Merge tag 'topic/phy-compliance-2020-04-08' of git://anongit.freedesktop.org/drm...
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_context.h"
141 #include "intel_engine_pm.h"
142 #include "intel_gt.h"
143 #include "intel_gt_pm.h"
144 #include "intel_gt_requests.h"
145 #include "intel_lrc_reg.h"
146 #include "intel_mocs.h"
147 #include "intel_reset.h"
148 #include "intel_ring.h"
149 #include "intel_workarounds.h"
150
151 #define RING_EXECLIST_QFULL             (1 << 0x2)
152 #define RING_EXECLIST1_VALID            (1 << 0x3)
153 #define RING_EXECLIST0_VALID            (1 << 0x4)
154 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
155 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
156 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
157
158 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
159 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
161 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
162 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
163 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
164
165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
166          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167
168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169
170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
172 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
173 #define GEN12_IDLE_CTX_ID               0x7FF
174 #define GEN12_CSB_CTX_VALID(csb_dw) \
175         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176
177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179
180 struct virtual_engine {
181         struct intel_engine_cs base;
182         struct intel_context context;
183
184         /*
185          * We allow only a single request through the virtual engine at a time
186          * (each request in the timeline waits for the completion fence of
187          * the previous before being submitted). By restricting ourselves to
188          * only submitting a single request, each request is placed on to a
189          * physical to maximise load spreading (by virtue of the late greedy
190          * scheduling -- each real engine takes the next available request
191          * upon idling).
192          */
193         struct i915_request *request;
194
195         /*
196          * We keep a rbtree of available virtual engines inside each physical
197          * engine, sorted by priority. Here we preallocate the nodes we need
198          * for the virtual engine, indexed by physical_engine->id.
199          */
200         struct ve_node {
201                 struct rb_node rb;
202                 int prio;
203         } nodes[I915_NUM_ENGINES];
204
205         /*
206          * Keep track of bonded pairs -- restrictions upon on our selection
207          * of physical engines any particular request may be submitted to.
208          * If we receive a submit-fence from a master engine, we will only
209          * use one of sibling_mask physical engines.
210          */
211         struct ve_bond {
212                 const struct intel_engine_cs *master;
213                 intel_engine_mask_t sibling_mask;
214         } *bonds;
215         unsigned int num_bonds;
216
217         /* And finally, which physical engines this virtual engine maps onto. */
218         unsigned int num_siblings;
219         struct intel_engine_cs *siblings[0];
220 };
221
222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
223 {
224         GEM_BUG_ON(!intel_engine_is_virtual(engine));
225         return container_of(engine, struct virtual_engine, base);
226 }
227
228 static int __execlists_context_alloc(struct intel_context *ce,
229                                      struct intel_engine_cs *engine);
230
231 static void execlists_init_reg_state(u32 *reg_state,
232                                      const struct intel_context *ce,
233                                      const struct intel_engine_cs *engine,
234                                      const struct intel_ring *ring,
235                                      bool close);
236 static void
237 __execlists_update_reg_state(const struct intel_context *ce,
238                              const struct intel_engine_cs *engine,
239                              u32 head);
240
241 static u32 intel_context_get_runtime(const struct intel_context *ce)
242 {
243         /*
244          * We can use either ppHWSP[16] which is recorded before the context
245          * switch (and so excludes the cost of context switches) or use the
246          * value from the context image itself, which is saved/restored earlier
247          * and so includes the cost of the save.
248          */
249         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
250 }
251
252 static void mark_eio(struct i915_request *rq)
253 {
254         if (i915_request_completed(rq))
255                 return;
256
257         GEM_BUG_ON(i915_request_signaled(rq));
258
259         i915_request_set_error_once(rq, -EIO);
260         i915_request_mark_complete(rq);
261 }
262
263 static struct i915_request *
264 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
265 {
266         struct i915_request *active = rq;
267
268         rcu_read_lock();
269         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
270                 if (i915_request_completed(rq))
271                         break;
272
273                 active = rq;
274         }
275         rcu_read_unlock();
276
277         return active;
278 }
279
280 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
281 {
282         return (i915_ggtt_offset(engine->status_page.vma) +
283                 I915_GEM_HWS_PREEMPT_ADDR);
284 }
285
286 static inline void
287 ring_set_paused(const struct intel_engine_cs *engine, int state)
288 {
289         /*
290          * We inspect HWS_PREEMPT with a semaphore inside
291          * engine->emit_fini_breadcrumb. If the dword is true,
292          * the ring is paused as the semaphore will busywait
293          * until the dword is false.
294          */
295         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
296         if (state)
297                 wmb();
298 }
299
300 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
301 {
302         return rb_entry(rb, struct i915_priolist, node);
303 }
304
305 static inline int rq_prio(const struct i915_request *rq)
306 {
307         return READ_ONCE(rq->sched.attr.priority);
308 }
309
310 static int effective_prio(const struct i915_request *rq)
311 {
312         int prio = rq_prio(rq);
313
314         /*
315          * If this request is special and must not be interrupted at any
316          * cost, so be it. Note we are only checking the most recent request
317          * in the context and so may be masking an earlier vip request. It
318          * is hoped that under the conditions where nopreempt is used, this
319          * will not matter (i.e. all requests to that context will be
320          * nopreempt for as long as desired).
321          */
322         if (i915_request_has_nopreempt(rq))
323                 prio = I915_PRIORITY_UNPREEMPTABLE;
324
325         /*
326          * On unwinding the active request, we give it a priority bump
327          * if it has completed waiting on any semaphore. If we know that
328          * the request has already started, we can prevent an unwanted
329          * preempt-to-idle cycle by taking that into account now.
330          */
331         if (__i915_request_has_started(rq))
332                 prio |= I915_PRIORITY_NOSEMAPHORE;
333
334         /* Restrict mere WAIT boosts from triggering preemption */
335         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
336         return prio | __NO_PREEMPTION;
337 }
338
339 static int queue_prio(const struct intel_engine_execlists *execlists)
340 {
341         struct i915_priolist *p;
342         struct rb_node *rb;
343
344         rb = rb_first_cached(&execlists->queue);
345         if (!rb)
346                 return INT_MIN;
347
348         /*
349          * As the priolist[] are inverted, with the highest priority in [0],
350          * we have to flip the index value to become priority.
351          */
352         p = to_priolist(rb);
353         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
354 }
355
356 static inline bool need_preempt(const struct intel_engine_cs *engine,
357                                 const struct i915_request *rq,
358                                 struct rb_node *rb)
359 {
360         int last_prio;
361
362         if (!intel_engine_has_semaphores(engine))
363                 return false;
364
365         /*
366          * Check if the current priority hint merits a preemption attempt.
367          *
368          * We record the highest value priority we saw during rescheduling
369          * prior to this dequeue, therefore we know that if it is strictly
370          * less than the current tail of ESLP[0], we do not need to force
371          * a preempt-to-idle cycle.
372          *
373          * However, the priority hint is a mere hint that we may need to
374          * preempt. If that hint is stale or we may be trying to preempt
375          * ourselves, ignore the request.
376          *
377          * More naturally we would write
378          *      prio >= max(0, last);
379          * except that we wish to prevent triggering preemption at the same
380          * priority level: the task that is running should remain running
381          * to preserve FIFO ordering of dependencies.
382          */
383         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
384         if (engine->execlists.queue_priority_hint <= last_prio)
385                 return false;
386
387         /*
388          * Check against the first request in ELSP[1], it will, thanks to the
389          * power of PI, be the highest priority of that context.
390          */
391         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
392             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
393                 return true;
394
395         if (rb) {
396                 struct virtual_engine *ve =
397                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
398                 bool preempt = false;
399
400                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
401                         struct i915_request *next;
402
403                         rcu_read_lock();
404                         next = READ_ONCE(ve->request);
405                         if (next)
406                                 preempt = rq_prio(next) > last_prio;
407                         rcu_read_unlock();
408                 }
409
410                 if (preempt)
411                         return preempt;
412         }
413
414         /*
415          * If the inflight context did not trigger the preemption, then maybe
416          * it was the set of queued requests? Pick the highest priority in
417          * the queue (the first active priolist) and see if it deserves to be
418          * running instead of ELSP[0].
419          *
420          * The highest priority request in the queue can not be either
421          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
422          * context, it's priority would not exceed ELSP[0] aka last_prio.
423          */
424         return queue_prio(&engine->execlists) > last_prio;
425 }
426
427 __maybe_unused static inline bool
428 assert_priority_queue(const struct i915_request *prev,
429                       const struct i915_request *next)
430 {
431         /*
432          * Without preemption, the prev may refer to the still active element
433          * which we refuse to let go.
434          *
435          * Even with preemption, there are times when we think it is better not
436          * to preempt and leave an ostensibly lower priority request in flight.
437          */
438         if (i915_request_is_active(prev))
439                 return true;
440
441         return rq_prio(prev) >= rq_prio(next);
442 }
443
444 /*
445  * The context descriptor encodes various attributes of a context,
446  * including its GTT address and some flags. Because it's fairly
447  * expensive to calculate, we'll just do it once and cache the result,
448  * which remains valid until the context is unpinned.
449  *
450  * This is what a descriptor looks like, from LSB to MSB::
451  *
452  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
453  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
454  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
455  *      bits 53-54:    mbz, reserved for use by hardware
456  *      bits 55-63:    group ID, currently unused and set to 0
457  *
458  * Starting from Gen11, the upper dword of the descriptor has a new format:
459  *
460  *      bits 32-36:    reserved
461  *      bits 37-47:    SW context ID
462  *      bits 48:53:    engine instance
463  *      bit 54:        mbz, reserved for use by hardware
464  *      bits 55-60:    SW counter
465  *      bits 61-63:    engine class
466  *
467  * engine info, SW context ID and SW counter need to form a unique number
468  * (Context ID) per lrc.
469  */
470 static u64
471 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
472 {
473         u64 desc;
474
475         desc = INTEL_LEGACY_32B_CONTEXT;
476         if (i915_vm_is_4lvl(ce->vm))
477                 desc = INTEL_LEGACY_64B_CONTEXT;
478         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
479
480         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
481         if (IS_GEN(engine->i915, 8))
482                 desc |= GEN8_CTX_L3LLC_COHERENT;
483
484         desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
485         /*
486          * The following 32bits are copied into the OA reports (dword 2).
487          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
488          * anything below.
489          */
490         if (INTEL_GEN(engine->i915) >= 11) {
491                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
492                                                                 /* bits 48-53 */
493
494                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
495                                                                 /* bits 61-63 */
496         }
497
498         return desc;
499 }
500
501 static inline unsigned int dword_in_page(void *addr)
502 {
503         return offset_in_page(addr) / sizeof(u32);
504 }
505
506 static void set_offsets(u32 *regs,
507                         const u8 *data,
508                         const struct intel_engine_cs *engine,
509                         bool clear)
510 #define NOP(x) (BIT(7) | (x))
511 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
512 #define POSTED BIT(0)
513 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
514 #define REG16(x) \
515         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
516         (((x) >> 2) & 0x7f)
517 #define END(x) 0, (x)
518 {
519         const u32 base = engine->mmio_base;
520
521         while (*data) {
522                 u8 count, flags;
523
524                 if (*data & BIT(7)) { /* skip */
525                         count = *data++ & ~BIT(7);
526                         if (clear)
527                                 memset32(regs, MI_NOOP, count);
528                         regs += count;
529                         continue;
530                 }
531
532                 count = *data & 0x3f;
533                 flags = *data >> 6;
534                 data++;
535
536                 *regs = MI_LOAD_REGISTER_IMM(count);
537                 if (flags & POSTED)
538                         *regs |= MI_LRI_FORCE_POSTED;
539                 if (INTEL_GEN(engine->i915) >= 11)
540                         *regs |= MI_LRI_CS_MMIO;
541                 regs++;
542
543                 GEM_BUG_ON(!count);
544                 do {
545                         u32 offset = 0;
546                         u8 v;
547
548                         do {
549                                 v = *data++;
550                                 offset <<= 7;
551                                 offset |= v & ~BIT(7);
552                         } while (v & BIT(7));
553
554                         regs[0] = base + (offset << 2);
555                         if (clear)
556                                 regs[1] = 0;
557                         regs += 2;
558                 } while (--count);
559         }
560
561         if (clear) {
562                 u8 count = *++data;
563
564                 /* Clear past the tail for HW access */
565                 GEM_BUG_ON(dword_in_page(regs) > count);
566                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
567
568                 /* Close the batch; used mainly by live_lrc_layout() */
569                 *regs = MI_BATCH_BUFFER_END;
570                 if (INTEL_GEN(engine->i915) >= 10)
571                         *regs |= BIT(0);
572         }
573 }
574
575 static const u8 gen8_xcs_offsets[] = {
576         NOP(1),
577         LRI(11, 0),
578         REG16(0x244),
579         REG(0x034),
580         REG(0x030),
581         REG(0x038),
582         REG(0x03c),
583         REG(0x168),
584         REG(0x140),
585         REG(0x110),
586         REG(0x11c),
587         REG(0x114),
588         REG(0x118),
589
590         NOP(9),
591         LRI(9, 0),
592         REG16(0x3a8),
593         REG16(0x28c),
594         REG16(0x288),
595         REG16(0x284),
596         REG16(0x280),
597         REG16(0x27c),
598         REG16(0x278),
599         REG16(0x274),
600         REG16(0x270),
601
602         NOP(13),
603         LRI(2, 0),
604         REG16(0x200),
605         REG(0x028),
606
607         END(80)
608 };
609
610 static const u8 gen9_xcs_offsets[] = {
611         NOP(1),
612         LRI(14, POSTED),
613         REG16(0x244),
614         REG(0x034),
615         REG(0x030),
616         REG(0x038),
617         REG(0x03c),
618         REG(0x168),
619         REG(0x140),
620         REG(0x110),
621         REG(0x11c),
622         REG(0x114),
623         REG(0x118),
624         REG(0x1c0),
625         REG(0x1c4),
626         REG(0x1c8),
627
628         NOP(3),
629         LRI(9, POSTED),
630         REG16(0x3a8),
631         REG16(0x28c),
632         REG16(0x288),
633         REG16(0x284),
634         REG16(0x280),
635         REG16(0x27c),
636         REG16(0x278),
637         REG16(0x274),
638         REG16(0x270),
639
640         NOP(13),
641         LRI(1, POSTED),
642         REG16(0x200),
643
644         NOP(13),
645         LRI(44, POSTED),
646         REG(0x028),
647         REG(0x09c),
648         REG(0x0c0),
649         REG(0x178),
650         REG(0x17c),
651         REG16(0x358),
652         REG(0x170),
653         REG(0x150),
654         REG(0x154),
655         REG(0x158),
656         REG16(0x41c),
657         REG16(0x600),
658         REG16(0x604),
659         REG16(0x608),
660         REG16(0x60c),
661         REG16(0x610),
662         REG16(0x614),
663         REG16(0x618),
664         REG16(0x61c),
665         REG16(0x620),
666         REG16(0x624),
667         REG16(0x628),
668         REG16(0x62c),
669         REG16(0x630),
670         REG16(0x634),
671         REG16(0x638),
672         REG16(0x63c),
673         REG16(0x640),
674         REG16(0x644),
675         REG16(0x648),
676         REG16(0x64c),
677         REG16(0x650),
678         REG16(0x654),
679         REG16(0x658),
680         REG16(0x65c),
681         REG16(0x660),
682         REG16(0x664),
683         REG16(0x668),
684         REG16(0x66c),
685         REG16(0x670),
686         REG16(0x674),
687         REG16(0x678),
688         REG16(0x67c),
689         REG(0x068),
690
691         END(176)
692 };
693
694 static const u8 gen12_xcs_offsets[] = {
695         NOP(1),
696         LRI(13, POSTED),
697         REG16(0x244),
698         REG(0x034),
699         REG(0x030),
700         REG(0x038),
701         REG(0x03c),
702         REG(0x168),
703         REG(0x140),
704         REG(0x110),
705         REG(0x1c0),
706         REG(0x1c4),
707         REG(0x1c8),
708         REG(0x180),
709         REG16(0x2b4),
710
711         NOP(5),
712         LRI(9, POSTED),
713         REG16(0x3a8),
714         REG16(0x28c),
715         REG16(0x288),
716         REG16(0x284),
717         REG16(0x280),
718         REG16(0x27c),
719         REG16(0x278),
720         REG16(0x274),
721         REG16(0x270),
722
723         END(80)
724 };
725
726 static const u8 gen8_rcs_offsets[] = {
727         NOP(1),
728         LRI(14, POSTED),
729         REG16(0x244),
730         REG(0x034),
731         REG(0x030),
732         REG(0x038),
733         REG(0x03c),
734         REG(0x168),
735         REG(0x140),
736         REG(0x110),
737         REG(0x11c),
738         REG(0x114),
739         REG(0x118),
740         REG(0x1c0),
741         REG(0x1c4),
742         REG(0x1c8),
743
744         NOP(3),
745         LRI(9, POSTED),
746         REG16(0x3a8),
747         REG16(0x28c),
748         REG16(0x288),
749         REG16(0x284),
750         REG16(0x280),
751         REG16(0x27c),
752         REG16(0x278),
753         REG16(0x274),
754         REG16(0x270),
755
756         NOP(13),
757         LRI(1, 0),
758         REG(0x0c8),
759
760         END(80)
761 };
762
763 static const u8 gen9_rcs_offsets[] = {
764         NOP(1),
765         LRI(14, POSTED),
766         REG16(0x244),
767         REG(0x34),
768         REG(0x30),
769         REG(0x38),
770         REG(0x3c),
771         REG(0x168),
772         REG(0x140),
773         REG(0x110),
774         REG(0x11c),
775         REG(0x114),
776         REG(0x118),
777         REG(0x1c0),
778         REG(0x1c4),
779         REG(0x1c8),
780
781         NOP(3),
782         LRI(9, POSTED),
783         REG16(0x3a8),
784         REG16(0x28c),
785         REG16(0x288),
786         REG16(0x284),
787         REG16(0x280),
788         REG16(0x27c),
789         REG16(0x278),
790         REG16(0x274),
791         REG16(0x270),
792
793         NOP(13),
794         LRI(1, 0),
795         REG(0xc8),
796
797         NOP(13),
798         LRI(44, POSTED),
799         REG(0x28),
800         REG(0x9c),
801         REG(0xc0),
802         REG(0x178),
803         REG(0x17c),
804         REG16(0x358),
805         REG(0x170),
806         REG(0x150),
807         REG(0x154),
808         REG(0x158),
809         REG16(0x41c),
810         REG16(0x600),
811         REG16(0x604),
812         REG16(0x608),
813         REG16(0x60c),
814         REG16(0x610),
815         REG16(0x614),
816         REG16(0x618),
817         REG16(0x61c),
818         REG16(0x620),
819         REG16(0x624),
820         REG16(0x628),
821         REG16(0x62c),
822         REG16(0x630),
823         REG16(0x634),
824         REG16(0x638),
825         REG16(0x63c),
826         REG16(0x640),
827         REG16(0x644),
828         REG16(0x648),
829         REG16(0x64c),
830         REG16(0x650),
831         REG16(0x654),
832         REG16(0x658),
833         REG16(0x65c),
834         REG16(0x660),
835         REG16(0x664),
836         REG16(0x668),
837         REG16(0x66c),
838         REG16(0x670),
839         REG16(0x674),
840         REG16(0x678),
841         REG16(0x67c),
842         REG(0x68),
843
844         END(176)
845 };
846
847 static const u8 gen11_rcs_offsets[] = {
848         NOP(1),
849         LRI(15, POSTED),
850         REG16(0x244),
851         REG(0x034),
852         REG(0x030),
853         REG(0x038),
854         REG(0x03c),
855         REG(0x168),
856         REG(0x140),
857         REG(0x110),
858         REG(0x11c),
859         REG(0x114),
860         REG(0x118),
861         REG(0x1c0),
862         REG(0x1c4),
863         REG(0x1c8),
864         REG(0x180),
865
866         NOP(1),
867         LRI(9, POSTED),
868         REG16(0x3a8),
869         REG16(0x28c),
870         REG16(0x288),
871         REG16(0x284),
872         REG16(0x280),
873         REG16(0x27c),
874         REG16(0x278),
875         REG16(0x274),
876         REG16(0x270),
877
878         LRI(1, POSTED),
879         REG(0x1b0),
880
881         NOP(10),
882         LRI(1, 0),
883         REG(0x0c8),
884
885         END(80)
886 };
887
888 static const u8 gen12_rcs_offsets[] = {
889         NOP(1),
890         LRI(13, POSTED),
891         REG16(0x244),
892         REG(0x034),
893         REG(0x030),
894         REG(0x038),
895         REG(0x03c),
896         REG(0x168),
897         REG(0x140),
898         REG(0x110),
899         REG(0x1c0),
900         REG(0x1c4),
901         REG(0x1c8),
902         REG(0x180),
903         REG16(0x2b4),
904
905         NOP(5),
906         LRI(9, POSTED),
907         REG16(0x3a8),
908         REG16(0x28c),
909         REG16(0x288),
910         REG16(0x284),
911         REG16(0x280),
912         REG16(0x27c),
913         REG16(0x278),
914         REG16(0x274),
915         REG16(0x270),
916
917         LRI(3, POSTED),
918         REG(0x1b0),
919         REG16(0x5a8),
920         REG16(0x5ac),
921
922         NOP(6),
923         LRI(1, 0),
924         REG(0x0c8),
925
926         END(80)
927 };
928
929 #undef END
930 #undef REG16
931 #undef REG
932 #undef LRI
933 #undef NOP
934
935 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
936 {
937         /*
938          * The gen12+ lists only have the registers we program in the basic
939          * default state. We rely on the context image using relative
940          * addressing to automatic fixup the register state between the
941          * physical engines for virtual engine.
942          */
943         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
944                    !intel_engine_has_relative_mmio(engine));
945
946         if (engine->class == RENDER_CLASS) {
947                 if (INTEL_GEN(engine->i915) >= 12)
948                         return gen12_rcs_offsets;
949                 else if (INTEL_GEN(engine->i915) >= 11)
950                         return gen11_rcs_offsets;
951                 else if (INTEL_GEN(engine->i915) >= 9)
952                         return gen9_rcs_offsets;
953                 else
954                         return gen8_rcs_offsets;
955         } else {
956                 if (INTEL_GEN(engine->i915) >= 12)
957                         return gen12_xcs_offsets;
958                 else if (INTEL_GEN(engine->i915) >= 9)
959                         return gen9_xcs_offsets;
960                 else
961                         return gen8_xcs_offsets;
962         }
963 }
964
965 static struct i915_request *
966 __unwind_incomplete_requests(struct intel_engine_cs *engine)
967 {
968         struct i915_request *rq, *rn, *active = NULL;
969         struct list_head *uninitialized_var(pl);
970         int prio = I915_PRIORITY_INVALID;
971
972         lockdep_assert_held(&engine->active.lock);
973
974         list_for_each_entry_safe_reverse(rq, rn,
975                                          &engine->active.requests,
976                                          sched.link) {
977                 if (i915_request_completed(rq))
978                         continue; /* XXX */
979
980                 __i915_request_unsubmit(rq);
981
982                 /*
983                  * Push the request back into the queue for later resubmission.
984                  * If this request is not native to this physical engine (i.e.
985                  * it came from a virtual source), push it back onto the virtual
986                  * engine so that it can be moved across onto another physical
987                  * engine as load dictates.
988                  */
989                 if (likely(rq->execution_mask == engine->mask)) {
990                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
991                         if (rq_prio(rq) != prio) {
992                                 prio = rq_prio(rq);
993                                 pl = i915_sched_lookup_priolist(engine, prio);
994                         }
995                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
996
997                         list_move(&rq->sched.link, pl);
998                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
999
1000                         active = rq;
1001                 } else {
1002                         struct intel_engine_cs *owner = rq->context->engine;
1003
1004                         /*
1005                          * Decouple the virtual breadcrumb before moving it
1006                          * back to the virtual engine -- we don't want the
1007                          * request to complete in the background and try
1008                          * and cancel the breadcrumb on the virtual engine
1009                          * (instead of the old engine where it is linked)!
1010                          */
1011                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1012                                      &rq->fence.flags)) {
1013                                 spin_lock_nested(&rq->lock,
1014                                                  SINGLE_DEPTH_NESTING);
1015                                 i915_request_cancel_breadcrumb(rq);
1016                                 spin_unlock(&rq->lock);
1017                         }
1018                         WRITE_ONCE(rq->engine, owner);
1019                         owner->submit_request(rq);
1020                         active = NULL;
1021                 }
1022         }
1023
1024         return active;
1025 }
1026
1027 struct i915_request *
1028 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1029 {
1030         struct intel_engine_cs *engine =
1031                 container_of(execlists, typeof(*engine), execlists);
1032
1033         return __unwind_incomplete_requests(engine);
1034 }
1035
1036 static inline void
1037 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1038 {
1039         /*
1040          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1041          * The compiler should eliminate this function as dead-code.
1042          */
1043         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1044                 return;
1045
1046         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1047                                    status, rq);
1048 }
1049
1050 static void intel_engine_context_in(struct intel_engine_cs *engine)
1051 {
1052         unsigned long flags;
1053
1054         if (READ_ONCE(engine->stats.enabled) == 0)
1055                 return;
1056
1057         write_seqlock_irqsave(&engine->stats.lock, flags);
1058
1059         if (engine->stats.enabled > 0) {
1060                 if (engine->stats.active++ == 0)
1061                         engine->stats.start = ktime_get();
1062                 GEM_BUG_ON(engine->stats.active == 0);
1063         }
1064
1065         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1066 }
1067
1068 static void intel_engine_context_out(struct intel_engine_cs *engine)
1069 {
1070         unsigned long flags;
1071
1072         if (READ_ONCE(engine->stats.enabled) == 0)
1073                 return;
1074
1075         write_seqlock_irqsave(&engine->stats.lock, flags);
1076
1077         if (engine->stats.enabled > 0) {
1078                 ktime_t last;
1079
1080                 if (engine->stats.active && --engine->stats.active == 0) {
1081                         /*
1082                          * Decrement the active context count and in case GPU
1083                          * is now idle add up to the running total.
1084                          */
1085                         last = ktime_sub(ktime_get(), engine->stats.start);
1086
1087                         engine->stats.total = ktime_add(engine->stats.total,
1088                                                         last);
1089                 } else if (engine->stats.active == 0) {
1090                         /*
1091                          * After turning on engine stats, context out might be
1092                          * the first event in which case we account from the
1093                          * time stats gathering was turned on.
1094                          */
1095                         last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1096
1097                         engine->stats.total = ktime_add(engine->stats.total,
1098                                                         last);
1099                 }
1100         }
1101
1102         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1103 }
1104
1105 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1106 {
1107         if (INTEL_GEN(engine->i915) >= 12)
1108                 return 0x60;
1109         else if (INTEL_GEN(engine->i915) >= 9)
1110                 return 0x54;
1111         else if (engine->class == RENDER_CLASS)
1112                 return 0x58;
1113         else
1114                 return -1;
1115 }
1116
1117 static void
1118 execlists_check_context(const struct intel_context *ce,
1119                         const struct intel_engine_cs *engine)
1120 {
1121         const struct intel_ring *ring = ce->ring;
1122         u32 *regs = ce->lrc_reg_state;
1123         bool valid = true;
1124         int x;
1125
1126         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1127                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1128                        engine->name,
1129                        regs[CTX_RING_START],
1130                        i915_ggtt_offset(ring->vma));
1131                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1132                 valid = false;
1133         }
1134
1135         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1136             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1137                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1138                        engine->name,
1139                        regs[CTX_RING_CTL],
1140                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1141                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1142                 valid = false;
1143         }
1144
1145         x = lrc_ring_mi_mode(engine);
1146         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1147                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1148                        engine->name, regs[x + 1]);
1149                 regs[x + 1] &= ~STOP_RING;
1150                 regs[x + 1] |= STOP_RING << 16;
1151                 valid = false;
1152         }
1153
1154         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1155 }
1156
1157 static void restore_default_state(struct intel_context *ce,
1158                                   struct intel_engine_cs *engine)
1159 {
1160         u32 *regs = ce->lrc_reg_state;
1161
1162         if (engine->pinned_default_state)
1163                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1164                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1165                        engine->context_size - PAGE_SIZE);
1166
1167         execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1168         ce->runtime.last = intel_context_get_runtime(ce);
1169 }
1170
1171 static void reset_active(struct i915_request *rq,
1172                          struct intel_engine_cs *engine)
1173 {
1174         struct intel_context * const ce = rq->context;
1175         u32 head;
1176
1177         /*
1178          * The executing context has been cancelled. We want to prevent
1179          * further execution along this context and propagate the error on
1180          * to anything depending on its results.
1181          *
1182          * In __i915_request_submit(), we apply the -EIO and remove the
1183          * requests' payloads for any banned requests. But first, we must
1184          * rewind the context back to the start of the incomplete request so
1185          * that we do not jump back into the middle of the batch.
1186          *
1187          * We preserve the breadcrumbs and semaphores of the incomplete
1188          * requests so that inter-timeline dependencies (i.e other timelines)
1189          * remain correctly ordered. And we defer to __i915_request_submit()
1190          * so that all asynchronous waits are correctly handled.
1191          */
1192         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1193                      rq->fence.context, rq->fence.seqno);
1194
1195         /* On resubmission of the active request, payload will be scrubbed */
1196         if (i915_request_completed(rq))
1197                 head = rq->tail;
1198         else
1199                 head = active_request(ce->timeline, rq)->head;
1200         head = intel_ring_wrap(ce->ring, head);
1201
1202         /* Scrub the context image to prevent replaying the previous batch */
1203         restore_default_state(ce, engine);
1204         __execlists_update_reg_state(ce, engine, head);
1205
1206         /* We've switched away, so this should be a no-op, but intent matters */
1207         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1208 }
1209
1210 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1211 {
1212 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1213         ce->runtime.num_underflow += dt < 0;
1214         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1215 #endif
1216 }
1217
1218 static void intel_context_update_runtime(struct intel_context *ce)
1219 {
1220         u32 old;
1221         s32 dt;
1222
1223         if (intel_context_is_barrier(ce))
1224                 return;
1225
1226         old = ce->runtime.last;
1227         ce->runtime.last = intel_context_get_runtime(ce);
1228         dt = ce->runtime.last - old;
1229
1230         if (unlikely(dt <= 0)) {
1231                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1232                          old, ce->runtime.last, dt);
1233                 st_update_runtime_underflow(ce, dt);
1234                 return;
1235         }
1236
1237         ewma_runtime_add(&ce->runtime.avg, dt);
1238         ce->runtime.total += dt;
1239 }
1240
1241 static inline struct intel_engine_cs *
1242 __execlists_schedule_in(struct i915_request *rq)
1243 {
1244         struct intel_engine_cs * const engine = rq->engine;
1245         struct intel_context * const ce = rq->context;
1246
1247         intel_context_get(ce);
1248
1249         if (unlikely(intel_context_is_banned(ce)))
1250                 reset_active(rq, engine);
1251
1252         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1253                 execlists_check_context(ce, engine);
1254
1255         ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1256         if (ce->tag) {
1257                 /* Use a fixed tag for OA and friends */
1258                 ce->lrc_desc |= (u64)ce->tag << 32;
1259         } else {
1260                 /* We don't need a strict matching tag, just different values */
1261                 ce->lrc_desc |=
1262                         (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1263                         GEN11_SW_CTX_ID_SHIFT;
1264                 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1265         }
1266
1267         __intel_gt_pm_get(engine->gt);
1268         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1269         intel_engine_context_in(engine);
1270
1271         return engine;
1272 }
1273
1274 static inline struct i915_request *
1275 execlists_schedule_in(struct i915_request *rq, int idx)
1276 {
1277         struct intel_context * const ce = rq->context;
1278         struct intel_engine_cs *old;
1279
1280         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1281         trace_i915_request_in(rq, idx);
1282
1283         old = READ_ONCE(ce->inflight);
1284         do {
1285                 if (!old) {
1286                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1287                         break;
1288                 }
1289         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1290
1291         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1292         return i915_request_get(rq);
1293 }
1294
1295 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1296 {
1297         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1298         struct i915_request *next = READ_ONCE(ve->request);
1299
1300         if (next && next->execution_mask & ~rq->execution_mask)
1301                 tasklet_schedule(&ve->base.execlists.tasklet);
1302 }
1303
1304 static inline void
1305 __execlists_schedule_out(struct i915_request *rq,
1306                          struct intel_engine_cs * const engine)
1307 {
1308         struct intel_context * const ce = rq->context;
1309
1310         /*
1311          * NB process_csb() is not under the engine->active.lock and hence
1312          * schedule_out can race with schedule_in meaning that we should
1313          * refrain from doing non-trivial work here.
1314          */
1315
1316         /*
1317          * If we have just completed this context, the engine may now be
1318          * idle and we want to re-enter powersaving.
1319          */
1320         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1321             i915_request_completed(rq))
1322                 intel_engine_add_retire(engine, ce->timeline);
1323
1324         intel_context_update_runtime(ce);
1325         intel_engine_context_out(engine);
1326         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1327         intel_gt_pm_put_async(engine->gt);
1328
1329         /*
1330          * If this is part of a virtual engine, its next request may
1331          * have been blocked waiting for access to the active context.
1332          * We have to kick all the siblings again in case we need to
1333          * switch (e.g. the next request is not runnable on this
1334          * engine). Hopefully, we will already have submitted the next
1335          * request before the tasklet runs and do not need to rebuild
1336          * each virtual tree and kick everyone again.
1337          */
1338         if (ce->engine != engine)
1339                 kick_siblings(rq, ce);
1340
1341         intel_context_put(ce);
1342 }
1343
1344 static inline void
1345 execlists_schedule_out(struct i915_request *rq)
1346 {
1347         struct intel_context * const ce = rq->context;
1348         struct intel_engine_cs *cur, *old;
1349
1350         trace_i915_request_out(rq);
1351
1352         old = READ_ONCE(ce->inflight);
1353         do
1354                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1355         while (!try_cmpxchg(&ce->inflight, &old, cur));
1356         if (!cur)
1357                 __execlists_schedule_out(rq, old);
1358
1359         i915_request_put(rq);
1360 }
1361
1362 static u64 execlists_update_context(struct i915_request *rq)
1363 {
1364         struct intel_context *ce = rq->context;
1365         u64 desc = ce->lrc_desc;
1366         u32 tail, prev;
1367
1368         /*
1369          * WaIdleLiteRestore:bdw,skl
1370          *
1371          * We should never submit the context with the same RING_TAIL twice
1372          * just in case we submit an empty ring, which confuses the HW.
1373          *
1374          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1375          * the normal request to be able to always advance the RING_TAIL on
1376          * subsequent resubmissions (for lite restore). Should that fail us,
1377          * and we try and submit the same tail again, force the context
1378          * reload.
1379          *
1380          * If we need to return to a preempted context, we need to skip the
1381          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1382          * HW has a tendency to ignore us rewinding the TAIL to the end of
1383          * an earlier request.
1384          */
1385         tail = intel_ring_set_tail(rq->ring, rq->tail);
1386         prev = ce->lrc_reg_state[CTX_RING_TAIL];
1387         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1388                 desc |= CTX_DESC_FORCE_RESTORE;
1389         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1390         rq->tail = rq->wa_tail;
1391
1392         /*
1393          * Make sure the context image is complete before we submit it to HW.
1394          *
1395          * Ostensibly, writes (including the WCB) should be flushed prior to
1396          * an uncached write such as our mmio register access, the empirical
1397          * evidence (esp. on Braswell) suggests that the WC write into memory
1398          * may not be visible to the HW prior to the completion of the UC
1399          * register write and that we may begin execution from the context
1400          * before its image is complete leading to invalid PD chasing.
1401          */
1402         wmb();
1403
1404         ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1405         return desc;
1406 }
1407
1408 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1409 {
1410         if (execlists->ctrl_reg) {
1411                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1412                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1413         } else {
1414                 writel(upper_32_bits(desc), execlists->submit_reg);
1415                 writel(lower_32_bits(desc), execlists->submit_reg);
1416         }
1417 }
1418
1419 static __maybe_unused char *
1420 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1421 {
1422         if (!rq)
1423                 return "";
1424
1425         snprintf(buf, buflen, "%s%llx:%lld%s prio %d",
1426                  prefix,
1427                  rq->fence.context, rq->fence.seqno,
1428                  i915_request_completed(rq) ? "!" :
1429                  i915_request_started(rq) ? "*" :
1430                  "",
1431                  rq_prio(rq));
1432
1433         return buf;
1434 }
1435
1436 static __maybe_unused void
1437 trace_ports(const struct intel_engine_execlists *execlists,
1438             const char *msg,
1439             struct i915_request * const *ports)
1440 {
1441         const struct intel_engine_cs *engine =
1442                 container_of(execlists, typeof(*engine), execlists);
1443         char __maybe_unused p0[40], p1[40];
1444
1445         if (!ports[0])
1446                 return;
1447
1448         ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1449                      dump_port(p0, sizeof(p0), "", ports[0]),
1450                      dump_port(p1, sizeof(p1), ", ", ports[1]));
1451 }
1452
1453 static inline bool
1454 reset_in_progress(const struct intel_engine_execlists *execlists)
1455 {
1456         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1457 }
1458
1459 static __maybe_unused bool
1460 assert_pending_valid(const struct intel_engine_execlists *execlists,
1461                      const char *msg)
1462 {
1463         struct i915_request * const *port, *rq;
1464         struct intel_context *ce = NULL;
1465         bool sentinel = false;
1466
1467         trace_ports(execlists, msg, execlists->pending);
1468
1469         /* We may be messing around with the lists during reset, lalala */
1470         if (reset_in_progress(execlists))
1471                 return true;
1472
1473         if (!execlists->pending[0]) {
1474                 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1475                 return false;
1476         }
1477
1478         if (execlists->pending[execlists_num_ports(execlists)]) {
1479                 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1480                               execlists_num_ports(execlists));
1481                 return false;
1482         }
1483
1484         for (port = execlists->pending; (rq = *port); port++) {
1485                 unsigned long flags;
1486                 bool ok = true;
1487
1488                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1489                 GEM_BUG_ON(!i915_request_is_active(rq));
1490
1491                 if (ce == rq->context) {
1492                         GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1493                                       ce->timeline->fence_context,
1494                                       port - execlists->pending);
1495                         return false;
1496                 }
1497                 ce = rq->context;
1498
1499                 /*
1500                  * Sentinels are supposed to be lonely so they flush the
1501                  * current exection off the HW. Check that they are the
1502                  * only request in the pending submission.
1503                  */
1504                 if (sentinel) {
1505                         GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
1506                                       ce->timeline->fence_context,
1507                                       port - execlists->pending);
1508                         return false;
1509                 }
1510
1511                 sentinel = i915_request_has_sentinel(rq);
1512                 if (sentinel && port != execlists->pending) {
1513                         GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
1514                                       ce->timeline->fence_context,
1515                                       port - execlists->pending);
1516                         return false;
1517                 }
1518
1519                 /* Hold tightly onto the lock to prevent concurrent retires! */
1520                 if (!spin_trylock_irqsave(&rq->lock, flags))
1521                         continue;
1522
1523                 if (i915_request_completed(rq))
1524                         goto unlock;
1525
1526                 if (i915_active_is_idle(&ce->active) &&
1527                     !intel_context_is_barrier(ce)) {
1528                         GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1529                                       ce->timeline->fence_context,
1530                                       port - execlists->pending);
1531                         ok = false;
1532                         goto unlock;
1533                 }
1534
1535                 if (!i915_vma_is_pinned(ce->state)) {
1536                         GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1537                                       ce->timeline->fence_context,
1538                                       port - execlists->pending);
1539                         ok = false;
1540                         goto unlock;
1541                 }
1542
1543                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1544                         GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1545                                       ce->timeline->fence_context,
1546                                       port - execlists->pending);
1547                         ok = false;
1548                         goto unlock;
1549                 }
1550
1551 unlock:
1552                 spin_unlock_irqrestore(&rq->lock, flags);
1553                 if (!ok)
1554                         return false;
1555         }
1556
1557         return ce;
1558 }
1559
1560 static void execlists_submit_ports(struct intel_engine_cs *engine)
1561 {
1562         struct intel_engine_execlists *execlists = &engine->execlists;
1563         unsigned int n;
1564
1565         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1566
1567         /*
1568          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1569          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1570          * not be relinquished until the device is idle (see
1571          * i915_gem_idle_work_handler()). As a precaution, we make sure
1572          * that all ELSP are drained i.e. we have processed the CSB,
1573          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1574          */
1575         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1576
1577         /*
1578          * ELSQ note: the submit queue is not cleared after being submitted
1579          * to the HW so we need to make sure we always clean it up. This is
1580          * currently ensured by the fact that we always write the same number
1581          * of elsq entries, keep this in mind before changing the loop below.
1582          */
1583         for (n = execlists_num_ports(execlists); n--; ) {
1584                 struct i915_request *rq = execlists->pending[n];
1585
1586                 write_desc(execlists,
1587                            rq ? execlists_update_context(rq) : 0,
1588                            n);
1589         }
1590
1591         /* we need to manually load the submit queue */
1592         if (execlists->ctrl_reg)
1593                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1594 }
1595
1596 static bool ctx_single_port_submission(const struct intel_context *ce)
1597 {
1598         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1599                 intel_context_force_single_submission(ce));
1600 }
1601
1602 static bool can_merge_ctx(const struct intel_context *prev,
1603                           const struct intel_context *next)
1604 {
1605         if (prev != next)
1606                 return false;
1607
1608         if (ctx_single_port_submission(prev))
1609                 return false;
1610
1611         return true;
1612 }
1613
1614 static unsigned long i915_request_flags(const struct i915_request *rq)
1615 {
1616         return READ_ONCE(rq->fence.flags);
1617 }
1618
1619 static bool can_merge_rq(const struct i915_request *prev,
1620                          const struct i915_request *next)
1621 {
1622         GEM_BUG_ON(prev == next);
1623         GEM_BUG_ON(!assert_priority_queue(prev, next));
1624
1625         /*
1626          * We do not submit known completed requests. Therefore if the next
1627          * request is already completed, we can pretend to merge it in
1628          * with the previous context (and we will skip updating the ELSP
1629          * and tracking). Thus hopefully keeping the ELSP full with active
1630          * contexts, despite the best efforts of preempt-to-busy to confuse
1631          * us.
1632          */
1633         if (i915_request_completed(next))
1634                 return true;
1635
1636         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1637                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1638                       BIT(I915_FENCE_FLAG_SENTINEL))))
1639                 return false;
1640
1641         if (!can_merge_ctx(prev->context, next->context))
1642                 return false;
1643
1644         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1645         return true;
1646 }
1647
1648 static void virtual_update_register_offsets(u32 *regs,
1649                                             struct intel_engine_cs *engine)
1650 {
1651         set_offsets(regs, reg_offsets(engine), engine, false);
1652 }
1653
1654 static bool virtual_matches(const struct virtual_engine *ve,
1655                             const struct i915_request *rq,
1656                             const struct intel_engine_cs *engine)
1657 {
1658         const struct intel_engine_cs *inflight;
1659
1660         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1661                 return false;
1662
1663         /*
1664          * We track when the HW has completed saving the context image
1665          * (i.e. when we have seen the final CS event switching out of
1666          * the context) and must not overwrite the context image before
1667          * then. This restricts us to only using the active engine
1668          * while the previous virtualized request is inflight (so
1669          * we reuse the register offsets). This is a very small
1670          * hystersis on the greedy seelction algorithm.
1671          */
1672         inflight = intel_context_inflight(&ve->context);
1673         if (inflight && inflight != engine)
1674                 return false;
1675
1676         return true;
1677 }
1678
1679 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1680                                      struct i915_request *rq)
1681 {
1682         struct intel_engine_cs *old = ve->siblings[0];
1683
1684         /* All unattached (rq->engine == old) must already be completed */
1685
1686         spin_lock(&old->breadcrumbs.irq_lock);
1687         if (!list_empty(&ve->context.signal_link)) {
1688                 list_del_init(&ve->context.signal_link);
1689
1690                 /*
1691                  * We cannot acquire the new engine->breadcrumbs.irq_lock
1692                  * (as we are holding a breadcrumbs.irq_lock already),
1693                  * so attach this request to the signaler on submission.
1694                  * The queued irq_work will occur when we finally drop
1695                  * the engine->active.lock after dequeue.
1696                  */
1697                 set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
1698
1699                 /* Also transfer the pending irq_work for the old breadcrumb. */
1700                 intel_engine_signal_breadcrumbs(rq->engine);
1701         }
1702         spin_unlock(&old->breadcrumbs.irq_lock);
1703 }
1704
1705 #define for_each_waiter(p__, rq__) \
1706         list_for_each_entry_lockless(p__, \
1707                                      &(rq__)->sched.waiters_list, \
1708                                      wait_link)
1709
1710 #define for_each_signaler(p__, rq__) \
1711         list_for_each_entry_rcu(p__, \
1712                                 &(rq__)->sched.signalers_list, \
1713                                 signal_link)
1714
1715 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1716 {
1717         LIST_HEAD(list);
1718
1719         /*
1720          * We want to move the interrupted request to the back of
1721          * the round-robin list (i.e. its priority level), but
1722          * in doing so, we must then move all requests that were in
1723          * flight and were waiting for the interrupted request to
1724          * be run after it again.
1725          */
1726         do {
1727                 struct i915_dependency *p;
1728
1729                 GEM_BUG_ON(i915_request_is_active(rq));
1730                 list_move_tail(&rq->sched.link, pl);
1731
1732                 for_each_waiter(p, rq) {
1733                         struct i915_request *w =
1734                                 container_of(p->waiter, typeof(*w), sched);
1735
1736                         /* Leave semaphores spinning on the other engines */
1737                         if (w->engine != rq->engine)
1738                                 continue;
1739
1740                         /* No waiter should start before its signaler */
1741                         GEM_BUG_ON(i915_request_started(w) &&
1742                                    !i915_request_completed(rq));
1743
1744                         GEM_BUG_ON(i915_request_is_active(w));
1745                         if (!i915_request_is_ready(w))
1746                                 continue;
1747
1748                         if (rq_prio(w) < rq_prio(rq))
1749                                 continue;
1750
1751                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1752                         list_move_tail(&w->sched.link, &list);
1753                 }
1754
1755                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1756         } while (rq);
1757 }
1758
1759 static void defer_active(struct intel_engine_cs *engine)
1760 {
1761         struct i915_request *rq;
1762
1763         rq = __unwind_incomplete_requests(engine);
1764         if (!rq)
1765                 return;
1766
1767         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1768 }
1769
1770 static bool
1771 need_timeslice(const struct intel_engine_cs *engine,
1772                const struct i915_request *rq)
1773 {
1774         int hint;
1775
1776         if (!intel_engine_has_timeslices(engine))
1777                 return false;
1778
1779         hint = engine->execlists.queue_priority_hint;
1780         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1781                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1782
1783         return hint >= effective_prio(rq);
1784 }
1785
1786 static bool
1787 timeslice_yield(const struct intel_engine_execlists *el,
1788                 const struct i915_request *rq)
1789 {
1790         /*
1791          * Once bitten, forever smitten!
1792          *
1793          * If the active context ever busy-waited on a semaphore,
1794          * it will be treated as a hog until the end of its timeslice (i.e.
1795          * until it is scheduled out and replaced by a new submission,
1796          * possibly even its own lite-restore). The HW only sends an interrupt
1797          * on the first miss, and we do know if that semaphore has been
1798          * signaled, or even if it is now stuck on another semaphore. Play
1799          * safe, yield if it might be stuck -- it will be given a fresh
1800          * timeslice in the near future.
1801          */
1802         return upper_32_bits(rq->context->lrc_desc) == READ_ONCE(el->yield);
1803 }
1804
1805 static bool
1806 timeslice_expired(const struct intel_engine_execlists *el,
1807                   const struct i915_request *rq)
1808 {
1809         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1810 }
1811
1812 static int
1813 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1814 {
1815         if (list_is_last(&rq->sched.link, &engine->active.requests))
1816                 return INT_MIN;
1817
1818         return rq_prio(list_next_entry(rq, sched.link));
1819 }
1820
1821 static inline unsigned long
1822 timeslice(const struct intel_engine_cs *engine)
1823 {
1824         return READ_ONCE(engine->props.timeslice_duration_ms);
1825 }
1826
1827 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1828 {
1829         const struct intel_engine_execlists *execlists = &engine->execlists;
1830         const struct i915_request *rq = *execlists->active;
1831
1832         if (!rq || i915_request_completed(rq))
1833                 return 0;
1834
1835         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1836                 return 0;
1837
1838         return timeslice(engine);
1839 }
1840
1841 static void set_timeslice(struct intel_engine_cs *engine)
1842 {
1843         unsigned long duration;
1844
1845         if (!intel_engine_has_timeslices(engine))
1846                 return;
1847
1848         duration = active_timeslice(engine);
1849         ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
1850
1851         set_timer_ms(&engine->execlists.timer, duration);
1852 }
1853
1854 static void start_timeslice(struct intel_engine_cs *engine)
1855 {
1856         struct intel_engine_execlists *execlists = &engine->execlists;
1857         const int prio = queue_prio(execlists);
1858         unsigned long duration;
1859
1860         if (!intel_engine_has_timeslices(engine))
1861                 return;
1862
1863         WRITE_ONCE(execlists->switch_priority_hint, prio);
1864         if (prio == INT_MIN)
1865                 return;
1866
1867         if (timer_pending(&execlists->timer))
1868                 return;
1869
1870         duration = timeslice(engine);
1871         ENGINE_TRACE(engine,
1872                      "start timeslicing, prio:%d, interval:%lu",
1873                      prio, duration);
1874
1875         set_timer_ms(&execlists->timer, duration);
1876 }
1877
1878 static void record_preemption(struct intel_engine_execlists *execlists)
1879 {
1880         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1881 }
1882
1883 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1884                                             const struct i915_request *rq)
1885 {
1886         if (!rq)
1887                 return 0;
1888
1889         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1890         if (unlikely(intel_context_is_banned(rq->context)))
1891                 return 1;
1892
1893         return READ_ONCE(engine->props.preempt_timeout_ms);
1894 }
1895
1896 static void set_preempt_timeout(struct intel_engine_cs *engine,
1897                                 const struct i915_request *rq)
1898 {
1899         if (!intel_engine_has_preempt_reset(engine))
1900                 return;
1901
1902         set_timer_ms(&engine->execlists.preempt,
1903                      active_preempt_timeout(engine, rq));
1904 }
1905
1906 static inline void clear_ports(struct i915_request **ports, int count)
1907 {
1908         memset_p((void **)ports, NULL, count);
1909 }
1910
1911 static void execlists_dequeue(struct intel_engine_cs *engine)
1912 {
1913         struct intel_engine_execlists * const execlists = &engine->execlists;
1914         struct i915_request **port = execlists->pending;
1915         struct i915_request ** const last_port = port + execlists->port_mask;
1916         struct i915_request * const *active;
1917         struct i915_request *last;
1918         struct rb_node *rb;
1919         bool submit = false;
1920
1921         /*
1922          * Hardware submission is through 2 ports. Conceptually each port
1923          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1924          * static for a context, and unique to each, so we only execute
1925          * requests belonging to a single context from each ring. RING_HEAD
1926          * is maintained by the CS in the context image, it marks the place
1927          * where it got up to last time, and through RING_TAIL we tell the CS
1928          * where we want to execute up to this time.
1929          *
1930          * In this list the requests are in order of execution. Consecutive
1931          * requests from the same context are adjacent in the ringbuffer. We
1932          * can combine these requests into a single RING_TAIL update:
1933          *
1934          *              RING_HEAD...req1...req2
1935          *                                    ^- RING_TAIL
1936          * since to execute req2 the CS must first execute req1.
1937          *
1938          * Our goal then is to point each port to the end of a consecutive
1939          * sequence of requests as being the most optimal (fewest wake ups
1940          * and context switches) submission.
1941          */
1942
1943         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1944                 struct virtual_engine *ve =
1945                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1946                 struct i915_request *rq = READ_ONCE(ve->request);
1947
1948                 if (!rq) { /* lazily cleanup after another engine handled rq */
1949                         rb_erase_cached(rb, &execlists->virtual);
1950                         RB_CLEAR_NODE(rb);
1951                         rb = rb_first_cached(&execlists->virtual);
1952                         continue;
1953                 }
1954
1955                 if (!virtual_matches(ve, rq, engine)) {
1956                         rb = rb_next(rb);
1957                         continue;
1958                 }
1959
1960                 break;
1961         }
1962
1963         /*
1964          * If the queue is higher priority than the last
1965          * request in the currently active context, submit afresh.
1966          * We will resubmit again afterwards in case we need to split
1967          * the active context to interject the preemption request,
1968          * i.e. we will retrigger preemption following the ack in case
1969          * of trouble.
1970          */
1971         active = READ_ONCE(execlists->active);
1972
1973         /*
1974          * In theory we can skip over completed contexts that have not
1975          * yet been processed by events (as those events are in flight):
1976          *
1977          * while ((last = *active) && i915_request_completed(last))
1978          *      active++;
1979          *
1980          * However, the GPU cannot handle this as it will ultimately
1981          * find itself trying to jump back into a context it has just
1982          * completed and barf.
1983          */
1984
1985         if ((last = *active)) {
1986                 if (need_preempt(engine, last, rb)) {
1987                         if (i915_request_completed(last)) {
1988                                 tasklet_hi_schedule(&execlists->tasklet);
1989                                 return;
1990                         }
1991
1992                         ENGINE_TRACE(engine,
1993                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1994                                      last->fence.context,
1995                                      last->fence.seqno,
1996                                      last->sched.attr.priority,
1997                                      execlists->queue_priority_hint);
1998                         record_preemption(execlists);
1999
2000                         /*
2001                          * Don't let the RING_HEAD advance past the breadcrumb
2002                          * as we unwind (and until we resubmit) so that we do
2003                          * not accidentally tell it to go backwards.
2004                          */
2005                         ring_set_paused(engine, 1);
2006
2007                         /*
2008                          * Note that we have not stopped the GPU at this point,
2009                          * so we are unwinding the incomplete requests as they
2010                          * remain inflight and so by the time we do complete
2011                          * the preemption, some of the unwound requests may
2012                          * complete!
2013                          */
2014                         __unwind_incomplete_requests(engine);
2015
2016                         last = NULL;
2017                 } else if (need_timeslice(engine, last) &&
2018                            timeslice_expired(execlists, last)) {
2019                         if (i915_request_completed(last)) {
2020                                 tasklet_hi_schedule(&execlists->tasklet);
2021                                 return;
2022                         }
2023
2024                         ENGINE_TRACE(engine,
2025                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2026                                      last->fence.context,
2027                                      last->fence.seqno,
2028                                      last->sched.attr.priority,
2029                                      execlists->queue_priority_hint,
2030                                      yesno(timeslice_yield(execlists, last)));
2031
2032                         ring_set_paused(engine, 1);
2033                         defer_active(engine);
2034
2035                         /*
2036                          * Unlike for preemption, if we rewind and continue
2037                          * executing the same context as previously active,
2038                          * the order of execution will remain the same and
2039                          * the tail will only advance. We do not need to
2040                          * force a full context restore, as a lite-restore
2041                          * is sufficient to resample the monotonic TAIL.
2042                          *
2043                          * If we switch to any other context, similarly we
2044                          * will not rewind TAIL of current context, and
2045                          * normal save/restore will preserve state and allow
2046                          * us to later continue executing the same request.
2047                          */
2048                         last = NULL;
2049                 } else {
2050                         /*
2051                          * Otherwise if we already have a request pending
2052                          * for execution after the current one, we can
2053                          * just wait until the next CS event before
2054                          * queuing more. In either case we will force a
2055                          * lite-restore preemption event, but if we wait
2056                          * we hopefully coalesce several updates into a single
2057                          * submission.
2058                          */
2059                         if (!list_is_last(&last->sched.link,
2060                                           &engine->active.requests)) {
2061                                 /*
2062                                  * Even if ELSP[1] is occupied and not worthy
2063                                  * of timeslices, our queue might be.
2064                                  */
2065                                 start_timeslice(engine);
2066                                 return;
2067                         }
2068                 }
2069         }
2070
2071         while (rb) { /* XXX virtual is always taking precedence */
2072                 struct virtual_engine *ve =
2073                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2074                 struct i915_request *rq;
2075
2076                 spin_lock(&ve->base.active.lock);
2077
2078                 rq = ve->request;
2079                 if (unlikely(!rq)) { /* lost the race to a sibling */
2080                         spin_unlock(&ve->base.active.lock);
2081                         rb_erase_cached(rb, &execlists->virtual);
2082                         RB_CLEAR_NODE(rb);
2083                         rb = rb_first_cached(&execlists->virtual);
2084                         continue;
2085                 }
2086
2087                 GEM_BUG_ON(rq != ve->request);
2088                 GEM_BUG_ON(rq->engine != &ve->base);
2089                 GEM_BUG_ON(rq->context != &ve->context);
2090
2091                 if (rq_prio(rq) >= queue_prio(execlists)) {
2092                         if (!virtual_matches(ve, rq, engine)) {
2093                                 spin_unlock(&ve->base.active.lock);
2094                                 rb = rb_next(rb);
2095                                 continue;
2096                         }
2097
2098                         if (last && !can_merge_rq(last, rq)) {
2099                                 spin_unlock(&ve->base.active.lock);
2100                                 start_timeslice(engine);
2101                                 return; /* leave this for another sibling */
2102                         }
2103
2104                         ENGINE_TRACE(engine,
2105                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2106                                      rq->fence.context,
2107                                      rq->fence.seqno,
2108                                      i915_request_completed(rq) ? "!" :
2109                                      i915_request_started(rq) ? "*" :
2110                                      "",
2111                                      yesno(engine != ve->siblings[0]));
2112
2113                         WRITE_ONCE(ve->request, NULL);
2114                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2115                                    INT_MIN);
2116                         rb_erase_cached(rb, &execlists->virtual);
2117                         RB_CLEAR_NODE(rb);
2118
2119                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2120                         WRITE_ONCE(rq->engine, engine);
2121
2122                         if (engine != ve->siblings[0]) {
2123                                 u32 *regs = ve->context.lrc_reg_state;
2124                                 unsigned int n;
2125
2126                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2127
2128                                 if (!intel_engine_has_relative_mmio(engine))
2129                                         virtual_update_register_offsets(regs,
2130                                                                         engine);
2131
2132                                 if (!list_empty(&ve->context.signals))
2133                                         virtual_xfer_breadcrumbs(ve, rq);
2134
2135                                 /*
2136                                  * Move the bound engine to the top of the list
2137                                  * for future execution. We then kick this
2138                                  * tasklet first before checking others, so that
2139                                  * we preferentially reuse this set of bound
2140                                  * registers.
2141                                  */
2142                                 for (n = 1; n < ve->num_siblings; n++) {
2143                                         if (ve->siblings[n] == engine) {
2144                                                 swap(ve->siblings[n],
2145                                                      ve->siblings[0]);
2146                                                 break;
2147                                         }
2148                                 }
2149
2150                                 GEM_BUG_ON(ve->siblings[0] != engine);
2151                         }
2152
2153                         if (__i915_request_submit(rq)) {
2154                                 submit = true;
2155                                 last = rq;
2156                         }
2157                         i915_request_put(rq);
2158
2159                         /*
2160                          * Hmm, we have a bunch of virtual engine requests,
2161                          * but the first one was already completed (thanks
2162                          * preempt-to-busy!). Keep looking at the veng queue
2163                          * until we have no more relevant requests (i.e.
2164                          * the normal submit queue has higher priority).
2165                          */
2166                         if (!submit) {
2167                                 spin_unlock(&ve->base.active.lock);
2168                                 rb = rb_first_cached(&execlists->virtual);
2169                                 continue;
2170                         }
2171                 }
2172
2173                 spin_unlock(&ve->base.active.lock);
2174                 break;
2175         }
2176
2177         while ((rb = rb_first_cached(&execlists->queue))) {
2178                 struct i915_priolist *p = to_priolist(rb);
2179                 struct i915_request *rq, *rn;
2180                 int i;
2181
2182                 priolist_for_each_request_consume(rq, rn, p, i) {
2183                         bool merge = true;
2184
2185                         /*
2186                          * Can we combine this request with the current port?
2187                          * It has to be the same context/ringbuffer and not
2188                          * have any exceptions (e.g. GVT saying never to
2189                          * combine contexts).
2190                          *
2191                          * If we can combine the requests, we can execute both
2192                          * by updating the RING_TAIL to point to the end of the
2193                          * second request, and so we never need to tell the
2194                          * hardware about the first.
2195                          */
2196                         if (last && !can_merge_rq(last, rq)) {
2197                                 /*
2198                                  * If we are on the second port and cannot
2199                                  * combine this request with the last, then we
2200                                  * are done.
2201                                  */
2202                                 if (port == last_port)
2203                                         goto done;
2204
2205                                 /*
2206                                  * We must not populate both ELSP[] with the
2207                                  * same LRCA, i.e. we must submit 2 different
2208                                  * contexts if we submit 2 ELSP.
2209                                  */
2210                                 if (last->context == rq->context)
2211                                         goto done;
2212
2213                                 if (i915_request_has_sentinel(last))
2214                                         goto done;
2215
2216                                 /*
2217                                  * If GVT overrides us we only ever submit
2218                                  * port[0], leaving port[1] empty. Note that we
2219                                  * also have to be careful that we don't queue
2220                                  * the same context (even though a different
2221                                  * request) to the second port.
2222                                  */
2223                                 if (ctx_single_port_submission(last->context) ||
2224                                     ctx_single_port_submission(rq->context))
2225                                         goto done;
2226
2227                                 merge = false;
2228                         }
2229
2230                         if (__i915_request_submit(rq)) {
2231                                 if (!merge) {
2232                                         *port = execlists_schedule_in(last, port - execlists->pending);
2233                                         port++;
2234                                         last = NULL;
2235                                 }
2236
2237                                 GEM_BUG_ON(last &&
2238                                            !can_merge_ctx(last->context,
2239                                                           rq->context));
2240                                 GEM_BUG_ON(last &&
2241                                            i915_seqno_passed(last->fence.seqno,
2242                                                              rq->fence.seqno));
2243
2244                                 submit = true;
2245                                 last = rq;
2246                         }
2247                 }
2248
2249                 rb_erase_cached(&p->node, &execlists->queue);
2250                 i915_priolist_free(p);
2251         }
2252
2253 done:
2254         /*
2255          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2256          *
2257          * We choose the priority hint such that if we add a request of greater
2258          * priority than this, we kick the submission tasklet to decide on
2259          * the right order of submitting the requests to hardware. We must
2260          * also be prepared to reorder requests as they are in-flight on the
2261          * HW. We derive the priority hint then as the first "hole" in
2262          * the HW submission ports and if there are no available slots,
2263          * the priority of the lowest executing request, i.e. last.
2264          *
2265          * When we do receive a higher priority request ready to run from the
2266          * user, see queue_request(), the priority hint is bumped to that
2267          * request triggering preemption on the next dequeue (or subsequent
2268          * interrupt for secondary ports).
2269          */
2270         execlists->queue_priority_hint = queue_prio(execlists);
2271
2272         if (submit) {
2273                 *port = execlists_schedule_in(last, port - execlists->pending);
2274                 execlists->switch_priority_hint =
2275                         switch_prio(engine, *execlists->pending);
2276
2277                 /*
2278                  * Skip if we ended up with exactly the same set of requests,
2279                  * e.g. trying to timeslice a pair of ordered contexts
2280                  */
2281                 if (!memcmp(active, execlists->pending,
2282                             (port - execlists->pending + 1) * sizeof(*port))) {
2283                         do
2284                                 execlists_schedule_out(fetch_and_zero(port));
2285                         while (port-- != execlists->pending);
2286
2287                         goto skip_submit;
2288                 }
2289                 clear_ports(port + 1, last_port - port);
2290
2291                 WRITE_ONCE(execlists->yield, -1);
2292                 execlists_submit_ports(engine);
2293                 set_preempt_timeout(engine, *active);
2294         } else {
2295 skip_submit:
2296                 ring_set_paused(engine, 0);
2297         }
2298 }
2299
2300 static void
2301 cancel_port_requests(struct intel_engine_execlists * const execlists)
2302 {
2303         struct i915_request * const *port;
2304
2305         for (port = execlists->pending; *port; port++)
2306                 execlists_schedule_out(*port);
2307         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2308
2309         /* Mark the end of active before we overwrite *active */
2310         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2311                 execlists_schedule_out(*port);
2312         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2313
2314         smp_wmb(); /* complete the seqlock for execlists_active() */
2315         WRITE_ONCE(execlists->active, execlists->inflight);
2316 }
2317
2318 static inline void
2319 invalidate_csb_entries(const u32 *first, const u32 *last)
2320 {
2321         clflush((void *)first);
2322         clflush((void *)last);
2323 }
2324
2325 /*
2326  * Starting with Gen12, the status has a new format:
2327  *
2328  *     bit  0:     switched to new queue
2329  *     bit  1:     reserved
2330  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2331  *                 switch detail is set to "wait on semaphore"
2332  *     bits 3-5:   engine class
2333  *     bits 6-11:  engine instance
2334  *     bits 12-14: reserved
2335  *     bits 15-25: sw context id of the lrc the GT switched to
2336  *     bits 26-31: sw counter of the lrc the GT switched to
2337  *     bits 32-35: context switch detail
2338  *                  - 0: ctx complete
2339  *                  - 1: wait on sync flip
2340  *                  - 2: wait on vblank
2341  *                  - 3: wait on scanline
2342  *                  - 4: wait on semaphore
2343  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2344  *                       WAIT_FOR_EVENT)
2345  *     bit  36:    reserved
2346  *     bits 37-43: wait detail (for switch detail 1 to 4)
2347  *     bits 44-46: reserved
2348  *     bits 47-57: sw context id of the lrc the GT switched away from
2349  *     bits 58-63: sw counter of the lrc the GT switched away from
2350  */
2351 static inline bool
2352 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2353 {
2354         u32 lower_dw = csb[0];
2355         u32 upper_dw = csb[1];
2356         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2357         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2358         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2359
2360         /*
2361          * The context switch detail is not guaranteed to be 5 when a preemption
2362          * occurs, so we can't just check for that. The check below works for
2363          * all the cases we care about, including preemptions of WAIT
2364          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2365          * would require some extra handling, but we don't support that.
2366          */
2367         if (!ctx_away_valid || new_queue) {
2368                 GEM_BUG_ON(!ctx_to_valid);
2369                 return true;
2370         }
2371
2372         /*
2373          * switch detail = 5 is covered by the case above and we do not expect a
2374          * context switch on an unsuccessful wait instruction since we always
2375          * use polling mode.
2376          */
2377         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2378         return false;
2379 }
2380
2381 static inline bool
2382 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2383 {
2384         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2385 }
2386
2387 static inline void flush_hwsp(const struct i915_request *rq)
2388 {
2389         mb();
2390         clflush((void *)READ_ONCE(rq->hwsp_seqno));
2391         mb();
2392 }
2393
2394 static void process_csb(struct intel_engine_cs *engine)
2395 {
2396         struct intel_engine_execlists * const execlists = &engine->execlists;
2397         const u32 * const buf = execlists->csb_status;
2398         const u8 num_entries = execlists->csb_size;
2399         u8 head, tail;
2400
2401         /*
2402          * As we modify our execlists state tracking we require exclusive
2403          * access. Either we are inside the tasklet, or the tasklet is disabled
2404          * and we assume that is only inside the reset paths and so serialised.
2405          */
2406         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2407                    !reset_in_progress(execlists));
2408         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2409
2410         /*
2411          * Note that csb_write, csb_status may be either in HWSP or mmio.
2412          * When reading from the csb_write mmio register, we have to be
2413          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2414          * the low 4bits. As it happens we know the next 4bits are always
2415          * zero and so we can simply masked off the low u8 of the register
2416          * and treat it identically to reading from the HWSP (without having
2417          * to use explicit shifting and masking, and probably bifurcating
2418          * the code to handle the legacy mmio read).
2419          */
2420         head = execlists->csb_head;
2421         tail = READ_ONCE(*execlists->csb_write);
2422         if (unlikely(head == tail))
2423                 return;
2424
2425         /*
2426          * Hopefully paired with a wmb() in HW!
2427          *
2428          * We must complete the read of the write pointer before any reads
2429          * from the CSB, so that we do not see stale values. Without an rmb
2430          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2431          * we perform the READ_ONCE(*csb_write).
2432          */
2433         rmb();
2434
2435         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2436         do {
2437                 bool promote;
2438
2439                 if (++head == num_entries)
2440                         head = 0;
2441
2442                 /*
2443                  * We are flying near dragons again.
2444                  *
2445                  * We hold a reference to the request in execlist_port[]
2446                  * but no more than that. We are operating in softirq
2447                  * context and so cannot hold any mutex or sleep. That
2448                  * prevents us stopping the requests we are processing
2449                  * in port[] from being retired simultaneously (the
2450                  * breadcrumb will be complete before we see the
2451                  * context-switch). As we only hold the reference to the
2452                  * request, any pointer chasing underneath the request
2453                  * is subject to a potential use-after-free. Thus we
2454                  * store all of the bookkeeping within port[] as
2455                  * required, and avoid using unguarded pointers beneath
2456                  * request itself. The same applies to the atomic
2457                  * status notifier.
2458                  */
2459
2460                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2461                              head, buf[2 * head + 0], buf[2 * head + 1]);
2462
2463                 if (INTEL_GEN(engine->i915) >= 12)
2464                         promote = gen12_csb_parse(execlists, buf + 2 * head);
2465                 else
2466                         promote = gen8_csb_parse(execlists, buf + 2 * head);
2467                 if (promote) {
2468                         struct i915_request * const *old = execlists->active;
2469
2470                         ring_set_paused(engine, 0);
2471
2472                         /* Point active to the new ELSP; prevent overwriting */
2473                         WRITE_ONCE(execlists->active, execlists->pending);
2474                         smp_wmb(); /* notify execlists_active() */
2475
2476                         /* cancel old inflight, prepare for switch */
2477                         trace_ports(execlists, "preempted", old);
2478                         while (*old)
2479                                 execlists_schedule_out(*old++);
2480
2481                         /* switch pending to inflight */
2482                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2483                         memcpy(execlists->inflight,
2484                                execlists->pending,
2485                                execlists_num_ports(execlists) *
2486                                sizeof(*execlists->pending));
2487                         smp_wmb(); /* complete the seqlock */
2488                         WRITE_ONCE(execlists->active, execlists->inflight);
2489
2490                         WRITE_ONCE(execlists->pending[0], NULL);
2491                 } else {
2492                         GEM_BUG_ON(!*execlists->active);
2493
2494                         /* port0 completed, advanced to port1 */
2495                         trace_ports(execlists, "completed", execlists->active);
2496
2497                         /*
2498                          * We rely on the hardware being strongly
2499                          * ordered, that the breadcrumb write is
2500                          * coherent (visible from the CPU) before the
2501                          * user interrupt and CSB is processed.
2502                          */
2503                         if (GEM_SHOW_DEBUG() &&
2504                             !i915_request_completed(*execlists->active)) {
2505                                 struct i915_request *rq = *execlists->active;
2506                                 const u32 *regs __maybe_unused =
2507                                         rq->context->lrc_reg_state;
2508
2509                                 /*
2510                                  * Flush the breadcrumb before crying foul.
2511                                  *
2512                                  * Since we have hit this on icl and seen the
2513                                  * breadcrumb advance as we print out the debug
2514                                  * info (so the problem corrected itself without
2515                                  * lasting damage), and we know that icl suffers
2516                                  * from missing global observation points in
2517                                  * execlists, presume that affects even more
2518                                  * coherency.
2519                                  */
2520                                 flush_hwsp(rq);
2521
2522                                 ENGINE_TRACE(engine,
2523                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2524                                              ENGINE_READ(engine, RING_START),
2525                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2526                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2527                                              ENGINE_READ(engine, RING_CTL),
2528                                              ENGINE_READ(engine, RING_MI_MODE));
2529                                 ENGINE_TRACE(engine,
2530                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2531                                              i915_ggtt_offset(rq->ring->vma),
2532                                              rq->head, rq->tail,
2533                                              rq->fence.context,
2534                                              lower_32_bits(rq->fence.seqno),
2535                                              hwsp_seqno(rq));
2536                                 ENGINE_TRACE(engine,
2537                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2538                                              regs[CTX_RING_START],
2539                                              regs[CTX_RING_HEAD],
2540                                              regs[CTX_RING_TAIL]);
2541
2542                                 /* Still? Declare it caput! */
2543                                 if (!i915_request_completed(rq) &&
2544                                     !reset_in_progress(execlists))
2545                                         GEM_BUG_ON("context completed before request");
2546                         }
2547
2548                         execlists_schedule_out(*execlists->active++);
2549
2550                         GEM_BUG_ON(execlists->active - execlists->inflight >
2551                                    execlists_num_ports(execlists));
2552                 }
2553         } while (head != tail);
2554
2555         execlists->csb_head = head;
2556         set_timeslice(engine);
2557
2558         /*
2559          * Gen11 has proven to fail wrt global observation point between
2560          * entry and tail update, failing on the ordering and thus
2561          * we see an old entry in the context status buffer.
2562          *
2563          * Forcibly evict out entries for the next gpu csb update,
2564          * to increase the odds that we get a fresh entries with non
2565          * working hardware. The cost for doing so comes out mostly with
2566          * the wash as hardware, working or not, will need to do the
2567          * invalidation before.
2568          */
2569         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2570 }
2571
2572 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2573 {
2574         lockdep_assert_held(&engine->active.lock);
2575         if (!READ_ONCE(engine->execlists.pending[0])) {
2576                 rcu_read_lock(); /* protect peeking at execlists->active */
2577                 execlists_dequeue(engine);
2578                 rcu_read_unlock();
2579         }
2580 }
2581
2582 static void __execlists_hold(struct i915_request *rq)
2583 {
2584         LIST_HEAD(list);
2585
2586         do {
2587                 struct i915_dependency *p;
2588
2589                 if (i915_request_is_active(rq))
2590                         __i915_request_unsubmit(rq);
2591
2592                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2593                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2594                 i915_request_set_hold(rq);
2595                 RQ_TRACE(rq, "on hold\n");
2596
2597                 for_each_waiter(p, rq) {
2598                         struct i915_request *w =
2599                                 container_of(p->waiter, typeof(*w), sched);
2600
2601                         /* Leave semaphores spinning on the other engines */
2602                         if (w->engine != rq->engine)
2603                                 continue;
2604
2605                         if (!i915_request_is_ready(w))
2606                                 continue;
2607
2608                         if (i915_request_completed(w))
2609                                 continue;
2610
2611                         if (i915_request_on_hold(w))
2612                                 continue;
2613
2614                         list_move_tail(&w->sched.link, &list);
2615                 }
2616
2617                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2618         } while (rq);
2619 }
2620
2621 static bool execlists_hold(struct intel_engine_cs *engine,
2622                            struct i915_request *rq)
2623 {
2624         spin_lock_irq(&engine->active.lock);
2625
2626         if (i915_request_completed(rq)) { /* too late! */
2627                 rq = NULL;
2628                 goto unlock;
2629         }
2630
2631         if (rq->engine != engine) { /* preempted virtual engine */
2632                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2633
2634                 /*
2635                  * intel_context_inflight() is only protected by virtue
2636                  * of process_csb() being called only by the tasklet (or
2637                  * directly from inside reset while the tasklet is suspended).
2638                  * Assert that neither of those are allowed to run while we
2639                  * poke at the request queues.
2640                  */
2641                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2642
2643                 /*
2644                  * An unsubmitted request along a virtual engine will
2645                  * remain on the active (this) engine until we are able
2646                  * to process the context switch away (and so mark the
2647                  * context as no longer in flight). That cannot have happened
2648                  * yet, otherwise we would not be hanging!
2649                  */
2650                 spin_lock(&ve->base.active.lock);
2651                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2652                 GEM_BUG_ON(ve->request != rq);
2653                 ve->request = NULL;
2654                 spin_unlock(&ve->base.active.lock);
2655                 i915_request_put(rq);
2656
2657                 rq->engine = engine;
2658         }
2659
2660         /*
2661          * Transfer this request onto the hold queue to prevent it
2662          * being resumbitted to HW (and potentially completed) before we have
2663          * released it. Since we may have already submitted following
2664          * requests, we need to remove those as well.
2665          */
2666         GEM_BUG_ON(i915_request_on_hold(rq));
2667         GEM_BUG_ON(rq->engine != engine);
2668         __execlists_hold(rq);
2669         GEM_BUG_ON(list_empty(&engine->active.hold));
2670
2671 unlock:
2672         spin_unlock_irq(&engine->active.lock);
2673         return rq;
2674 }
2675
2676 static bool hold_request(const struct i915_request *rq)
2677 {
2678         struct i915_dependency *p;
2679         bool result = false;
2680
2681         /*
2682          * If one of our ancestors is on hold, we must also be on hold,
2683          * otherwise we will bypass it and execute before it.
2684          */
2685         rcu_read_lock();
2686         for_each_signaler(p, rq) {
2687                 const struct i915_request *s =
2688                         container_of(p->signaler, typeof(*s), sched);
2689
2690                 if (s->engine != rq->engine)
2691                         continue;
2692
2693                 result = i915_request_on_hold(s);
2694                 if (result)
2695                         break;
2696         }
2697         rcu_read_unlock();
2698
2699         return result;
2700 }
2701
2702 static void __execlists_unhold(struct i915_request *rq)
2703 {
2704         LIST_HEAD(list);
2705
2706         do {
2707                 struct i915_dependency *p;
2708
2709                 RQ_TRACE(rq, "hold release\n");
2710
2711                 GEM_BUG_ON(!i915_request_on_hold(rq));
2712                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2713
2714                 i915_request_clear_hold(rq);
2715                 list_move_tail(&rq->sched.link,
2716                                i915_sched_lookup_priolist(rq->engine,
2717                                                           rq_prio(rq)));
2718                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2719
2720                 /* Also release any children on this engine that are ready */
2721                 for_each_waiter(p, rq) {
2722                         struct i915_request *w =
2723                                 container_of(p->waiter, typeof(*w), sched);
2724
2725                         /* Propagate any change in error status */
2726                         if (rq->fence.error)
2727                                 i915_request_set_error_once(w, rq->fence.error);
2728
2729                         if (w->engine != rq->engine)
2730                                 continue;
2731
2732                         if (!i915_request_on_hold(w))
2733                                 continue;
2734
2735                         /* Check that no other parents are also on hold */
2736                         if (hold_request(w))
2737                                 continue;
2738
2739                         list_move_tail(&w->sched.link, &list);
2740                 }
2741
2742                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2743         } while (rq);
2744 }
2745
2746 static void execlists_unhold(struct intel_engine_cs *engine,
2747                              struct i915_request *rq)
2748 {
2749         spin_lock_irq(&engine->active.lock);
2750
2751         /*
2752          * Move this request back to the priority queue, and all of its
2753          * children and grandchildren that were suspended along with it.
2754          */
2755         __execlists_unhold(rq);
2756
2757         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2758                 engine->execlists.queue_priority_hint = rq_prio(rq);
2759                 tasklet_hi_schedule(&engine->execlists.tasklet);
2760         }
2761
2762         spin_unlock_irq(&engine->active.lock);
2763 }
2764
2765 struct execlists_capture {
2766         struct work_struct work;
2767         struct i915_request *rq;
2768         struct i915_gpu_coredump *error;
2769 };
2770
2771 static void execlists_capture_work(struct work_struct *work)
2772 {
2773         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2774         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2775         struct intel_engine_cs *engine = cap->rq->engine;
2776         struct intel_gt_coredump *gt = cap->error->gt;
2777         struct intel_engine_capture_vma *vma;
2778
2779         /* Compress all the objects attached to the request, slow! */
2780         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2781         if (vma) {
2782                 struct i915_vma_compress *compress =
2783                         i915_vma_capture_prepare(gt);
2784
2785                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2786                 i915_vma_capture_finish(gt, compress);
2787         }
2788
2789         gt->simulated = gt->engine->simulated;
2790         cap->error->simulated = gt->simulated;
2791
2792         /* Publish the error state, and announce it to the world */
2793         i915_error_state_store(cap->error);
2794         i915_gpu_coredump_put(cap->error);
2795
2796         /* Return this request and all that depend upon it for signaling */
2797         execlists_unhold(engine, cap->rq);
2798         i915_request_put(cap->rq);
2799
2800         kfree(cap);
2801 }
2802
2803 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2804 {
2805         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2806         struct execlists_capture *cap;
2807
2808         cap = kmalloc(sizeof(*cap), gfp);
2809         if (!cap)
2810                 return NULL;
2811
2812         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2813         if (!cap->error)
2814                 goto err_cap;
2815
2816         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2817         if (!cap->error->gt)
2818                 goto err_gpu;
2819
2820         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2821         if (!cap->error->gt->engine)
2822                 goto err_gt;
2823
2824         return cap;
2825
2826 err_gt:
2827         kfree(cap->error->gt);
2828 err_gpu:
2829         kfree(cap->error);
2830 err_cap:
2831         kfree(cap);
2832         return NULL;
2833 }
2834
2835 static struct i915_request *
2836 active_context(struct intel_engine_cs *engine, u32 ccid)
2837 {
2838         const struct intel_engine_execlists * const el = &engine->execlists;
2839         struct i915_request * const *port, *rq;
2840
2841         /*
2842          * Use the most recent result from process_csb(), but just in case
2843          * we trigger an error (via interrupt) before the first CS event has
2844          * been written, peek at the next submission.
2845          */
2846
2847         for (port = el->active; (rq = *port); port++) {
2848                 if (upper_32_bits(rq->context->lrc_desc) == ccid) {
2849                         ENGINE_TRACE(engine,
2850                                      "ccid found at active:%zd\n",
2851                                      port - el->active);
2852                         return rq;
2853                 }
2854         }
2855
2856         for (port = el->pending; (rq = *port); port++) {
2857                 if (upper_32_bits(rq->context->lrc_desc) == ccid) {
2858                         ENGINE_TRACE(engine,
2859                                      "ccid found at pending:%zd\n",
2860                                      port - el->pending);
2861                         return rq;
2862                 }
2863         }
2864
2865         ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
2866         return NULL;
2867 }
2868
2869 static u32 active_ccid(struct intel_engine_cs *engine)
2870 {
2871         return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
2872 }
2873
2874 static bool execlists_capture(struct intel_engine_cs *engine)
2875 {
2876         struct execlists_capture *cap;
2877
2878         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2879                 return true;
2880
2881         /*
2882          * We need to _quickly_ capture the engine state before we reset.
2883          * We are inside an atomic section (softirq) here and we are delaying
2884          * the forced preemption event.
2885          */
2886         cap = capture_regs(engine);
2887         if (!cap)
2888                 return true;
2889
2890         spin_lock_irq(&engine->active.lock);
2891         cap->rq = active_context(engine, active_ccid(engine));
2892         if (cap->rq) {
2893                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2894                 cap->rq = i915_request_get_rcu(cap->rq);
2895         }
2896         spin_unlock_irq(&engine->active.lock);
2897         if (!cap->rq)
2898                 goto err_free;
2899
2900         /*
2901          * Remove the request from the execlists queue, and take ownership
2902          * of the request. We pass it to our worker who will _slowly_ compress
2903          * all the pages the _user_ requested for debugging their batch, after
2904          * which we return it to the queue for signaling.
2905          *
2906          * By removing them from the execlists queue, we also remove the
2907          * requests from being processed by __unwind_incomplete_requests()
2908          * during the intel_engine_reset(), and so they will *not* be replayed
2909          * afterwards.
2910          *
2911          * Note that because we have not yet reset the engine at this point,
2912          * it is possible for the request that we have identified as being
2913          * guilty, did in fact complete and we will then hit an arbitration
2914          * point allowing the outstanding preemption to succeed. The likelihood
2915          * of that is very low (as capturing of the engine registers should be
2916          * fast enough to run inside an irq-off atomic section!), so we will
2917          * simply hold that request accountable for being non-preemptible
2918          * long enough to force the reset.
2919          */
2920         if (!execlists_hold(engine, cap->rq))
2921                 goto err_rq;
2922
2923         INIT_WORK(&cap->work, execlists_capture_work);
2924         schedule_work(&cap->work);
2925         return true;
2926
2927 err_rq:
2928         i915_request_put(cap->rq);
2929 err_free:
2930         i915_gpu_coredump_put(cap->error);
2931         kfree(cap);
2932         return false;
2933 }
2934
2935 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2936 {
2937         const unsigned int bit = I915_RESET_ENGINE + engine->id;
2938         unsigned long *lock = &engine->gt->reset.flags;
2939
2940         if (!intel_has_reset_engine(engine->gt))
2941                 return;
2942
2943         if (test_and_set_bit(bit, lock))
2944                 return;
2945
2946         ENGINE_TRACE(engine, "reset for %s\n", msg);
2947
2948         /* Mark this tasklet as disabled to avoid waiting for it to complete */
2949         tasklet_disable_nosync(&engine->execlists.tasklet);
2950
2951         ring_set_paused(engine, 1); /* Freeze the current request in place */
2952         if (execlists_capture(engine))
2953                 intel_engine_reset(engine, msg);
2954         else
2955                 ring_set_paused(engine, 0);
2956
2957         tasklet_enable(&engine->execlists.tasklet);
2958         clear_and_wake_up_bit(bit, lock);
2959 }
2960
2961 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2962 {
2963         const struct timer_list *t = &engine->execlists.preempt;
2964
2965         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2966                 return false;
2967
2968         if (!timer_expired(t))
2969                 return false;
2970
2971         return READ_ONCE(engine->execlists.pending[0]);
2972 }
2973
2974 /*
2975  * Check the unread Context Status Buffers and manage the submission of new
2976  * contexts to the ELSP accordingly.
2977  */
2978 static void execlists_submission_tasklet(unsigned long data)
2979 {
2980         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2981         bool timeout = preempt_timeout(engine);
2982
2983         process_csb(engine);
2984
2985         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2986                 engine->execlists.error_interrupt = 0;
2987                 if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
2988                         execlists_reset(engine, "CS error");
2989         }
2990
2991         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2992                 unsigned long flags;
2993
2994                 spin_lock_irqsave(&engine->active.lock, flags);
2995                 __execlists_submission_tasklet(engine);
2996                 spin_unlock_irqrestore(&engine->active.lock, flags);
2997
2998                 /* Recheck after serialising with direct-submission */
2999                 if (unlikely(timeout && preempt_timeout(engine)))
3000                         execlists_reset(engine, "preemption time out");
3001         }
3002 }
3003
3004 static void __execlists_kick(struct intel_engine_execlists *execlists)
3005 {
3006         /* Kick the tasklet for some interrupt coalescing and reset handling */
3007         tasklet_hi_schedule(&execlists->tasklet);
3008 }
3009
3010 #define execlists_kick(t, member) \
3011         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
3012
3013 static void execlists_timeslice(struct timer_list *timer)
3014 {
3015         execlists_kick(timer, timer);
3016 }
3017
3018 static void execlists_preempt(struct timer_list *timer)
3019 {
3020         execlists_kick(timer, preempt);
3021 }
3022
3023 static void queue_request(struct intel_engine_cs *engine,
3024                           struct i915_request *rq)
3025 {
3026         GEM_BUG_ON(!list_empty(&rq->sched.link));
3027         list_add_tail(&rq->sched.link,
3028                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
3029         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3030 }
3031
3032 static void __submit_queue_imm(struct intel_engine_cs *engine)
3033 {
3034         struct intel_engine_execlists * const execlists = &engine->execlists;
3035
3036         if (reset_in_progress(execlists))
3037                 return; /* defer until we restart the engine following reset */
3038
3039         /* Hopefully we clear execlists->pending[] to let us through */
3040         if (READ_ONCE(execlists->pending[0]) &&
3041             tasklet_trylock(&execlists->tasklet)) {
3042                 process_csb(engine);
3043                 tasklet_unlock(&execlists->tasklet);
3044         }
3045
3046         __execlists_submission_tasklet(engine);
3047 }
3048
3049 static void submit_queue(struct intel_engine_cs *engine,
3050                          const struct i915_request *rq)
3051 {
3052         struct intel_engine_execlists *execlists = &engine->execlists;
3053
3054         if (rq_prio(rq) <= execlists->queue_priority_hint)
3055                 return;
3056
3057         execlists->queue_priority_hint = rq_prio(rq);
3058         __submit_queue_imm(engine);
3059 }
3060
3061 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3062                              const struct i915_request *rq)
3063 {
3064         GEM_BUG_ON(i915_request_on_hold(rq));
3065         return !list_empty(&engine->active.hold) && hold_request(rq);
3066 }
3067
3068 static void execlists_submit_request(struct i915_request *request)
3069 {
3070         struct intel_engine_cs *engine = request->engine;
3071         unsigned long flags;
3072
3073         /* Will be called from irq-context when using foreign fences. */
3074         spin_lock_irqsave(&engine->active.lock, flags);
3075
3076         if (unlikely(ancestor_on_hold(engine, request))) {
3077                 RQ_TRACE(request, "ancestor on hold\n");
3078                 list_add_tail(&request->sched.link, &engine->active.hold);
3079                 i915_request_set_hold(request);
3080         } else {
3081                 queue_request(engine, request);
3082
3083                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3084                 GEM_BUG_ON(list_empty(&request->sched.link));
3085
3086                 submit_queue(engine, request);
3087         }
3088
3089         spin_unlock_irqrestore(&engine->active.lock, flags);
3090 }
3091
3092 static void __execlists_context_fini(struct intel_context *ce)
3093 {
3094         intel_ring_put(ce->ring);
3095         i915_vma_put(ce->state);
3096 }
3097
3098 static void execlists_context_destroy(struct kref *kref)
3099 {
3100         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3101
3102         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3103         GEM_BUG_ON(intel_context_is_pinned(ce));
3104
3105         if (ce->state)
3106                 __execlists_context_fini(ce);
3107
3108         intel_context_fini(ce);
3109         intel_context_free(ce);
3110 }
3111
3112 static void
3113 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3114 {
3115         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3116                 return;
3117
3118         vaddr += engine->context_size;
3119
3120         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3121 }
3122
3123 static void
3124 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3125 {
3126         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3127                 return;
3128
3129         vaddr += engine->context_size;
3130
3131         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3132                 drm_err_once(&engine->i915->drm,
3133                              "%s context redzone overwritten!\n",
3134                              engine->name);
3135 }
3136
3137 static void execlists_context_unpin(struct intel_context *ce)
3138 {
3139         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
3140                       ce->engine);
3141
3142         i915_gem_object_unpin_map(ce->state->obj);
3143 }
3144
3145 static void
3146 __execlists_update_reg_state(const struct intel_context *ce,
3147                              const struct intel_engine_cs *engine,
3148                              u32 head)
3149 {
3150         struct intel_ring *ring = ce->ring;
3151         u32 *regs = ce->lrc_reg_state;
3152
3153         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3154         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3155
3156         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3157         regs[CTX_RING_HEAD] = head;
3158         regs[CTX_RING_TAIL] = ring->tail;
3159         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3160
3161         /* RPCS */
3162         if (engine->class == RENDER_CLASS) {
3163                 regs[CTX_R_PWR_CLK_STATE] =
3164                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3165
3166                 i915_oa_init_reg_state(ce, engine);
3167         }
3168 }
3169
3170 static int
3171 __execlists_context_pin(struct intel_context *ce,
3172                         struct intel_engine_cs *engine)
3173 {
3174         void *vaddr;
3175
3176         GEM_BUG_ON(!ce->state);
3177         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3178
3179         vaddr = i915_gem_object_pin_map(ce->state->obj,
3180                                         i915_coherent_map_type(engine->i915) |
3181                                         I915_MAP_OVERRIDE);
3182         if (IS_ERR(vaddr))
3183                 return PTR_ERR(vaddr);
3184
3185         ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3186         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
3187         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3188
3189         return 0;
3190 }
3191
3192 static int execlists_context_pin(struct intel_context *ce)
3193 {
3194         return __execlists_context_pin(ce, ce->engine);
3195 }
3196
3197 static int execlists_context_alloc(struct intel_context *ce)
3198 {
3199         return __execlists_context_alloc(ce, ce->engine);
3200 }
3201
3202 static void execlists_context_reset(struct intel_context *ce)
3203 {
3204         CE_TRACE(ce, "reset\n");
3205         GEM_BUG_ON(!intel_context_is_pinned(ce));
3206
3207         intel_ring_reset(ce->ring, ce->ring->emit);
3208
3209         /* Scrub away the garbage */
3210         execlists_init_reg_state(ce->lrc_reg_state,
3211                                  ce, ce->engine, ce->ring, true);
3212         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3213
3214         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
3215 }
3216
3217 static const struct intel_context_ops execlists_context_ops = {
3218         .alloc = execlists_context_alloc,
3219
3220         .pin = execlists_context_pin,
3221         .unpin = execlists_context_unpin,
3222
3223         .enter = intel_context_enter_engine,
3224         .exit = intel_context_exit_engine,
3225
3226         .reset = execlists_context_reset,
3227         .destroy = execlists_context_destroy,
3228 };
3229
3230 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3231 {
3232         u32 *cs;
3233
3234         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3235                 return 0;
3236
3237         cs = intel_ring_begin(rq, 6);
3238         if (IS_ERR(cs))
3239                 return PTR_ERR(cs);
3240
3241         /*
3242          * Check if we have been preempted before we even get started.
3243          *
3244          * After this point i915_request_started() reports true, even if
3245          * we get preempted and so are no longer running.
3246          */
3247         *cs++ = MI_ARB_CHECK;
3248         *cs++ = MI_NOOP;
3249
3250         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3251         *cs++ = i915_request_timeline(rq)->hwsp_offset;
3252         *cs++ = 0;
3253         *cs++ = rq->fence.seqno - 1;
3254
3255         intel_ring_advance(rq, cs);
3256
3257         /* Record the updated position of the request's payload */
3258         rq->infix = intel_ring_offset(rq, cs);
3259
3260         return 0;
3261 }
3262
3263 static int execlists_request_alloc(struct i915_request *request)
3264 {
3265         int ret;
3266
3267         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3268
3269         /*
3270          * Flush enough space to reduce the likelihood of waiting after
3271          * we start building the request - in which case we will just
3272          * have to repeat work.
3273          */
3274         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3275
3276         /*
3277          * Note that after this point, we have committed to using
3278          * this request as it is being used to both track the
3279          * state of engine initialisation and liveness of the
3280          * golden renderstate above. Think twice before you try
3281          * to cancel/unwind this request now.
3282          */
3283
3284         /* Unconditionally invalidate GPU caches and TLBs. */
3285         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3286         if (ret)
3287                 return ret;
3288
3289         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3290         return 0;
3291 }
3292
3293 /*
3294  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3295  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3296  * but there is a slight complication as this is applied in WA batch where the
3297  * values are only initialized once so we cannot take register value at the
3298  * beginning and reuse it further; hence we save its value to memory, upload a
3299  * constant value with bit21 set and then we restore it back with the saved value.
3300  * To simplify the WA, a constant value is formed by using the default value
3301  * of this register. This shouldn't be a problem because we are only modifying
3302  * it for a short period and this batch in non-premptible. We can ofcourse
3303  * use additional instructions that read the actual value of the register
3304  * at that time and set our bit of interest but it makes the WA complicated.
3305  *
3306  * This WA is also required for Gen9 so extracting as a function avoids
3307  * code duplication.
3308  */
3309 static u32 *
3310 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3311 {
3312         /* NB no one else is allowed to scribble over scratch + 256! */
3313         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3314         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3315         *batch++ = intel_gt_scratch_offset(engine->gt,
3316                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3317         *batch++ = 0;
3318
3319         *batch++ = MI_LOAD_REGISTER_IMM(1);
3320         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3321         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3322
3323         batch = gen8_emit_pipe_control(batch,
3324                                        PIPE_CONTROL_CS_STALL |
3325                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3326                                        0);
3327
3328         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3329         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3330         *batch++ = intel_gt_scratch_offset(engine->gt,
3331                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3332         *batch++ = 0;
3333
3334         return batch;
3335 }
3336
3337 /*
3338  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3339  * initialized at the beginning and shared across all contexts but this field
3340  * helps us to have multiple batches at different offsets and select them based
3341  * on a criteria. At the moment this batch always start at the beginning of the page
3342  * and at this point we don't have multiple wa_ctx batch buffers.
3343  *
3344  * The number of WA applied are not known at the beginning; we use this field
3345  * to return the no of DWORDS written.
3346  *
3347  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3348  * so it adds NOOPs as padding to make it cacheline aligned.
3349  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3350  * makes a complete batch buffer.
3351  */
3352 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3353 {
3354         /* WaDisableCtxRestoreArbitration:bdw,chv */
3355         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3356
3357         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3358         if (IS_BROADWELL(engine->i915))
3359                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3360
3361         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3362         /* Actual scratch location is at 128 bytes offset */
3363         batch = gen8_emit_pipe_control(batch,
3364                                        PIPE_CONTROL_FLUSH_L3 |
3365                                        PIPE_CONTROL_STORE_DATA_INDEX |
3366                                        PIPE_CONTROL_CS_STALL |
3367                                        PIPE_CONTROL_QW_WRITE,
3368                                        LRC_PPHWSP_SCRATCH_ADDR);
3369
3370         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3371
3372         /* Pad to end of cacheline */
3373         while ((unsigned long)batch % CACHELINE_BYTES)
3374                 *batch++ = MI_NOOP;
3375
3376         /*
3377          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3378          * execution depends on the length specified in terms of cache lines
3379          * in the register CTX_RCS_INDIRECT_CTX
3380          */
3381
3382         return batch;
3383 }
3384
3385 struct lri {
3386         i915_reg_t reg;
3387         u32 value;
3388 };
3389
3390 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3391 {
3392         GEM_BUG_ON(!count || count > 63);
3393
3394         *batch++ = MI_LOAD_REGISTER_IMM(count);
3395         do {
3396                 *batch++ = i915_mmio_reg_offset(lri->reg);
3397                 *batch++ = lri->value;
3398         } while (lri++, --count);
3399         *batch++ = MI_NOOP;
3400
3401         return batch;
3402 }
3403
3404 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3405 {
3406         static const struct lri lri[] = {
3407                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3408                 {
3409                         COMMON_SLICE_CHICKEN2,
3410                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3411                                        0),
3412                 },
3413
3414                 /* BSpec: 11391 */
3415                 {
3416                         FF_SLICE_CHICKEN,
3417                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3418                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3419                 },
3420
3421                 /* BSpec: 11299 */
3422                 {
3423                         _3D_CHICKEN3,
3424                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3425                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3426                 }
3427         };
3428
3429         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3430
3431         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3432         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3433
3434         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3435         batch = gen8_emit_pipe_control(batch,
3436                                        PIPE_CONTROL_FLUSH_L3 |
3437                                        PIPE_CONTROL_STORE_DATA_INDEX |
3438                                        PIPE_CONTROL_CS_STALL |
3439                                        PIPE_CONTROL_QW_WRITE,
3440                                        LRC_PPHWSP_SCRATCH_ADDR);
3441
3442         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3443
3444         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3445         if (HAS_POOLED_EU(engine->i915)) {
3446                 /*
3447                  * EU pool configuration is setup along with golden context
3448                  * during context initialization. This value depends on
3449                  * device type (2x6 or 3x6) and needs to be updated based
3450                  * on which subslice is disabled especially for 2x6
3451                  * devices, however it is safe to load default
3452                  * configuration of 3x6 device instead of masking off
3453                  * corresponding bits because HW ignores bits of a disabled
3454                  * subslice and drops down to appropriate config. Please
3455                  * see render_state_setup() in i915_gem_render_state.c for
3456                  * possible configurations, to avoid duplication they are
3457                  * not shown here again.
3458                  */
3459                 *batch++ = GEN9_MEDIA_POOL_STATE;
3460                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3461                 *batch++ = 0x00777000;
3462                 *batch++ = 0;
3463                 *batch++ = 0;
3464                 *batch++ = 0;
3465         }
3466
3467         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3468
3469         /* Pad to end of cacheline */
3470         while ((unsigned long)batch % CACHELINE_BYTES)
3471                 *batch++ = MI_NOOP;
3472
3473         return batch;
3474 }
3475
3476 static u32 *
3477 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3478 {
3479         int i;
3480
3481         /*
3482          * WaPipeControlBefore3DStateSamplePattern: cnl
3483          *
3484          * Ensure the engine is idle prior to programming a
3485          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3486          */
3487         batch = gen8_emit_pipe_control(batch,
3488                                        PIPE_CONTROL_CS_STALL,
3489                                        0);
3490         /*
3491          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3492          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3493          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3494          * confusing. Since gen8_emit_pipe_control() already advances the
3495          * batch by 6 dwords, we advance the other 10 here, completing a
3496          * cacheline. It's not clear if the workaround requires this padding
3497          * before other commands, or if it's just the regular padding we would
3498          * already have for the workaround bb, so leave it here for now.
3499          */
3500         for (i = 0; i < 10; i++)
3501                 *batch++ = MI_NOOP;
3502
3503         /* Pad to end of cacheline */
3504         while ((unsigned long)batch % CACHELINE_BYTES)
3505                 *batch++ = MI_NOOP;
3506
3507         return batch;
3508 }
3509
3510 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3511
3512 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3513 {
3514         struct drm_i915_gem_object *obj;
3515         struct i915_vma *vma;
3516         int err;
3517
3518         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3519         if (IS_ERR(obj))
3520                 return PTR_ERR(obj);
3521
3522         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3523         if (IS_ERR(vma)) {
3524                 err = PTR_ERR(vma);
3525                 goto err;
3526         }
3527
3528         err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3529         if (err)
3530                 goto err;
3531
3532         engine->wa_ctx.vma = vma;
3533         return 0;
3534
3535 err:
3536         i915_gem_object_put(obj);
3537         return err;
3538 }
3539
3540 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3541 {
3542         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3543 }
3544
3545 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3546
3547 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3548 {
3549         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3550         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3551                                             &wa_ctx->per_ctx };
3552         wa_bb_func_t wa_bb_fn[2];
3553         struct page *page;
3554         void *batch, *batch_ptr;
3555         unsigned int i;
3556         int ret;
3557
3558         if (engine->class != RENDER_CLASS)
3559                 return 0;
3560
3561         switch (INTEL_GEN(engine->i915)) {
3562         case 12:
3563         case 11:
3564                 return 0;
3565         case 10:
3566                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3567                 wa_bb_fn[1] = NULL;
3568                 break;
3569         case 9:
3570                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3571                 wa_bb_fn[1] = NULL;
3572                 break;
3573         case 8:
3574                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3575                 wa_bb_fn[1] = NULL;
3576                 break;
3577         default:
3578                 MISSING_CASE(INTEL_GEN(engine->i915));
3579                 return 0;
3580         }
3581
3582         ret = lrc_setup_wa_ctx(engine);
3583         if (ret) {
3584                 drm_dbg(&engine->i915->drm,
3585                         "Failed to setup context WA page: %d\n", ret);
3586                 return ret;
3587         }
3588
3589         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3590         batch = batch_ptr = kmap_atomic(page);
3591
3592         /*
3593          * Emit the two workaround batch buffers, recording the offset from the
3594          * start of the workaround batch buffer object for each and their
3595          * respective sizes.
3596          */
3597         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3598                 wa_bb[i]->offset = batch_ptr - batch;
3599                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3600                                                   CACHELINE_BYTES))) {
3601                         ret = -EINVAL;
3602                         break;
3603                 }
3604                 if (wa_bb_fn[i])
3605                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3606                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3607         }
3608
3609         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3610
3611         kunmap_atomic(batch);
3612         if (ret)
3613                 lrc_destroy_wa_ctx(engine);
3614
3615         return ret;
3616 }
3617
3618 static void enable_error_interrupt(struct intel_engine_cs *engine)
3619 {
3620         u32 status;
3621
3622         engine->execlists.error_interrupt = 0;
3623         ENGINE_WRITE(engine, RING_EMR, ~0u);
3624         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3625
3626         status = ENGINE_READ(engine, RING_ESR);
3627         if (unlikely(status)) {
3628                 drm_err(&engine->i915->drm,
3629                         "engine '%s' resumed still in error: %08x\n",
3630                         engine->name, status);
3631                 __intel_gt_reset(engine->gt, engine->mask);
3632         }
3633
3634         /*
3635          * On current gen8+, we have 2 signals to play with
3636          *
3637          * - I915_ERROR_INSTUCTION (bit 0)
3638          *
3639          *    Generate an error if the command parser encounters an invalid
3640          *    instruction
3641          *
3642          *    This is a fatal error.
3643          *
3644          * - CP_PRIV (bit 2)
3645          *
3646          *    Generate an error on privilege violation (where the CP replaces
3647          *    the instruction with a no-op). This also fires for writes into
3648          *    read-only scratch pages.
3649          *
3650          *    This is a non-fatal error, parsing continues.
3651          *
3652          * * there are a few others defined for odd HW that we do not use
3653          *
3654          * Since CP_PRIV fires for cases where we have chosen to ignore the
3655          * error (as the HW is validating and suppressing the mistakes), we
3656          * only unmask the instruction error bit.
3657          */
3658         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3659 }
3660
3661 static void enable_execlists(struct intel_engine_cs *engine)
3662 {
3663         u32 mode;
3664
3665         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3666
3667         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3668
3669         if (INTEL_GEN(engine->i915) >= 11)
3670                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3671         else
3672                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3673         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3674
3675         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3676
3677         ENGINE_WRITE_FW(engine,
3678                         RING_HWS_PGA,
3679                         i915_ggtt_offset(engine->status_page.vma));
3680         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3681
3682         enable_error_interrupt(engine);
3683
3684         engine->context_tag = 0;
3685 }
3686
3687 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3688 {
3689         bool unexpected = false;
3690
3691         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3692                 drm_dbg(&engine->i915->drm,
3693                         "STOP_RING still set in RING_MI_MODE\n");
3694                 unexpected = true;
3695         }
3696
3697         return unexpected;
3698 }
3699
3700 static int execlists_resume(struct intel_engine_cs *engine)
3701 {
3702         intel_mocs_init_engine(engine);
3703
3704         intel_engine_reset_breadcrumbs(engine);
3705
3706         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3707                 struct drm_printer p = drm_debug_printer(__func__);
3708
3709                 intel_engine_dump(engine, &p, NULL);
3710         }
3711
3712         enable_execlists(engine);
3713
3714         return 0;
3715 }
3716
3717 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3718 {
3719         struct intel_engine_execlists * const execlists = &engine->execlists;
3720         unsigned long flags;
3721
3722         ENGINE_TRACE(engine, "depth<-%d\n",
3723                      atomic_read(&execlists->tasklet.count));
3724
3725         /*
3726          * Prevent request submission to the hardware until we have
3727          * completed the reset in i915_gem_reset_finish(). If a request
3728          * is completed by one engine, it may then queue a request
3729          * to a second via its execlists->tasklet *just* as we are
3730          * calling engine->resume() and also writing the ELSP.
3731          * Turning off the execlists->tasklet until the reset is over
3732          * prevents the race.
3733          */
3734         __tasklet_disable_sync_once(&execlists->tasklet);
3735         GEM_BUG_ON(!reset_in_progress(execlists));
3736
3737         /* And flush any current direct submission. */
3738         spin_lock_irqsave(&engine->active.lock, flags);
3739         spin_unlock_irqrestore(&engine->active.lock, flags);
3740
3741         /*
3742          * We stop engines, otherwise we might get failed reset and a
3743          * dead gpu (on elk). Also as modern gpu as kbl can suffer
3744          * from system hang if batchbuffer is progressing when
3745          * the reset is issued, regardless of READY_TO_RESET ack.
3746          * Thus assume it is best to stop engines on all gens
3747          * where we have a gpu reset.
3748          *
3749          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3750          *
3751          * FIXME: Wa for more modern gens needs to be validated
3752          */
3753         ring_set_paused(engine, 1);
3754         intel_engine_stop_cs(engine);
3755 }
3756
3757 static void reset_csb_pointers(struct intel_engine_cs *engine)
3758 {
3759         struct intel_engine_execlists * const execlists = &engine->execlists;
3760         const unsigned int reset_value = execlists->csb_size - 1;
3761
3762         ring_set_paused(engine, 0);
3763
3764         /*
3765          * After a reset, the HW starts writing into CSB entry [0]. We
3766          * therefore have to set our HEAD pointer back one entry so that
3767          * the *first* entry we check is entry 0. To complicate this further,
3768          * as we don't wait for the first interrupt after reset, we have to
3769          * fake the HW write to point back to the last entry so that our
3770          * inline comparison of our cached head position against the last HW
3771          * write works even before the first interrupt.
3772          */
3773         execlists->csb_head = reset_value;
3774         WRITE_ONCE(*execlists->csb_write, reset_value);
3775         wmb(); /* Make sure this is visible to HW (paranoia?) */
3776
3777         /*
3778          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3779          * Bludgeon them with a mmio update to be sure.
3780          */
3781         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3782                      reset_value << 8 | reset_value);
3783         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3784
3785         invalidate_csb_entries(&execlists->csb_status[0],
3786                                &execlists->csb_status[reset_value]);
3787 }
3788
3789 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3790 {
3791         int x;
3792
3793         x = lrc_ring_mi_mode(engine);
3794         if (x != -1) {
3795                 regs[x + 1] &= ~STOP_RING;
3796                 regs[x + 1] |= STOP_RING << 16;
3797         }
3798 }
3799
3800 static void __execlists_reset_reg_state(const struct intel_context *ce,
3801                                         const struct intel_engine_cs *engine)
3802 {
3803         u32 *regs = ce->lrc_reg_state;
3804
3805         __reset_stop_ring(regs, engine);
3806 }
3807
3808 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3809 {
3810         struct intel_engine_execlists * const execlists = &engine->execlists;
3811         struct intel_context *ce;
3812         struct i915_request *rq;
3813         u32 head;
3814
3815         mb(); /* paranoia: read the CSB pointers from after the reset */
3816         clflush(execlists->csb_write);
3817         mb();
3818
3819         process_csb(engine); /* drain preemption events */
3820
3821         /* Following the reset, we need to reload the CSB read/write pointers */
3822         reset_csb_pointers(engine);
3823
3824         /*
3825          * Save the currently executing context, even if we completed
3826          * its request, it was still running at the time of the
3827          * reset and will have been clobbered.
3828          */
3829         rq = execlists_active(execlists);
3830         if (!rq)
3831                 goto unwind;
3832
3833         ce = rq->context;
3834         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3835
3836         if (i915_request_completed(rq)) {
3837                 /* Idle context; tidy up the ring so we can restart afresh */
3838                 head = intel_ring_wrap(ce->ring, rq->tail);
3839                 goto out_replay;
3840         }
3841
3842         /* We still have requests in-flight; the engine should be active */
3843         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3844
3845         /* Context has requests still in-flight; it should not be idle! */
3846         GEM_BUG_ON(i915_active_is_idle(&ce->active));
3847
3848         rq = active_request(ce->timeline, rq);
3849         head = intel_ring_wrap(ce->ring, rq->head);
3850         GEM_BUG_ON(head == ce->ring->tail);
3851
3852         /*
3853          * If this request hasn't started yet, e.g. it is waiting on a
3854          * semaphore, we need to avoid skipping the request or else we
3855          * break the signaling chain. However, if the context is corrupt
3856          * the request will not restart and we will be stuck with a wedged
3857          * device. It is quite often the case that if we issue a reset
3858          * while the GPU is loading the context image, that the context
3859          * image becomes corrupt.
3860          *
3861          * Otherwise, if we have not started yet, the request should replay
3862          * perfectly and we do not need to flag the result as being erroneous.
3863          */
3864         if (!i915_request_started(rq))
3865                 goto out_replay;
3866
3867         /*
3868          * If the request was innocent, we leave the request in the ELSP
3869          * and will try to replay it on restarting. The context image may
3870          * have been corrupted by the reset, in which case we may have
3871          * to service a new GPU hang, but more likely we can continue on
3872          * without impact.
3873          *
3874          * If the request was guilty, we presume the context is corrupt
3875          * and have to at least restore the RING register in the context
3876          * image back to the expected values to skip over the guilty request.
3877          */
3878         __i915_request_reset(rq, stalled);
3879         if (!stalled)
3880                 goto out_replay;
3881
3882         /*
3883          * We want a simple context + ring to execute the breadcrumb update.
3884          * We cannot rely on the context being intact across the GPU hang,
3885          * so clear it and rebuild just what we need for the breadcrumb.
3886          * All pending requests for this context will be zapped, and any
3887          * future request will be after userspace has had the opportunity
3888          * to recreate its own state.
3889          */
3890         GEM_BUG_ON(!intel_context_is_pinned(ce));
3891         restore_default_state(ce, engine);
3892
3893 out_replay:
3894         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3895                      head, ce->ring->tail);
3896         __execlists_reset_reg_state(ce, engine);
3897         __execlists_update_reg_state(ce, engine, head);
3898         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3899
3900 unwind:
3901         /* Push back any incomplete requests for replay after the reset. */
3902         cancel_port_requests(execlists);
3903         __unwind_incomplete_requests(engine);
3904 }
3905
3906 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3907 {
3908         unsigned long flags;
3909
3910         ENGINE_TRACE(engine, "\n");
3911
3912         spin_lock_irqsave(&engine->active.lock, flags);
3913
3914         __execlists_reset(engine, stalled);
3915
3916         spin_unlock_irqrestore(&engine->active.lock, flags);
3917 }
3918
3919 static void nop_submission_tasklet(unsigned long data)
3920 {
3921         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3922
3923         /* The driver is wedged; don't process any more events. */
3924         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
3925 }
3926
3927 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3928 {
3929         struct intel_engine_execlists * const execlists = &engine->execlists;
3930         struct i915_request *rq, *rn;
3931         struct rb_node *rb;
3932         unsigned long flags;
3933
3934         ENGINE_TRACE(engine, "\n");
3935
3936         /*
3937          * Before we call engine->cancel_requests(), we should have exclusive
3938          * access to the submission state. This is arranged for us by the
3939          * caller disabling the interrupt generation, the tasklet and other
3940          * threads that may then access the same state, giving us a free hand
3941          * to reset state. However, we still need to let lockdep be aware that
3942          * we know this state may be accessed in hardirq context, so we
3943          * disable the irq around this manipulation and we want to keep
3944          * the spinlock focused on its duties and not accidentally conflate
3945          * coverage to the submission's irq state. (Similarly, although we
3946          * shouldn't need to disable irq around the manipulation of the
3947          * submission's irq state, we also wish to remind ourselves that
3948          * it is irq state.)
3949          */
3950         spin_lock_irqsave(&engine->active.lock, flags);
3951
3952         __execlists_reset(engine, true);
3953
3954         /* Mark all executing requests as skipped. */
3955         list_for_each_entry(rq, &engine->active.requests, sched.link)
3956                 mark_eio(rq);
3957
3958         /* Flush the queued requests to the timeline list (for retiring). */
3959         while ((rb = rb_first_cached(&execlists->queue))) {
3960                 struct i915_priolist *p = to_priolist(rb);
3961                 int i;
3962
3963                 priolist_for_each_request_consume(rq, rn, p, i) {
3964                         mark_eio(rq);
3965                         __i915_request_submit(rq);
3966                 }
3967
3968                 rb_erase_cached(&p->node, &execlists->queue);
3969                 i915_priolist_free(p);
3970         }
3971
3972         /* On-hold requests will be flushed to timeline upon their release */
3973         list_for_each_entry(rq, &engine->active.hold, sched.link)
3974                 mark_eio(rq);
3975
3976         /* Cancel all attached virtual engines */
3977         while ((rb = rb_first_cached(&execlists->virtual))) {
3978                 struct virtual_engine *ve =
3979                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3980
3981                 rb_erase_cached(rb, &execlists->virtual);
3982                 RB_CLEAR_NODE(rb);
3983
3984                 spin_lock(&ve->base.active.lock);
3985                 rq = fetch_and_zero(&ve->request);
3986                 if (rq) {
3987                         mark_eio(rq);
3988
3989                         rq->engine = engine;
3990                         __i915_request_submit(rq);
3991                         i915_request_put(rq);
3992
3993                         ve->base.execlists.queue_priority_hint = INT_MIN;
3994                 }
3995                 spin_unlock(&ve->base.active.lock);
3996         }
3997
3998         /* Remaining _unready_ requests will be nop'ed when submitted */
3999
4000         execlists->queue_priority_hint = INT_MIN;
4001         execlists->queue = RB_ROOT_CACHED;
4002
4003         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4004         execlists->tasklet.func = nop_submission_tasklet;
4005
4006         spin_unlock_irqrestore(&engine->active.lock, flags);
4007 }
4008
4009 static void execlists_reset_finish(struct intel_engine_cs *engine)
4010 {
4011         struct intel_engine_execlists * const execlists = &engine->execlists;
4012
4013         /*
4014          * After a GPU reset, we may have requests to replay. Do so now while
4015          * we still have the forcewake to be sure that the GPU is not allowed
4016          * to sleep before we restart and reload a context.
4017          */
4018         GEM_BUG_ON(!reset_in_progress(execlists));
4019         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4020                 execlists->tasklet.func(execlists->tasklet.data);
4021
4022         if (__tasklet_enable(&execlists->tasklet))
4023                 /* And kick in case we missed a new request submission. */
4024                 tasklet_hi_schedule(&execlists->tasklet);
4025         ENGINE_TRACE(engine, "depth->%d\n",
4026                      atomic_read(&execlists->tasklet.count));
4027 }
4028
4029 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4030                                     u64 offset, u32 len,
4031                                     const unsigned int flags)
4032 {
4033         u32 *cs;
4034
4035         cs = intel_ring_begin(rq, 4);
4036         if (IS_ERR(cs))
4037                 return PTR_ERR(cs);
4038
4039         /*
4040          * WaDisableCtxRestoreArbitration:bdw,chv
4041          *
4042          * We don't need to perform MI_ARB_ENABLE as often as we do (in
4043          * particular all the gen that do not need the w/a at all!), if we
4044          * took care to make sure that on every switch into this context
4045          * (both ordinary and for preemption) that arbitrartion was enabled
4046          * we would be fine.  However, for gen8 there is another w/a that
4047          * requires us to not preempt inside GPGPU execution, so we keep
4048          * arbitration disabled for gen8 batches. Arbitration will be
4049          * re-enabled before we close the request
4050          * (engine->emit_fini_breadcrumb).
4051          */
4052         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4053
4054         /* FIXME(BDW+): Address space and security selectors. */
4055         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4056                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4057         *cs++ = lower_32_bits(offset);
4058         *cs++ = upper_32_bits(offset);
4059
4060         intel_ring_advance(rq, cs);
4061
4062         return 0;
4063 }
4064
4065 static int gen8_emit_bb_start(struct i915_request *rq,
4066                               u64 offset, u32 len,
4067                               const unsigned int flags)
4068 {
4069         u32 *cs;
4070
4071         cs = intel_ring_begin(rq, 6);
4072         if (IS_ERR(cs))
4073                 return PTR_ERR(cs);
4074
4075         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4076
4077         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4078                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4079         *cs++ = lower_32_bits(offset);
4080         *cs++ = upper_32_bits(offset);
4081
4082         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4083         *cs++ = MI_NOOP;
4084
4085         intel_ring_advance(rq, cs);
4086
4087         return 0;
4088 }
4089
4090 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4091 {
4092         ENGINE_WRITE(engine, RING_IMR,
4093                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
4094         ENGINE_POSTING_READ(engine, RING_IMR);
4095 }
4096
4097 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4098 {
4099         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4100 }
4101
4102 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4103 {
4104         u32 cmd, *cs;
4105
4106         cs = intel_ring_begin(request, 4);
4107         if (IS_ERR(cs))
4108                 return PTR_ERR(cs);
4109
4110         cmd = MI_FLUSH_DW + 1;
4111
4112         /* We always require a command barrier so that subsequent
4113          * commands, such as breadcrumb interrupts, are strictly ordered
4114          * wrt the contents of the write cache being flushed to memory
4115          * (and thus being coherent from the CPU).
4116          */
4117         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4118
4119         if (mode & EMIT_INVALIDATE) {
4120                 cmd |= MI_INVALIDATE_TLB;
4121                 if (request->engine->class == VIDEO_DECODE_CLASS)
4122                         cmd |= MI_INVALIDATE_BSD;
4123         }
4124
4125         *cs++ = cmd;
4126         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4127         *cs++ = 0; /* upper addr */
4128         *cs++ = 0; /* value */
4129         intel_ring_advance(request, cs);
4130
4131         return 0;
4132 }
4133
4134 static int gen8_emit_flush_render(struct i915_request *request,
4135                                   u32 mode)
4136 {
4137         bool vf_flush_wa = false, dc_flush_wa = false;
4138         u32 *cs, flags = 0;
4139         int len;
4140
4141         flags |= PIPE_CONTROL_CS_STALL;
4142
4143         if (mode & EMIT_FLUSH) {
4144                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4145                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4146                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4147                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4148         }
4149
4150         if (mode & EMIT_INVALIDATE) {
4151                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4152                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4153                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4154                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4155                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4156                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4157                 flags |= PIPE_CONTROL_QW_WRITE;
4158                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4159
4160                 /*
4161                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4162                  * pipe control.
4163                  */
4164                 if (IS_GEN(request->i915, 9))
4165                         vf_flush_wa = true;
4166
4167                 /* WaForGAMHang:kbl */
4168                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4169                         dc_flush_wa = true;
4170         }
4171
4172         len = 6;
4173
4174         if (vf_flush_wa)
4175                 len += 6;
4176
4177         if (dc_flush_wa)
4178                 len += 12;
4179
4180         cs = intel_ring_begin(request, len);
4181         if (IS_ERR(cs))
4182                 return PTR_ERR(cs);
4183
4184         if (vf_flush_wa)
4185                 cs = gen8_emit_pipe_control(cs, 0, 0);
4186
4187         if (dc_flush_wa)
4188                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4189                                             0);
4190
4191         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4192
4193         if (dc_flush_wa)
4194                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4195
4196         intel_ring_advance(request, cs);
4197
4198         return 0;
4199 }
4200
4201 static int gen11_emit_flush_render(struct i915_request *request,
4202                                    u32 mode)
4203 {
4204         if (mode & EMIT_FLUSH) {
4205                 u32 *cs;
4206                 u32 flags = 0;
4207
4208                 flags |= PIPE_CONTROL_CS_STALL;
4209
4210                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4211                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4212                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4213                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4214                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4215                 flags |= PIPE_CONTROL_QW_WRITE;
4216                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4217
4218                 cs = intel_ring_begin(request, 6);
4219                 if (IS_ERR(cs))
4220                         return PTR_ERR(cs);
4221
4222                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4223                 intel_ring_advance(request, cs);
4224         }
4225
4226         if (mode & EMIT_INVALIDATE) {
4227                 u32 *cs;
4228                 u32 flags = 0;
4229
4230                 flags |= PIPE_CONTROL_CS_STALL;
4231
4232                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4233                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4234                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4235                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4236                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4237                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4238                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4239                 flags |= PIPE_CONTROL_QW_WRITE;
4240                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4241
4242                 cs = intel_ring_begin(request, 6);
4243                 if (IS_ERR(cs))
4244                         return PTR_ERR(cs);
4245
4246                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4247                 intel_ring_advance(request, cs);
4248         }
4249
4250         return 0;
4251 }
4252
4253 static u32 preparser_disable(bool state)
4254 {
4255         return MI_ARB_CHECK | 1 << 8 | state;
4256 }
4257
4258 static int gen12_emit_flush_render(struct i915_request *request,
4259                                    u32 mode)
4260 {
4261         if (mode & EMIT_FLUSH) {
4262                 u32 flags = 0;
4263                 u32 *cs;
4264
4265                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4266                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4267                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4268                 /* Wa_1409600907:tgl */
4269                 flags |= PIPE_CONTROL_DEPTH_STALL;
4270                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4271                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4272                 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4273
4274                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4275                 flags |= PIPE_CONTROL_QW_WRITE;
4276
4277                 flags |= PIPE_CONTROL_CS_STALL;
4278
4279                 cs = intel_ring_begin(request, 6);
4280                 if (IS_ERR(cs))
4281                         return PTR_ERR(cs);
4282
4283                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4284                 intel_ring_advance(request, cs);
4285         }
4286
4287         if (mode & EMIT_INVALIDATE) {
4288                 u32 flags = 0;
4289                 u32 *cs;
4290
4291                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4292                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4293                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4294                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4295                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4296                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4297                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4298                 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4299
4300                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4301                 flags |= PIPE_CONTROL_QW_WRITE;
4302
4303                 flags |= PIPE_CONTROL_CS_STALL;
4304
4305                 cs = intel_ring_begin(request, 8);
4306                 if (IS_ERR(cs))
4307                         return PTR_ERR(cs);
4308
4309                 /*
4310                  * Prevent the pre-parser from skipping past the TLB
4311                  * invalidate and loading a stale page for the batch
4312                  * buffer / request payload.
4313                  */
4314                 *cs++ = preparser_disable(true);
4315
4316                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4317
4318                 *cs++ = preparser_disable(false);
4319                 intel_ring_advance(request, cs);
4320         }
4321
4322         return 0;
4323 }
4324
4325 /*
4326  * Reserve space for 2 NOOPs at the end of each request to be
4327  * used as a workaround for not being allowed to do lite
4328  * restore with HEAD==TAIL (WaIdleLiteRestore).
4329  */
4330 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4331 {
4332         /* Ensure there's always at least one preemption point per-request. */
4333         *cs++ = MI_ARB_CHECK;
4334         *cs++ = MI_NOOP;
4335         request->wa_tail = intel_ring_offset(request, cs);
4336
4337         return cs;
4338 }
4339
4340 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4341 {
4342         *cs++ = MI_SEMAPHORE_WAIT |
4343                 MI_SEMAPHORE_GLOBAL_GTT |
4344                 MI_SEMAPHORE_POLL |
4345                 MI_SEMAPHORE_SAD_EQ_SDD;
4346         *cs++ = 0;
4347         *cs++ = intel_hws_preempt_address(request->engine);
4348         *cs++ = 0;
4349
4350         return cs;
4351 }
4352
4353 static __always_inline u32*
4354 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4355                                  u32 *cs)
4356 {
4357         *cs++ = MI_USER_INTERRUPT;
4358
4359         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4360         if (intel_engine_has_semaphores(request->engine))
4361                 cs = emit_preempt_busywait(request, cs);
4362
4363         request->tail = intel_ring_offset(request, cs);
4364         assert_ring_tail_valid(request->ring, request->tail);
4365
4366         return gen8_emit_wa_tail(request, cs);
4367 }
4368
4369 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4370 {
4371         cs = gen8_emit_ggtt_write(cs,
4372                                   request->fence.seqno,
4373                                   i915_request_active_timeline(request)->hwsp_offset,
4374                                   0);
4375
4376         return gen8_emit_fini_breadcrumb_footer(request, cs);
4377 }
4378
4379 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4380 {
4381         cs = gen8_emit_pipe_control(cs,
4382                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4383                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4384                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4385                                     0);
4386
4387         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4388         cs = gen8_emit_ggtt_write_rcs(cs,
4389                                       request->fence.seqno,
4390                                       i915_request_active_timeline(request)->hwsp_offset,
4391                                       PIPE_CONTROL_FLUSH_ENABLE |
4392                                       PIPE_CONTROL_CS_STALL);
4393
4394         return gen8_emit_fini_breadcrumb_footer(request, cs);
4395 }
4396
4397 static u32 *
4398 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4399 {
4400         cs = gen8_emit_ggtt_write_rcs(cs,
4401                                       request->fence.seqno,
4402                                       i915_request_active_timeline(request)->hwsp_offset,
4403                                       PIPE_CONTROL_CS_STALL |
4404                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4405                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4406                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4407                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4408                                       PIPE_CONTROL_FLUSH_ENABLE);
4409
4410         return gen8_emit_fini_breadcrumb_footer(request, cs);
4411 }
4412
4413 /*
4414  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4415  * flush and will continue pre-fetching the instructions after it before the
4416  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4417  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4418  * of the next request before the memory has been flushed, we're guaranteed that
4419  * we won't access the batch itself too early.
4420  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4421  * so, if the current request is modifying an instruction in the next request on
4422  * the same intel_context, we might pre-fetch and then execute the pre-update
4423  * instruction. To avoid this, the users of self-modifying code should either
4424  * disable the parser around the code emitting the memory writes, via a new flag
4425  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4426  * the in-kernel use-cases we've opted to use a separate context, see
4427  * reloc_gpu() as an example.
4428  * All the above applies only to the instructions themselves. Non-inline data
4429  * used by the instructions is not pre-fetched.
4430  */
4431
4432 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4433 {
4434         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4435                 MI_SEMAPHORE_GLOBAL_GTT |
4436                 MI_SEMAPHORE_POLL |
4437                 MI_SEMAPHORE_SAD_EQ_SDD;
4438         *cs++ = 0;
4439         *cs++ = intel_hws_preempt_address(request->engine);
4440         *cs++ = 0;
4441         *cs++ = 0;
4442         *cs++ = MI_NOOP;
4443
4444         return cs;
4445 }
4446
4447 static __always_inline u32*
4448 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4449 {
4450         *cs++ = MI_USER_INTERRUPT;
4451
4452         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4453         if (intel_engine_has_semaphores(request->engine))
4454                 cs = gen12_emit_preempt_busywait(request, cs);
4455
4456         request->tail = intel_ring_offset(request, cs);
4457         assert_ring_tail_valid(request->ring, request->tail);
4458
4459         return gen8_emit_wa_tail(request, cs);
4460 }
4461
4462 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4463 {
4464         cs = gen8_emit_ggtt_write(cs,
4465                                   request->fence.seqno,
4466                                   i915_request_active_timeline(request)->hwsp_offset,
4467                                   0);
4468
4469         return gen12_emit_fini_breadcrumb_footer(request, cs);
4470 }
4471
4472 static u32 *
4473 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4474 {
4475         cs = gen8_emit_ggtt_write_rcs(cs,
4476                                       request->fence.seqno,
4477                                       i915_request_active_timeline(request)->hwsp_offset,
4478                                       PIPE_CONTROL_CS_STALL |
4479                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4480                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4481                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4482                                       /* Wa_1409600907:tgl */
4483                                       PIPE_CONTROL_DEPTH_STALL |
4484                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4485                                       PIPE_CONTROL_FLUSH_ENABLE |
4486                                       PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4487
4488         return gen12_emit_fini_breadcrumb_footer(request, cs);
4489 }
4490
4491 static void execlists_park(struct intel_engine_cs *engine)
4492 {
4493         cancel_timer(&engine->execlists.timer);
4494         cancel_timer(&engine->execlists.preempt);
4495 }
4496
4497 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4498 {
4499         engine->submit_request = execlists_submit_request;
4500         engine->schedule = i915_schedule;
4501         engine->execlists.tasklet.func = execlists_submission_tasklet;
4502
4503         engine->reset.prepare = execlists_reset_prepare;
4504         engine->reset.rewind = execlists_reset_rewind;
4505         engine->reset.cancel = execlists_reset_cancel;
4506         engine->reset.finish = execlists_reset_finish;
4507
4508         engine->park = execlists_park;
4509         engine->unpark = NULL;
4510
4511         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4512         if (!intel_vgpu_active(engine->i915)) {
4513                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4514                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4515                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4516         }
4517
4518         if (INTEL_GEN(engine->i915) >= 12)
4519                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4520
4521         if (intel_engine_has_preemption(engine))
4522                 engine->emit_bb_start = gen8_emit_bb_start;
4523         else
4524                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
4525 }
4526
4527 static void execlists_shutdown(struct intel_engine_cs *engine)
4528 {
4529         /* Synchronise with residual timers and any softirq they raise */
4530         del_timer_sync(&engine->execlists.timer);
4531         del_timer_sync(&engine->execlists.preempt);
4532         tasklet_kill(&engine->execlists.tasklet);
4533 }
4534
4535 static void execlists_release(struct intel_engine_cs *engine)
4536 {
4537         execlists_shutdown(engine);
4538
4539         intel_engine_cleanup_common(engine);
4540         lrc_destroy_wa_ctx(engine);
4541 }
4542
4543 static void
4544 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4545 {
4546         /* Default vfuncs which can be overriden by each engine. */
4547
4548         engine->resume = execlists_resume;
4549
4550         engine->cops = &execlists_context_ops;
4551         engine->request_alloc = execlists_request_alloc;
4552
4553         engine->emit_flush = gen8_emit_flush;
4554         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4555         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4556         if (INTEL_GEN(engine->i915) >= 12)
4557                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4558
4559         engine->set_default_submission = intel_execlists_set_default_submission;
4560
4561         if (INTEL_GEN(engine->i915) < 11) {
4562                 engine->irq_enable = gen8_logical_ring_enable_irq;
4563                 engine->irq_disable = gen8_logical_ring_disable_irq;
4564         } else {
4565                 /*
4566                  * TODO: On Gen11 interrupt masks need to be clear
4567                  * to allow C6 entry. Keep interrupts enabled at
4568                  * and take the hit of generating extra interrupts
4569                  * until a more refined solution exists.
4570                  */
4571         }
4572 }
4573
4574 static inline void
4575 logical_ring_default_irqs(struct intel_engine_cs *engine)
4576 {
4577         unsigned int shift = 0;
4578
4579         if (INTEL_GEN(engine->i915) < 11) {
4580                 const u8 irq_shifts[] = {
4581                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
4582                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
4583                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4584                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4585                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
4586                 };
4587
4588                 shift = irq_shifts[engine->id];
4589         }
4590
4591         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4592         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4593         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4594         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
4595 }
4596
4597 static void rcs_submission_override(struct intel_engine_cs *engine)
4598 {
4599         switch (INTEL_GEN(engine->i915)) {
4600         case 12:
4601                 engine->emit_flush = gen12_emit_flush_render;
4602                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4603                 break;
4604         case 11:
4605                 engine->emit_flush = gen11_emit_flush_render;
4606                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4607                 break;
4608         default:
4609                 engine->emit_flush = gen8_emit_flush_render;
4610                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4611                 break;
4612         }
4613 }
4614
4615 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4616 {
4617         struct intel_engine_execlists * const execlists = &engine->execlists;
4618         struct drm_i915_private *i915 = engine->i915;
4619         struct intel_uncore *uncore = engine->uncore;
4620         u32 base = engine->mmio_base;
4621
4622         tasklet_init(&engine->execlists.tasklet,
4623                      execlists_submission_tasklet, (unsigned long)engine);
4624         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4625         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4626
4627         logical_ring_default_vfuncs(engine);
4628         logical_ring_default_irqs(engine);
4629
4630         if (engine->class == RENDER_CLASS)
4631                 rcs_submission_override(engine);
4632
4633         if (intel_init_workaround_bb(engine))
4634                 /*
4635                  * We continue even if we fail to initialize WA batch
4636                  * because we only expect rare glitches but nothing
4637                  * critical to prevent us from using GPU
4638                  */
4639                 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
4640
4641         if (HAS_LOGICAL_RING_ELSQ(i915)) {
4642                 execlists->submit_reg = uncore->regs +
4643                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4644                 execlists->ctrl_reg = uncore->regs +
4645                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4646         } else {
4647                 execlists->submit_reg = uncore->regs +
4648                         i915_mmio_reg_offset(RING_ELSP(base));
4649         }
4650
4651         execlists->csb_status =
4652                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4653
4654         execlists->csb_write =
4655                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4656
4657         if (INTEL_GEN(i915) < 11)
4658                 execlists->csb_size = GEN8_CSB_ENTRIES;
4659         else
4660                 execlists->csb_size = GEN11_CSB_ENTRIES;
4661
4662         reset_csb_pointers(engine);
4663
4664         /* Finally, take ownership and responsibility for cleanup! */
4665         engine->release = execlists_release;
4666
4667         return 0;
4668 }
4669
4670 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4671 {
4672         u32 indirect_ctx_offset;
4673
4674         switch (INTEL_GEN(engine->i915)) {
4675         default:
4676                 MISSING_CASE(INTEL_GEN(engine->i915));
4677                 /* fall through */
4678         case 12:
4679                 indirect_ctx_offset =
4680                         GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4681                 break;
4682         case 11:
4683                 indirect_ctx_offset =
4684                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4685                 break;
4686         case 10:
4687                 indirect_ctx_offset =
4688                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4689                 break;
4690         case 9:
4691                 indirect_ctx_offset =
4692                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4693                 break;
4694         case 8:
4695                 indirect_ctx_offset =
4696                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4697                 break;
4698         }
4699
4700         return indirect_ctx_offset;
4701 }
4702
4703
4704 static void init_common_reg_state(u32 * const regs,
4705                                   const struct intel_engine_cs *engine,
4706                                   const struct intel_ring *ring,
4707                                   bool inhibit)
4708 {
4709         u32 ctl;
4710
4711         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4712         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4713         if (inhibit)
4714                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4715         if (INTEL_GEN(engine->i915) < 11)
4716                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4717                                            CTX_CTRL_RS_CTX_ENABLE);
4718         regs[CTX_CONTEXT_CONTROL] = ctl;
4719
4720         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4721         regs[CTX_TIMESTAMP] = 0;
4722 }
4723
4724 static void init_wa_bb_reg_state(u32 * const regs,
4725                                  const struct intel_engine_cs *engine,
4726                                  u32 pos_bb_per_ctx)
4727 {
4728         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4729
4730         if (wa_ctx->per_ctx.size) {
4731                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4732
4733                 regs[pos_bb_per_ctx] =
4734                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4735         }
4736
4737         if (wa_ctx->indirect_ctx.size) {
4738                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4739
4740                 regs[pos_bb_per_ctx + 2] =
4741                         (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4742                         (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4743
4744                 regs[pos_bb_per_ctx + 4] =
4745                         intel_lr_indirect_ctx_offset(engine) << 6;
4746         }
4747 }
4748
4749 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4750 {
4751         if (i915_vm_is_4lvl(&ppgtt->vm)) {
4752                 /* 64b PPGTT (48bit canonical)
4753                  * PDP0_DESCRIPTOR contains the base address to PML4 and
4754                  * other PDP Descriptors are ignored.
4755                  */
4756                 ASSIGN_CTX_PML4(ppgtt, regs);
4757         } else {
4758                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
4759                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
4760                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
4761                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
4762         }
4763 }
4764
4765 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4766 {
4767         if (i915_is_ggtt(vm))
4768                 return i915_vm_to_ggtt(vm)->alias;
4769         else
4770                 return i915_vm_to_ppgtt(vm);
4771 }
4772
4773 static void execlists_init_reg_state(u32 *regs,
4774                                      const struct intel_context *ce,
4775                                      const struct intel_engine_cs *engine,
4776                                      const struct intel_ring *ring,
4777                                      bool inhibit)
4778 {
4779         /*
4780          * A context is actually a big batch buffer with several
4781          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4782          * values we are setting here are only for the first context restore:
4783          * on a subsequent save, the GPU will recreate this batchbuffer with new
4784          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4785          * we are not initializing here).
4786          *
4787          * Must keep consistent with virtual_update_register_offsets().
4788          */
4789         set_offsets(regs, reg_offsets(engine), engine, inhibit);
4790
4791         init_common_reg_state(regs, engine, ring, inhibit);
4792         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4793
4794         init_wa_bb_reg_state(regs, engine,
4795                              INTEL_GEN(engine->i915) >= 12 ?
4796                              GEN12_CTX_BB_PER_CTX_PTR :
4797                              CTX_BB_PER_CTX_PTR);
4798
4799         __reset_stop_ring(regs, engine);
4800 }
4801
4802 static int
4803 populate_lr_context(struct intel_context *ce,
4804                     struct drm_i915_gem_object *ctx_obj,
4805                     struct intel_engine_cs *engine,
4806                     struct intel_ring *ring)
4807 {
4808         bool inhibit = true;
4809         void *vaddr;
4810         int ret;
4811
4812         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4813         if (IS_ERR(vaddr)) {
4814                 ret = PTR_ERR(vaddr);
4815                 drm_dbg(&engine->i915->drm,
4816                         "Could not map object pages! (%d)\n", ret);
4817                 return ret;
4818         }
4819
4820         set_redzone(vaddr, engine);
4821
4822         if (engine->default_state) {
4823                 void *defaults;
4824
4825                 defaults = i915_gem_object_pin_map(engine->default_state,
4826                                                    I915_MAP_WB);
4827                 if (IS_ERR(defaults)) {
4828                         ret = PTR_ERR(defaults);
4829                         goto err_unpin_ctx;
4830                 }
4831
4832                 memcpy(vaddr, defaults, engine->context_size);
4833                 i915_gem_object_unpin_map(engine->default_state);
4834                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4835                 inhibit = false;
4836         }
4837
4838         /* Clear the ppHWSP (inc. per-context counters) */
4839         memset(vaddr, 0, PAGE_SIZE);
4840
4841         /*
4842          * The second page of the context object contains some registers which
4843          * must be set up prior to the first execution.
4844          */
4845         execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4846                                  ce, engine, ring, inhibit);
4847
4848         ret = 0;
4849 err_unpin_ctx:
4850         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4851         i915_gem_object_unpin_map(ctx_obj);
4852         return ret;
4853 }
4854
4855 static int __execlists_context_alloc(struct intel_context *ce,
4856                                      struct intel_engine_cs *engine)
4857 {
4858         struct drm_i915_gem_object *ctx_obj;
4859         struct intel_ring *ring;
4860         struct i915_vma *vma;
4861         u32 context_size;
4862         int ret;
4863
4864         GEM_BUG_ON(ce->state);
4865         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4866
4867         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4868                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4869
4870         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4871         if (IS_ERR(ctx_obj))
4872                 return PTR_ERR(ctx_obj);
4873
4874         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4875         if (IS_ERR(vma)) {
4876                 ret = PTR_ERR(vma);
4877                 goto error_deref_obj;
4878         }
4879
4880         if (!ce->timeline) {
4881                 struct intel_timeline *tl;
4882                 struct i915_vma *hwsp;
4883
4884                 /*
4885                  * Use the static global HWSP for the kernel context, and
4886                  * a dynamically allocated cacheline for everyone else.
4887                  */
4888                 hwsp = NULL;
4889                 if (unlikely(intel_context_is_barrier(ce)))
4890                         hwsp = engine->status_page.vma;
4891
4892                 tl = intel_timeline_create(engine->gt, hwsp);
4893                 if (IS_ERR(tl)) {
4894                         ret = PTR_ERR(tl);
4895                         goto error_deref_obj;
4896                 }
4897
4898                 ce->timeline = tl;
4899         }
4900
4901         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4902         if (IS_ERR(ring)) {
4903                 ret = PTR_ERR(ring);
4904                 goto error_deref_obj;
4905         }
4906
4907         ret = populate_lr_context(ce, ctx_obj, engine, ring);
4908         if (ret) {
4909                 drm_dbg(&engine->i915->drm,
4910                         "Failed to populate LRC: %d\n", ret);
4911                 goto error_ring_free;
4912         }
4913
4914         ce->ring = ring;
4915         ce->state = vma;
4916
4917         return 0;
4918
4919 error_ring_free:
4920         intel_ring_put(ring);
4921 error_deref_obj:
4922         i915_gem_object_put(ctx_obj);
4923         return ret;
4924 }
4925
4926 static struct list_head *virtual_queue(struct virtual_engine *ve)
4927 {
4928         return &ve->base.execlists.default_priolist.requests[0];
4929 }
4930
4931 static void virtual_context_destroy(struct kref *kref)
4932 {
4933         struct virtual_engine *ve =
4934                 container_of(kref, typeof(*ve), context.ref);
4935         unsigned int n;
4936
4937         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4938         GEM_BUG_ON(ve->request);
4939         GEM_BUG_ON(ve->context.inflight);
4940
4941         for (n = 0; n < ve->num_siblings; n++) {
4942                 struct intel_engine_cs *sibling = ve->siblings[n];
4943                 struct rb_node *node = &ve->nodes[sibling->id].rb;
4944                 unsigned long flags;
4945
4946                 if (RB_EMPTY_NODE(node))
4947                         continue;
4948
4949                 spin_lock_irqsave(&sibling->active.lock, flags);
4950
4951                 /* Detachment is lazily performed in the execlists tasklet */
4952                 if (!RB_EMPTY_NODE(node))
4953                         rb_erase_cached(node, &sibling->execlists.virtual);
4954
4955                 spin_unlock_irqrestore(&sibling->active.lock, flags);
4956         }
4957         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4958
4959         if (ve->context.state)
4960                 __execlists_context_fini(&ve->context);
4961         intel_context_fini(&ve->context);
4962
4963         intel_engine_free_request_pool(&ve->base);
4964
4965         kfree(ve->bonds);
4966         kfree(ve);
4967 }
4968
4969 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4970 {
4971         int swp;
4972
4973         /*
4974          * Pick a random sibling on starting to help spread the load around.
4975          *
4976          * New contexts are typically created with exactly the same order
4977          * of siblings, and often started in batches. Due to the way we iterate
4978          * the array of sibling when submitting requests, sibling[0] is
4979          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4980          * randomised across the system, we also help spread the load by the
4981          * first engine we inspect being different each time.
4982          *
4983          * NB This does not force us to execute on this engine, it will just
4984          * typically be the first we inspect for submission.
4985          */
4986         swp = prandom_u32_max(ve->num_siblings);
4987         if (!swp)
4988                 return;
4989
4990         swap(ve->siblings[swp], ve->siblings[0]);
4991         if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4992                 virtual_update_register_offsets(ve->context.lrc_reg_state,
4993                                                 ve->siblings[0]);
4994 }
4995
4996 static int virtual_context_alloc(struct intel_context *ce)
4997 {
4998         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4999
5000         return __execlists_context_alloc(ce, ve->siblings[0]);
5001 }
5002
5003 static int virtual_context_pin(struct intel_context *ce)
5004 {
5005         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5006         int err;
5007
5008         /* Note: we must use a real engine class for setting up reg state */
5009         err = __execlists_context_pin(ce, ve->siblings[0]);
5010         if (err)
5011                 return err;
5012
5013         virtual_engine_initial_hint(ve);
5014         return 0;
5015 }
5016
5017 static void virtual_context_enter(struct intel_context *ce)
5018 {
5019         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5020         unsigned int n;
5021
5022         for (n = 0; n < ve->num_siblings; n++)
5023                 intel_engine_pm_get(ve->siblings[n]);
5024
5025         intel_timeline_enter(ce->timeline);
5026 }
5027
5028 static void virtual_context_exit(struct intel_context *ce)
5029 {
5030         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5031         unsigned int n;
5032
5033         intel_timeline_exit(ce->timeline);
5034
5035         for (n = 0; n < ve->num_siblings; n++)
5036                 intel_engine_pm_put(ve->siblings[n]);
5037 }
5038
5039 static const struct intel_context_ops virtual_context_ops = {
5040         .alloc = virtual_context_alloc,
5041
5042         .pin = virtual_context_pin,
5043         .unpin = execlists_context_unpin,
5044
5045         .enter = virtual_context_enter,
5046         .exit = virtual_context_exit,
5047
5048         .destroy = virtual_context_destroy,
5049 };
5050
5051 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5052 {
5053         struct i915_request *rq;
5054         intel_engine_mask_t mask;
5055
5056         rq = READ_ONCE(ve->request);
5057         if (!rq)
5058                 return 0;
5059
5060         /* The rq is ready for submission; rq->execution_mask is now stable. */
5061         mask = rq->execution_mask;
5062         if (unlikely(!mask)) {
5063                 /* Invalid selection, submit to a random engine in error */
5064                 i915_request_set_error_once(rq, -ENODEV);
5065                 mask = ve->siblings[0]->mask;
5066         }
5067
5068         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5069                      rq->fence.context, rq->fence.seqno,
5070                      mask, ve->base.execlists.queue_priority_hint);
5071
5072         return mask;
5073 }
5074
5075 static void virtual_submission_tasklet(unsigned long data)
5076 {
5077         struct virtual_engine * const ve = (struct virtual_engine *)data;
5078         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5079         intel_engine_mask_t mask;
5080         unsigned int n;
5081
5082         rcu_read_lock();
5083         mask = virtual_submission_mask(ve);
5084         rcu_read_unlock();
5085         if (unlikely(!mask))
5086                 return;
5087
5088         local_irq_disable();
5089         for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
5090                 struct intel_engine_cs *sibling = ve->siblings[n];
5091                 struct ve_node * const node = &ve->nodes[sibling->id];
5092                 struct rb_node **parent, *rb;
5093                 bool first;
5094
5095                 if (unlikely(!(mask & sibling->mask))) {
5096                         if (!RB_EMPTY_NODE(&node->rb)) {
5097                                 spin_lock(&sibling->active.lock);
5098                                 rb_erase_cached(&node->rb,
5099                                                 &sibling->execlists.virtual);
5100                                 RB_CLEAR_NODE(&node->rb);
5101                                 spin_unlock(&sibling->active.lock);
5102                         }
5103                         continue;
5104                 }
5105
5106                 spin_lock(&sibling->active.lock);
5107
5108                 if (!RB_EMPTY_NODE(&node->rb)) {
5109                         /*
5110                          * Cheat and avoid rebalancing the tree if we can
5111                          * reuse this node in situ.
5112                          */
5113                         first = rb_first_cached(&sibling->execlists.virtual) ==
5114                                 &node->rb;
5115                         if (prio == node->prio || (prio > node->prio && first))
5116                                 goto submit_engine;
5117
5118                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5119                 }
5120
5121                 rb = NULL;
5122                 first = true;
5123                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5124                 while (*parent) {
5125                         struct ve_node *other;
5126
5127                         rb = *parent;
5128                         other = rb_entry(rb, typeof(*other), rb);
5129                         if (prio > other->prio) {
5130                                 parent = &rb->rb_left;
5131                         } else {
5132                                 parent = &rb->rb_right;
5133                                 first = false;
5134                         }
5135                 }
5136
5137                 rb_link_node(&node->rb, rb, parent);
5138                 rb_insert_color_cached(&node->rb,
5139                                        &sibling->execlists.virtual,
5140                                        first);
5141
5142 submit_engine:
5143                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5144                 node->prio = prio;
5145                 if (first && prio > sibling->execlists.queue_priority_hint)
5146                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5147
5148                 spin_unlock(&sibling->active.lock);
5149         }
5150         local_irq_enable();
5151 }
5152
5153 static void virtual_submit_request(struct i915_request *rq)
5154 {
5155         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5156         struct i915_request *old;
5157         unsigned long flags;
5158
5159         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5160                      rq->fence.context,
5161                      rq->fence.seqno);
5162
5163         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5164
5165         spin_lock_irqsave(&ve->base.active.lock, flags);
5166
5167         old = ve->request;
5168         if (old) { /* background completion event from preempt-to-busy */
5169                 GEM_BUG_ON(!i915_request_completed(old));
5170                 __i915_request_submit(old);
5171                 i915_request_put(old);
5172         }
5173
5174         if (i915_request_completed(rq)) {
5175                 __i915_request_submit(rq);
5176
5177                 ve->base.execlists.queue_priority_hint = INT_MIN;
5178                 ve->request = NULL;
5179         } else {
5180                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5181                 ve->request = i915_request_get(rq);
5182
5183                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5184                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5185
5186                 tasklet_schedule(&ve->base.execlists.tasklet);
5187         }
5188
5189         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5190 }
5191
5192 static struct ve_bond *
5193 virtual_find_bond(struct virtual_engine *ve,
5194                   const struct intel_engine_cs *master)
5195 {
5196         int i;
5197
5198         for (i = 0; i < ve->num_bonds; i++) {
5199                 if (ve->bonds[i].master == master)
5200                         return &ve->bonds[i];
5201         }
5202
5203         return NULL;
5204 }
5205
5206 static void
5207 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5208 {
5209         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5210         intel_engine_mask_t allowed, exec;
5211         struct ve_bond *bond;
5212
5213         allowed = ~to_request(signal)->engine->mask;
5214
5215         bond = virtual_find_bond(ve, to_request(signal)->engine);
5216         if (bond)
5217                 allowed &= bond->sibling_mask;
5218
5219         /* Restrict the bonded request to run on only the available engines */
5220         exec = READ_ONCE(rq->execution_mask);
5221         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5222                 ;
5223
5224         /* Prevent the master from being re-run on the bonded engines */
5225         to_request(signal)->execution_mask &= ~allowed;
5226 }
5227
5228 struct intel_context *
5229 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5230                                unsigned int count)
5231 {
5232         struct virtual_engine *ve;
5233         unsigned int n;
5234         int err;
5235
5236         if (count == 0)
5237                 return ERR_PTR(-EINVAL);
5238
5239         if (count == 1)
5240                 return intel_context_create(siblings[0]);
5241
5242         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5243         if (!ve)
5244                 return ERR_PTR(-ENOMEM);
5245
5246         ve->base.i915 = siblings[0]->i915;
5247         ve->base.gt = siblings[0]->gt;
5248         ve->base.uncore = siblings[0]->uncore;
5249         ve->base.id = -1;
5250
5251         ve->base.class = OTHER_CLASS;
5252         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5253         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5254         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5255
5256         /*
5257          * The decision on whether to submit a request using semaphores
5258          * depends on the saturated state of the engine. We only compute
5259          * this during HW submission of the request, and we need for this
5260          * state to be globally applied to all requests being submitted
5261          * to this engine. Virtual engines encompass more than one physical
5262          * engine and so we cannot accurately tell in advance if one of those
5263          * engines is already saturated and so cannot afford to use a semaphore
5264          * and be pessimized in priority for doing so -- if we are the only
5265          * context using semaphores after all other clients have stopped, we
5266          * will be starved on the saturated system. Such a global switch for
5267          * semaphores is less than ideal, but alas is the current compromise.
5268          */
5269         ve->base.saturated = ALL_ENGINES;
5270
5271         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5272
5273         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5274         intel_engine_init_breadcrumbs(&ve->base);
5275         intel_engine_init_execlists(&ve->base);
5276
5277         ve->base.cops = &virtual_context_ops;
5278         ve->base.request_alloc = execlists_request_alloc;
5279
5280         ve->base.schedule = i915_schedule;
5281         ve->base.submit_request = virtual_submit_request;
5282         ve->base.bond_execute = virtual_bond_execute;
5283
5284         INIT_LIST_HEAD(virtual_queue(ve));
5285         ve->base.execlists.queue_priority_hint = INT_MIN;
5286         tasklet_init(&ve->base.execlists.tasklet,
5287                      virtual_submission_tasklet,
5288                      (unsigned long)ve);
5289
5290         intel_context_init(&ve->context, &ve->base);
5291
5292         for (n = 0; n < count; n++) {
5293                 struct intel_engine_cs *sibling = siblings[n];
5294
5295                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5296                 if (sibling->mask & ve->base.mask) {
5297                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5298                                   sibling->name);
5299                         err = -EINVAL;
5300                         goto err_put;
5301                 }
5302
5303                 /*
5304                  * The virtual engine implementation is tightly coupled to
5305                  * the execlists backend -- we push out request directly
5306                  * into a tree inside each physical engine. We could support
5307                  * layering if we handle cloning of the requests and
5308                  * submitting a copy into each backend.
5309                  */
5310                 if (sibling->execlists.tasklet.func !=
5311                     execlists_submission_tasklet) {
5312                         err = -ENODEV;
5313                         goto err_put;
5314                 }
5315
5316                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5317                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5318
5319                 ve->siblings[ve->num_siblings++] = sibling;
5320                 ve->base.mask |= sibling->mask;
5321
5322                 /*
5323                  * All physical engines must be compatible for their emission
5324                  * functions (as we build the instructions during request
5325                  * construction and do not alter them before submission
5326                  * on the physical engine). We use the engine class as a guide
5327                  * here, although that could be refined.
5328                  */
5329                 if (ve->base.class != OTHER_CLASS) {
5330                         if (ve->base.class != sibling->class) {
5331                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5332                                           sibling->class, ve->base.class);
5333                                 err = -EINVAL;
5334                                 goto err_put;
5335                         }
5336                         continue;
5337                 }
5338
5339                 ve->base.class = sibling->class;
5340                 ve->base.uabi_class = sibling->uabi_class;
5341                 snprintf(ve->base.name, sizeof(ve->base.name),
5342                          "v%dx%d", ve->base.class, count);
5343                 ve->base.context_size = sibling->context_size;
5344
5345                 ve->base.emit_bb_start = sibling->emit_bb_start;
5346                 ve->base.emit_flush = sibling->emit_flush;
5347                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5348                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5349                 ve->base.emit_fini_breadcrumb_dw =
5350                         sibling->emit_fini_breadcrumb_dw;
5351
5352                 ve->base.flags = sibling->flags;
5353         }
5354
5355         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5356
5357         return &ve->context;
5358
5359 err_put:
5360         intel_context_put(&ve->context);
5361         return ERR_PTR(err);
5362 }
5363
5364 struct intel_context *
5365 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5366 {
5367         struct virtual_engine *se = to_virtual_engine(src);
5368         struct intel_context *dst;
5369
5370         dst = intel_execlists_create_virtual(se->siblings,
5371                                              se->num_siblings);
5372         if (IS_ERR(dst))
5373                 return dst;
5374
5375         if (se->num_bonds) {
5376                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5377
5378                 de->bonds = kmemdup(se->bonds,
5379                                     sizeof(*se->bonds) * se->num_bonds,
5380                                     GFP_KERNEL);
5381                 if (!de->bonds) {
5382                         intel_context_put(dst);
5383                         return ERR_PTR(-ENOMEM);
5384                 }
5385
5386                 de->num_bonds = se->num_bonds;
5387         }
5388
5389         return dst;
5390 }
5391
5392 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5393                                      const struct intel_engine_cs *master,
5394                                      const struct intel_engine_cs *sibling)
5395 {
5396         struct virtual_engine *ve = to_virtual_engine(engine);
5397         struct ve_bond *bond;
5398         int n;
5399
5400         /* Sanity check the sibling is part of the virtual engine */
5401         for (n = 0; n < ve->num_siblings; n++)
5402                 if (sibling == ve->siblings[n])
5403                         break;
5404         if (n == ve->num_siblings)
5405                 return -EINVAL;
5406
5407         bond = virtual_find_bond(ve, master);
5408         if (bond) {
5409                 bond->sibling_mask |= sibling->mask;
5410                 return 0;
5411         }
5412
5413         bond = krealloc(ve->bonds,
5414                         sizeof(*bond) * (ve->num_bonds + 1),
5415                         GFP_KERNEL);
5416         if (!bond)
5417                 return -ENOMEM;
5418
5419         bond[ve->num_bonds].master = master;
5420         bond[ve->num_bonds].sibling_mask = sibling->mask;
5421
5422         ve->bonds = bond;
5423         ve->num_bonds++;
5424
5425         return 0;
5426 }
5427
5428 struct intel_engine_cs *
5429 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5430                                  unsigned int sibling)
5431 {
5432         struct virtual_engine *ve = to_virtual_engine(engine);
5433
5434         if (sibling >= ve->num_siblings)
5435                 return NULL;
5436
5437         return ve->siblings[sibling];
5438 }
5439
5440 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5441                                    struct drm_printer *m,
5442                                    void (*show_request)(struct drm_printer *m,
5443                                                         struct i915_request *rq,
5444                                                         const char *prefix),
5445                                    unsigned int max)
5446 {
5447         const struct intel_engine_execlists *execlists = &engine->execlists;
5448         struct i915_request *rq, *last;
5449         unsigned long flags;
5450         unsigned int count;
5451         struct rb_node *rb;
5452
5453         spin_lock_irqsave(&engine->active.lock, flags);
5454
5455         last = NULL;
5456         count = 0;
5457         list_for_each_entry(rq, &engine->active.requests, sched.link) {
5458                 if (count++ < max - 1)
5459                         show_request(m, rq, "\t\tE ");
5460                 else
5461                         last = rq;
5462         }
5463         if (last) {
5464                 if (count > max) {
5465                         drm_printf(m,
5466                                    "\t\t...skipping %d executing requests...\n",
5467                                    count - max);
5468                 }
5469                 show_request(m, last, "\t\tE ");
5470         }
5471
5472         if (execlists->switch_priority_hint != INT_MIN)
5473                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
5474                            READ_ONCE(execlists->switch_priority_hint));
5475         if (execlists->queue_priority_hint != INT_MIN)
5476                 drm_printf(m, "\t\tQueue priority hint: %d\n",
5477                            READ_ONCE(execlists->queue_priority_hint));
5478
5479         last = NULL;
5480         count = 0;
5481         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5482                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5483                 int i;
5484
5485                 priolist_for_each_request(rq, p, i) {
5486                         if (count++ < max - 1)
5487                                 show_request(m, rq, "\t\tQ ");
5488                         else
5489                                 last = rq;
5490                 }
5491         }
5492         if (last) {
5493                 if (count > max) {
5494                         drm_printf(m,
5495                                    "\t\t...skipping %d queued requests...\n",
5496                                    count - max);
5497                 }
5498                 show_request(m, last, "\t\tQ ");
5499         }
5500
5501         last = NULL;
5502         count = 0;
5503         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5504                 struct virtual_engine *ve =
5505                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5506                 struct i915_request *rq = READ_ONCE(ve->request);
5507
5508                 if (rq) {
5509                         if (count++ < max - 1)
5510                                 show_request(m, rq, "\t\tV ");
5511                         else
5512                                 last = rq;
5513                 }
5514         }
5515         if (last) {
5516                 if (count > max) {
5517                         drm_printf(m,
5518                                    "\t\t...skipping %d virtual requests...\n",
5519                                    count - max);
5520                 }
5521                 show_request(m, last, "\t\tV ");
5522         }
5523
5524         spin_unlock_irqrestore(&engine->active.lock, flags);
5525 }
5526
5527 void intel_lr_context_reset(struct intel_engine_cs *engine,
5528                             struct intel_context *ce,
5529                             u32 head,
5530                             bool scrub)
5531 {
5532         GEM_BUG_ON(!intel_context_is_pinned(ce));
5533
5534         /*
5535          * We want a simple context + ring to execute the breadcrumb update.
5536          * We cannot rely on the context being intact across the GPU hang,
5537          * so clear it and rebuild just what we need for the breadcrumb.
5538          * All pending requests for this context will be zapped, and any
5539          * future request will be after userspace has had the opportunity
5540          * to recreate its own state.
5541          */
5542         if (scrub)
5543                 restore_default_state(ce, engine);
5544
5545         /* Rerun the request; its payload has been neutered (if guilty). */
5546         __execlists_update_reg_state(ce, engine, head);
5547 }
5548
5549 bool
5550 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5551 {
5552         return engine->set_default_submission ==
5553                intel_execlists_set_default_submission;
5554 }
5555
5556 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5557 #include "selftest_lrc.c"
5558 #endif