drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_context.h"
 141 #include "intel_engine_pm.h"
 142 #include "intel_gt.h"
 143 #include "intel_gt_pm.h"
 144 #include "intel_gt_requests.h"
 145 #include "intel_lrc_reg.h"
 146 #include "intel_mocs.h"
 147 #include "intel_reset.h"
 148 #include "intel_ring.h"
 149 #include "intel_workarounds.h"
 150
 151 #define RING_EXECLIST_QFULL             (1 << 0x2)
 152 #define RING_EXECLIST1_VALID            (1 << 0x3)
 153 #define RING_EXECLIST0_VALID            (1 << 0x4)
 154 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 155 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 156 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 157
 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 159 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 162 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 163 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 164
 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 166          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 167
 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 169
 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 172 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 173 #define GEN12_IDLE_CTX_ID               0x7FF
 174 #define GEN12_CSB_CTX_VALID(csb_dw) \
 175         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 176
 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 179
 180 struct virtual_engine {
 181         struct intel_engine_cs base;
 182         struct intel_context context;
 183
 184         /*
 185          * We allow only a single request through the virtual engine at a time
 186          * (each request in the timeline waits for the completion fence of
 187          * the previous before being submitted). By restricting ourselves to
 188          * only submitting a single request, each request is placed on to a
 189          * physical to maximise load spreading (by virtue of the late greedy
 190          * scheduling -- each real engine takes the next available request
 191          * upon idling).
 192          */
 193         struct i915_request *request;
 194
 195         /*
 196          * We keep a rbtree of available virtual engines inside each physical
 197          * engine, sorted by priority. Here we preallocate the nodes we need
 198          * for the virtual engine, indexed by physical_engine->id.
 199          */
 200         struct ve_node {
 201                 struct rb_node rb;
 202                 int prio;
 203         } nodes[I915_NUM_ENGINES];
 204
 205         /*
 206          * Keep track of bonded pairs -- restrictions upon on our selection
 207          * of physical engines any particular request may be submitted to.
 208          * If we receive a submit-fence from a master engine, we will only
 209          * use one of sibling_mask physical engines.
 210          */
 211         struct ve_bond {
 212                 const struct intel_engine_cs *master;
 213                 intel_engine_mask_t sibling_mask;
 214         } *bonds;
 215         unsigned int num_bonds;
 216
 217         /* And finally, which physical engines this virtual engine maps onto. */
 218         unsigned int num_siblings;
 219         struct intel_engine_cs *siblings[0];
 220 };
 221
 222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 223 {
 224         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 225         return container_of(engine, struct virtual_engine, base);
 226 }
 227
 228 static int __execlists_context_alloc(struct intel_context *ce,
 229                                      struct intel_engine_cs *engine);
 230
 231 static void execlists_init_reg_state(u32 *reg_state,
 232                                      const struct intel_context *ce,
 233                                      const struct intel_engine_cs *engine,
 234                                      const struct intel_ring *ring,
 235                                      bool close);
 236 static void
 237 __execlists_update_reg_state(const struct intel_context *ce,
 238                              const struct intel_engine_cs *engine,
 239                              u32 head);
 240
 241 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 242 {
 243         if (INTEL_GEN(engine->i915) >= 12)
 244                 return 0x60;
 245         else if (INTEL_GEN(engine->i915) >= 9)
 246                 return 0x54;
 247         else if (engine->class == RENDER_CLASS)
 248                 return 0x58;
 249         else
 250                 return -1;
 251 }
 252
 253 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 254 {
 255         if (INTEL_GEN(engine->i915) >= 12)
 256                 return 0x74;
 257         else if (INTEL_GEN(engine->i915) >= 9)
 258                 return 0x68;
 259         else if (engine->class == RENDER_CLASS)
 260                 return 0xd8;
 261         else
 262                 return -1;
 263 }
 264
 265 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 266 {
 267         if (INTEL_GEN(engine->i915) >= 12)
 268                 return 0x12;
 269         else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 270                 return 0x18;
 271         else
 272                 return -1;
 273 }
 274
 275 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 276 {
 277         int x;
 278
 279         x = lrc_ring_wa_bb_per_ctx(engine);
 280         if (x < 0)
 281                 return x;
 282
 283         return x + 2;
 284 }
 285
 286 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 287 {
 288         int x;
 289
 290         x = lrc_ring_indirect_ptr(engine);
 291         if (x < 0)
 292                 return x;
 293
 294         return x + 2;
 295 }
 296
 297 static u32
 298 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 299 {
 300         switch (INTEL_GEN(engine->i915)) {
 301         default:
 302                 MISSING_CASE(INTEL_GEN(engine->i915));
 303                 fallthrough;
 304         case 12:
 305                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 306         case 11:
 307                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 308         case 10:
 309                 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 310         case 9:
 311                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 312         case 8:
 313                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 314         }
 315 }
 316
 317 static u32 intel_context_get_runtime(const struct intel_context *ce)
 318 {
 319         /*
 320          * We can use either ppHWSP[16] which is recorded before the context
 321          * switch (and so excludes the cost of context switches) or use the
 322          * value from the context image itself, which is saved/restored earlier
 323          * and so includes the cost of the save.
 324          */
 325         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
 326 }
 327
 328 static void mark_eio(struct i915_request *rq)
 329 {
 330         if (i915_request_completed(rq))
 331                 return;
 332
 333         GEM_BUG_ON(i915_request_signaled(rq));
 334
 335         i915_request_set_error_once(rq, -EIO);
 336         i915_request_mark_complete(rq);
 337 }
 338
 339 static struct i915_request *
 340 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 341 {
 342         struct i915_request *active = rq;
 343
 344         rcu_read_lock();
 345         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 346                 if (i915_request_completed(rq))
 347                         break;
 348
 349                 active = rq;
 350         }
 351         rcu_read_unlock();
 352
 353         return active;
 354 }
 355
 356 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 357 {
 358         return (i915_ggtt_offset(engine->status_page.vma) +
 359                 I915_GEM_HWS_PREEMPT_ADDR);
 360 }
 361
 362 static inline void
 363 ring_set_paused(const struct intel_engine_cs *engine, int state)
 364 {
 365         /*
 366          * We inspect HWS_PREEMPT with a semaphore inside
 367          * engine->emit_fini_breadcrumb. If the dword is true,
 368          * the ring is paused as the semaphore will busywait
 369          * until the dword is false.
 370          */
 371         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 372         if (state)
 373                 wmb();
 374 }
 375
 376 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 377 {
 378         return rb_entry(rb, struct i915_priolist, node);
 379 }
 380
 381 static inline int rq_prio(const struct i915_request *rq)
 382 {
 383         return READ_ONCE(rq->sched.attr.priority);
 384 }
 385
 386 static int effective_prio(const struct i915_request *rq)
 387 {
 388         int prio = rq_prio(rq);
 389
 390         /*
 391          * If this request is special and must not be interrupted at any
 392          * cost, so be it. Note we are only checking the most recent request
 393          * in the context and so may be masking an earlier vip request. It
 394          * is hoped that under the conditions where nopreempt is used, this
 395          * will not matter (i.e. all requests to that context will be
 396          * nopreempt for as long as desired).
 397          */
 398         if (i915_request_has_nopreempt(rq))
 399                 prio = I915_PRIORITY_UNPREEMPTABLE;
 400
 401         /*
 402          * On unwinding the active request, we give it a priority bump
 403          * if it has completed waiting on any semaphore. If we know that
 404          * the request has already started, we can prevent an unwanted
 405          * preempt-to-idle cycle by taking that into account now.
 406          */
 407         if (__i915_request_has_started(rq))
 408                 prio |= I915_PRIORITY_NOSEMAPHORE;
 409
 410         /* Restrict mere WAIT boosts from triggering preemption */
 411         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
 412         return prio | __NO_PREEMPTION;
 413 }
 414
 415 static int queue_prio(const struct intel_engine_execlists *execlists)
 416 {
 417         struct i915_priolist *p;
 418         struct rb_node *rb;
 419
 420         rb = rb_first_cached(&execlists->queue);
 421         if (!rb)
 422                 return INT_MIN;
 423
 424         /*
 425          * As the priolist[] are inverted, with the highest priority in [0],
 426          * we have to flip the index value to become priority.
 427          */
 428         p = to_priolist(rb);
 429         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 430 }
 431
 432 static inline bool need_preempt(const struct intel_engine_cs *engine,
 433                                 const struct i915_request *rq,
 434                                 struct rb_node *rb)
 435 {
 436         int last_prio;
 437
 438         if (!intel_engine_has_semaphores(engine))
 439                 return false;
 440
 441         /*
 442          * Check if the current priority hint merits a preemption attempt.
 443          *
 444          * We record the highest value priority we saw during rescheduling
 445          * prior to this dequeue, therefore we know that if it is strictly
 446          * less than the current tail of ESLP[0], we do not need to force
 447          * a preempt-to-idle cycle.
 448          *
 449          * However, the priority hint is a mere hint that we may need to
 450          * preempt. If that hint is stale or we may be trying to preempt
 451          * ourselves, ignore the request.
 452          *
 453          * More naturally we would write
 454          *      prio >= max(0, last);
 455          * except that we wish to prevent triggering preemption at the same
 456          * priority level: the task that is running should remain running
 457          * to preserve FIFO ordering of dependencies.
 458          */
 459         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 460         if (engine->execlists.queue_priority_hint <= last_prio)
 461                 return false;
 462
 463         /*
 464          * Check against the first request in ELSP[1], it will, thanks to the
 465          * power of PI, be the highest priority of that context.
 466          */
 467         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 468             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 469                 return true;
 470
 471         if (rb) {
 472                 struct virtual_engine *ve =
 473                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 474                 bool preempt = false;
 475
 476                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 477                         struct i915_request *next;
 478
 479                         rcu_read_lock();
 480                         next = READ_ONCE(ve->request);
 481                         if (next)
 482                                 preempt = rq_prio(next) > last_prio;
 483                         rcu_read_unlock();
 484                 }
 485
 486                 if (preempt)
 487                         return preempt;
 488         }
 489
 490         /*
 491          * If the inflight context did not trigger the preemption, then maybe
 492          * it was the set of queued requests? Pick the highest priority in
 493          * the queue (the first active priolist) and see if it deserves to be
 494          * running instead of ELSP[0].
 495          *
 496          * The highest priority request in the queue can not be either
 497          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 498          * context, it's priority would not exceed ELSP[0] aka last_prio.
 499          */
 500         return queue_prio(&engine->execlists) > last_prio;
 501 }
 502
 503 __maybe_unused static inline bool
 504 assert_priority_queue(const struct i915_request *prev,
 505                       const struct i915_request *next)
 506 {
 507         /*
 508          * Without preemption, the prev may refer to the still active element
 509          * which we refuse to let go.
 510          *
 511          * Even with preemption, there are times when we think it is better not
 512          * to preempt and leave an ostensibly lower priority request in flight.
 513          */
 514         if (i915_request_is_active(prev))
 515                 return true;
 516
 517         return rq_prio(prev) >= rq_prio(next);
 518 }
 519
 520 /*
 521  * The context descriptor encodes various attributes of a context,
 522  * including its GTT address and some flags. Because it's fairly
 523  * expensive to calculate, we'll just do it once and cache the result,
 524  * which remains valid until the context is unpinned.
 525  *
 526  * This is what a descriptor looks like, from LSB to MSB::
 527  *
 528  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 529  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 530  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 531  *      bits 53-54:    mbz, reserved for use by hardware
 532  *      bits 55-63:    group ID, currently unused and set to 0
 533  *
 534  * Starting from Gen11, the upper dword of the descriptor has a new format:
 535  *
 536  *      bits 32-36:    reserved
 537  *      bits 37-47:    SW context ID
 538  *      bits 48:53:    engine instance
 539  *      bit 54:        mbz, reserved for use by hardware
 540  *      bits 55-60:    SW counter
 541  *      bits 61-63:    engine class
 542  *
 543  * engine info, SW context ID and SW counter need to form a unique number
 544  * (Context ID) per lrc.
 545  */
 546 static u64
 547 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 548 {
 549         u64 desc;
 550
 551         desc = INTEL_LEGACY_32B_CONTEXT;
 552         if (i915_vm_is_4lvl(ce->vm))
 553                 desc = INTEL_LEGACY_64B_CONTEXT;
 554         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 555
 556         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 557         if (IS_GEN(engine->i915, 8))
 558                 desc |= GEN8_CTX_L3LLC_COHERENT;
 559
 560         desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
 561         /*
 562          * The following 32bits are copied into the OA reports (dword 2).
 563          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
 564          * anything below.
 565          */
 566         if (INTEL_GEN(engine->i915) >= 11) {
 567                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
 568                                                                 /* bits 48-53 */
 569
 570                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
 571                                                                 /* bits 61-63 */
 572         }
 573
 574         return desc;
 575 }
 576
 577 static inline unsigned int dword_in_page(void *addr)
 578 {
 579         return offset_in_page(addr) / sizeof(u32);
 580 }
 581
 582 static void set_offsets(u32 *regs,
 583                         const u8 *data,
 584                         const struct intel_engine_cs *engine,
 585                         bool clear)
 586 #define NOP(x) (BIT(7) | (x))
 587 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 588 #define POSTED BIT(0)
 589 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 590 #define REG16(x) \
 591         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 592         (((x) >> 2) & 0x7f)
 593 #define END(x) 0, (x)
 594 {
 595         const u32 base = engine->mmio_base;
 596
 597         while (*data) {
 598                 u8 count, flags;
 599
 600                 if (*data & BIT(7)) { /* skip */
 601                         count = *data++ & ~BIT(7);
 602                         if (clear)
 603                                 memset32(regs, MI_NOOP, count);
 604                         regs += count;
 605                         continue;
 606                 }
 607
 608                 count = *data & 0x3f;
 609                 flags = *data >> 6;
 610                 data++;
 611
 612                 *regs = MI_LOAD_REGISTER_IMM(count);
 613                 if (flags & POSTED)
 614                         *regs |= MI_LRI_FORCE_POSTED;
 615                 if (INTEL_GEN(engine->i915) >= 11)
 616                         *regs |= MI_LRI_CS_MMIO;
 617                 regs++;
 618
 619                 GEM_BUG_ON(!count);
 620                 do {
 621                         u32 offset = 0;
 622                         u8 v;
 623
 624                         do {
 625                                 v = *data++;
 626                                 offset <<= 7;
 627                                 offset |= v & ~BIT(7);
 628                         } while (v & BIT(7));
 629
 630                         regs[0] = base + (offset << 2);
 631                         if (clear)
 632                                 regs[1] = 0;
 633                         regs += 2;
 634                 } while (--count);
 635         }
 636
 637         if (clear) {
 638                 u8 count = *++data;
 639
 640                 /* Clear past the tail for HW access */
 641                 GEM_BUG_ON(dword_in_page(regs) > count);
 642                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 643
 644                 /* Close the batch; used mainly by live_lrc_layout() */
 645                 *regs = MI_BATCH_BUFFER_END;
 646                 if (INTEL_GEN(engine->i915) >= 10)
 647                         *regs |= BIT(0);
 648         }
 649 }
 650
 651 static const u8 gen8_xcs_offsets[] = {
 652         NOP(1),
 653         LRI(11, 0),
 654         REG16(0x244),
 655         REG(0x034),
 656         REG(0x030),
 657         REG(0x038),
 658         REG(0x03c),
 659         REG(0x168),
 660         REG(0x140),
 661         REG(0x110),
 662         REG(0x11c),
 663         REG(0x114),
 664         REG(0x118),
 665
 666         NOP(9),
 667         LRI(9, 0),
 668         REG16(0x3a8),
 669         REG16(0x28c),
 670         REG16(0x288),
 671         REG16(0x284),
 672         REG16(0x280),
 673         REG16(0x27c),
 674         REG16(0x278),
 675         REG16(0x274),
 676         REG16(0x270),
 677
 678         NOP(13),
 679         LRI(2, 0),
 680         REG16(0x200),
 681         REG(0x028),
 682
 683         END(80)
 684 };
 685
 686 static const u8 gen9_xcs_offsets[] = {
 687         NOP(1),
 688         LRI(14, POSTED),
 689         REG16(0x244),
 690         REG(0x034),
 691         REG(0x030),
 692         REG(0x038),
 693         REG(0x03c),
 694         REG(0x168),
 695         REG(0x140),
 696         REG(0x110),
 697         REG(0x11c),
 698         REG(0x114),
 699         REG(0x118),
 700         REG(0x1c0),
 701         REG(0x1c4),
 702         REG(0x1c8),
 703
 704         NOP(3),
 705         LRI(9, POSTED),
 706         REG16(0x3a8),
 707         REG16(0x28c),
 708         REG16(0x288),
 709         REG16(0x284),
 710         REG16(0x280),
 711         REG16(0x27c),
 712         REG16(0x278),
 713         REG16(0x274),
 714         REG16(0x270),
 715
 716         NOP(13),
 717         LRI(1, POSTED),
 718         REG16(0x200),
 719
 720         NOP(13),
 721         LRI(44, POSTED),
 722         REG(0x028),
 723         REG(0x09c),
 724         REG(0x0c0),
 725         REG(0x178),
 726         REG(0x17c),
 727         REG16(0x358),
 728         REG(0x170),
 729         REG(0x150),
 730         REG(0x154),
 731         REG(0x158),
 732         REG16(0x41c),
 733         REG16(0x600),
 734         REG16(0x604),
 735         REG16(0x608),
 736         REG16(0x60c),
 737         REG16(0x610),
 738         REG16(0x614),
 739         REG16(0x618),
 740         REG16(0x61c),
 741         REG16(0x620),
 742         REG16(0x624),
 743         REG16(0x628),
 744         REG16(0x62c),
 745         REG16(0x630),
 746         REG16(0x634),
 747         REG16(0x638),
 748         REG16(0x63c),
 749         REG16(0x640),
 750         REG16(0x644),
 751         REG16(0x648),
 752         REG16(0x64c),
 753         REG16(0x650),
 754         REG16(0x654),
 755         REG16(0x658),
 756         REG16(0x65c),
 757         REG16(0x660),
 758         REG16(0x664),
 759         REG16(0x668),
 760         REG16(0x66c),
 761         REG16(0x670),
 762         REG16(0x674),
 763         REG16(0x678),
 764         REG16(0x67c),
 765         REG(0x068),
 766
 767         END(176)
 768 };
 769
 770 static const u8 gen12_xcs_offsets[] = {
 771         NOP(1),
 772         LRI(13, POSTED),
 773         REG16(0x244),
 774         REG(0x034),
 775         REG(0x030),
 776         REG(0x038),
 777         REG(0x03c),
 778         REG(0x168),
 779         REG(0x140),
 780         REG(0x110),
 781         REG(0x1c0),
 782         REG(0x1c4),
 783         REG(0x1c8),
 784         REG(0x180),
 785         REG16(0x2b4),
 786
 787         NOP(5),
 788         LRI(9, POSTED),
 789         REG16(0x3a8),
 790         REG16(0x28c),
 791         REG16(0x288),
 792         REG16(0x284),
 793         REG16(0x280),
 794         REG16(0x27c),
 795         REG16(0x278),
 796         REG16(0x274),
 797         REG16(0x270),
 798
 799         END(80)
 800 };
 801
 802 static const u8 gen8_rcs_offsets[] = {
 803         NOP(1),
 804         LRI(14, POSTED),
 805         REG16(0x244),
 806         REG(0x034),
 807         REG(0x030),
 808         REG(0x038),
 809         REG(0x03c),
 810         REG(0x168),
 811         REG(0x140),
 812         REG(0x110),
 813         REG(0x11c),
 814         REG(0x114),
 815         REG(0x118),
 816         REG(0x1c0),
 817         REG(0x1c4),
 818         REG(0x1c8),
 819
 820         NOP(3),
 821         LRI(9, POSTED),
 822         REG16(0x3a8),
 823         REG16(0x28c),
 824         REG16(0x288),
 825         REG16(0x284),
 826         REG16(0x280),
 827         REG16(0x27c),
 828         REG16(0x278),
 829         REG16(0x274),
 830         REG16(0x270),
 831
 832         NOP(13),
 833         LRI(1, 0),
 834         REG(0x0c8),
 835
 836         END(80)
 837 };
 838
 839 static const u8 gen9_rcs_offsets[] = {
 840         NOP(1),
 841         LRI(14, POSTED),
 842         REG16(0x244),
 843         REG(0x34),
 844         REG(0x30),
 845         REG(0x38),
 846         REG(0x3c),
 847         REG(0x168),
 848         REG(0x140),
 849         REG(0x110),
 850         REG(0x11c),
 851         REG(0x114),
 852         REG(0x118),
 853         REG(0x1c0),
 854         REG(0x1c4),
 855         REG(0x1c8),
 856
 857         NOP(3),
 858         LRI(9, POSTED),
 859         REG16(0x3a8),
 860         REG16(0x28c),
 861         REG16(0x288),
 862         REG16(0x284),
 863         REG16(0x280),
 864         REG16(0x27c),
 865         REG16(0x278),
 866         REG16(0x274),
 867         REG16(0x270),
 868
 869         NOP(13),
 870         LRI(1, 0),
 871         REG(0xc8),
 872
 873         NOP(13),
 874         LRI(44, POSTED),
 875         REG(0x28),
 876         REG(0x9c),
 877         REG(0xc0),
 878         REG(0x178),
 879         REG(0x17c),
 880         REG16(0x358),
 881         REG(0x170),
 882         REG(0x150),
 883         REG(0x154),
 884         REG(0x158),
 885         REG16(0x41c),
 886         REG16(0x600),
 887         REG16(0x604),
 888         REG16(0x608),
 889         REG16(0x60c),
 890         REG16(0x610),
 891         REG16(0x614),
 892         REG16(0x618),
 893         REG16(0x61c),
 894         REG16(0x620),
 895         REG16(0x624),
 896         REG16(0x628),
 897         REG16(0x62c),
 898         REG16(0x630),
 899         REG16(0x634),
 900         REG16(0x638),
 901         REG16(0x63c),
 902         REG16(0x640),
 903         REG16(0x644),
 904         REG16(0x648),
 905         REG16(0x64c),
 906         REG16(0x650),
 907         REG16(0x654),
 908         REG16(0x658),
 909         REG16(0x65c),
 910         REG16(0x660),
 911         REG16(0x664),
 912         REG16(0x668),
 913         REG16(0x66c),
 914         REG16(0x670),
 915         REG16(0x674),
 916         REG16(0x678),
 917         REG16(0x67c),
 918         REG(0x68),
 919
 920         END(176)
 921 };
 922
 923 static const u8 gen11_rcs_offsets[] = {
 924         NOP(1),
 925         LRI(15, POSTED),
 926         REG16(0x244),
 927         REG(0x034),
 928         REG(0x030),
 929         REG(0x038),
 930         REG(0x03c),
 931         REG(0x168),
 932         REG(0x140),
 933         REG(0x110),
 934         REG(0x11c),
 935         REG(0x114),
 936         REG(0x118),
 937         REG(0x1c0),
 938         REG(0x1c4),
 939         REG(0x1c8),
 940         REG(0x180),
 941
 942         NOP(1),
 943         LRI(9, POSTED),
 944         REG16(0x3a8),
 945         REG16(0x28c),
 946         REG16(0x288),
 947         REG16(0x284),
 948         REG16(0x280),
 949         REG16(0x27c),
 950         REG16(0x278),
 951         REG16(0x274),
 952         REG16(0x270),
 953
 954         LRI(1, POSTED),
 955         REG(0x1b0),
 956
 957         NOP(10),
 958         LRI(1, 0),
 959         REG(0x0c8),
 960
 961         END(80)
 962 };
 963
 964 static const u8 gen12_rcs_offsets[] = {
 965         NOP(1),
 966         LRI(13, POSTED),
 967         REG16(0x244),
 968         REG(0x034),
 969         REG(0x030),
 970         REG(0x038),
 971         REG(0x03c),
 972         REG(0x168),
 973         REG(0x140),
 974         REG(0x110),
 975         REG(0x1c0),
 976         REG(0x1c4),
 977         REG(0x1c8),
 978         REG(0x180),
 979         REG16(0x2b4),
 980
 981         NOP(5),
 982         LRI(9, POSTED),
 983         REG16(0x3a8),
 984         REG16(0x28c),
 985         REG16(0x288),
 986         REG16(0x284),
 987         REG16(0x280),
 988         REG16(0x27c),
 989         REG16(0x278),
 990         REG16(0x274),
 991         REG16(0x270),
 992
 993         LRI(3, POSTED),
 994         REG(0x1b0),
 995         REG16(0x5a8),
 996         REG16(0x5ac),
 997
 998         NOP(6),
 999         LRI(1, 0),
1000         REG(0x0c8),
1001
1002         END(80)
1003 };
1004
1005 #undef END
1006 #undef REG16
1007 #undef REG
1008 #undef LRI
1009 #undef NOP
1010
1011 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1012 {
1013         /*
1014          * The gen12+ lists only have the registers we program in the basic
1015          * default state. We rely on the context image using relative
1016          * addressing to automatic fixup the register state between the
1017          * physical engines for virtual engine.
1018          */
1019         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1020                    !intel_engine_has_relative_mmio(engine));
1021
1022         if (engine->class == RENDER_CLASS) {
1023                 if (INTEL_GEN(engine->i915) >= 12)
1024                         return gen12_rcs_offsets;
1025                 else if (INTEL_GEN(engine->i915) >= 11)
1026                         return gen11_rcs_offsets;
1027                 else if (INTEL_GEN(engine->i915) >= 9)
1028                         return gen9_rcs_offsets;
1029                 else
1030                         return gen8_rcs_offsets;
1031         } else {
1032                 if (INTEL_GEN(engine->i915) >= 12)
1033                         return gen12_xcs_offsets;
1034                 else if (INTEL_GEN(engine->i915) >= 9)
1035                         return gen9_xcs_offsets;
1036                 else
1037                         return gen8_xcs_offsets;
1038         }
1039 }
1040
1041 static struct i915_request *
1042 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1043 {
1044         struct i915_request *rq, *rn, *active = NULL;
1045         struct list_head *uninitialized_var(pl);
1046         int prio = I915_PRIORITY_INVALID;
1047
1048         lockdep_assert_held(&engine->active.lock);
1049
1050         list_for_each_entry_safe_reverse(rq, rn,
1051                                          &engine->active.requests,
1052                                          sched.link) {
1053                 if (i915_request_completed(rq))
1054                         continue; /* XXX */
1055
1056                 __i915_request_unsubmit(rq);
1057
1058                 /*
1059                  * Push the request back into the queue for later resubmission.
1060                  * If this request is not native to this physical engine (i.e.
1061                  * it came from a virtual source), push it back onto the virtual
1062                  * engine so that it can be moved across onto another physical
1063                  * engine as load dictates.
1064                  */
1065                 if (likely(rq->execution_mask == engine->mask)) {
1066                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1067                         if (rq_prio(rq) != prio) {
1068                                 prio = rq_prio(rq);
1069                                 pl = i915_sched_lookup_priolist(engine, prio);
1070                         }
1071                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1072
1073                         list_move(&rq->sched.link, pl);
1074                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1075
1076                         active = rq;
1077                 } else {
1078                         struct intel_engine_cs *owner = rq->context->engine;
1079
1080                         /*
1081                          * Decouple the virtual breadcrumb before moving it
1082                          * back to the virtual engine -- we don't want the
1083                          * request to complete in the background and try
1084                          * and cancel the breadcrumb on the virtual engine
1085                          * (instead of the old engine where it is linked)!
1086                          */
1087                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1088                                      &rq->fence.flags)) {
1089                                 spin_lock_nested(&rq->lock,
1090                                                  SINGLE_DEPTH_NESTING);
1091                                 i915_request_cancel_breadcrumb(rq);
1092                                 spin_unlock(&rq->lock);
1093                         }
1094                         WRITE_ONCE(rq->engine, owner);
1095                         owner->submit_request(rq);
1096                         active = NULL;
1097                 }
1098         }
1099
1100         return active;
1101 }
1102
1103 struct i915_request *
1104 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1105 {
1106         struct intel_engine_cs *engine =
1107                 container_of(execlists, typeof(*engine), execlists);
1108
1109         return __unwind_incomplete_requests(engine);
1110 }
1111
1112 static inline void
1113 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1114 {
1115         /*
1116          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1117          * The compiler should eliminate this function as dead-code.
1118          */
1119         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1120                 return;
1121
1122         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1123                                    status, rq);
1124 }
1125
1126 static void intel_engine_context_in(struct intel_engine_cs *engine)
1127 {
1128         unsigned long flags;
1129
1130         if (READ_ONCE(engine->stats.enabled) == 0)
1131                 return;
1132
1133         write_seqlock_irqsave(&engine->stats.lock, flags);
1134
1135         if (engine->stats.enabled > 0) {
1136                 if (engine->stats.active++ == 0)
1137                         engine->stats.start = ktime_get();
1138                 GEM_BUG_ON(engine->stats.active == 0);
1139         }
1140
1141         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1142 }
1143
1144 static void intel_engine_context_out(struct intel_engine_cs *engine)
1145 {
1146         unsigned long flags;
1147
1148         if (READ_ONCE(engine->stats.enabled) == 0)
1149                 return;
1150
1151         write_seqlock_irqsave(&engine->stats.lock, flags);
1152
1153         if (engine->stats.enabled > 0) {
1154                 ktime_t last;
1155
1156                 if (engine->stats.active && --engine->stats.active == 0) {
1157                         /*
1158                          * Decrement the active context count and in case GPU
1159                          * is now idle add up to the running total.
1160                          */
1161                         last = ktime_sub(ktime_get(), engine->stats.start);
1162
1163                         engine->stats.total = ktime_add(engine->stats.total,
1164                                                         last);
1165                 } else if (engine->stats.active == 0) {
1166                         /*
1167                          * After turning on engine stats, context out might be
1168                          * the first event in which case we account from the
1169                          * time stats gathering was turned on.
1170                          */
1171                         last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1172
1173                         engine->stats.total = ktime_add(engine->stats.total,
1174                                                         last);
1175                 }
1176         }
1177
1178         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1179 }
1180
1181 static void
1182 execlists_check_context(const struct intel_context *ce,
1183                         const struct intel_engine_cs *engine)
1184 {
1185         const struct intel_ring *ring = ce->ring;
1186         u32 *regs = ce->lrc_reg_state;
1187         bool valid = true;
1188         int x;
1189
1190         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1191                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1192                        engine->name,
1193                        regs[CTX_RING_START],
1194                        i915_ggtt_offset(ring->vma));
1195                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1196                 valid = false;
1197         }
1198
1199         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1200             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1201                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1202                        engine->name,
1203                        regs[CTX_RING_CTL],
1204                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1205                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1206                 valid = false;
1207         }
1208
1209         x = lrc_ring_mi_mode(engine);
1210         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1211                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1212                        engine->name, regs[x + 1]);
1213                 regs[x + 1] &= ~STOP_RING;
1214                 regs[x + 1] |= STOP_RING << 16;
1215                 valid = false;
1216         }
1217
1218         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1219 }
1220
1221 static void restore_default_state(struct intel_context *ce,
1222                                   struct intel_engine_cs *engine)
1223 {
1224         u32 *regs = ce->lrc_reg_state;
1225
1226         if (engine->pinned_default_state)
1227                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1228                        engine->pinned_default_state + LRC_STATE_OFFSET,
1229                        engine->context_size - PAGE_SIZE);
1230
1231         execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1232         ce->runtime.last = intel_context_get_runtime(ce);
1233 }
1234
1235 static void reset_active(struct i915_request *rq,
1236                          struct intel_engine_cs *engine)
1237 {
1238         struct intel_context * const ce = rq->context;
1239         u32 head;
1240
1241         /*
1242          * The executing context has been cancelled. We want to prevent
1243          * further execution along this context and propagate the error on
1244          * to anything depending on its results.
1245          *
1246          * In __i915_request_submit(), we apply the -EIO and remove the
1247          * requests' payloads for any banned requests. But first, we must
1248          * rewind the context back to the start of the incomplete request so
1249          * that we do not jump back into the middle of the batch.
1250          *
1251          * We preserve the breadcrumbs and semaphores of the incomplete
1252          * requests so that inter-timeline dependencies (i.e other timelines)
1253          * remain correctly ordered. And we defer to __i915_request_submit()
1254          * so that all asynchronous waits are correctly handled.
1255          */
1256         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1257                      rq->fence.context, rq->fence.seqno);
1258
1259         /* On resubmission of the active request, payload will be scrubbed */
1260         if (i915_request_completed(rq))
1261                 head = rq->tail;
1262         else
1263                 head = active_request(ce->timeline, rq)->head;
1264         head = intel_ring_wrap(ce->ring, head);
1265
1266         /* Scrub the context image to prevent replaying the previous batch */
1267         restore_default_state(ce, engine);
1268         __execlists_update_reg_state(ce, engine, head);
1269
1270         /* We've switched away, so this should be a no-op, but intent matters */
1271         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1272 }
1273
1274 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1275 {
1276 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1277         ce->runtime.num_underflow += dt < 0;
1278         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1279 #endif
1280 }
1281
1282 static void intel_context_update_runtime(struct intel_context *ce)
1283 {
1284         u32 old;
1285         s32 dt;
1286
1287         if (intel_context_is_barrier(ce))
1288                 return;
1289
1290         old = ce->runtime.last;
1291         ce->runtime.last = intel_context_get_runtime(ce);
1292         dt = ce->runtime.last - old;
1293
1294         if (unlikely(dt <= 0)) {
1295                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1296                          old, ce->runtime.last, dt);
1297                 st_update_runtime_underflow(ce, dt);
1298                 return;
1299         }
1300
1301         ewma_runtime_add(&ce->runtime.avg, dt);
1302         ce->runtime.total += dt;
1303 }
1304
1305 static inline struct intel_engine_cs *
1306 __execlists_schedule_in(struct i915_request *rq)
1307 {
1308         struct intel_engine_cs * const engine = rq->engine;
1309         struct intel_context * const ce = rq->context;
1310
1311         intel_context_get(ce);
1312
1313         if (unlikely(intel_context_is_banned(ce)))
1314                 reset_active(rq, engine);
1315
1316         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1317                 execlists_check_context(ce, engine);
1318
1319         ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1320         if (ce->tag) {
1321                 /* Use a fixed tag for OA and friends */
1322                 ce->lrc_desc |= (u64)ce->tag << 32;
1323         } else {
1324                 /* We don't need a strict matching tag, just different values */
1325                 ce->lrc_desc |=
1326                         (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1327                         GEN11_SW_CTX_ID_SHIFT;
1328                 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1329         }
1330
1331         __intel_gt_pm_get(engine->gt);
1332         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1333         intel_engine_context_in(engine);
1334
1335         return engine;
1336 }
1337
1338 static inline struct i915_request *
1339 execlists_schedule_in(struct i915_request *rq, int idx)
1340 {
1341         struct intel_context * const ce = rq->context;
1342         struct intel_engine_cs *old;
1343
1344         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1345         trace_i915_request_in(rq, idx);
1346
1347         old = READ_ONCE(ce->inflight);
1348         do {
1349                 if (!old) {
1350                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1351                         break;
1352                 }
1353         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1354
1355         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1356         return i915_request_get(rq);
1357 }
1358
1359 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1360 {
1361         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1362         struct i915_request *next = READ_ONCE(ve->request);
1363
1364         if (next && next->execution_mask & ~rq->execution_mask)
1365                 tasklet_schedule(&ve->base.execlists.tasklet);
1366 }
1367
1368 static inline void
1369 __execlists_schedule_out(struct i915_request *rq,
1370                          struct intel_engine_cs * const engine)
1371 {
1372         struct intel_context * const ce = rq->context;
1373
1374         /*
1375          * NB process_csb() is not under the engine->active.lock and hence
1376          * schedule_out can race with schedule_in meaning that we should
1377          * refrain from doing non-trivial work here.
1378          */
1379
1380         /*
1381          * If we have just completed this context, the engine may now be
1382          * idle and we want to re-enter powersaving.
1383          */
1384         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1385             i915_request_completed(rq))
1386                 intel_engine_add_retire(engine, ce->timeline);
1387
1388         intel_context_update_runtime(ce);
1389         intel_engine_context_out(engine);
1390         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1391         intel_gt_pm_put_async(engine->gt);
1392
1393         /*
1394          * If this is part of a virtual engine, its next request may
1395          * have been blocked waiting for access to the active context.
1396          * We have to kick all the siblings again in case we need to
1397          * switch (e.g. the next request is not runnable on this
1398          * engine). Hopefully, we will already have submitted the next
1399          * request before the tasklet runs and do not need to rebuild
1400          * each virtual tree and kick everyone again.
1401          */
1402         if (ce->engine != engine)
1403                 kick_siblings(rq, ce);
1404
1405         intel_context_put(ce);
1406 }
1407
1408 static inline void
1409 execlists_schedule_out(struct i915_request *rq)
1410 {
1411         struct intel_context * const ce = rq->context;
1412         struct intel_engine_cs *cur, *old;
1413
1414         trace_i915_request_out(rq);
1415
1416         old = READ_ONCE(ce->inflight);
1417         do
1418                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1419         while (!try_cmpxchg(&ce->inflight, &old, cur));
1420         if (!cur)
1421                 __execlists_schedule_out(rq, old);
1422
1423         i915_request_put(rq);
1424 }
1425
1426 static u64 execlists_update_context(struct i915_request *rq)
1427 {
1428         struct intel_context *ce = rq->context;
1429         u64 desc = ce->lrc_desc;
1430         u32 tail, prev;
1431
1432         /*
1433          * WaIdleLiteRestore:bdw,skl
1434          *
1435          * We should never submit the context with the same RING_TAIL twice
1436          * just in case we submit an empty ring, which confuses the HW.
1437          *
1438          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1439          * the normal request to be able to always advance the RING_TAIL on
1440          * subsequent resubmissions (for lite restore). Should that fail us,
1441          * and we try and submit the same tail again, force the context
1442          * reload.
1443          *
1444          * If we need to return to a preempted context, we need to skip the
1445          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1446          * HW has a tendency to ignore us rewinding the TAIL to the end of
1447          * an earlier request.
1448          */
1449         tail = intel_ring_set_tail(rq->ring, rq->tail);
1450         prev = ce->lrc_reg_state[CTX_RING_TAIL];
1451         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1452                 desc |= CTX_DESC_FORCE_RESTORE;
1453         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1454         rq->tail = rq->wa_tail;
1455
1456         /*
1457          * Make sure the context image is complete before we submit it to HW.
1458          *
1459          * Ostensibly, writes (including the WCB) should be flushed prior to
1460          * an uncached write such as our mmio register access, the empirical
1461          * evidence (esp. on Braswell) suggests that the WC write into memory
1462          * may not be visible to the HW prior to the completion of the UC
1463          * register write and that we may begin execution from the context
1464          * before its image is complete leading to invalid PD chasing.
1465          */
1466         wmb();
1467
1468         ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1469         return desc;
1470 }
1471
1472 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1473 {
1474         if (execlists->ctrl_reg) {
1475                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1476                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1477         } else {
1478                 writel(upper_32_bits(desc), execlists->submit_reg);
1479                 writel(lower_32_bits(desc), execlists->submit_reg);
1480         }
1481 }
1482
1483 static __maybe_unused char *
1484 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1485 {
1486         if (!rq)
1487                 return "";
1488
1489         snprintf(buf, buflen, "%s%llx:%lld%s prio %d",
1490                  prefix,
1491                  rq->fence.context, rq->fence.seqno,
1492                  i915_request_completed(rq) ? "!" :
1493                  i915_request_started(rq) ? "*" :
1494                  "",
1495                  rq_prio(rq));
1496
1497         return buf;
1498 }
1499
1500 static __maybe_unused void
1501 trace_ports(const struct intel_engine_execlists *execlists,
1502             const char *msg,
1503             struct i915_request * const *ports)
1504 {
1505         const struct intel_engine_cs *engine =
1506                 container_of(execlists, typeof(*engine), execlists);
1507         char __maybe_unused p0[40], p1[40];
1508
1509         if (!ports[0])
1510                 return;
1511
1512         ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1513                      dump_port(p0, sizeof(p0), "", ports[0]),
1514                      dump_port(p1, sizeof(p1), ", ", ports[1]));
1515 }
1516
1517 static inline bool
1518 reset_in_progress(const struct intel_engine_execlists *execlists)
1519 {
1520         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1521 }
1522
1523 static __maybe_unused bool
1524 assert_pending_valid(const struct intel_engine_execlists *execlists,
1525                      const char *msg)
1526 {
1527         struct i915_request * const *port, *rq;
1528         struct intel_context *ce = NULL;
1529         bool sentinel = false;
1530
1531         trace_ports(execlists, msg, execlists->pending);
1532
1533         /* We may be messing around with the lists during reset, lalala */
1534         if (reset_in_progress(execlists))
1535                 return true;
1536
1537         if (!execlists->pending[0]) {
1538                 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1539                 return false;
1540         }
1541
1542         if (execlists->pending[execlists_num_ports(execlists)]) {
1543                 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1544                               execlists_num_ports(execlists));
1545                 return false;
1546         }
1547
1548         for (port = execlists->pending; (rq = *port); port++) {
1549                 unsigned long flags;
1550                 bool ok = true;
1551
1552                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1553                 GEM_BUG_ON(!i915_request_is_active(rq));
1554
1555                 if (ce == rq->context) {
1556                         GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1557                                       ce->timeline->fence_context,
1558                                       port - execlists->pending);
1559                         return false;
1560                 }
1561                 ce = rq->context;
1562
1563                 /*
1564                  * Sentinels are supposed to be lonely so they flush the
1565                  * current exection off the HW. Check that they are the
1566                  * only request in the pending submission.
1567                  */
1568                 if (sentinel) {
1569                         GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
1570                                       ce->timeline->fence_context,
1571                                       port - execlists->pending);
1572                         return false;
1573                 }
1574
1575                 sentinel = i915_request_has_sentinel(rq);
1576                 if (sentinel && port != execlists->pending) {
1577                         GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
1578                                       ce->timeline->fence_context,
1579                                       port - execlists->pending);
1580                         return false;
1581                 }
1582
1583                 /* Hold tightly onto the lock to prevent concurrent retires! */
1584                 if (!spin_trylock_irqsave(&rq->lock, flags))
1585                         continue;
1586
1587                 if (i915_request_completed(rq))
1588                         goto unlock;
1589
1590                 if (i915_active_is_idle(&ce->active) &&
1591                     !intel_context_is_barrier(ce)) {
1592                         GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1593                                       ce->timeline->fence_context,
1594                                       port - execlists->pending);
1595                         ok = false;
1596                         goto unlock;
1597                 }
1598
1599                 if (!i915_vma_is_pinned(ce->state)) {
1600                         GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1601                                       ce->timeline->fence_context,
1602                                       port - execlists->pending);
1603                         ok = false;
1604                         goto unlock;
1605                 }
1606
1607                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1608                         GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1609                                       ce->timeline->fence_context,
1610                                       port - execlists->pending);
1611                         ok = false;
1612                         goto unlock;
1613                 }
1614
1615 unlock:
1616                 spin_unlock_irqrestore(&rq->lock, flags);
1617                 if (!ok)
1618                         return false;
1619         }
1620
1621         return ce;
1622 }
1623
1624 static void execlists_submit_ports(struct intel_engine_cs *engine)
1625 {
1626         struct intel_engine_execlists *execlists = &engine->execlists;
1627         unsigned int n;
1628
1629         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1630
1631         /*
1632          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1633          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1634          * not be relinquished until the device is idle (see
1635          * i915_gem_idle_work_handler()). As a precaution, we make sure
1636          * that all ELSP are drained i.e. we have processed the CSB,
1637          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1638          */
1639         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1640
1641         /*
1642          * ELSQ note: the submit queue is not cleared after being submitted
1643          * to the HW so we need to make sure we always clean it up. This is
1644          * currently ensured by the fact that we always write the same number
1645          * of elsq entries, keep this in mind before changing the loop below.
1646          */
1647         for (n = execlists_num_ports(execlists); n--; ) {
1648                 struct i915_request *rq = execlists->pending[n];
1649
1650                 write_desc(execlists,
1651                            rq ? execlists_update_context(rq) : 0,
1652                            n);
1653         }
1654
1655         /* we need to manually load the submit queue */
1656         if (execlists->ctrl_reg)
1657                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1658 }
1659
1660 static bool ctx_single_port_submission(const struct intel_context *ce)
1661 {
1662         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1663                 intel_context_force_single_submission(ce));
1664 }
1665
1666 static bool can_merge_ctx(const struct intel_context *prev,
1667                           const struct intel_context *next)
1668 {
1669         if (prev != next)
1670                 return false;
1671
1672         if (ctx_single_port_submission(prev))
1673                 return false;
1674
1675         return true;
1676 }
1677
1678 static unsigned long i915_request_flags(const struct i915_request *rq)
1679 {
1680         return READ_ONCE(rq->fence.flags);
1681 }
1682
1683 static bool can_merge_rq(const struct i915_request *prev,
1684                          const struct i915_request *next)
1685 {
1686         GEM_BUG_ON(prev == next);
1687         GEM_BUG_ON(!assert_priority_queue(prev, next));
1688
1689         /*
1690          * We do not submit known completed requests. Therefore if the next
1691          * request is already completed, we can pretend to merge it in
1692          * with the previous context (and we will skip updating the ELSP
1693          * and tracking). Thus hopefully keeping the ELSP full with active
1694          * contexts, despite the best efforts of preempt-to-busy to confuse
1695          * us.
1696          */
1697         if (i915_request_completed(next))
1698                 return true;
1699
1700         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1701                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1702                       BIT(I915_FENCE_FLAG_SENTINEL))))
1703                 return false;
1704
1705         if (!can_merge_ctx(prev->context, next->context))
1706                 return false;
1707
1708         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1709         return true;
1710 }
1711
1712 static void virtual_update_register_offsets(u32 *regs,
1713                                             struct intel_engine_cs *engine)
1714 {
1715         set_offsets(regs, reg_offsets(engine), engine, false);
1716 }
1717
1718 static bool virtual_matches(const struct virtual_engine *ve,
1719                             const struct i915_request *rq,
1720                             const struct intel_engine_cs *engine)
1721 {
1722         const struct intel_engine_cs *inflight;
1723
1724         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1725                 return false;
1726
1727         /*
1728          * We track when the HW has completed saving the context image
1729          * (i.e. when we have seen the final CS event switching out of
1730          * the context) and must not overwrite the context image before
1731          * then. This restricts us to only using the active engine
1732          * while the previous virtualized request is inflight (so
1733          * we reuse the register offsets). This is a very small
1734          * hystersis on the greedy seelction algorithm.
1735          */
1736         inflight = intel_context_inflight(&ve->context);
1737         if (inflight && inflight != engine)
1738                 return false;
1739
1740         return true;
1741 }
1742
1743 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1744                                      struct i915_request *rq)
1745 {
1746         struct intel_engine_cs *old = ve->siblings[0];
1747
1748         /* All unattached (rq->engine == old) must already be completed */
1749
1750         spin_lock(&old->breadcrumbs.irq_lock);
1751         if (!list_empty(&ve->context.signal_link)) {
1752                 list_del_init(&ve->context.signal_link);
1753
1754                 /*
1755                  * We cannot acquire the new engine->breadcrumbs.irq_lock
1756                  * (as we are holding a breadcrumbs.irq_lock already),
1757                  * so attach this request to the signaler on submission.
1758                  * The queued irq_work will occur when we finally drop
1759                  * the engine->active.lock after dequeue.
1760                  */
1761                 set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
1762
1763                 /* Also transfer the pending irq_work for the old breadcrumb. */
1764                 intel_engine_signal_breadcrumbs(rq->engine);
1765         }
1766         spin_unlock(&old->breadcrumbs.irq_lock);
1767 }
1768
1769 #define for_each_waiter(p__, rq__) \
1770         list_for_each_entry_lockless(p__, \
1771                                      &(rq__)->sched.waiters_list, \
1772                                      wait_link)
1773
1774 #define for_each_signaler(p__, rq__) \
1775         list_for_each_entry_rcu(p__, \
1776                                 &(rq__)->sched.signalers_list, \
1777                                 signal_link)
1778
1779 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1780 {
1781         LIST_HEAD(list);
1782
1783         /*
1784          * We want to move the interrupted request to the back of
1785          * the round-robin list (i.e. its priority level), but
1786          * in doing so, we must then move all requests that were in
1787          * flight and were waiting for the interrupted request to
1788          * be run after it again.
1789          */
1790         do {
1791                 struct i915_dependency *p;
1792
1793                 GEM_BUG_ON(i915_request_is_active(rq));
1794                 list_move_tail(&rq->sched.link, pl);
1795
1796                 for_each_waiter(p, rq) {
1797                         struct i915_request *w =
1798                                 container_of(p->waiter, typeof(*w), sched);
1799
1800                         /* Leave semaphores spinning on the other engines */
1801                         if (w->engine != rq->engine)
1802                                 continue;
1803
1804                         /* No waiter should start before its signaler */
1805                         GEM_BUG_ON(i915_request_started(w) &&
1806                                    !i915_request_completed(rq));
1807
1808                         GEM_BUG_ON(i915_request_is_active(w));
1809                         if (!i915_request_is_ready(w))
1810                                 continue;
1811
1812                         if (rq_prio(w) < rq_prio(rq))
1813                                 continue;
1814
1815                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1816                         list_move_tail(&w->sched.link, &list);
1817                 }
1818
1819                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1820         } while (rq);
1821 }
1822
1823 static void defer_active(struct intel_engine_cs *engine)
1824 {
1825         struct i915_request *rq;
1826
1827         rq = __unwind_incomplete_requests(engine);
1828         if (!rq)
1829                 return;
1830
1831         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1832 }
1833
1834 static bool
1835 need_timeslice(const struct intel_engine_cs *engine,
1836                const struct i915_request *rq)
1837 {
1838         int hint;
1839
1840         if (!intel_engine_has_timeslices(engine))
1841                 return false;
1842
1843         hint = engine->execlists.queue_priority_hint;
1844         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1845                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1846
1847         return hint >= effective_prio(rq);
1848 }
1849
1850 static bool
1851 timeslice_yield(const struct intel_engine_execlists *el,
1852                 const struct i915_request *rq)
1853 {
1854         /*
1855          * Once bitten, forever smitten!
1856          *
1857          * If the active context ever busy-waited on a semaphore,
1858          * it will be treated as a hog until the end of its timeslice (i.e.
1859          * until it is scheduled out and replaced by a new submission,
1860          * possibly even its own lite-restore). The HW only sends an interrupt
1861          * on the first miss, and we do know if that semaphore has been
1862          * signaled, or even if it is now stuck on another semaphore. Play
1863          * safe, yield if it might be stuck -- it will be given a fresh
1864          * timeslice in the near future.
1865          */
1866         return upper_32_bits(rq->context->lrc_desc) == READ_ONCE(el->yield);
1867 }
1868
1869 static bool
1870 timeslice_expired(const struct intel_engine_execlists *el,
1871                   const struct i915_request *rq)
1872 {
1873         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1874 }
1875
1876 static int
1877 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1878 {
1879         if (list_is_last(&rq->sched.link, &engine->active.requests))
1880                 return INT_MIN;
1881
1882         return rq_prio(list_next_entry(rq, sched.link));
1883 }
1884
1885 static inline unsigned long
1886 timeslice(const struct intel_engine_cs *engine)
1887 {
1888         return READ_ONCE(engine->props.timeslice_duration_ms);
1889 }
1890
1891 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1892 {
1893         const struct intel_engine_execlists *execlists = &engine->execlists;
1894         const struct i915_request *rq = *execlists->active;
1895
1896         if (!rq || i915_request_completed(rq))
1897                 return 0;
1898
1899         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1900                 return 0;
1901
1902         return timeslice(engine);
1903 }
1904
1905 static void set_timeslice(struct intel_engine_cs *engine)
1906 {
1907         unsigned long duration;
1908
1909         if (!intel_engine_has_timeslices(engine))
1910                 return;
1911
1912         duration = active_timeslice(engine);
1913         ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
1914
1915         set_timer_ms(&engine->execlists.timer, duration);
1916 }
1917
1918 static void start_timeslice(struct intel_engine_cs *engine)
1919 {
1920         struct intel_engine_execlists *execlists = &engine->execlists;
1921         const int prio = queue_prio(execlists);
1922         unsigned long duration;
1923
1924         if (!intel_engine_has_timeslices(engine))
1925                 return;
1926
1927         WRITE_ONCE(execlists->switch_priority_hint, prio);
1928         if (prio == INT_MIN)
1929                 return;
1930
1931         if (timer_pending(&execlists->timer))
1932                 return;
1933
1934         duration = timeslice(engine);
1935         ENGINE_TRACE(engine,
1936                      "start timeslicing, prio:%d, interval:%lu",
1937                      prio, duration);
1938
1939         set_timer_ms(&execlists->timer, duration);
1940 }
1941
1942 static void record_preemption(struct intel_engine_execlists *execlists)
1943 {
1944         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1945 }
1946
1947 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1948                                             const struct i915_request *rq)
1949 {
1950         if (!rq)
1951                 return 0;
1952
1953         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1954         if (unlikely(intel_context_is_banned(rq->context)))
1955                 return 1;
1956
1957         return READ_ONCE(engine->props.preempt_timeout_ms);
1958 }
1959
1960 static void set_preempt_timeout(struct intel_engine_cs *engine,
1961                                 const struct i915_request *rq)
1962 {
1963         if (!intel_engine_has_preempt_reset(engine))
1964                 return;
1965
1966         set_timer_ms(&engine->execlists.preempt,
1967                      active_preempt_timeout(engine, rq));
1968 }
1969
1970 static inline void clear_ports(struct i915_request **ports, int count)
1971 {
1972         memset_p((void **)ports, NULL, count);
1973 }
1974
1975 static void execlists_dequeue(struct intel_engine_cs *engine)
1976 {
1977         struct intel_engine_execlists * const execlists = &engine->execlists;
1978         struct i915_request **port = execlists->pending;
1979         struct i915_request ** const last_port = port + execlists->port_mask;
1980         struct i915_request * const *active;
1981         struct i915_request *last;
1982         struct rb_node *rb;
1983         bool submit = false;
1984
1985         /*
1986          * Hardware submission is through 2 ports. Conceptually each port
1987          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1988          * static for a context, and unique to each, so we only execute
1989          * requests belonging to a single context from each ring. RING_HEAD
1990          * is maintained by the CS in the context image, it marks the place
1991          * where it got up to last time, and through RING_TAIL we tell the CS
1992          * where we want to execute up to this time.
1993          *
1994          * In this list the requests are in order of execution. Consecutive
1995          * requests from the same context are adjacent in the ringbuffer. We
1996          * can combine these requests into a single RING_TAIL update:
1997          *
1998          *              RING_HEAD...req1...req2
1999          *                                    ^- RING_TAIL
2000          * since to execute req2 the CS must first execute req1.
2001          *
2002          * Our goal then is to point each port to the end of a consecutive
2003          * sequence of requests as being the most optimal (fewest wake ups
2004          * and context switches) submission.
2005          */
2006
2007         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2008                 struct virtual_engine *ve =
2009                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2010                 struct i915_request *rq = READ_ONCE(ve->request);
2011
2012                 if (!rq) { /* lazily cleanup after another engine handled rq */
2013                         rb_erase_cached(rb, &execlists->virtual);
2014                         RB_CLEAR_NODE(rb);
2015                         rb = rb_first_cached(&execlists->virtual);
2016                         continue;
2017                 }
2018
2019                 if (!virtual_matches(ve, rq, engine)) {
2020                         rb = rb_next(rb);
2021                         continue;
2022                 }
2023
2024                 break;
2025         }
2026
2027         /*
2028          * If the queue is higher priority than the last
2029          * request in the currently active context, submit afresh.
2030          * We will resubmit again afterwards in case we need to split
2031          * the active context to interject the preemption request,
2032          * i.e. we will retrigger preemption following the ack in case
2033          * of trouble.
2034          */
2035         active = READ_ONCE(execlists->active);
2036
2037         /*
2038          * In theory we can skip over completed contexts that have not
2039          * yet been processed by events (as those events are in flight):
2040          *
2041          * while ((last = *active) && i915_request_completed(last))
2042          *      active++;
2043          *
2044          * However, the GPU cannot handle this as it will ultimately
2045          * find itself trying to jump back into a context it has just
2046          * completed and barf.
2047          */
2048
2049         if ((last = *active)) {
2050                 if (need_preempt(engine, last, rb)) {
2051                         if (i915_request_completed(last)) {
2052                                 tasklet_hi_schedule(&execlists->tasklet);
2053                                 return;
2054                         }
2055
2056                         ENGINE_TRACE(engine,
2057                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2058                                      last->fence.context,
2059                                      last->fence.seqno,
2060                                      last->sched.attr.priority,
2061                                      execlists->queue_priority_hint);
2062                         record_preemption(execlists);
2063
2064                         /*
2065                          * Don't let the RING_HEAD advance past the breadcrumb
2066                          * as we unwind (and until we resubmit) so that we do
2067                          * not accidentally tell it to go backwards.
2068                          */
2069                         ring_set_paused(engine, 1);
2070
2071                         /*
2072                          * Note that we have not stopped the GPU at this point,
2073                          * so we are unwinding the incomplete requests as they
2074                          * remain inflight and so by the time we do complete
2075                          * the preemption, some of the unwound requests may
2076                          * complete!
2077                          */
2078                         __unwind_incomplete_requests(engine);
2079
2080                         last = NULL;
2081                 } else if (need_timeslice(engine, last) &&
2082                            timeslice_expired(execlists, last)) {
2083                         if (i915_request_completed(last)) {
2084                                 tasklet_hi_schedule(&execlists->tasklet);
2085                                 return;
2086                         }
2087
2088                         ENGINE_TRACE(engine,
2089                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2090                                      last->fence.context,
2091                                      last->fence.seqno,
2092                                      last->sched.attr.priority,
2093                                      execlists->queue_priority_hint,
2094                                      yesno(timeslice_yield(execlists, last)));
2095
2096                         ring_set_paused(engine, 1);
2097                         defer_active(engine);
2098
2099                         /*
2100                          * Unlike for preemption, if we rewind and continue
2101                          * executing the same context as previously active,
2102                          * the order of execution will remain the same and
2103                          * the tail will only advance. We do not need to
2104                          * force a full context restore, as a lite-restore
2105                          * is sufficient to resample the monotonic TAIL.
2106                          *
2107                          * If we switch to any other context, similarly we
2108                          * will not rewind TAIL of current context, and
2109                          * normal save/restore will preserve state and allow
2110                          * us to later continue executing the same request.
2111                          */
2112                         last = NULL;
2113                 } else {
2114                         /*
2115                          * Otherwise if we already have a request pending
2116                          * for execution after the current one, we can
2117                          * just wait until the next CS event before
2118                          * queuing more. In either case we will force a
2119                          * lite-restore preemption event, but if we wait
2120                          * we hopefully coalesce several updates into a single
2121                          * submission.
2122                          */
2123                         if (!list_is_last(&last->sched.link,
2124                                           &engine->active.requests)) {
2125                                 /*
2126                                  * Even if ELSP[1] is occupied and not worthy
2127                                  * of timeslices, our queue might be.
2128                                  */
2129                                 start_timeslice(engine);
2130                                 return;
2131                         }
2132                 }
2133         }
2134
2135         while (rb) { /* XXX virtual is always taking precedence */
2136                 struct virtual_engine *ve =
2137                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2138                 struct i915_request *rq;
2139
2140                 spin_lock(&ve->base.active.lock);
2141
2142                 rq = ve->request;
2143                 if (unlikely(!rq)) { /* lost the race to a sibling */
2144                         spin_unlock(&ve->base.active.lock);
2145                         rb_erase_cached(rb, &execlists->virtual);
2146                         RB_CLEAR_NODE(rb);
2147                         rb = rb_first_cached(&execlists->virtual);
2148                         continue;
2149                 }
2150
2151                 GEM_BUG_ON(rq != ve->request);
2152                 GEM_BUG_ON(rq->engine != &ve->base);
2153                 GEM_BUG_ON(rq->context != &ve->context);
2154
2155                 if (rq_prio(rq) >= queue_prio(execlists)) {
2156                         if (!virtual_matches(ve, rq, engine)) {
2157                                 spin_unlock(&ve->base.active.lock);
2158                                 rb = rb_next(rb);
2159                                 continue;
2160                         }
2161
2162                         if (last && !can_merge_rq(last, rq)) {
2163                                 spin_unlock(&ve->base.active.lock);
2164                                 start_timeslice(engine);
2165                                 return; /* leave this for another sibling */
2166                         }
2167
2168                         ENGINE_TRACE(engine,
2169                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2170                                      rq->fence.context,
2171                                      rq->fence.seqno,
2172                                      i915_request_completed(rq) ? "!" :
2173                                      i915_request_started(rq) ? "*" :
2174                                      "",
2175                                      yesno(engine != ve->siblings[0]));
2176
2177                         WRITE_ONCE(ve->request, NULL);
2178                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2179                                    INT_MIN);
2180                         rb_erase_cached(rb, &execlists->virtual);
2181                         RB_CLEAR_NODE(rb);
2182
2183                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2184                         WRITE_ONCE(rq->engine, engine);
2185
2186                         if (engine != ve->siblings[0]) {
2187                                 u32 *regs = ve->context.lrc_reg_state;
2188                                 unsigned int n;
2189
2190                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2191
2192                                 if (!intel_engine_has_relative_mmio(engine))
2193                                         virtual_update_register_offsets(regs,
2194                                                                         engine);
2195
2196                                 if (!list_empty(&ve->context.signals))
2197                                         virtual_xfer_breadcrumbs(ve, rq);
2198
2199                                 /*
2200                                  * Move the bound engine to the top of the list
2201                                  * for future execution. We then kick this
2202                                  * tasklet first before checking others, so that
2203                                  * we preferentially reuse this set of bound
2204                                  * registers.
2205                                  */
2206                                 for (n = 1; n < ve->num_siblings; n++) {
2207                                         if (ve->siblings[n] == engine) {
2208                                                 swap(ve->siblings[n],
2209                                                      ve->siblings[0]);
2210                                                 break;
2211                                         }
2212                                 }
2213
2214                                 GEM_BUG_ON(ve->siblings[0] != engine);
2215                         }
2216
2217                         if (__i915_request_submit(rq)) {
2218                                 submit = true;
2219                                 last = rq;
2220                         }
2221                         i915_request_put(rq);
2222
2223                         /*
2224                          * Hmm, we have a bunch of virtual engine requests,
2225                          * but the first one was already completed (thanks
2226                          * preempt-to-busy!). Keep looking at the veng queue
2227                          * until we have no more relevant requests (i.e.
2228                          * the normal submit queue has higher priority).
2229                          */
2230                         if (!submit) {
2231                                 spin_unlock(&ve->base.active.lock);
2232                                 rb = rb_first_cached(&execlists->virtual);
2233                                 continue;
2234                         }
2235                 }
2236
2237                 spin_unlock(&ve->base.active.lock);
2238                 break;
2239         }
2240
2241         while ((rb = rb_first_cached(&execlists->queue))) {
2242                 struct i915_priolist *p = to_priolist(rb);
2243                 struct i915_request *rq, *rn;
2244                 int i;
2245
2246                 priolist_for_each_request_consume(rq, rn, p, i) {
2247                         bool merge = true;
2248
2249                         /*
2250                          * Can we combine this request with the current port?
2251                          * It has to be the same context/ringbuffer and not
2252                          * have any exceptions (e.g. GVT saying never to
2253                          * combine contexts).
2254                          *
2255                          * If we can combine the requests, we can execute both
2256                          * by updating the RING_TAIL to point to the end of the
2257                          * second request, and so we never need to tell the
2258                          * hardware about the first.
2259                          */
2260                         if (last && !can_merge_rq(last, rq)) {
2261                                 /*
2262                                  * If we are on the second port and cannot
2263                                  * combine this request with the last, then we
2264                                  * are done.
2265                                  */
2266                                 if (port == last_port)
2267                                         goto done;
2268
2269                                 /*
2270                                  * We must not populate both ELSP[] with the
2271                                  * same LRCA, i.e. we must submit 2 different
2272                                  * contexts if we submit 2 ELSP.
2273                                  */
2274                                 if (last->context == rq->context)
2275                                         goto done;
2276
2277                                 if (i915_request_has_sentinel(last))
2278                                         goto done;
2279
2280                                 /*
2281                                  * If GVT overrides us we only ever submit
2282                                  * port[0], leaving port[1] empty. Note that we
2283                                  * also have to be careful that we don't queue
2284                                  * the same context (even though a different
2285                                  * request) to the second port.
2286                                  */
2287                                 if (ctx_single_port_submission(last->context) ||
2288                                     ctx_single_port_submission(rq->context))
2289                                         goto done;
2290
2291                                 merge = false;
2292                         }
2293
2294                         if (__i915_request_submit(rq)) {
2295                                 if (!merge) {
2296                                         *port = execlists_schedule_in(last, port - execlists->pending);
2297                                         port++;
2298                                         last = NULL;
2299                                 }
2300
2301                                 GEM_BUG_ON(last &&
2302                                            !can_merge_ctx(last->context,
2303                                                           rq->context));
2304                                 GEM_BUG_ON(last &&
2305                                            i915_seqno_passed(last->fence.seqno,
2306                                                              rq->fence.seqno));
2307
2308                                 submit = true;
2309                                 last = rq;
2310                         }
2311                 }
2312
2313                 rb_erase_cached(&p->node, &execlists->queue);
2314                 i915_priolist_free(p);
2315         }
2316
2317 done:
2318         /*
2319          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2320          *
2321          * We choose the priority hint such that if we add a request of greater
2322          * priority than this, we kick the submission tasklet to decide on
2323          * the right order of submitting the requests to hardware. We must
2324          * also be prepared to reorder requests as they are in-flight on the
2325          * HW. We derive the priority hint then as the first "hole" in
2326          * the HW submission ports and if there are no available slots,
2327          * the priority of the lowest executing request, i.e. last.
2328          *
2329          * When we do receive a higher priority request ready to run from the
2330          * user, see queue_request(), the priority hint is bumped to that
2331          * request triggering preemption on the next dequeue (or subsequent
2332          * interrupt for secondary ports).
2333          */
2334         execlists->queue_priority_hint = queue_prio(execlists);
2335
2336         if (submit) {
2337                 *port = execlists_schedule_in(last, port - execlists->pending);
2338                 execlists->switch_priority_hint =
2339                         switch_prio(engine, *execlists->pending);
2340
2341                 /*
2342                  * Skip if we ended up with exactly the same set of requests,
2343                  * e.g. trying to timeslice a pair of ordered contexts
2344                  */
2345                 if (!memcmp(active, execlists->pending,
2346                             (port - execlists->pending + 1) * sizeof(*port))) {
2347                         do
2348                                 execlists_schedule_out(fetch_and_zero(port));
2349                         while (port-- != execlists->pending);
2350
2351                         goto skip_submit;
2352                 }
2353                 clear_ports(port + 1, last_port - port);
2354
2355                 WRITE_ONCE(execlists->yield, -1);
2356                 execlists_submit_ports(engine);
2357                 set_preempt_timeout(engine, *active);
2358         } else {
2359 skip_submit:
2360                 ring_set_paused(engine, 0);
2361         }
2362 }
2363
2364 static void
2365 cancel_port_requests(struct intel_engine_execlists * const execlists)
2366 {
2367         struct i915_request * const *port;
2368
2369         for (port = execlists->pending; *port; port++)
2370                 execlists_schedule_out(*port);
2371         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2372
2373         /* Mark the end of active before we overwrite *active */
2374         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2375                 execlists_schedule_out(*port);
2376         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2377
2378         smp_wmb(); /* complete the seqlock for execlists_active() */
2379         WRITE_ONCE(execlists->active, execlists->inflight);
2380 }
2381
2382 static inline void
2383 invalidate_csb_entries(const u32 *first, const u32 *last)
2384 {
2385         clflush((void *)first);
2386         clflush((void *)last);
2387 }
2388
2389 /*
2390  * Starting with Gen12, the status has a new format:
2391  *
2392  *     bit  0:     switched to new queue
2393  *     bit  1:     reserved
2394  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2395  *                 switch detail is set to "wait on semaphore"
2396  *     bits 3-5:   engine class
2397  *     bits 6-11:  engine instance
2398  *     bits 12-14: reserved
2399  *     bits 15-25: sw context id of the lrc the GT switched to
2400  *     bits 26-31: sw counter of the lrc the GT switched to
2401  *     bits 32-35: context switch detail
2402  *                  - 0: ctx complete
2403  *                  - 1: wait on sync flip
2404  *                  - 2: wait on vblank
2405  *                  - 3: wait on scanline
2406  *                  - 4: wait on semaphore
2407  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2408  *                       WAIT_FOR_EVENT)
2409  *     bit  36:    reserved
2410  *     bits 37-43: wait detail (for switch detail 1 to 4)
2411  *     bits 44-46: reserved
2412  *     bits 47-57: sw context id of the lrc the GT switched away from
2413  *     bits 58-63: sw counter of the lrc the GT switched away from
2414  */
2415 static inline bool
2416 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2417 {
2418         u32 lower_dw = csb[0];
2419         u32 upper_dw = csb[1];
2420         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2421         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2422         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2423
2424         /*
2425          * The context switch detail is not guaranteed to be 5 when a preemption
2426          * occurs, so we can't just check for that. The check below works for
2427          * all the cases we care about, including preemptions of WAIT
2428          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2429          * would require some extra handling, but we don't support that.
2430          */
2431         if (!ctx_away_valid || new_queue) {
2432                 GEM_BUG_ON(!ctx_to_valid);
2433                 return true;
2434         }
2435
2436         /*
2437          * switch detail = 5 is covered by the case above and we do not expect a
2438          * context switch on an unsuccessful wait instruction since we always
2439          * use polling mode.
2440          */
2441         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2442         return false;
2443 }
2444
2445 static inline bool
2446 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2447 {
2448         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2449 }
2450
2451 static void process_csb(struct intel_engine_cs *engine)
2452 {
2453         struct intel_engine_execlists * const execlists = &engine->execlists;
2454         const u32 * const buf = execlists->csb_status;
2455         const u8 num_entries = execlists->csb_size;
2456         u8 head, tail;
2457
2458         /*
2459          * As we modify our execlists state tracking we require exclusive
2460          * access. Either we are inside the tasklet, or the tasklet is disabled
2461          * and we assume that is only inside the reset paths and so serialised.
2462          */
2463         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2464                    !reset_in_progress(execlists));
2465         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2466
2467         /*
2468          * Note that csb_write, csb_status may be either in HWSP or mmio.
2469          * When reading from the csb_write mmio register, we have to be
2470          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2471          * the low 4bits. As it happens we know the next 4bits are always
2472          * zero and so we can simply masked off the low u8 of the register
2473          * and treat it identically to reading from the HWSP (without having
2474          * to use explicit shifting and masking, and probably bifurcating
2475          * the code to handle the legacy mmio read).
2476          */
2477         head = execlists->csb_head;
2478         tail = READ_ONCE(*execlists->csb_write);
2479         if (unlikely(head == tail))
2480                 return;
2481
2482         /*
2483          * Hopefully paired with a wmb() in HW!
2484          *
2485          * We must complete the read of the write pointer before any reads
2486          * from the CSB, so that we do not see stale values. Without an rmb
2487          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2488          * we perform the READ_ONCE(*csb_write).
2489          */
2490         rmb();
2491
2492         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2493         do {
2494                 bool promote;
2495
2496                 if (++head == num_entries)
2497                         head = 0;
2498
2499                 /*
2500                  * We are flying near dragons again.
2501                  *
2502                  * We hold a reference to the request in execlist_port[]
2503                  * but no more than that. We are operating in softirq
2504                  * context and so cannot hold any mutex or sleep. That
2505                  * prevents us stopping the requests we are processing
2506                  * in port[] from being retired simultaneously (the
2507                  * breadcrumb will be complete before we see the
2508                  * context-switch). As we only hold the reference to the
2509                  * request, any pointer chasing underneath the request
2510                  * is subject to a potential use-after-free. Thus we
2511                  * store all of the bookkeeping within port[] as
2512                  * required, and avoid using unguarded pointers beneath
2513                  * request itself. The same applies to the atomic
2514                  * status notifier.
2515                  */
2516
2517                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2518                              head, buf[2 * head + 0], buf[2 * head + 1]);
2519
2520                 if (INTEL_GEN(engine->i915) >= 12)
2521                         promote = gen12_csb_parse(execlists, buf + 2 * head);
2522                 else
2523                         promote = gen8_csb_parse(execlists, buf + 2 * head);
2524                 if (promote) {
2525                         struct i915_request * const *old = execlists->active;
2526
2527                         ring_set_paused(engine, 0);
2528
2529                         /* Point active to the new ELSP; prevent overwriting */
2530                         WRITE_ONCE(execlists->active, execlists->pending);
2531                         smp_wmb(); /* notify execlists_active() */
2532
2533                         /* cancel old inflight, prepare for switch */
2534                         trace_ports(execlists, "preempted", old);
2535                         while (*old)
2536                                 execlists_schedule_out(*old++);
2537
2538                         /* switch pending to inflight */
2539                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2540                         memcpy(execlists->inflight,
2541                                execlists->pending,
2542                                execlists_num_ports(execlists) *
2543                                sizeof(*execlists->pending));
2544                         smp_wmb(); /* complete the seqlock */
2545                         WRITE_ONCE(execlists->active, execlists->inflight);
2546
2547                         WRITE_ONCE(execlists->pending[0], NULL);
2548                 } else {
2549                         GEM_BUG_ON(!*execlists->active);
2550
2551                         /* port0 completed, advanced to port1 */
2552                         trace_ports(execlists, "completed", execlists->active);
2553
2554                         /*
2555                          * We rely on the hardware being strongly
2556                          * ordered, that the breadcrumb write is
2557                          * coherent (visible from the CPU) before the
2558                          * user interrupt is processed. One might assume
2559                          * that the breadcrumb write being before the
2560                          * user interrupt and the CS event for the context
2561                          * switch would therefore be before the CS event
2562                          * itself...
2563                          */
2564                         if (GEM_SHOW_DEBUG() &&
2565                             !i915_request_completed(*execlists->active)) {
2566                                 struct i915_request *rq = *execlists->active;
2567                                 const u32 *regs __maybe_unused =
2568                                         rq->context->lrc_reg_state;
2569
2570                                 ENGINE_TRACE(engine,
2571                                              "context completed before request!\n");
2572                                 ENGINE_TRACE(engine,
2573                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2574                                              ENGINE_READ(engine, RING_START),
2575                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2576                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2577                                              ENGINE_READ(engine, RING_CTL),
2578                                              ENGINE_READ(engine, RING_MI_MODE));
2579                                 ENGINE_TRACE(engine,
2580                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2581                                              i915_ggtt_offset(rq->ring->vma),
2582                                              rq->head, rq->tail,
2583                                              rq->fence.context,
2584                                              lower_32_bits(rq->fence.seqno),
2585                                              hwsp_seqno(rq));
2586                                 ENGINE_TRACE(engine,
2587                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2588                                              regs[CTX_RING_START],
2589                                              regs[CTX_RING_HEAD],
2590                                              regs[CTX_RING_TAIL]);
2591                         }
2592
2593                         execlists_schedule_out(*execlists->active++);
2594
2595                         GEM_BUG_ON(execlists->active - execlists->inflight >
2596                                    execlists_num_ports(execlists));
2597                 }
2598         } while (head != tail);
2599
2600         execlists->csb_head = head;
2601         set_timeslice(engine);
2602
2603         /*
2604          * Gen11 has proven to fail wrt global observation point between
2605          * entry and tail update, failing on the ordering and thus
2606          * we see an old entry in the context status buffer.
2607          *
2608          * Forcibly evict out entries for the next gpu csb update,
2609          * to increase the odds that we get a fresh entries with non
2610          * working hardware. The cost for doing so comes out mostly with
2611          * the wash as hardware, working or not, will need to do the
2612          * invalidation before.
2613          */
2614         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2615 }
2616
2617 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2618 {
2619         lockdep_assert_held(&engine->active.lock);
2620         if (!READ_ONCE(engine->execlists.pending[0])) {
2621                 rcu_read_lock(); /* protect peeking at execlists->active */
2622                 execlists_dequeue(engine);
2623                 rcu_read_unlock();
2624         }
2625 }
2626
2627 static void __execlists_hold(struct i915_request *rq)
2628 {
2629         LIST_HEAD(list);
2630
2631         do {
2632                 struct i915_dependency *p;
2633
2634                 if (i915_request_is_active(rq))
2635                         __i915_request_unsubmit(rq);
2636
2637                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2638                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2639                 i915_request_set_hold(rq);
2640                 RQ_TRACE(rq, "on hold\n");
2641
2642                 for_each_waiter(p, rq) {
2643                         struct i915_request *w =
2644                                 container_of(p->waiter, typeof(*w), sched);
2645
2646                         /* Leave semaphores spinning on the other engines */
2647                         if (w->engine != rq->engine)
2648                                 continue;
2649
2650                         if (!i915_request_is_ready(w))
2651                                 continue;
2652
2653                         if (i915_request_completed(w))
2654                                 continue;
2655
2656                         if (i915_request_on_hold(w))
2657                                 continue;
2658
2659                         list_move_tail(&w->sched.link, &list);
2660                 }
2661
2662                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2663         } while (rq);
2664 }
2665
2666 static bool execlists_hold(struct intel_engine_cs *engine,
2667                            struct i915_request *rq)
2668 {
2669         spin_lock_irq(&engine->active.lock);
2670
2671         if (i915_request_completed(rq)) { /* too late! */
2672                 rq = NULL;
2673                 goto unlock;
2674         }
2675
2676         if (rq->engine != engine) { /* preempted virtual engine */
2677                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2678
2679                 /*
2680                  * intel_context_inflight() is only protected by virtue
2681                  * of process_csb() being called only by the tasklet (or
2682                  * directly from inside reset while the tasklet is suspended).
2683                  * Assert that neither of those are allowed to run while we
2684                  * poke at the request queues.
2685                  */
2686                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2687
2688                 /*
2689                  * An unsubmitted request along a virtual engine will
2690                  * remain on the active (this) engine until we are able
2691                  * to process the context switch away (and so mark the
2692                  * context as no longer in flight). That cannot have happened
2693                  * yet, otherwise we would not be hanging!
2694                  */
2695                 spin_lock(&ve->base.active.lock);
2696                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2697                 GEM_BUG_ON(ve->request != rq);
2698                 ve->request = NULL;
2699                 spin_unlock(&ve->base.active.lock);
2700                 i915_request_put(rq);
2701
2702                 rq->engine = engine;
2703         }
2704
2705         /*
2706          * Transfer this request onto the hold queue to prevent it
2707          * being resumbitted to HW (and potentially completed) before we have
2708          * released it. Since we may have already submitted following
2709          * requests, we need to remove those as well.
2710          */
2711         GEM_BUG_ON(i915_request_on_hold(rq));
2712         GEM_BUG_ON(rq->engine != engine);
2713         __execlists_hold(rq);
2714         GEM_BUG_ON(list_empty(&engine->active.hold));
2715
2716 unlock:
2717         spin_unlock_irq(&engine->active.lock);
2718         return rq;
2719 }
2720
2721 static bool hold_request(const struct i915_request *rq)
2722 {
2723         struct i915_dependency *p;
2724         bool result = false;
2725
2726         /*
2727          * If one of our ancestors is on hold, we must also be on hold,
2728          * otherwise we will bypass it and execute before it.
2729          */
2730         rcu_read_lock();
2731         for_each_signaler(p, rq) {
2732                 const struct i915_request *s =
2733                         container_of(p->signaler, typeof(*s), sched);
2734
2735                 if (s->engine != rq->engine)
2736                         continue;
2737
2738                 result = i915_request_on_hold(s);
2739                 if (result)
2740                         break;
2741         }
2742         rcu_read_unlock();
2743
2744         return result;
2745 }
2746
2747 static void __execlists_unhold(struct i915_request *rq)
2748 {
2749         LIST_HEAD(list);
2750
2751         do {
2752                 struct i915_dependency *p;
2753
2754                 RQ_TRACE(rq, "hold release\n");
2755
2756                 GEM_BUG_ON(!i915_request_on_hold(rq));
2757                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2758
2759                 i915_request_clear_hold(rq);
2760                 list_move_tail(&rq->sched.link,
2761                                i915_sched_lookup_priolist(rq->engine,
2762                                                           rq_prio(rq)));
2763                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2764
2765                 /* Also release any children on this engine that are ready */
2766                 for_each_waiter(p, rq) {
2767                         struct i915_request *w =
2768                                 container_of(p->waiter, typeof(*w), sched);
2769
2770                         /* Propagate any change in error status */
2771                         if (rq->fence.error)
2772                                 i915_request_set_error_once(w, rq->fence.error);
2773
2774                         if (w->engine != rq->engine)
2775                                 continue;
2776
2777                         if (!i915_request_on_hold(w))
2778                                 continue;
2779
2780                         /* Check that no other parents are also on hold */
2781                         if (hold_request(w))
2782                                 continue;
2783
2784                         list_move_tail(&w->sched.link, &list);
2785                 }
2786
2787                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2788         } while (rq);
2789 }
2790
2791 static void execlists_unhold(struct intel_engine_cs *engine,
2792                              struct i915_request *rq)
2793 {
2794         spin_lock_irq(&engine->active.lock);
2795
2796         /*
2797          * Move this request back to the priority queue, and all of its
2798          * children and grandchildren that were suspended along with it.
2799          */
2800         __execlists_unhold(rq);
2801
2802         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2803                 engine->execlists.queue_priority_hint = rq_prio(rq);
2804                 tasklet_hi_schedule(&engine->execlists.tasklet);
2805         }
2806
2807         spin_unlock_irq(&engine->active.lock);
2808 }
2809
2810 struct execlists_capture {
2811         struct work_struct work;
2812         struct i915_request *rq;
2813         struct i915_gpu_coredump *error;
2814 };
2815
2816 static void execlists_capture_work(struct work_struct *work)
2817 {
2818         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2819         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2820         struct intel_engine_cs *engine = cap->rq->engine;
2821         struct intel_gt_coredump *gt = cap->error->gt;
2822         struct intel_engine_capture_vma *vma;
2823
2824         /* Compress all the objects attached to the request, slow! */
2825         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2826         if (vma) {
2827                 struct i915_vma_compress *compress =
2828                         i915_vma_capture_prepare(gt);
2829
2830                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2831                 i915_vma_capture_finish(gt, compress);
2832         }
2833
2834         gt->simulated = gt->engine->simulated;
2835         cap->error->simulated = gt->simulated;
2836
2837         /* Publish the error state, and announce it to the world */
2838         i915_error_state_store(cap->error);
2839         i915_gpu_coredump_put(cap->error);
2840
2841         /* Return this request and all that depend upon it for signaling */
2842         execlists_unhold(engine, cap->rq);
2843         i915_request_put(cap->rq);
2844
2845         kfree(cap);
2846 }
2847
2848 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2849 {
2850         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2851         struct execlists_capture *cap;
2852
2853         cap = kmalloc(sizeof(*cap), gfp);
2854         if (!cap)
2855                 return NULL;
2856
2857         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2858         if (!cap->error)
2859                 goto err_cap;
2860
2861         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2862         if (!cap->error->gt)
2863                 goto err_gpu;
2864
2865         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2866         if (!cap->error->gt->engine)
2867                 goto err_gt;
2868
2869         return cap;
2870
2871 err_gt:
2872         kfree(cap->error->gt);
2873 err_gpu:
2874         kfree(cap->error);
2875 err_cap:
2876         kfree(cap);
2877         return NULL;
2878 }
2879
2880 static struct i915_request *
2881 active_context(struct intel_engine_cs *engine, u32 ccid)
2882 {
2883         const struct intel_engine_execlists * const el = &engine->execlists;
2884         struct i915_request * const *port, *rq;
2885
2886         /*
2887          * Use the most recent result from process_csb(), but just in case
2888          * we trigger an error (via interrupt) before the first CS event has
2889          * been written, peek at the next submission.
2890          */
2891
2892         for (port = el->active; (rq = *port); port++) {
2893                 if (upper_32_bits(rq->context->lrc_desc) == ccid) {
2894                         ENGINE_TRACE(engine,
2895                                      "ccid found at active:%zd\n",
2896                                      port - el->active);
2897                         return rq;
2898                 }
2899         }
2900
2901         for (port = el->pending; (rq = *port); port++) {
2902                 if (upper_32_bits(rq->context->lrc_desc) == ccid) {
2903                         ENGINE_TRACE(engine,
2904                                      "ccid found at pending:%zd\n",
2905                                      port - el->pending);
2906                         return rq;
2907                 }
2908         }
2909
2910         ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
2911         return NULL;
2912 }
2913
2914 static u32 active_ccid(struct intel_engine_cs *engine)
2915 {
2916         return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
2917 }
2918
2919 static bool execlists_capture(struct intel_engine_cs *engine)
2920 {
2921         struct execlists_capture *cap;
2922
2923         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2924                 return true;
2925
2926         /*
2927          * We need to _quickly_ capture the engine state before we reset.
2928          * We are inside an atomic section (softirq) here and we are delaying
2929          * the forced preemption event.
2930          */
2931         cap = capture_regs(engine);
2932         if (!cap)
2933                 return true;
2934
2935         spin_lock_irq(&engine->active.lock);
2936         cap->rq = active_context(engine, active_ccid(engine));
2937         if (cap->rq) {
2938                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2939                 cap->rq = i915_request_get_rcu(cap->rq);
2940         }
2941         spin_unlock_irq(&engine->active.lock);
2942         if (!cap->rq)
2943                 goto err_free;
2944
2945         /*
2946          * Remove the request from the execlists queue, and take ownership
2947          * of the request. We pass it to our worker who will _slowly_ compress
2948          * all the pages the _user_ requested for debugging their batch, after
2949          * which we return it to the queue for signaling.
2950          *
2951          * By removing them from the execlists queue, we also remove the
2952          * requests from being processed by __unwind_incomplete_requests()
2953          * during the intel_engine_reset(), and so they will *not* be replayed
2954          * afterwards.
2955          *
2956          * Note that because we have not yet reset the engine at this point,
2957          * it is possible for the request that we have identified as being
2958          * guilty, did in fact complete and we will then hit an arbitration
2959          * point allowing the outstanding preemption to succeed. The likelihood
2960          * of that is very low (as capturing of the engine registers should be
2961          * fast enough to run inside an irq-off atomic section!), so we will
2962          * simply hold that request accountable for being non-preemptible
2963          * long enough to force the reset.
2964          */
2965         if (!execlists_hold(engine, cap->rq))
2966                 goto err_rq;
2967
2968         INIT_WORK(&cap->work, execlists_capture_work);
2969         schedule_work(&cap->work);
2970         return true;
2971
2972 err_rq:
2973         i915_request_put(cap->rq);
2974 err_free:
2975         i915_gpu_coredump_put(cap->error);
2976         kfree(cap);
2977         return false;
2978 }
2979
2980 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2981 {
2982         const unsigned int bit = I915_RESET_ENGINE + engine->id;
2983         unsigned long *lock = &engine->gt->reset.flags;
2984
2985         if (!intel_has_reset_engine(engine->gt))
2986                 return;
2987
2988         if (test_and_set_bit(bit, lock))
2989                 return;
2990
2991         ENGINE_TRACE(engine, "reset for %s\n", msg);
2992
2993         /* Mark this tasklet as disabled to avoid waiting for it to complete */
2994         tasklet_disable_nosync(&engine->execlists.tasklet);
2995
2996         ring_set_paused(engine, 1); /* Freeze the current request in place */
2997         if (execlists_capture(engine))
2998                 intel_engine_reset(engine, msg);
2999         else
3000                 ring_set_paused(engine, 0);
3001
3002         tasklet_enable(&engine->execlists.tasklet);
3003         clear_and_wake_up_bit(bit, lock);
3004 }
3005
3006 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3007 {
3008         const struct timer_list *t = &engine->execlists.preempt;
3009
3010         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3011                 return false;
3012
3013         if (!timer_expired(t))
3014                 return false;
3015
3016         return READ_ONCE(engine->execlists.pending[0]);
3017 }
3018
3019 /*
3020  * Check the unread Context Status Buffers and manage the submission of new
3021  * contexts to the ELSP accordingly.
3022  */
3023 static void execlists_submission_tasklet(unsigned long data)
3024 {
3025         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3026         bool timeout = preempt_timeout(engine);
3027
3028         process_csb(engine);
3029
3030         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3031                 engine->execlists.error_interrupt = 0;
3032                 if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
3033                         execlists_reset(engine, "CS error");
3034         }
3035
3036         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3037                 unsigned long flags;
3038
3039                 spin_lock_irqsave(&engine->active.lock, flags);
3040                 __execlists_submission_tasklet(engine);
3041                 spin_unlock_irqrestore(&engine->active.lock, flags);
3042
3043                 /* Recheck after serialising with direct-submission */
3044                 if (unlikely(timeout && preempt_timeout(engine)))
3045                         execlists_reset(engine, "preemption time out");
3046         }
3047 }
3048
3049 static void __execlists_kick(struct intel_engine_execlists *execlists)
3050 {
3051         /* Kick the tasklet for some interrupt coalescing and reset handling */
3052         tasklet_hi_schedule(&execlists->tasklet);
3053 }
3054
3055 #define execlists_kick(t, member) \
3056         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
3057
3058 static void execlists_timeslice(struct timer_list *timer)
3059 {
3060         execlists_kick(timer, timer);
3061 }
3062
3063 static void execlists_preempt(struct timer_list *timer)
3064 {
3065         execlists_kick(timer, preempt);
3066 }
3067
3068 static void queue_request(struct intel_engine_cs *engine,
3069                           struct i915_request *rq)
3070 {
3071         GEM_BUG_ON(!list_empty(&rq->sched.link));
3072         list_add_tail(&rq->sched.link,
3073                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
3074         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3075 }
3076
3077 static void __submit_queue_imm(struct intel_engine_cs *engine)
3078 {
3079         struct intel_engine_execlists * const execlists = &engine->execlists;
3080
3081         if (reset_in_progress(execlists))
3082                 return; /* defer until we restart the engine following reset */
3083
3084         /* Hopefully we clear execlists->pending[] to let us through */
3085         if (READ_ONCE(execlists->pending[0]) &&
3086             tasklet_trylock(&execlists->tasklet)) {
3087                 process_csb(engine);
3088                 tasklet_unlock(&execlists->tasklet);
3089         }
3090
3091         __execlists_submission_tasklet(engine);
3092 }
3093
3094 static void submit_queue(struct intel_engine_cs *engine,
3095                          const struct i915_request *rq)
3096 {
3097         struct intel_engine_execlists *execlists = &engine->execlists;
3098
3099         if (rq_prio(rq) <= execlists->queue_priority_hint)
3100                 return;
3101
3102         execlists->queue_priority_hint = rq_prio(rq);
3103         __submit_queue_imm(engine);
3104 }
3105
3106 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3107                              const struct i915_request *rq)
3108 {
3109         GEM_BUG_ON(i915_request_on_hold(rq));
3110         return !list_empty(&engine->active.hold) && hold_request(rq);
3111 }
3112
3113 static void execlists_submit_request(struct i915_request *request)
3114 {
3115         struct intel_engine_cs *engine = request->engine;
3116         unsigned long flags;
3117
3118         /* Will be called from irq-context when using foreign fences. */
3119         spin_lock_irqsave(&engine->active.lock, flags);
3120
3121         if (unlikely(ancestor_on_hold(engine, request))) {
3122                 RQ_TRACE(request, "ancestor on hold\n");
3123                 list_add_tail(&request->sched.link, &engine->active.hold);
3124                 i915_request_set_hold(request);
3125         } else {
3126                 queue_request(engine, request);
3127
3128                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3129                 GEM_BUG_ON(list_empty(&request->sched.link));
3130
3131                 submit_queue(engine, request);
3132         }
3133
3134         spin_unlock_irqrestore(&engine->active.lock, flags);
3135 }
3136
3137 static void __execlists_context_fini(struct intel_context *ce)
3138 {
3139         intel_ring_put(ce->ring);
3140         i915_vma_put(ce->state);
3141 }
3142
3143 static void execlists_context_destroy(struct kref *kref)
3144 {
3145         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3146
3147         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3148         GEM_BUG_ON(intel_context_is_pinned(ce));
3149
3150         if (ce->state)
3151                 __execlists_context_fini(ce);
3152
3153         intel_context_fini(ce);
3154         intel_context_free(ce);
3155 }
3156
3157 static void
3158 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3159 {
3160         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3161                 return;
3162
3163         vaddr += engine->context_size;
3164
3165         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3166 }
3167
3168 static void
3169 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3170 {
3171         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3172                 return;
3173
3174         vaddr += engine->context_size;
3175
3176         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3177                 drm_err_once(&engine->i915->drm,
3178                              "%s context redzone overwritten!\n",
3179                              engine->name);
3180 }
3181
3182 static void execlists_context_unpin(struct intel_context *ce)
3183 {
3184         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3185                       ce->engine);
3186
3187         i915_gem_object_unpin_map(ce->state->obj);
3188 }
3189
3190 static void
3191 __execlists_update_reg_state(const struct intel_context *ce,
3192                              const struct intel_engine_cs *engine,
3193                              u32 head)
3194 {
3195         struct intel_ring *ring = ce->ring;
3196         u32 *regs = ce->lrc_reg_state;
3197
3198         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3199         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3200
3201         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3202         regs[CTX_RING_HEAD] = head;
3203         regs[CTX_RING_TAIL] = ring->tail;
3204         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3205
3206         /* RPCS */
3207         if (engine->class == RENDER_CLASS) {
3208                 regs[CTX_R_PWR_CLK_STATE] =
3209                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3210
3211                 i915_oa_init_reg_state(ce, engine);
3212         }
3213 }
3214
3215 static int
3216 __execlists_context_pin(struct intel_context *ce,
3217                         struct intel_engine_cs *engine)
3218 {
3219         void *vaddr;
3220
3221         GEM_BUG_ON(!ce->state);
3222         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3223
3224         vaddr = i915_gem_object_pin_map(ce->state->obj,
3225                                         i915_coherent_map_type(engine->i915) |
3226                                         I915_MAP_OVERRIDE);
3227         if (IS_ERR(vaddr))
3228                 return PTR_ERR(vaddr);
3229
3230         ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3231         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3232         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3233
3234         return 0;
3235 }
3236
3237 static int execlists_context_pin(struct intel_context *ce)
3238 {
3239         return __execlists_context_pin(ce, ce->engine);
3240 }
3241
3242 static int execlists_context_alloc(struct intel_context *ce)
3243 {
3244         return __execlists_context_alloc(ce, ce->engine);
3245 }
3246
3247 static void execlists_context_reset(struct intel_context *ce)
3248 {
3249         CE_TRACE(ce, "reset\n");
3250         GEM_BUG_ON(!intel_context_is_pinned(ce));
3251
3252         intel_ring_reset(ce->ring, ce->ring->emit);
3253
3254         /* Scrub away the garbage */
3255         execlists_init_reg_state(ce->lrc_reg_state,
3256                                  ce, ce->engine, ce->ring, true);
3257         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3258
3259         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
3260 }
3261
3262 static const struct intel_context_ops execlists_context_ops = {
3263         .alloc = execlists_context_alloc,
3264
3265         .pin = execlists_context_pin,
3266         .unpin = execlists_context_unpin,
3267
3268         .enter = intel_context_enter_engine,
3269         .exit = intel_context_exit_engine,
3270
3271         .reset = execlists_context_reset,
3272         .destroy = execlists_context_destroy,
3273 };
3274
3275 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3276 {
3277         u32 *cs;
3278
3279         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3280                 return 0;
3281
3282         cs = intel_ring_begin(rq, 6);
3283         if (IS_ERR(cs))
3284                 return PTR_ERR(cs);
3285
3286         /*
3287          * Check if we have been preempted before we even get started.
3288          *
3289          * After this point i915_request_started() reports true, even if
3290          * we get preempted and so are no longer running.
3291          */
3292         *cs++ = MI_ARB_CHECK;
3293         *cs++ = MI_NOOP;
3294
3295         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3296         *cs++ = i915_request_timeline(rq)->hwsp_offset;
3297         *cs++ = 0;
3298         *cs++ = rq->fence.seqno - 1;
3299
3300         intel_ring_advance(rq, cs);
3301
3302         /* Record the updated position of the request's payload */
3303         rq->infix = intel_ring_offset(rq, cs);
3304
3305         return 0;
3306 }
3307
3308 static int execlists_request_alloc(struct i915_request *request)
3309 {
3310         int ret;
3311
3312         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3313
3314         /*
3315          * Flush enough space to reduce the likelihood of waiting after
3316          * we start building the request - in which case we will just
3317          * have to repeat work.
3318          */
3319         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3320
3321         /*
3322          * Note that after this point, we have committed to using
3323          * this request as it is being used to both track the
3324          * state of engine initialisation and liveness of the
3325          * golden renderstate above. Think twice before you try
3326          * to cancel/unwind this request now.
3327          */
3328
3329         /* Unconditionally invalidate GPU caches and TLBs. */
3330         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3331         if (ret)
3332                 return ret;
3333
3334         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3335         return 0;
3336 }
3337
3338 /*
3339  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3340  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3341  * but there is a slight complication as this is applied in WA batch where the
3342  * values are only initialized once so we cannot take register value at the
3343  * beginning and reuse it further; hence we save its value to memory, upload a
3344  * constant value with bit21 set and then we restore it back with the saved value.
3345  * To simplify the WA, a constant value is formed by using the default value
3346  * of this register. This shouldn't be a problem because we are only modifying
3347  * it for a short period and this batch in non-premptible. We can ofcourse
3348  * use additional instructions that read the actual value of the register
3349  * at that time and set our bit of interest but it makes the WA complicated.
3350  *
3351  * This WA is also required for Gen9 so extracting as a function avoids
3352  * code duplication.
3353  */
3354 static u32 *
3355 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3356 {
3357         /* NB no one else is allowed to scribble over scratch + 256! */
3358         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3359         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3360         *batch++ = intel_gt_scratch_offset(engine->gt,
3361                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3362         *batch++ = 0;
3363
3364         *batch++ = MI_LOAD_REGISTER_IMM(1);
3365         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3366         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3367
3368         batch = gen8_emit_pipe_control(batch,
3369                                        PIPE_CONTROL_CS_STALL |
3370                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3371                                        0);
3372
3373         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3374         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3375         *batch++ = intel_gt_scratch_offset(engine->gt,
3376                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3377         *batch++ = 0;
3378
3379         return batch;
3380 }
3381
3382 /*
3383  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3384  * initialized at the beginning and shared across all contexts but this field
3385  * helps us to have multiple batches at different offsets and select them based
3386  * on a criteria. At the moment this batch always start at the beginning of the page
3387  * and at this point we don't have multiple wa_ctx batch buffers.
3388  *
3389  * The number of WA applied are not known at the beginning; we use this field
3390  * to return the no of DWORDS written.
3391  *
3392  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3393  * so it adds NOOPs as padding to make it cacheline aligned.
3394  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3395  * makes a complete batch buffer.
3396  */
3397 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3398 {
3399         /* WaDisableCtxRestoreArbitration:bdw,chv */
3400         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3401
3402         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3403         if (IS_BROADWELL(engine->i915))
3404                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3405
3406         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3407         /* Actual scratch location is at 128 bytes offset */
3408         batch = gen8_emit_pipe_control(batch,
3409                                        PIPE_CONTROL_FLUSH_L3 |
3410                                        PIPE_CONTROL_STORE_DATA_INDEX |
3411                                        PIPE_CONTROL_CS_STALL |
3412                                        PIPE_CONTROL_QW_WRITE,
3413                                        LRC_PPHWSP_SCRATCH_ADDR);
3414
3415         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3416
3417         /* Pad to end of cacheline */
3418         while ((unsigned long)batch % CACHELINE_BYTES)
3419                 *batch++ = MI_NOOP;
3420
3421         /*
3422          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3423          * execution depends on the length specified in terms of cache lines
3424          * in the register CTX_RCS_INDIRECT_CTX
3425          */
3426
3427         return batch;
3428 }
3429
3430 struct lri {
3431         i915_reg_t reg;
3432         u32 value;
3433 };
3434
3435 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3436 {
3437         GEM_BUG_ON(!count || count > 63);
3438
3439         *batch++ = MI_LOAD_REGISTER_IMM(count);
3440         do {
3441                 *batch++ = i915_mmio_reg_offset(lri->reg);
3442                 *batch++ = lri->value;
3443         } while (lri++, --count);
3444         *batch++ = MI_NOOP;
3445
3446         return batch;
3447 }
3448
3449 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3450 {
3451         static const struct lri lri[] = {
3452                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3453                 {
3454                         COMMON_SLICE_CHICKEN2,
3455                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3456                                        0),
3457                 },
3458
3459                 /* BSpec: 11391 */
3460                 {
3461                         FF_SLICE_CHICKEN,
3462                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3463                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3464                 },
3465
3466                 /* BSpec: 11299 */
3467                 {
3468                         _3D_CHICKEN3,
3469                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3470                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3471                 }
3472         };
3473
3474         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3475
3476         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3477         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3478
3479         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3480         batch = gen8_emit_pipe_control(batch,
3481                                        PIPE_CONTROL_FLUSH_L3 |
3482                                        PIPE_CONTROL_STORE_DATA_INDEX |
3483                                        PIPE_CONTROL_CS_STALL |
3484                                        PIPE_CONTROL_QW_WRITE,
3485                                        LRC_PPHWSP_SCRATCH_ADDR);
3486
3487         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3488
3489         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3490         if (HAS_POOLED_EU(engine->i915)) {
3491                 /*
3492                  * EU pool configuration is setup along with golden context
3493                  * during context initialization. This value depends on
3494                  * device type (2x6 or 3x6) and needs to be updated based
3495                  * on which subslice is disabled especially for 2x6
3496                  * devices, however it is safe to load default
3497                  * configuration of 3x6 device instead of masking off
3498                  * corresponding bits because HW ignores bits of a disabled
3499                  * subslice and drops down to appropriate config. Please
3500                  * see render_state_setup() in i915_gem_render_state.c for
3501                  * possible configurations, to avoid duplication they are
3502                  * not shown here again.
3503                  */
3504                 *batch++ = GEN9_MEDIA_POOL_STATE;
3505                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3506                 *batch++ = 0x00777000;
3507                 *batch++ = 0;
3508                 *batch++ = 0;
3509                 *batch++ = 0;
3510         }
3511
3512         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3513
3514         /* Pad to end of cacheline */
3515         while ((unsigned long)batch % CACHELINE_BYTES)
3516                 *batch++ = MI_NOOP;
3517
3518         return batch;
3519 }
3520
3521 static u32 *
3522 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3523 {
3524         int i;
3525
3526         /*
3527          * WaPipeControlBefore3DStateSamplePattern: cnl
3528          *
3529          * Ensure the engine is idle prior to programming a
3530          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3531          */
3532         batch = gen8_emit_pipe_control(batch,
3533                                        PIPE_CONTROL_CS_STALL,
3534                                        0);
3535         /*
3536          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3537          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3538          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3539          * confusing. Since gen8_emit_pipe_control() already advances the
3540          * batch by 6 dwords, we advance the other 10 here, completing a
3541          * cacheline. It's not clear if the workaround requires this padding
3542          * before other commands, or if it's just the regular padding we would
3543          * already have for the workaround bb, so leave it here for now.
3544          */
3545         for (i = 0; i < 10; i++)
3546                 *batch++ = MI_NOOP;
3547
3548         /* Pad to end of cacheline */
3549         while ((unsigned long)batch % CACHELINE_BYTES)
3550                 *batch++ = MI_NOOP;
3551
3552         return batch;
3553 }
3554
3555 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3556
3557 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3558 {
3559         struct drm_i915_gem_object *obj;
3560         struct i915_vma *vma;
3561         int err;
3562
3563         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3564         if (IS_ERR(obj))
3565                 return PTR_ERR(obj);
3566
3567         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3568         if (IS_ERR(vma)) {
3569                 err = PTR_ERR(vma);
3570                 goto err;
3571         }
3572
3573         err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3574         if (err)
3575                 goto err;
3576
3577         engine->wa_ctx.vma = vma;
3578         return 0;
3579
3580 err:
3581         i915_gem_object_put(obj);
3582         return err;
3583 }
3584
3585 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3586 {
3587         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3588 }
3589
3590 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3591
3592 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3593 {
3594         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3595         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3596                                             &wa_ctx->per_ctx };
3597         wa_bb_func_t wa_bb_fn[2];
3598         struct page *page;
3599         void *batch, *batch_ptr;
3600         unsigned int i;
3601         int ret;
3602
3603         if (engine->class != RENDER_CLASS)
3604                 return 0;
3605
3606         switch (INTEL_GEN(engine->i915)) {
3607         case 12:
3608         case 11:
3609                 return 0;
3610         case 10:
3611                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3612                 wa_bb_fn[1] = NULL;
3613                 break;
3614         case 9:
3615                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3616                 wa_bb_fn[1] = NULL;
3617                 break;
3618         case 8:
3619                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3620                 wa_bb_fn[1] = NULL;
3621                 break;
3622         default:
3623                 MISSING_CASE(INTEL_GEN(engine->i915));
3624                 return 0;
3625         }
3626
3627         ret = lrc_setup_wa_ctx(engine);
3628         if (ret) {
3629                 drm_dbg(&engine->i915->drm,
3630                         "Failed to setup context WA page: %d\n", ret);
3631                 return ret;
3632         }
3633
3634         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3635         batch = batch_ptr = kmap_atomic(page);
3636
3637         /*
3638          * Emit the two workaround batch buffers, recording the offset from the
3639          * start of the workaround batch buffer object for each and their
3640          * respective sizes.
3641          */
3642         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3643                 wa_bb[i]->offset = batch_ptr - batch;
3644                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3645                                                   CACHELINE_BYTES))) {
3646                         ret = -EINVAL;
3647                         break;
3648                 }
3649                 if (wa_bb_fn[i])
3650                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3651                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3652         }
3653
3654         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3655
3656         kunmap_atomic(batch);
3657         if (ret)
3658                 lrc_destroy_wa_ctx(engine);
3659
3660         return ret;
3661 }
3662
3663 static void reset_csb_pointers(struct intel_engine_cs *engine)
3664 {
3665         struct intel_engine_execlists * const execlists = &engine->execlists;
3666         const unsigned int reset_value = execlists->csb_size - 1;
3667
3668         ring_set_paused(engine, 0);
3669
3670         /*
3671          * After a reset, the HW starts writing into CSB entry [0]. We
3672          * therefore have to set our HEAD pointer back one entry so that
3673          * the *first* entry we check is entry 0. To complicate this further,
3674          * as we don't wait for the first interrupt after reset, we have to
3675          * fake the HW write to point back to the last entry so that our
3676          * inline comparison of our cached head position against the last HW
3677          * write works even before the first interrupt.
3678          */
3679         execlists->csb_head = reset_value;
3680         WRITE_ONCE(*execlists->csb_write, reset_value);
3681         wmb(); /* Make sure this is visible to HW (paranoia?) */
3682
3683         /*
3684          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3685          * Bludgeon them with a mmio update to be sure.
3686          */
3687         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3688                      reset_value << 8 | reset_value);
3689         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3690
3691         invalidate_csb_entries(&execlists->csb_status[0],
3692                                &execlists->csb_status[reset_value]);
3693 }
3694
3695 static void execlists_sanitize(struct intel_engine_cs *engine)
3696 {
3697         /*
3698          * Poison residual state on resume, in case the suspend didn't!
3699          *
3700          * We have to assume that across suspend/resume (or other loss
3701          * of control) that the contents of our pinned buffers has been
3702          * lost, replaced by garbage. Since this doesn't always happen,
3703          * let's poison such state so that we more quickly spot when
3704          * we falsely assume it has been preserved.
3705          */
3706         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3707                 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
3708
3709         reset_csb_pointers(engine);
3710
3711         /*
3712          * The kernel_context HWSP is stored in the status_page. As above,
3713          * that may be lost on resume/initialisation, and so we need to
3714          * reset the value in the HWSP.
3715          */
3716         intel_timeline_reset_seqno(engine->kernel_context->timeline);
3717 }
3718
3719 static void enable_error_interrupt(struct intel_engine_cs *engine)
3720 {
3721         u32 status;
3722
3723         engine->execlists.error_interrupt = 0;
3724         ENGINE_WRITE(engine, RING_EMR, ~0u);
3725         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3726
3727         status = ENGINE_READ(engine, RING_ESR);
3728         if (unlikely(status)) {
3729                 drm_err(&engine->i915->drm,
3730                         "engine '%s' resumed still in error: %08x\n",
3731                         engine->name, status);
3732                 __intel_gt_reset(engine->gt, engine->mask);
3733         }
3734
3735         /*
3736          * On current gen8+, we have 2 signals to play with
3737          *
3738          * - I915_ERROR_INSTUCTION (bit 0)
3739          *
3740          *    Generate an error if the command parser encounters an invalid
3741          *    instruction
3742          *
3743          *    This is a fatal error.
3744          *
3745          * - CP_PRIV (bit 2)
3746          *
3747          *    Generate an error on privilege violation (where the CP replaces
3748          *    the instruction with a no-op). This also fires for writes into
3749          *    read-only scratch pages.
3750          *
3751          *    This is a non-fatal error, parsing continues.
3752          *
3753          * * there are a few others defined for odd HW that we do not use
3754          *
3755          * Since CP_PRIV fires for cases where we have chosen to ignore the
3756          * error (as the HW is validating and suppressing the mistakes), we
3757          * only unmask the instruction error bit.
3758          */
3759         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3760 }
3761
3762 static void enable_execlists(struct intel_engine_cs *engine)
3763 {
3764         u32 mode;
3765
3766         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3767
3768         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3769
3770         if (INTEL_GEN(engine->i915) >= 11)
3771                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3772         else
3773                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3774         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3775
3776         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3777
3778         ENGINE_WRITE_FW(engine,
3779                         RING_HWS_PGA,
3780                         i915_ggtt_offset(engine->status_page.vma));
3781         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3782
3783         enable_error_interrupt(engine);
3784
3785         engine->context_tag = 0;
3786 }
3787
3788 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3789 {
3790         bool unexpected = false;
3791
3792         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3793                 drm_dbg(&engine->i915->drm,
3794                         "STOP_RING still set in RING_MI_MODE\n");
3795                 unexpected = true;
3796         }
3797
3798         return unexpected;
3799 }
3800
3801 static int execlists_resume(struct intel_engine_cs *engine)
3802 {
3803         intel_mocs_init_engine(engine);
3804
3805         intel_engine_reset_breadcrumbs(engine);
3806
3807         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3808                 struct drm_printer p = drm_debug_printer(__func__);
3809
3810                 intel_engine_dump(engine, &p, NULL);
3811         }
3812
3813         enable_execlists(engine);
3814
3815         return 0;
3816 }
3817
3818 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3819 {
3820         struct intel_engine_execlists * const execlists = &engine->execlists;
3821         unsigned long flags;
3822
3823         ENGINE_TRACE(engine, "depth<-%d\n",
3824                      atomic_read(&execlists->tasklet.count));
3825
3826         /*
3827          * Prevent request submission to the hardware until we have
3828          * completed the reset in i915_gem_reset_finish(). If a request
3829          * is completed by one engine, it may then queue a request
3830          * to a second via its execlists->tasklet *just* as we are
3831          * calling engine->resume() and also writing the ELSP.
3832          * Turning off the execlists->tasklet until the reset is over
3833          * prevents the race.
3834          */
3835         __tasklet_disable_sync_once(&execlists->tasklet);
3836         GEM_BUG_ON(!reset_in_progress(execlists));
3837
3838         /* And flush any current direct submission. */
3839         spin_lock_irqsave(&engine->active.lock, flags);
3840         spin_unlock_irqrestore(&engine->active.lock, flags);
3841
3842         /*
3843          * We stop engines, otherwise we might get failed reset and a
3844          * dead gpu (on elk). Also as modern gpu as kbl can suffer
3845          * from system hang if batchbuffer is progressing when
3846          * the reset is issued, regardless of READY_TO_RESET ack.
3847          * Thus assume it is best to stop engines on all gens
3848          * where we have a gpu reset.
3849          *
3850          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3851          *
3852          * FIXME: Wa for more modern gens needs to be validated
3853          */
3854         ring_set_paused(engine, 1);
3855         intel_engine_stop_cs(engine);
3856 }
3857
3858 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3859 {
3860         int x;
3861
3862         x = lrc_ring_mi_mode(engine);
3863         if (x != -1) {
3864                 regs[x + 1] &= ~STOP_RING;
3865                 regs[x + 1] |= STOP_RING << 16;
3866         }
3867 }
3868
3869 static void __execlists_reset_reg_state(const struct intel_context *ce,
3870                                         const struct intel_engine_cs *engine)
3871 {
3872         u32 *regs = ce->lrc_reg_state;
3873
3874         __reset_stop_ring(regs, engine);
3875 }
3876
3877 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3878 {
3879         struct intel_engine_execlists * const execlists = &engine->execlists;
3880         struct intel_context *ce;
3881         struct i915_request *rq;
3882         u32 head;
3883
3884         mb(); /* paranoia: read the CSB pointers from after the reset */
3885         clflush(execlists->csb_write);
3886         mb();
3887
3888         process_csb(engine); /* drain preemption events */
3889
3890         /* Following the reset, we need to reload the CSB read/write pointers */
3891         reset_csb_pointers(engine);
3892
3893         /*
3894          * Save the currently executing context, even if we completed
3895          * its request, it was still running at the time of the
3896          * reset and will have been clobbered.
3897          */
3898         rq = execlists_active(execlists);
3899         if (!rq)
3900                 goto unwind;
3901
3902         ce = rq->context;
3903         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3904
3905         if (i915_request_completed(rq)) {
3906                 /* Idle context; tidy up the ring so we can restart afresh */
3907                 head = intel_ring_wrap(ce->ring, rq->tail);
3908                 goto out_replay;
3909         }
3910
3911         /* We still have requests in-flight; the engine should be active */
3912         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3913
3914         /* Context has requests still in-flight; it should not be idle! */
3915         GEM_BUG_ON(i915_active_is_idle(&ce->active));
3916
3917         rq = active_request(ce->timeline, rq);
3918         head = intel_ring_wrap(ce->ring, rq->head);
3919         GEM_BUG_ON(head == ce->ring->tail);
3920
3921         /*
3922          * If this request hasn't started yet, e.g. it is waiting on a
3923          * semaphore, we need to avoid skipping the request or else we
3924          * break the signaling chain. However, if the context is corrupt
3925          * the request will not restart and we will be stuck with a wedged
3926          * device. It is quite often the case that if we issue a reset
3927          * while the GPU is loading the context image, that the context
3928          * image becomes corrupt.
3929          *
3930          * Otherwise, if we have not started yet, the request should replay
3931          * perfectly and we do not need to flag the result as being erroneous.
3932          */
3933         if (!i915_request_started(rq))
3934                 goto out_replay;
3935
3936         /*
3937          * If the request was innocent, we leave the request in the ELSP
3938          * and will try to replay it on restarting. The context image may
3939          * have been corrupted by the reset, in which case we may have
3940          * to service a new GPU hang, but more likely we can continue on
3941          * without impact.
3942          *
3943          * If the request was guilty, we presume the context is corrupt
3944          * and have to at least restore the RING register in the context
3945          * image back to the expected values to skip over the guilty request.
3946          */
3947         __i915_request_reset(rq, stalled);
3948         if (!stalled)
3949                 goto out_replay;
3950
3951         /*
3952          * We want a simple context + ring to execute the breadcrumb update.
3953          * We cannot rely on the context being intact across the GPU hang,
3954          * so clear it and rebuild just what we need for the breadcrumb.
3955          * All pending requests for this context will be zapped, and any
3956          * future request will be after userspace has had the opportunity
3957          * to recreate its own state.
3958          */
3959         GEM_BUG_ON(!intel_context_is_pinned(ce));
3960         restore_default_state(ce, engine);
3961
3962 out_replay:
3963         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3964                      head, ce->ring->tail);
3965         __execlists_reset_reg_state(ce, engine);
3966         __execlists_update_reg_state(ce, engine, head);
3967         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3968
3969 unwind:
3970         /* Push back any incomplete requests for replay after the reset. */
3971         cancel_port_requests(execlists);
3972         __unwind_incomplete_requests(engine);
3973 }
3974
3975 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3976 {
3977         unsigned long flags;
3978
3979         ENGINE_TRACE(engine, "\n");
3980
3981         spin_lock_irqsave(&engine->active.lock, flags);
3982
3983         __execlists_reset(engine, stalled);
3984
3985         spin_unlock_irqrestore(&engine->active.lock, flags);
3986 }
3987
3988 static void nop_submission_tasklet(unsigned long data)
3989 {
3990         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3991
3992         /* The driver is wedged; don't process any more events. */
3993         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
3994 }
3995
3996 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3997 {
3998         struct intel_engine_execlists * const execlists = &engine->execlists;
3999         struct i915_request *rq, *rn;
4000         struct rb_node *rb;
4001         unsigned long flags;
4002
4003         ENGINE_TRACE(engine, "\n");
4004
4005         /*
4006          * Before we call engine->cancel_requests(), we should have exclusive
4007          * access to the submission state. This is arranged for us by the
4008          * caller disabling the interrupt generation, the tasklet and other
4009          * threads that may then access the same state, giving us a free hand
4010          * to reset state. However, we still need to let lockdep be aware that
4011          * we know this state may be accessed in hardirq context, so we
4012          * disable the irq around this manipulation and we want to keep
4013          * the spinlock focused on its duties and not accidentally conflate
4014          * coverage to the submission's irq state. (Similarly, although we
4015          * shouldn't need to disable irq around the manipulation of the
4016          * submission's irq state, we also wish to remind ourselves that
4017          * it is irq state.)
4018          */
4019         spin_lock_irqsave(&engine->active.lock, flags);
4020
4021         __execlists_reset(engine, true);
4022
4023         /* Mark all executing requests as skipped. */
4024         list_for_each_entry(rq, &engine->active.requests, sched.link)
4025                 mark_eio(rq);
4026
4027         /* Flush the queued requests to the timeline list (for retiring). */
4028         while ((rb = rb_first_cached(&execlists->queue))) {
4029                 struct i915_priolist *p = to_priolist(rb);
4030                 int i;
4031
4032                 priolist_for_each_request_consume(rq, rn, p, i) {
4033                         mark_eio(rq);
4034                         __i915_request_submit(rq);
4035                 }
4036
4037                 rb_erase_cached(&p->node, &execlists->queue);
4038                 i915_priolist_free(p);
4039         }
4040
4041         /* On-hold requests will be flushed to timeline upon their release */
4042         list_for_each_entry(rq, &engine->active.hold, sched.link)
4043                 mark_eio(rq);
4044
4045         /* Cancel all attached virtual engines */
4046         while ((rb = rb_first_cached(&execlists->virtual))) {
4047                 struct virtual_engine *ve =
4048                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4049
4050                 rb_erase_cached(rb, &execlists->virtual);
4051                 RB_CLEAR_NODE(rb);
4052
4053                 spin_lock(&ve->base.active.lock);
4054                 rq = fetch_and_zero(&ve->request);
4055                 if (rq) {
4056                         mark_eio(rq);
4057
4058                         rq->engine = engine;
4059                         __i915_request_submit(rq);
4060                         i915_request_put(rq);
4061
4062                         ve->base.execlists.queue_priority_hint = INT_MIN;
4063                 }
4064                 spin_unlock(&ve->base.active.lock);
4065         }
4066
4067         /* Remaining _unready_ requests will be nop'ed when submitted */
4068
4069         execlists->queue_priority_hint = INT_MIN;
4070         execlists->queue = RB_ROOT_CACHED;
4071
4072         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4073         execlists->tasklet.func = nop_submission_tasklet;
4074
4075         spin_unlock_irqrestore(&engine->active.lock, flags);
4076 }
4077
4078 static void execlists_reset_finish(struct intel_engine_cs *engine)
4079 {
4080         struct intel_engine_execlists * const execlists = &engine->execlists;
4081
4082         /*
4083          * After a GPU reset, we may have requests to replay. Do so now while
4084          * we still have the forcewake to be sure that the GPU is not allowed
4085          * to sleep before we restart and reload a context.
4086          */
4087         GEM_BUG_ON(!reset_in_progress(execlists));
4088         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4089                 execlists->tasklet.func(execlists->tasklet.data);
4090
4091         if (__tasklet_enable(&execlists->tasklet))
4092                 /* And kick in case we missed a new request submission. */
4093                 tasklet_hi_schedule(&execlists->tasklet);
4094         ENGINE_TRACE(engine, "depth->%d\n",
4095                      atomic_read(&execlists->tasklet.count));
4096 }
4097
4098 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4099                                     u64 offset, u32 len,
4100                                     const unsigned int flags)
4101 {
4102         u32 *cs;
4103
4104         cs = intel_ring_begin(rq, 4);
4105         if (IS_ERR(cs))
4106                 return PTR_ERR(cs);
4107
4108         /*
4109          * WaDisableCtxRestoreArbitration:bdw,chv
4110          *
4111          * We don't need to perform MI_ARB_ENABLE as often as we do (in
4112          * particular all the gen that do not need the w/a at all!), if we
4113          * took care to make sure that on every switch into this context
4114          * (both ordinary and for preemption) that arbitrartion was enabled
4115          * we would be fine.  However, for gen8 there is another w/a that
4116          * requires us to not preempt inside GPGPU execution, so we keep
4117          * arbitration disabled for gen8 batches. Arbitration will be
4118          * re-enabled before we close the request
4119          * (engine->emit_fini_breadcrumb).
4120          */
4121         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4122
4123         /* FIXME(BDW+): Address space and security selectors. */
4124         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4125                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4126         *cs++ = lower_32_bits(offset);
4127         *cs++ = upper_32_bits(offset);
4128
4129         intel_ring_advance(rq, cs);
4130
4131         return 0;
4132 }
4133
4134 static int gen8_emit_bb_start(struct i915_request *rq,
4135                               u64 offset, u32 len,
4136                               const unsigned int flags)
4137 {
4138         u32 *cs;
4139
4140         cs = intel_ring_begin(rq, 6);
4141         if (IS_ERR(cs))
4142                 return PTR_ERR(cs);
4143
4144         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4145
4146         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4147                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4148         *cs++ = lower_32_bits(offset);
4149         *cs++ = upper_32_bits(offset);
4150
4151         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4152         *cs++ = MI_NOOP;
4153
4154         intel_ring_advance(rq, cs);
4155
4156         return 0;
4157 }
4158
4159 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4160 {
4161         ENGINE_WRITE(engine, RING_IMR,
4162                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
4163         ENGINE_POSTING_READ(engine, RING_IMR);
4164 }
4165
4166 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4167 {
4168         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4169 }
4170
4171 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4172 {
4173         u32 cmd, *cs;
4174
4175         cs = intel_ring_begin(request, 4);
4176         if (IS_ERR(cs))
4177                 return PTR_ERR(cs);
4178
4179         cmd = MI_FLUSH_DW + 1;
4180
4181         /* We always require a command barrier so that subsequent
4182          * commands, such as breadcrumb interrupts, are strictly ordered
4183          * wrt the contents of the write cache being flushed to memory
4184          * (and thus being coherent from the CPU).
4185          */
4186         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4187
4188         if (mode & EMIT_INVALIDATE) {
4189                 cmd |= MI_INVALIDATE_TLB;
4190                 if (request->engine->class == VIDEO_DECODE_CLASS)
4191                         cmd |= MI_INVALIDATE_BSD;
4192         }
4193
4194         *cs++ = cmd;
4195         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4196         *cs++ = 0; /* upper addr */
4197         *cs++ = 0; /* value */
4198         intel_ring_advance(request, cs);
4199
4200         return 0;
4201 }
4202
4203 static int gen8_emit_flush_render(struct i915_request *request,
4204                                   u32 mode)
4205 {
4206         bool vf_flush_wa = false, dc_flush_wa = false;
4207         u32 *cs, flags = 0;
4208         int len;
4209
4210         flags |= PIPE_CONTROL_CS_STALL;
4211
4212         if (mode & EMIT_FLUSH) {
4213                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4214                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4215                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4216                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4217         }
4218
4219         if (mode & EMIT_INVALIDATE) {
4220                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4221                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4222                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4223                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4224                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4225                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4226                 flags |= PIPE_CONTROL_QW_WRITE;
4227                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4228
4229                 /*
4230                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4231                  * pipe control.
4232                  */
4233                 if (IS_GEN(request->i915, 9))
4234                         vf_flush_wa = true;
4235
4236                 /* WaForGAMHang:kbl */
4237                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4238                         dc_flush_wa = true;
4239         }
4240
4241         len = 6;
4242
4243         if (vf_flush_wa)
4244                 len += 6;
4245
4246         if (dc_flush_wa)
4247                 len += 12;
4248
4249         cs = intel_ring_begin(request, len);
4250         if (IS_ERR(cs))
4251                 return PTR_ERR(cs);
4252
4253         if (vf_flush_wa)
4254                 cs = gen8_emit_pipe_control(cs, 0, 0);
4255
4256         if (dc_flush_wa)
4257                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4258                                             0);
4259
4260         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4261
4262         if (dc_flush_wa)
4263                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4264
4265         intel_ring_advance(request, cs);
4266
4267         return 0;
4268 }
4269
4270 static int gen11_emit_flush_render(struct i915_request *request,
4271                                    u32 mode)
4272 {
4273         if (mode & EMIT_FLUSH) {
4274                 u32 *cs;
4275                 u32 flags = 0;
4276
4277                 flags |= PIPE_CONTROL_CS_STALL;
4278
4279                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4280                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4281                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4282                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4283                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4284                 flags |= PIPE_CONTROL_QW_WRITE;
4285                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4286
4287                 cs = intel_ring_begin(request, 6);
4288                 if (IS_ERR(cs))
4289                         return PTR_ERR(cs);
4290
4291                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4292                 intel_ring_advance(request, cs);
4293         }
4294
4295         if (mode & EMIT_INVALIDATE) {
4296                 u32 *cs;
4297                 u32 flags = 0;
4298
4299                 flags |= PIPE_CONTROL_CS_STALL;
4300
4301                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4302                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4303                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4304                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4305                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4306                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4307                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4308                 flags |= PIPE_CONTROL_QW_WRITE;
4309                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4310
4311                 cs = intel_ring_begin(request, 6);
4312                 if (IS_ERR(cs))
4313                         return PTR_ERR(cs);
4314
4315                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4316                 intel_ring_advance(request, cs);
4317         }
4318
4319         return 0;
4320 }
4321
4322 static u32 preparser_disable(bool state)
4323 {
4324         return MI_ARB_CHECK | 1 << 8 | state;
4325 }
4326
4327 static int gen12_emit_flush_render(struct i915_request *request,
4328                                    u32 mode)
4329 {
4330         if (mode & EMIT_FLUSH) {
4331                 u32 flags = 0;
4332                 u32 *cs;
4333
4334                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4335                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4336                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4337                 /* Wa_1409600907:tgl */
4338                 flags |= PIPE_CONTROL_DEPTH_STALL;
4339                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4340                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4341                 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4342
4343                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4344                 flags |= PIPE_CONTROL_QW_WRITE;
4345
4346                 flags |= PIPE_CONTROL_CS_STALL;
4347
4348                 cs = intel_ring_begin(request, 6);
4349                 if (IS_ERR(cs))
4350                         return PTR_ERR(cs);
4351
4352                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4353                 intel_ring_advance(request, cs);
4354         }
4355
4356         if (mode & EMIT_INVALIDATE) {
4357                 u32 flags = 0;
4358                 u32 *cs;
4359
4360                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4361                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4362                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4363                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4364                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4365                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4366                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4367                 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4368
4369                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4370                 flags |= PIPE_CONTROL_QW_WRITE;
4371
4372                 flags |= PIPE_CONTROL_CS_STALL;
4373
4374                 cs = intel_ring_begin(request, 8);
4375                 if (IS_ERR(cs))
4376                         return PTR_ERR(cs);
4377
4378                 /*
4379                  * Prevent the pre-parser from skipping past the TLB
4380                  * invalidate and loading a stale page for the batch
4381                  * buffer / request payload.
4382                  */
4383                 *cs++ = preparser_disable(true);
4384
4385                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4386
4387                 *cs++ = preparser_disable(false);
4388                 intel_ring_advance(request, cs);
4389         }
4390
4391         return 0;
4392 }
4393
4394 /*
4395  * Reserve space for 2 NOOPs at the end of each request to be
4396  * used as a workaround for not being allowed to do lite
4397  * restore with HEAD==TAIL (WaIdleLiteRestore).
4398  */
4399 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4400 {
4401         /* Ensure there's always at least one preemption point per-request. */
4402         *cs++ = MI_ARB_CHECK;
4403         *cs++ = MI_NOOP;
4404         request->wa_tail = intel_ring_offset(request, cs);
4405
4406         return cs;
4407 }
4408
4409 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4410 {
4411         *cs++ = MI_SEMAPHORE_WAIT |
4412                 MI_SEMAPHORE_GLOBAL_GTT |
4413                 MI_SEMAPHORE_POLL |
4414                 MI_SEMAPHORE_SAD_EQ_SDD;
4415         *cs++ = 0;
4416         *cs++ = intel_hws_preempt_address(request->engine);
4417         *cs++ = 0;
4418
4419         return cs;
4420 }
4421
4422 static __always_inline u32*
4423 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4424                                  u32 *cs)
4425 {
4426         *cs++ = MI_USER_INTERRUPT;
4427
4428         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4429         if (intel_engine_has_semaphores(request->engine))
4430                 cs = emit_preempt_busywait(request, cs);
4431
4432         request->tail = intel_ring_offset(request, cs);
4433         assert_ring_tail_valid(request->ring, request->tail);
4434
4435         return gen8_emit_wa_tail(request, cs);
4436 }
4437
4438 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4439 {
4440         cs = gen8_emit_ggtt_write(cs,
4441                                   request->fence.seqno,
4442                                   i915_request_active_timeline(request)->hwsp_offset,
4443                                   0);
4444
4445         return gen8_emit_fini_breadcrumb_footer(request, cs);
4446 }
4447
4448 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4449 {
4450         cs = gen8_emit_pipe_control(cs,
4451                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4452                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4453                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4454                                     0);
4455
4456         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4457         cs = gen8_emit_ggtt_write_rcs(cs,
4458                                       request->fence.seqno,
4459                                       i915_request_active_timeline(request)->hwsp_offset,
4460                                       PIPE_CONTROL_FLUSH_ENABLE |
4461                                       PIPE_CONTROL_CS_STALL);
4462
4463         return gen8_emit_fini_breadcrumb_footer(request, cs);
4464 }
4465
4466 static u32 *
4467 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4468 {
4469         cs = gen8_emit_ggtt_write_rcs(cs,
4470                                       request->fence.seqno,
4471                                       i915_request_active_timeline(request)->hwsp_offset,
4472                                       PIPE_CONTROL_CS_STALL |
4473                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4474                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4475                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4476                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4477                                       PIPE_CONTROL_FLUSH_ENABLE);
4478
4479         return gen8_emit_fini_breadcrumb_footer(request, cs);
4480 }
4481
4482 /*
4483  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4484  * flush and will continue pre-fetching the instructions after it before the
4485  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4486  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4487  * of the next request before the memory has been flushed, we're guaranteed that
4488  * we won't access the batch itself too early.
4489  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4490  * so, if the current request is modifying an instruction in the next request on
4491  * the same intel_context, we might pre-fetch and then execute the pre-update
4492  * instruction. To avoid this, the users of self-modifying code should either
4493  * disable the parser around the code emitting the memory writes, via a new flag
4494  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4495  * the in-kernel use-cases we've opted to use a separate context, see
4496  * reloc_gpu() as an example.
4497  * All the above applies only to the instructions themselves. Non-inline data
4498  * used by the instructions is not pre-fetched.
4499  */
4500
4501 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4502 {
4503         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4504                 MI_SEMAPHORE_GLOBAL_GTT |
4505                 MI_SEMAPHORE_POLL |
4506                 MI_SEMAPHORE_SAD_EQ_SDD;
4507         *cs++ = 0;
4508         *cs++ = intel_hws_preempt_address(request->engine);
4509         *cs++ = 0;
4510         *cs++ = 0;
4511         *cs++ = MI_NOOP;
4512
4513         return cs;
4514 }
4515
4516 static __always_inline u32*
4517 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4518 {
4519         *cs++ = MI_USER_INTERRUPT;
4520
4521         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4522         if (intel_engine_has_semaphores(request->engine))
4523                 cs = gen12_emit_preempt_busywait(request, cs);
4524
4525         request->tail = intel_ring_offset(request, cs);
4526         assert_ring_tail_valid(request->ring, request->tail);
4527
4528         return gen8_emit_wa_tail(request, cs);
4529 }
4530
4531 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4532 {
4533         cs = gen8_emit_ggtt_write(cs,
4534                                   request->fence.seqno,
4535                                   i915_request_active_timeline(request)->hwsp_offset,
4536                                   0);
4537
4538         return gen12_emit_fini_breadcrumb_footer(request, cs);
4539 }
4540
4541 static u32 *
4542 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4543 {
4544         cs = gen8_emit_ggtt_write_rcs(cs,
4545                                       request->fence.seqno,
4546                                       i915_request_active_timeline(request)->hwsp_offset,
4547                                       PIPE_CONTROL_CS_STALL |
4548                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4549                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4550                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4551                                       /* Wa_1409600907:tgl */
4552                                       PIPE_CONTROL_DEPTH_STALL |
4553                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4554                                       PIPE_CONTROL_FLUSH_ENABLE |
4555                                       PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4556
4557         return gen12_emit_fini_breadcrumb_footer(request, cs);
4558 }
4559
4560 static void execlists_park(struct intel_engine_cs *engine)
4561 {
4562         cancel_timer(&engine->execlists.timer);
4563         cancel_timer(&engine->execlists.preempt);
4564 }
4565
4566 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4567 {
4568         engine->submit_request = execlists_submit_request;
4569         engine->schedule = i915_schedule;
4570         engine->execlists.tasklet.func = execlists_submission_tasklet;
4571
4572         engine->reset.prepare = execlists_reset_prepare;
4573         engine->reset.rewind = execlists_reset_rewind;
4574         engine->reset.cancel = execlists_reset_cancel;
4575         engine->reset.finish = execlists_reset_finish;
4576
4577         engine->park = execlists_park;
4578         engine->unpark = NULL;
4579
4580         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4581         if (!intel_vgpu_active(engine->i915)) {
4582                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4583                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4584                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4585         }
4586
4587         if (INTEL_GEN(engine->i915) >= 12)
4588                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4589
4590         if (intel_engine_has_preemption(engine))
4591                 engine->emit_bb_start = gen8_emit_bb_start;
4592         else
4593                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
4594 }
4595
4596 static void execlists_shutdown(struct intel_engine_cs *engine)
4597 {
4598         /* Synchronise with residual timers and any softirq they raise */
4599         del_timer_sync(&engine->execlists.timer);
4600         del_timer_sync(&engine->execlists.preempt);
4601         tasklet_kill(&engine->execlists.tasklet);
4602 }
4603
4604 static void execlists_release(struct intel_engine_cs *engine)
4605 {
4606         engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
4607
4608         execlists_shutdown(engine);
4609
4610         intel_engine_cleanup_common(engine);
4611         lrc_destroy_wa_ctx(engine);
4612 }
4613
4614 static void
4615 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4616 {
4617         /* Default vfuncs which can be overriden by each engine. */
4618
4619         engine->resume = execlists_resume;
4620
4621         engine->cops = &execlists_context_ops;
4622         engine->request_alloc = execlists_request_alloc;
4623
4624         engine->emit_flush = gen8_emit_flush;
4625         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4626         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4627         if (INTEL_GEN(engine->i915) >= 12)
4628                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4629
4630         engine->set_default_submission = intel_execlists_set_default_submission;
4631
4632         if (INTEL_GEN(engine->i915) < 11) {
4633                 engine->irq_enable = gen8_logical_ring_enable_irq;
4634                 engine->irq_disable = gen8_logical_ring_disable_irq;
4635         } else {
4636                 /*
4637                  * TODO: On Gen11 interrupt masks need to be clear
4638                  * to allow C6 entry. Keep interrupts enabled at
4639                  * and take the hit of generating extra interrupts
4640                  * until a more refined solution exists.
4641                  */
4642         }
4643 }
4644
4645 static inline void
4646 logical_ring_default_irqs(struct intel_engine_cs *engine)
4647 {
4648         unsigned int shift = 0;
4649
4650         if (INTEL_GEN(engine->i915) < 11) {
4651                 const u8 irq_shifts[] = {
4652                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
4653                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
4654                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4655                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4656                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
4657                 };
4658
4659                 shift = irq_shifts[engine->id];
4660         }
4661
4662         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4663         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4664         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4665         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
4666 }
4667
4668 static void rcs_submission_override(struct intel_engine_cs *engine)
4669 {
4670         switch (INTEL_GEN(engine->i915)) {
4671         case 12:
4672                 engine->emit_flush = gen12_emit_flush_render;
4673                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4674                 break;
4675         case 11:
4676                 engine->emit_flush = gen11_emit_flush_render;
4677                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4678                 break;
4679         default:
4680                 engine->emit_flush = gen8_emit_flush_render;
4681                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4682                 break;
4683         }
4684 }
4685
4686 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4687 {
4688         struct intel_engine_execlists * const execlists = &engine->execlists;
4689         struct drm_i915_private *i915 = engine->i915;
4690         struct intel_uncore *uncore = engine->uncore;
4691         u32 base = engine->mmio_base;
4692
4693         tasklet_init(&engine->execlists.tasklet,
4694                      execlists_submission_tasklet, (unsigned long)engine);
4695         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4696         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4697
4698         logical_ring_default_vfuncs(engine);
4699         logical_ring_default_irqs(engine);
4700
4701         if (engine->class == RENDER_CLASS)
4702                 rcs_submission_override(engine);
4703
4704         if (intel_init_workaround_bb(engine))
4705                 /*
4706                  * We continue even if we fail to initialize WA batch
4707                  * because we only expect rare glitches but nothing
4708                  * critical to prevent us from using GPU
4709                  */
4710                 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
4711
4712         if (HAS_LOGICAL_RING_ELSQ(i915)) {
4713                 execlists->submit_reg = uncore->regs +
4714                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4715                 execlists->ctrl_reg = uncore->regs +
4716                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4717         } else {
4718                 execlists->submit_reg = uncore->regs +
4719                         i915_mmio_reg_offset(RING_ELSP(base));
4720         }
4721
4722         execlists->csb_status =
4723                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4724
4725         execlists->csb_write =
4726                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4727
4728         if (INTEL_GEN(i915) < 11)
4729                 execlists->csb_size = GEN8_CSB_ENTRIES;
4730         else
4731                 execlists->csb_size = GEN11_CSB_ENTRIES;
4732
4733         /* Finally, take ownership and responsibility for cleanup! */
4734         engine->sanitize = execlists_sanitize;
4735         engine->release = execlists_release;
4736
4737         return 0;
4738 }
4739
4740
4741 static void init_common_reg_state(u32 * const regs,
4742                                   const struct intel_engine_cs *engine,
4743                                   const struct intel_ring *ring,
4744                                   bool inhibit)
4745 {
4746         u32 ctl;
4747
4748         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4749         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4750         if (inhibit)
4751                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4752         if (INTEL_GEN(engine->i915) < 11)
4753                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4754                                            CTX_CTRL_RS_CTX_ENABLE);
4755         regs[CTX_CONTEXT_CONTROL] = ctl;
4756
4757         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4758         regs[CTX_TIMESTAMP] = 0;
4759 }
4760
4761 static void init_wa_bb_reg_state(u32 * const regs,
4762                                  const struct intel_engine_cs *engine)
4763 {
4764         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4765
4766         if (wa_ctx->per_ctx.size) {
4767                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4768
4769                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
4770                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
4771                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4772         }
4773
4774         if (wa_ctx->indirect_ctx.size) {
4775                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4776
4777                 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
4778                 regs[lrc_ring_indirect_ptr(engine) + 1] =
4779                         (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4780                         (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4781
4782                 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
4783                 regs[lrc_ring_indirect_offset(engine) + 1] =
4784                         lrc_ring_indirect_offset_default(engine) << 6;
4785         }
4786 }
4787
4788 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4789 {
4790         if (i915_vm_is_4lvl(&ppgtt->vm)) {
4791                 /* 64b PPGTT (48bit canonical)
4792                  * PDP0_DESCRIPTOR contains the base address to PML4 and
4793                  * other PDP Descriptors are ignored.
4794                  */
4795                 ASSIGN_CTX_PML4(ppgtt, regs);
4796         } else {
4797                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
4798                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
4799                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
4800                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
4801         }
4802 }
4803
4804 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4805 {
4806         if (i915_is_ggtt(vm))
4807                 return i915_vm_to_ggtt(vm)->alias;
4808         else
4809                 return i915_vm_to_ppgtt(vm);
4810 }
4811
4812 static void execlists_init_reg_state(u32 *regs,
4813                                      const struct intel_context *ce,
4814                                      const struct intel_engine_cs *engine,
4815                                      const struct intel_ring *ring,
4816                                      bool inhibit)
4817 {
4818         /*
4819          * A context is actually a big batch buffer with several
4820          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4821          * values we are setting here are only for the first context restore:
4822          * on a subsequent save, the GPU will recreate this batchbuffer with new
4823          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4824          * we are not initializing here).
4825          *
4826          * Must keep consistent with virtual_update_register_offsets().
4827          */
4828         set_offsets(regs, reg_offsets(engine), engine, inhibit);
4829
4830         init_common_reg_state(regs, engine, ring, inhibit);
4831         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4832
4833         init_wa_bb_reg_state(regs, engine);
4834
4835         __reset_stop_ring(regs, engine);
4836 }
4837
4838 static int
4839 populate_lr_context(struct intel_context *ce,
4840                     struct drm_i915_gem_object *ctx_obj,
4841                     struct intel_engine_cs *engine,
4842                     struct intel_ring *ring)
4843 {
4844         bool inhibit = true;
4845         void *vaddr;
4846         int ret;
4847
4848         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4849         if (IS_ERR(vaddr)) {
4850                 ret = PTR_ERR(vaddr);
4851                 drm_dbg(&engine->i915->drm,
4852                         "Could not map object pages! (%d)\n", ret);
4853                 return ret;
4854         }
4855
4856         set_redzone(vaddr, engine);
4857
4858         if (engine->default_state) {
4859                 void *defaults;
4860
4861                 defaults = i915_gem_object_pin_map(engine->default_state,
4862                                                    I915_MAP_WB);
4863                 if (IS_ERR(defaults)) {
4864                         ret = PTR_ERR(defaults);
4865                         goto err_unpin_ctx;
4866                 }
4867
4868                 memcpy(vaddr, defaults, engine->context_size);
4869                 i915_gem_object_unpin_map(engine->default_state);
4870                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4871                 inhibit = false;
4872         }
4873
4874         /* Clear the ppHWSP (inc. per-context counters) */
4875         memset(vaddr, 0, PAGE_SIZE);
4876
4877         /*
4878          * The second page of the context object contains some registers which
4879          * must be set up prior to the first execution.
4880          */
4881         execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
4882                                  ce, engine, ring, inhibit);
4883
4884         ret = 0;
4885 err_unpin_ctx:
4886         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4887         i915_gem_object_unpin_map(ctx_obj);
4888         return ret;
4889 }
4890
4891 static int __execlists_context_alloc(struct intel_context *ce,
4892                                      struct intel_engine_cs *engine)
4893 {
4894         struct drm_i915_gem_object *ctx_obj;
4895         struct intel_ring *ring;
4896         struct i915_vma *vma;
4897         u32 context_size;
4898         int ret;
4899
4900         GEM_BUG_ON(ce->state);
4901         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4902
4903         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4904                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4905
4906         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4907         if (IS_ERR(ctx_obj))
4908                 return PTR_ERR(ctx_obj);
4909
4910         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4911         if (IS_ERR(vma)) {
4912                 ret = PTR_ERR(vma);
4913                 goto error_deref_obj;
4914         }
4915
4916         if (!ce->timeline) {
4917                 struct intel_timeline *tl;
4918                 struct i915_vma *hwsp;
4919
4920                 /*
4921                  * Use the static global HWSP for the kernel context, and
4922                  * a dynamically allocated cacheline for everyone else.
4923                  */
4924                 hwsp = NULL;
4925                 if (unlikely(intel_context_is_barrier(ce)))
4926                         hwsp = engine->status_page.vma;
4927
4928                 tl = intel_timeline_create(engine->gt, hwsp);
4929                 if (IS_ERR(tl)) {
4930                         ret = PTR_ERR(tl);
4931                         goto error_deref_obj;
4932                 }
4933
4934                 ce->timeline = tl;
4935         }
4936
4937         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4938         if (IS_ERR(ring)) {
4939                 ret = PTR_ERR(ring);
4940                 goto error_deref_obj;
4941         }
4942
4943         ret = populate_lr_context(ce, ctx_obj, engine, ring);
4944         if (ret) {
4945                 drm_dbg(&engine->i915->drm,
4946                         "Failed to populate LRC: %d\n", ret);
4947                 goto error_ring_free;
4948         }
4949
4950         ce->ring = ring;
4951         ce->state = vma;
4952
4953         return 0;
4954
4955 error_ring_free:
4956         intel_ring_put(ring);
4957 error_deref_obj:
4958         i915_gem_object_put(ctx_obj);
4959         return ret;
4960 }
4961
4962 static struct list_head *virtual_queue(struct virtual_engine *ve)
4963 {
4964         return &ve->base.execlists.default_priolist.requests[0];
4965 }
4966
4967 static void virtual_context_destroy(struct kref *kref)
4968 {
4969         struct virtual_engine *ve =
4970                 container_of(kref, typeof(*ve), context.ref);
4971         unsigned int n;
4972
4973         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4974         GEM_BUG_ON(ve->request);
4975         GEM_BUG_ON(ve->context.inflight);
4976
4977         for (n = 0; n < ve->num_siblings; n++) {
4978                 struct intel_engine_cs *sibling = ve->siblings[n];
4979                 struct rb_node *node = &ve->nodes[sibling->id].rb;
4980                 unsigned long flags;
4981
4982                 if (RB_EMPTY_NODE(node))
4983                         continue;
4984
4985                 spin_lock_irqsave(&sibling->active.lock, flags);
4986
4987                 /* Detachment is lazily performed in the execlists tasklet */
4988                 if (!RB_EMPTY_NODE(node))
4989                         rb_erase_cached(node, &sibling->execlists.virtual);
4990
4991                 spin_unlock_irqrestore(&sibling->active.lock, flags);
4992         }
4993         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4994
4995         if (ve->context.state)
4996                 __execlists_context_fini(&ve->context);
4997         intel_context_fini(&ve->context);
4998
4999         intel_engine_free_request_pool(&ve->base);
5000
5001         kfree(ve->bonds);
5002         kfree(ve);
5003 }
5004
5005 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5006 {
5007         int swp;
5008
5009         /*
5010          * Pick a random sibling on starting to help spread the load around.
5011          *
5012          * New contexts are typically created with exactly the same order
5013          * of siblings, and often started in batches. Due to the way we iterate
5014          * the array of sibling when submitting requests, sibling[0] is
5015          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5016          * randomised across the system, we also help spread the load by the
5017          * first engine we inspect being different each time.
5018          *
5019          * NB This does not force us to execute on this engine, it will just
5020          * typically be the first we inspect for submission.
5021          */
5022         swp = prandom_u32_max(ve->num_siblings);
5023         if (!swp)
5024                 return;
5025
5026         swap(ve->siblings[swp], ve->siblings[0]);
5027         if (!intel_engine_has_relative_mmio(ve->siblings[0]))
5028                 virtual_update_register_offsets(ve->context.lrc_reg_state,
5029                                                 ve->siblings[0]);
5030 }
5031
5032 static int virtual_context_alloc(struct intel_context *ce)
5033 {
5034         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5035
5036         return __execlists_context_alloc(ce, ve->siblings[0]);
5037 }
5038
5039 static int virtual_context_pin(struct intel_context *ce)
5040 {
5041         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5042         int err;
5043
5044         /* Note: we must use a real engine class for setting up reg state */
5045         err = __execlists_context_pin(ce, ve->siblings[0]);
5046         if (err)
5047                 return err;
5048
5049         virtual_engine_initial_hint(ve);
5050         return 0;
5051 }
5052
5053 static void virtual_context_enter(struct intel_context *ce)
5054 {
5055         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5056         unsigned int n;
5057
5058         for (n = 0; n < ve->num_siblings; n++)
5059                 intel_engine_pm_get(ve->siblings[n]);
5060
5061         intel_timeline_enter(ce->timeline);
5062 }
5063
5064 static void virtual_context_exit(struct intel_context *ce)
5065 {
5066         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5067         unsigned int n;
5068
5069         intel_timeline_exit(ce->timeline);
5070
5071         for (n = 0; n < ve->num_siblings; n++)
5072                 intel_engine_pm_put(ve->siblings[n]);
5073 }
5074
5075 static const struct intel_context_ops virtual_context_ops = {
5076         .alloc = virtual_context_alloc,
5077
5078         .pin = virtual_context_pin,
5079         .unpin = execlists_context_unpin,
5080
5081         .enter = virtual_context_enter,
5082         .exit = virtual_context_exit,
5083
5084         .destroy = virtual_context_destroy,
5085 };
5086
5087 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5088 {
5089         struct i915_request *rq;
5090         intel_engine_mask_t mask;
5091
5092         rq = READ_ONCE(ve->request);
5093         if (!rq)
5094                 return 0;
5095
5096         /* The rq is ready for submission; rq->execution_mask is now stable. */
5097         mask = rq->execution_mask;
5098         if (unlikely(!mask)) {
5099                 /* Invalid selection, submit to a random engine in error */
5100                 i915_request_set_error_once(rq, -ENODEV);
5101                 mask = ve->siblings[0]->mask;
5102         }
5103
5104         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5105                      rq->fence.context, rq->fence.seqno,
5106                      mask, ve->base.execlists.queue_priority_hint);
5107
5108         return mask;
5109 }
5110
5111 static void virtual_submission_tasklet(unsigned long data)
5112 {
5113         struct virtual_engine * const ve = (struct virtual_engine *)data;
5114         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5115         intel_engine_mask_t mask;
5116         unsigned int n;
5117
5118         rcu_read_lock();
5119         mask = virtual_submission_mask(ve);
5120         rcu_read_unlock();
5121         if (unlikely(!mask))
5122                 return;
5123
5124         local_irq_disable();
5125         for (n = 0; n < ve->num_siblings; n++) {
5126                 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5127                 struct ve_node * const node = &ve->nodes[sibling->id];
5128                 struct rb_node **parent, *rb;
5129                 bool first;
5130
5131                 if (!READ_ONCE(ve->request))
5132                         break; /* already handled by a sibling's tasklet */
5133
5134                 if (unlikely(!(mask & sibling->mask))) {
5135                         if (!RB_EMPTY_NODE(&node->rb)) {
5136                                 spin_lock(&sibling->active.lock);
5137                                 rb_erase_cached(&node->rb,
5138                                                 &sibling->execlists.virtual);
5139                                 RB_CLEAR_NODE(&node->rb);
5140                                 spin_unlock(&sibling->active.lock);
5141                         }
5142                         continue;
5143                 }
5144
5145                 spin_lock(&sibling->active.lock);
5146
5147                 if (!RB_EMPTY_NODE(&node->rb)) {
5148                         /*
5149                          * Cheat and avoid rebalancing the tree if we can
5150                          * reuse this node in situ.
5151                          */
5152                         first = rb_first_cached(&sibling->execlists.virtual) ==
5153                                 &node->rb;
5154                         if (prio == node->prio || (prio > node->prio && first))
5155                                 goto submit_engine;
5156
5157                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5158                 }
5159
5160                 rb = NULL;
5161                 first = true;
5162                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5163                 while (*parent) {
5164                         struct ve_node *other;
5165
5166                         rb = *parent;
5167                         other = rb_entry(rb, typeof(*other), rb);
5168                         if (prio > other->prio) {
5169                                 parent = &rb->rb_left;
5170                         } else {
5171                                 parent = &rb->rb_right;
5172                                 first = false;
5173                         }
5174                 }
5175
5176                 rb_link_node(&node->rb, rb, parent);
5177                 rb_insert_color_cached(&node->rb,
5178                                        &sibling->execlists.virtual,
5179                                        first);
5180
5181 submit_engine:
5182                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5183                 node->prio = prio;
5184                 if (first && prio > sibling->execlists.queue_priority_hint)
5185                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5186
5187                 spin_unlock(&sibling->active.lock);
5188         }
5189         local_irq_enable();
5190 }
5191
5192 static void virtual_submit_request(struct i915_request *rq)
5193 {
5194         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5195         struct i915_request *old;
5196         unsigned long flags;
5197
5198         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5199                      rq->fence.context,
5200                      rq->fence.seqno);
5201
5202         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5203
5204         spin_lock_irqsave(&ve->base.active.lock, flags);
5205
5206         old = ve->request;
5207         if (old) { /* background completion event from preempt-to-busy */
5208                 GEM_BUG_ON(!i915_request_completed(old));
5209                 __i915_request_submit(old);
5210                 i915_request_put(old);
5211         }
5212
5213         if (i915_request_completed(rq)) {
5214                 __i915_request_submit(rq);
5215
5216                 ve->base.execlists.queue_priority_hint = INT_MIN;
5217                 ve->request = NULL;
5218         } else {
5219                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5220                 ve->request = i915_request_get(rq);
5221
5222                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5223                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5224
5225                 tasklet_schedule(&ve->base.execlists.tasklet);
5226         }
5227
5228         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5229 }
5230
5231 static struct ve_bond *
5232 virtual_find_bond(struct virtual_engine *ve,
5233                   const struct intel_engine_cs *master)
5234 {
5235         int i;
5236
5237         for (i = 0; i < ve->num_bonds; i++) {
5238                 if (ve->bonds[i].master == master)
5239                         return &ve->bonds[i];
5240         }
5241
5242         return NULL;
5243 }
5244
5245 static void
5246 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5247 {
5248         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5249         intel_engine_mask_t allowed, exec;
5250         struct ve_bond *bond;
5251
5252         allowed = ~to_request(signal)->engine->mask;
5253
5254         bond = virtual_find_bond(ve, to_request(signal)->engine);
5255         if (bond)
5256                 allowed &= bond->sibling_mask;
5257
5258         /* Restrict the bonded request to run on only the available engines */
5259         exec = READ_ONCE(rq->execution_mask);
5260         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5261                 ;
5262
5263         /* Prevent the master from being re-run on the bonded engines */
5264         to_request(signal)->execution_mask &= ~allowed;
5265 }
5266
5267 struct intel_context *
5268 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5269                                unsigned int count)
5270 {
5271         struct virtual_engine *ve;
5272         unsigned int n;
5273         int err;
5274
5275         if (count == 0)
5276                 return ERR_PTR(-EINVAL);
5277
5278         if (count == 1)
5279                 return intel_context_create(siblings[0]);
5280
5281         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5282         if (!ve)
5283                 return ERR_PTR(-ENOMEM);
5284
5285         ve->base.i915 = siblings[0]->i915;
5286         ve->base.gt = siblings[0]->gt;
5287         ve->base.uncore = siblings[0]->uncore;
5288         ve->base.id = -1;
5289
5290         ve->base.class = OTHER_CLASS;
5291         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5292         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5293         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5294
5295         /*
5296          * The decision on whether to submit a request using semaphores
5297          * depends on the saturated state of the engine. We only compute
5298          * this during HW submission of the request, and we need for this
5299          * state to be globally applied to all requests being submitted
5300          * to this engine. Virtual engines encompass more than one physical
5301          * engine and so we cannot accurately tell in advance if one of those
5302          * engines is already saturated and so cannot afford to use a semaphore
5303          * and be pessimized in priority for doing so -- if we are the only
5304          * context using semaphores after all other clients have stopped, we
5305          * will be starved on the saturated system. Such a global switch for
5306          * semaphores is less than ideal, but alas is the current compromise.
5307          */
5308         ve->base.saturated = ALL_ENGINES;
5309
5310         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5311
5312         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5313         intel_engine_init_breadcrumbs(&ve->base);
5314         intel_engine_init_execlists(&ve->base);
5315
5316         ve->base.cops = &virtual_context_ops;
5317         ve->base.request_alloc = execlists_request_alloc;
5318
5319         ve->base.schedule = i915_schedule;
5320         ve->base.submit_request = virtual_submit_request;
5321         ve->base.bond_execute = virtual_bond_execute;
5322
5323         INIT_LIST_HEAD(virtual_queue(ve));
5324         ve->base.execlists.queue_priority_hint = INT_MIN;
5325         tasklet_init(&ve->base.execlists.tasklet,
5326                      virtual_submission_tasklet,
5327                      (unsigned long)ve);
5328
5329         intel_context_init(&ve->context, &ve->base);
5330
5331         for (n = 0; n < count; n++) {
5332                 struct intel_engine_cs *sibling = siblings[n];
5333
5334                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5335                 if (sibling->mask & ve->base.mask) {
5336                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5337                                   sibling->name);
5338                         err = -EINVAL;
5339                         goto err_put;
5340                 }
5341
5342                 /*
5343                  * The virtual engine implementation is tightly coupled to
5344                  * the execlists backend -- we push out request directly
5345                  * into a tree inside each physical engine. We could support
5346                  * layering if we handle cloning of the requests and
5347                  * submitting a copy into each backend.
5348                  */
5349                 if (sibling->execlists.tasklet.func !=
5350                     execlists_submission_tasklet) {
5351                         err = -ENODEV;
5352                         goto err_put;
5353                 }
5354
5355                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5356                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5357
5358                 ve->siblings[ve->num_siblings++] = sibling;
5359                 ve->base.mask |= sibling->mask;
5360
5361                 /*
5362                  * All physical engines must be compatible for their emission
5363                  * functions (as we build the instructions during request
5364                  * construction and do not alter them before submission
5365                  * on the physical engine). We use the engine class as a guide
5366                  * here, although that could be refined.
5367                  */
5368                 if (ve->base.class != OTHER_CLASS) {
5369                         if (ve->base.class != sibling->class) {
5370                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5371                                           sibling->class, ve->base.class);
5372                                 err = -EINVAL;
5373                                 goto err_put;
5374                         }
5375                         continue;
5376                 }
5377
5378                 ve->base.class = sibling->class;
5379                 ve->base.uabi_class = sibling->uabi_class;
5380                 snprintf(ve->base.name, sizeof(ve->base.name),
5381                          "v%dx%d", ve->base.class, count);
5382                 ve->base.context_size = sibling->context_size;
5383
5384                 ve->base.emit_bb_start = sibling->emit_bb_start;
5385                 ve->base.emit_flush = sibling->emit_flush;
5386                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5387                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5388                 ve->base.emit_fini_breadcrumb_dw =
5389                         sibling->emit_fini_breadcrumb_dw;
5390
5391                 ve->base.flags = sibling->flags;
5392         }
5393
5394         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5395
5396         return &ve->context;
5397
5398 err_put:
5399         intel_context_put(&ve->context);
5400         return ERR_PTR(err);
5401 }
5402
5403 struct intel_context *
5404 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5405 {
5406         struct virtual_engine *se = to_virtual_engine(src);
5407         struct intel_context *dst;
5408
5409         dst = intel_execlists_create_virtual(se->siblings,
5410                                              se->num_siblings);
5411         if (IS_ERR(dst))
5412                 return dst;
5413
5414         if (se->num_bonds) {
5415                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5416
5417                 de->bonds = kmemdup(se->bonds,
5418                                     sizeof(*se->bonds) * se->num_bonds,
5419                                     GFP_KERNEL);
5420                 if (!de->bonds) {
5421                         intel_context_put(dst);
5422                         return ERR_PTR(-ENOMEM);
5423                 }
5424
5425                 de->num_bonds = se->num_bonds;
5426         }
5427
5428         return dst;
5429 }
5430
5431 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5432                                      const struct intel_engine_cs *master,
5433                                      const struct intel_engine_cs *sibling)
5434 {
5435         struct virtual_engine *ve = to_virtual_engine(engine);
5436         struct ve_bond *bond;
5437         int n;
5438
5439         /* Sanity check the sibling is part of the virtual engine */
5440         for (n = 0; n < ve->num_siblings; n++)
5441                 if (sibling == ve->siblings[n])
5442                         break;
5443         if (n == ve->num_siblings)
5444                 return -EINVAL;
5445
5446         bond = virtual_find_bond(ve, master);
5447         if (bond) {
5448                 bond->sibling_mask |= sibling->mask;
5449                 return 0;
5450         }
5451
5452         bond = krealloc(ve->bonds,
5453                         sizeof(*bond) * (ve->num_bonds + 1),
5454                         GFP_KERNEL);
5455         if (!bond)
5456                 return -ENOMEM;
5457
5458         bond[ve->num_bonds].master = master;
5459         bond[ve->num_bonds].sibling_mask = sibling->mask;
5460
5461         ve->bonds = bond;
5462         ve->num_bonds++;
5463
5464         return 0;
5465 }
5466
5467 struct intel_engine_cs *
5468 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5469                                  unsigned int sibling)
5470 {
5471         struct virtual_engine *ve = to_virtual_engine(engine);
5472
5473         if (sibling >= ve->num_siblings)
5474                 return NULL;
5475
5476         return ve->siblings[sibling];
5477 }
5478
5479 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5480                                    struct drm_printer *m,
5481                                    void (*show_request)(struct drm_printer *m,
5482                                                         struct i915_request *rq,
5483                                                         const char *prefix),
5484                                    unsigned int max)
5485 {
5486         const struct intel_engine_execlists *execlists = &engine->execlists;
5487         struct i915_request *rq, *last;
5488         unsigned long flags;
5489         unsigned int count;
5490         struct rb_node *rb;
5491
5492         spin_lock_irqsave(&engine->active.lock, flags);
5493
5494         last = NULL;
5495         count = 0;
5496         list_for_each_entry(rq, &engine->active.requests, sched.link) {
5497                 if (count++ < max - 1)
5498                         show_request(m, rq, "\t\tE ");
5499                 else
5500                         last = rq;
5501         }
5502         if (last) {
5503                 if (count > max) {
5504                         drm_printf(m,
5505                                    "\t\t...skipping %d executing requests...\n",
5506                                    count - max);
5507                 }
5508                 show_request(m, last, "\t\tE ");
5509         }
5510
5511         if (execlists->switch_priority_hint != INT_MIN)
5512                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
5513                            READ_ONCE(execlists->switch_priority_hint));
5514         if (execlists->queue_priority_hint != INT_MIN)
5515                 drm_printf(m, "\t\tQueue priority hint: %d\n",
5516                            READ_ONCE(execlists->queue_priority_hint));
5517
5518         last = NULL;
5519         count = 0;
5520         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5521                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5522                 int i;
5523
5524                 priolist_for_each_request(rq, p, i) {
5525                         if (count++ < max - 1)
5526                                 show_request(m, rq, "\t\tQ ");
5527                         else
5528                                 last = rq;
5529                 }
5530         }
5531         if (last) {
5532                 if (count > max) {
5533                         drm_printf(m,
5534                                    "\t\t...skipping %d queued requests...\n",
5535                                    count - max);
5536                 }
5537                 show_request(m, last, "\t\tQ ");
5538         }
5539
5540         last = NULL;
5541         count = 0;
5542         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5543                 struct virtual_engine *ve =
5544                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5545                 struct i915_request *rq = READ_ONCE(ve->request);
5546
5547                 if (rq) {
5548                         if (count++ < max - 1)
5549                                 show_request(m, rq, "\t\tV ");
5550                         else
5551                                 last = rq;
5552                 }
5553         }
5554         if (last) {
5555                 if (count > max) {
5556                         drm_printf(m,
5557                                    "\t\t...skipping %d virtual requests...\n",
5558                                    count - max);
5559                 }
5560                 show_request(m, last, "\t\tV ");
5561         }
5562
5563         spin_unlock_irqrestore(&engine->active.lock, flags);
5564 }
5565
5566 void intel_lr_context_reset(struct intel_engine_cs *engine,
5567                             struct intel_context *ce,
5568                             u32 head,
5569                             bool scrub)
5570 {
5571         GEM_BUG_ON(!intel_context_is_pinned(ce));
5572
5573         /*
5574          * We want a simple context + ring to execute the breadcrumb update.
5575          * We cannot rely on the context being intact across the GPU hang,
5576          * so clear it and rebuild just what we need for the breadcrumb.
5577          * All pending requests for this context will be zapped, and any
5578          * future request will be after userspace has had the opportunity
5579          * to recreate its own state.
5580          */
5581         if (scrub)
5582                 restore_default_state(ce, engine);
5583
5584         /* Rerun the request; its payload has been neutered (if guilty). */
5585         __execlists_update_reg_state(ce, engine, head);
5586 }
5587
5588 bool
5589 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5590 {
5591         return engine->set_default_submission ==
5592                intel_execlists_set_default_submission;
5593 }
5594
5595 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5596 #include "selftest_lrc.c"
5597 #endif