drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_breadcrumbs.h"
 141 #include "intel_context.h"
 142 #include "intel_engine_pm.h"
 143 #include "intel_gt.h"
 144 #include "intel_gt_pm.h"
 145 #include "intel_gt_requests.h"
 146 #include "intel_lrc_reg.h"
 147 #include "intel_mocs.h"
 148 #include "intel_reset.h"
 149 #include "intel_ring.h"
 150 #include "intel_workarounds.h"
 151 #include "shmem_utils.h"
 152
 153 #define RING_EXECLIST_QFULL             (1 << 0x2)
 154 #define RING_EXECLIST1_VALID            (1 << 0x3)
 155 #define RING_EXECLIST0_VALID            (1 << 0x4)
 156 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 157 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 158 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 159
 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 161 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 164 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 165 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 166
 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 168          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 169
 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 171
 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 174 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 175 #define GEN12_IDLE_CTX_ID               0x7FF
 176 #define GEN12_CSB_CTX_VALID(csb_dw) \
 177         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 178
 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 181
 182 struct virtual_engine {
 183         struct intel_engine_cs base;
 184         struct intel_context context;
 185
 186         /*
 187          * We allow only a single request through the virtual engine at a time
 188          * (each request in the timeline waits for the completion fence of
 189          * the previous before being submitted). By restricting ourselves to
 190          * only submitting a single request, each request is placed on to a
 191          * physical to maximise load spreading (by virtue of the late greedy
 192          * scheduling -- each real engine takes the next available request
 193          * upon idling).
 194          */
 195         struct i915_request *request;
 196
 197         /*
 198          * We keep a rbtree of available virtual engines inside each physical
 199          * engine, sorted by priority. Here we preallocate the nodes we need
 200          * for the virtual engine, indexed by physical_engine->id.
 201          */
 202         struct ve_node {
 203                 struct rb_node rb;
 204                 int prio;
 205         } nodes[I915_NUM_ENGINES];
 206
 207         /*
 208          * Keep track of bonded pairs -- restrictions upon on our selection
 209          * of physical engines any particular request may be submitted to.
 210          * If we receive a submit-fence from a master engine, we will only
 211          * use one of sibling_mask physical engines.
 212          */
 213         struct ve_bond {
 214                 const struct intel_engine_cs *master;
 215                 intel_engine_mask_t sibling_mask;
 216         } *bonds;
 217         unsigned int num_bonds;
 218
 219         /* And finally, which physical engines this virtual engine maps onto. */
 220         unsigned int num_siblings;
 221         struct intel_engine_cs *siblings[];
 222 };
 223
 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 225 {
 226         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 227         return container_of(engine, struct virtual_engine, base);
 228 }
 229
 230 static int __execlists_context_alloc(struct intel_context *ce,
 231                                      struct intel_engine_cs *engine);
 232
 233 static void execlists_init_reg_state(u32 *reg_state,
 234                                      const struct intel_context *ce,
 235                                      const struct intel_engine_cs *engine,
 236                                      const struct intel_ring *ring,
 237                                      bool close);
 238 static void
 239 __execlists_update_reg_state(const struct intel_context *ce,
 240                              const struct intel_engine_cs *engine,
 241                              u32 head);
 242
 243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 244 {
 245         if (INTEL_GEN(engine->i915) >= 12)
 246                 return 0x60;
 247         else if (INTEL_GEN(engine->i915) >= 9)
 248                 return 0x54;
 249         else if (engine->class == RENDER_CLASS)
 250                 return 0x58;
 251         else
 252                 return -1;
 253 }
 254
 255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 256 {
 257         if (INTEL_GEN(engine->i915) >= 12)
 258                 return 0x74;
 259         else if (INTEL_GEN(engine->i915) >= 9)
 260                 return 0x68;
 261         else if (engine->class == RENDER_CLASS)
 262                 return 0xd8;
 263         else
 264                 return -1;
 265 }
 266
 267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 268 {
 269         if (INTEL_GEN(engine->i915) >= 12)
 270                 return 0x12;
 271         else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 272                 return 0x18;
 273         else
 274                 return -1;
 275 }
 276
 277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 278 {
 279         int x;
 280
 281         x = lrc_ring_wa_bb_per_ctx(engine);
 282         if (x < 0)
 283                 return x;
 284
 285         return x + 2;
 286 }
 287
 288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 289 {
 290         int x;
 291
 292         x = lrc_ring_indirect_ptr(engine);
 293         if (x < 0)
 294                 return x;
 295
 296         return x + 2;
 297 }
 298
 299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 300 {
 301         if (engine->class != RENDER_CLASS)
 302                 return -1;
 303
 304         if (INTEL_GEN(engine->i915) >= 12)
 305                 return 0xb6;
 306         else if (INTEL_GEN(engine->i915) >= 11)
 307                 return 0xaa;
 308         else
 309                 return -1;
 310 }
 311
 312 static u32
 313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 314 {
 315         switch (INTEL_GEN(engine->i915)) {
 316         default:
 317                 MISSING_CASE(INTEL_GEN(engine->i915));
 318                 fallthrough;
 319         case 12:
 320                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 321         case 11:
 322                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 323         case 10:
 324                 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 325         case 9:
 326                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 327         case 8:
 328                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 329         }
 330 }
 331
 332 static void
 333 lrc_ring_setup_indirect_ctx(u32 *regs,
 334                             const struct intel_engine_cs *engine,
 335                             u32 ctx_bb_ggtt_addr,
 336                             u32 size)
 337 {
 338         GEM_BUG_ON(!size);
 339         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 340         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 341         regs[lrc_ring_indirect_ptr(engine) + 1] =
 342                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 343
 344         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 345         regs[lrc_ring_indirect_offset(engine) + 1] =
 346                 lrc_ring_indirect_offset_default(engine) << 6;
 347 }
 348
 349 static u32 intel_context_get_runtime(const struct intel_context *ce)
 350 {
 351         /*
 352          * We can use either ppHWSP[16] which is recorded before the context
 353          * switch (and so excludes the cost of context switches) or use the
 354          * value from the context image itself, which is saved/restored earlier
 355          * and so includes the cost of the save.
 356          */
 357         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
 358 }
 359
 360 static void mark_eio(struct i915_request *rq)
 361 {
 362         if (i915_request_completed(rq))
 363                 return;
 364
 365         GEM_BUG_ON(i915_request_signaled(rq));
 366
 367         i915_request_set_error_once(rq, -EIO);
 368         i915_request_mark_complete(rq);
 369 }
 370
 371 static struct i915_request *
 372 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 373 {
 374         struct i915_request *active = rq;
 375
 376         rcu_read_lock();
 377         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 378                 if (i915_request_completed(rq))
 379                         break;
 380
 381                 active = rq;
 382         }
 383         rcu_read_unlock();
 384
 385         return active;
 386 }
 387
 388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 389 {
 390         return (i915_ggtt_offset(engine->status_page.vma) +
 391                 I915_GEM_HWS_PREEMPT_ADDR);
 392 }
 393
 394 static inline void
 395 ring_set_paused(const struct intel_engine_cs *engine, int state)
 396 {
 397         /*
 398          * We inspect HWS_PREEMPT with a semaphore inside
 399          * engine->emit_fini_breadcrumb. If the dword is true,
 400          * the ring is paused as the semaphore will busywait
 401          * until the dword is false.
 402          */
 403         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 404         if (state)
 405                 wmb();
 406 }
 407
 408 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 409 {
 410         return rb_entry(rb, struct i915_priolist, node);
 411 }
 412
 413 static inline int rq_prio(const struct i915_request *rq)
 414 {
 415         return READ_ONCE(rq->sched.attr.priority);
 416 }
 417
 418 static int effective_prio(const struct i915_request *rq)
 419 {
 420         int prio = rq_prio(rq);
 421
 422         /*
 423          * If this request is special and must not be interrupted at any
 424          * cost, so be it. Note we are only checking the most recent request
 425          * in the context and so may be masking an earlier vip request. It
 426          * is hoped that under the conditions where nopreempt is used, this
 427          * will not matter (i.e. all requests to that context will be
 428          * nopreempt for as long as desired).
 429          */
 430         if (i915_request_has_nopreempt(rq))
 431                 prio = I915_PRIORITY_UNPREEMPTABLE;
 432
 433         return prio;
 434 }
 435
 436 static int queue_prio(const struct intel_engine_execlists *execlists)
 437 {
 438         struct i915_priolist *p;
 439         struct rb_node *rb;
 440
 441         rb = rb_first_cached(&execlists->queue);
 442         if (!rb)
 443                 return INT_MIN;
 444
 445         /*
 446          * As the priolist[] are inverted, with the highest priority in [0],
 447          * we have to flip the index value to become priority.
 448          */
 449         p = to_priolist(rb);
 450         if (!I915_USER_PRIORITY_SHIFT)
 451                 return p->priority;
 452
 453         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 454 }
 455
 456 static inline bool need_preempt(const struct intel_engine_cs *engine,
 457                                 const struct i915_request *rq,
 458                                 struct rb_node *rb)
 459 {
 460         int last_prio;
 461
 462         if (!intel_engine_has_semaphores(engine))
 463                 return false;
 464
 465         /*
 466          * Check if the current priority hint merits a preemption attempt.
 467          *
 468          * We record the highest value priority we saw during rescheduling
 469          * prior to this dequeue, therefore we know that if it is strictly
 470          * less than the current tail of ESLP[0], we do not need to force
 471          * a preempt-to-idle cycle.
 472          *
 473          * However, the priority hint is a mere hint that we may need to
 474          * preempt. If that hint is stale or we may be trying to preempt
 475          * ourselves, ignore the request.
 476          *
 477          * More naturally we would write
 478          *      prio >= max(0, last);
 479          * except that we wish to prevent triggering preemption at the same
 480          * priority level: the task that is running should remain running
 481          * to preserve FIFO ordering of dependencies.
 482          */
 483         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 484         if (engine->execlists.queue_priority_hint <= last_prio)
 485                 return false;
 486
 487         /*
 488          * Check against the first request in ELSP[1], it will, thanks to the
 489          * power of PI, be the highest priority of that context.
 490          */
 491         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 492             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 493                 return true;
 494
 495         if (rb) {
 496                 struct virtual_engine *ve =
 497                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 498                 bool preempt = false;
 499
 500                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 501                         struct i915_request *next;
 502
 503                         rcu_read_lock();
 504                         next = READ_ONCE(ve->request);
 505                         if (next)
 506                                 preempt = rq_prio(next) > last_prio;
 507                         rcu_read_unlock();
 508                 }
 509
 510                 if (preempt)
 511                         return preempt;
 512         }
 513
 514         /*
 515          * If the inflight context did not trigger the preemption, then maybe
 516          * it was the set of queued requests? Pick the highest priority in
 517          * the queue (the first active priolist) and see if it deserves to be
 518          * running instead of ELSP[0].
 519          *
 520          * The highest priority request in the queue can not be either
 521          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 522          * context, it's priority would not exceed ELSP[0] aka last_prio.
 523          */
 524         return queue_prio(&engine->execlists) > last_prio;
 525 }
 526
 527 __maybe_unused static inline bool
 528 assert_priority_queue(const struct i915_request *prev,
 529                       const struct i915_request *next)
 530 {
 531         /*
 532          * Without preemption, the prev may refer to the still active element
 533          * which we refuse to let go.
 534          *
 535          * Even with preemption, there are times when we think it is better not
 536          * to preempt and leave an ostensibly lower priority request in flight.
 537          */
 538         if (i915_request_is_active(prev))
 539                 return true;
 540
 541         return rq_prio(prev) >= rq_prio(next);
 542 }
 543
 544 /*
 545  * The context descriptor encodes various attributes of a context,
 546  * including its GTT address and some flags. Because it's fairly
 547  * expensive to calculate, we'll just do it once and cache the result,
 548  * which remains valid until the context is unpinned.
 549  *
 550  * This is what a descriptor looks like, from LSB to MSB::
 551  *
 552  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 553  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 554  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 555  *      bits 53-54:    mbz, reserved for use by hardware
 556  *      bits 55-63:    group ID, currently unused and set to 0
 557  *
 558  * Starting from Gen11, the upper dword of the descriptor has a new format:
 559  *
 560  *      bits 32-36:    reserved
 561  *      bits 37-47:    SW context ID
 562  *      bits 48:53:    engine instance
 563  *      bit 54:        mbz, reserved for use by hardware
 564  *      bits 55-60:    SW counter
 565  *      bits 61-63:    engine class
 566  *
 567  * engine info, SW context ID and SW counter need to form a unique number
 568  * (Context ID) per lrc.
 569  */
 570 static u32
 571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 572 {
 573         u32 desc;
 574
 575         desc = INTEL_LEGACY_32B_CONTEXT;
 576         if (i915_vm_is_4lvl(ce->vm))
 577                 desc = INTEL_LEGACY_64B_CONTEXT;
 578         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 579
 580         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 581         if (IS_GEN(engine->i915, 8))
 582                 desc |= GEN8_CTX_L3LLC_COHERENT;
 583
 584         return i915_ggtt_offset(ce->state) | desc;
 585 }
 586
 587 static inline unsigned int dword_in_page(void *addr)
 588 {
 589         return offset_in_page(addr) / sizeof(u32);
 590 }
 591
 592 static void set_offsets(u32 *regs,
 593                         const u8 *data,
 594                         const struct intel_engine_cs *engine,
 595                         bool clear)
 596 #define NOP(x) (BIT(7) | (x))
 597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 598 #define POSTED BIT(0)
 599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 600 #define REG16(x) \
 601         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 602         (((x) >> 2) & 0x7f)
 603 #define END(total_state_size) 0, (total_state_size)
 604 {
 605         const u32 base = engine->mmio_base;
 606
 607         while (*data) {
 608                 u8 count, flags;
 609
 610                 if (*data & BIT(7)) { /* skip */
 611                         count = *data++ & ~BIT(7);
 612                         if (clear)
 613                                 memset32(regs, MI_NOOP, count);
 614                         regs += count;
 615                         continue;
 616                 }
 617
 618                 count = *data & 0x3f;
 619                 flags = *data >> 6;
 620                 data++;
 621
 622                 *regs = MI_LOAD_REGISTER_IMM(count);
 623                 if (flags & POSTED)
 624                         *regs |= MI_LRI_FORCE_POSTED;
 625                 if (INTEL_GEN(engine->i915) >= 11)
 626                         *regs |= MI_LRI_LRM_CS_MMIO;
 627                 regs++;
 628
 629                 GEM_BUG_ON(!count);
 630                 do {
 631                         u32 offset = 0;
 632                         u8 v;
 633
 634                         do {
 635                                 v = *data++;
 636                                 offset <<= 7;
 637                                 offset |= v & ~BIT(7);
 638                         } while (v & BIT(7));
 639
 640                         regs[0] = base + (offset << 2);
 641                         if (clear)
 642                                 regs[1] = 0;
 643                         regs += 2;
 644                 } while (--count);
 645         }
 646
 647         if (clear) {
 648                 u8 count = *++data;
 649
 650                 /* Clear past the tail for HW access */
 651                 GEM_BUG_ON(dword_in_page(regs) > count);
 652                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 653
 654                 /* Close the batch; used mainly by live_lrc_layout() */
 655                 *regs = MI_BATCH_BUFFER_END;
 656                 if (INTEL_GEN(engine->i915) >= 10)
 657                         *regs |= BIT(0);
 658         }
 659 }
 660
 661 static const u8 gen8_xcs_offsets[] = {
 662         NOP(1),
 663         LRI(11, 0),
 664         REG16(0x244),
 665         REG(0x034),
 666         REG(0x030),
 667         REG(0x038),
 668         REG(0x03c),
 669         REG(0x168),
 670         REG(0x140),
 671         REG(0x110),
 672         REG(0x11c),
 673         REG(0x114),
 674         REG(0x118),
 675
 676         NOP(9),
 677         LRI(9, 0),
 678         REG16(0x3a8),
 679         REG16(0x28c),
 680         REG16(0x288),
 681         REG16(0x284),
 682         REG16(0x280),
 683         REG16(0x27c),
 684         REG16(0x278),
 685         REG16(0x274),
 686         REG16(0x270),
 687
 688         NOP(13),
 689         LRI(2, 0),
 690         REG16(0x200),
 691         REG(0x028),
 692
 693         END(80)
 694 };
 695
 696 static const u8 gen9_xcs_offsets[] = {
 697         NOP(1),
 698         LRI(14, POSTED),
 699         REG16(0x244),
 700         REG(0x034),
 701         REG(0x030),
 702         REG(0x038),
 703         REG(0x03c),
 704         REG(0x168),
 705         REG(0x140),
 706         REG(0x110),
 707         REG(0x11c),
 708         REG(0x114),
 709         REG(0x118),
 710         REG(0x1c0),
 711         REG(0x1c4),
 712         REG(0x1c8),
 713
 714         NOP(3),
 715         LRI(9, POSTED),
 716         REG16(0x3a8),
 717         REG16(0x28c),
 718         REG16(0x288),
 719         REG16(0x284),
 720         REG16(0x280),
 721         REG16(0x27c),
 722         REG16(0x278),
 723         REG16(0x274),
 724         REG16(0x270),
 725
 726         NOP(13),
 727         LRI(1, POSTED),
 728         REG16(0x200),
 729
 730         NOP(13),
 731         LRI(44, POSTED),
 732         REG(0x028),
 733         REG(0x09c),
 734         REG(0x0c0),
 735         REG(0x178),
 736         REG(0x17c),
 737         REG16(0x358),
 738         REG(0x170),
 739         REG(0x150),
 740         REG(0x154),
 741         REG(0x158),
 742         REG16(0x41c),
 743         REG16(0x600),
 744         REG16(0x604),
 745         REG16(0x608),
 746         REG16(0x60c),
 747         REG16(0x610),
 748         REG16(0x614),
 749         REG16(0x618),
 750         REG16(0x61c),
 751         REG16(0x620),
 752         REG16(0x624),
 753         REG16(0x628),
 754         REG16(0x62c),
 755         REG16(0x630),
 756         REG16(0x634),
 757         REG16(0x638),
 758         REG16(0x63c),
 759         REG16(0x640),
 760         REG16(0x644),
 761         REG16(0x648),
 762         REG16(0x64c),
 763         REG16(0x650),
 764         REG16(0x654),
 765         REG16(0x658),
 766         REG16(0x65c),
 767         REG16(0x660),
 768         REG16(0x664),
 769         REG16(0x668),
 770         REG16(0x66c),
 771         REG16(0x670),
 772         REG16(0x674),
 773         REG16(0x678),
 774         REG16(0x67c),
 775         REG(0x068),
 776
 777         END(176)
 778 };
 779
 780 static const u8 gen12_xcs_offsets[] = {
 781         NOP(1),
 782         LRI(13, POSTED),
 783         REG16(0x244),
 784         REG(0x034),
 785         REG(0x030),
 786         REG(0x038),
 787         REG(0x03c),
 788         REG(0x168),
 789         REG(0x140),
 790         REG(0x110),
 791         REG(0x1c0),
 792         REG(0x1c4),
 793         REG(0x1c8),
 794         REG(0x180),
 795         REG16(0x2b4),
 796
 797         NOP(5),
 798         LRI(9, POSTED),
 799         REG16(0x3a8),
 800         REG16(0x28c),
 801         REG16(0x288),
 802         REG16(0x284),
 803         REG16(0x280),
 804         REG16(0x27c),
 805         REG16(0x278),
 806         REG16(0x274),
 807         REG16(0x270),
 808
 809         END(80)
 810 };
 811
 812 static const u8 gen8_rcs_offsets[] = {
 813         NOP(1),
 814         LRI(14, POSTED),
 815         REG16(0x244),
 816         REG(0x034),
 817         REG(0x030),
 818         REG(0x038),
 819         REG(0x03c),
 820         REG(0x168),
 821         REG(0x140),
 822         REG(0x110),
 823         REG(0x11c),
 824         REG(0x114),
 825         REG(0x118),
 826         REG(0x1c0),
 827         REG(0x1c4),
 828         REG(0x1c8),
 829
 830         NOP(3),
 831         LRI(9, POSTED),
 832         REG16(0x3a8),
 833         REG16(0x28c),
 834         REG16(0x288),
 835         REG16(0x284),
 836         REG16(0x280),
 837         REG16(0x27c),
 838         REG16(0x278),
 839         REG16(0x274),
 840         REG16(0x270),
 841
 842         NOP(13),
 843         LRI(1, 0),
 844         REG(0x0c8),
 845
 846         END(80)
 847 };
 848
 849 static const u8 gen9_rcs_offsets[] = {
 850         NOP(1),
 851         LRI(14, POSTED),
 852         REG16(0x244),
 853         REG(0x34),
 854         REG(0x30),
 855         REG(0x38),
 856         REG(0x3c),
 857         REG(0x168),
 858         REG(0x140),
 859         REG(0x110),
 860         REG(0x11c),
 861         REG(0x114),
 862         REG(0x118),
 863         REG(0x1c0),
 864         REG(0x1c4),
 865         REG(0x1c8),
 866
 867         NOP(3),
 868         LRI(9, POSTED),
 869         REG16(0x3a8),
 870         REG16(0x28c),
 871         REG16(0x288),
 872         REG16(0x284),
 873         REG16(0x280),
 874         REG16(0x27c),
 875         REG16(0x278),
 876         REG16(0x274),
 877         REG16(0x270),
 878
 879         NOP(13),
 880         LRI(1, 0),
 881         REG(0xc8),
 882
 883         NOP(13),
 884         LRI(44, POSTED),
 885         REG(0x28),
 886         REG(0x9c),
 887         REG(0xc0),
 888         REG(0x178),
 889         REG(0x17c),
 890         REG16(0x358),
 891         REG(0x170),
 892         REG(0x150),
 893         REG(0x154),
 894         REG(0x158),
 895         REG16(0x41c),
 896         REG16(0x600),
 897         REG16(0x604),
 898         REG16(0x608),
 899         REG16(0x60c),
 900         REG16(0x610),
 901         REG16(0x614),
 902         REG16(0x618),
 903         REG16(0x61c),
 904         REG16(0x620),
 905         REG16(0x624),
 906         REG16(0x628),
 907         REG16(0x62c),
 908         REG16(0x630),
 909         REG16(0x634),
 910         REG16(0x638),
 911         REG16(0x63c),
 912         REG16(0x640),
 913         REG16(0x644),
 914         REG16(0x648),
 915         REG16(0x64c),
 916         REG16(0x650),
 917         REG16(0x654),
 918         REG16(0x658),
 919         REG16(0x65c),
 920         REG16(0x660),
 921         REG16(0x664),
 922         REG16(0x668),
 923         REG16(0x66c),
 924         REG16(0x670),
 925         REG16(0x674),
 926         REG16(0x678),
 927         REG16(0x67c),
 928         REG(0x68),
 929
 930         END(176)
 931 };
 932
 933 static const u8 gen11_rcs_offsets[] = {
 934         NOP(1),
 935         LRI(15, POSTED),
 936         REG16(0x244),
 937         REG(0x034),
 938         REG(0x030),
 939         REG(0x038),
 940         REG(0x03c),
 941         REG(0x168),
 942         REG(0x140),
 943         REG(0x110),
 944         REG(0x11c),
 945         REG(0x114),
 946         REG(0x118),
 947         REG(0x1c0),
 948         REG(0x1c4),
 949         REG(0x1c8),
 950         REG(0x180),
 951
 952         NOP(1),
 953         LRI(9, POSTED),
 954         REG16(0x3a8),
 955         REG16(0x28c),
 956         REG16(0x288),
 957         REG16(0x284),
 958         REG16(0x280),
 959         REG16(0x27c),
 960         REG16(0x278),
 961         REG16(0x274),
 962         REG16(0x270),
 963
 964         LRI(1, POSTED),
 965         REG(0x1b0),
 966
 967         NOP(10),
 968         LRI(1, 0),
 969         REG(0x0c8),
 970
 971         END(80)
 972 };
 973
 974 static const u8 gen12_rcs_offsets[] = {
 975         NOP(1),
 976         LRI(13, POSTED),
 977         REG16(0x244),
 978         REG(0x034),
 979         REG(0x030),
 980         REG(0x038),
 981         REG(0x03c),
 982         REG(0x168),
 983         REG(0x140),
 984         REG(0x110),
 985         REG(0x1c0),
 986         REG(0x1c4),
 987         REG(0x1c8),
 988         REG(0x180),
 989         REG16(0x2b4),
 990
 991         NOP(5),
 992         LRI(9, POSTED),
 993         REG16(0x3a8),
 994         REG16(0x28c),
 995         REG16(0x288),
 996         REG16(0x284),
 997         REG16(0x280),
 998         REG16(0x27c),
 999         REG16(0x278),
1000         REG16(0x274),
1001         REG16(0x270),
1002
1003         LRI(3, POSTED),
1004         REG(0x1b0),
1005         REG16(0x5a8),
1006         REG16(0x5ac),
1007
1008         NOP(6),
1009         LRI(1, 0),
1010         REG(0x0c8),
1011         NOP(3 + 9 + 1),
1012
1013         LRI(51, POSTED),
1014         REG16(0x588),
1015         REG16(0x588),
1016         REG16(0x588),
1017         REG16(0x588),
1018         REG16(0x588),
1019         REG16(0x588),
1020         REG(0x028),
1021         REG(0x09c),
1022         REG(0x0c0),
1023         REG(0x178),
1024         REG(0x17c),
1025         REG16(0x358),
1026         REG(0x170),
1027         REG(0x150),
1028         REG(0x154),
1029         REG(0x158),
1030         REG16(0x41c),
1031         REG16(0x600),
1032         REG16(0x604),
1033         REG16(0x608),
1034         REG16(0x60c),
1035         REG16(0x610),
1036         REG16(0x614),
1037         REG16(0x618),
1038         REG16(0x61c),
1039         REG16(0x620),
1040         REG16(0x624),
1041         REG16(0x628),
1042         REG16(0x62c),
1043         REG16(0x630),
1044         REG16(0x634),
1045         REG16(0x638),
1046         REG16(0x63c),
1047         REG16(0x640),
1048         REG16(0x644),
1049         REG16(0x648),
1050         REG16(0x64c),
1051         REG16(0x650),
1052         REG16(0x654),
1053         REG16(0x658),
1054         REG16(0x65c),
1055         REG16(0x660),
1056         REG16(0x664),
1057         REG16(0x668),
1058         REG16(0x66c),
1059         REG16(0x670),
1060         REG16(0x674),
1061         REG16(0x678),
1062         REG16(0x67c),
1063         REG(0x068),
1064         REG(0x084),
1065         NOP(1),
1066
1067         END(192)
1068 };
1069
1070 #undef END
1071 #undef REG16
1072 #undef REG
1073 #undef LRI
1074 #undef NOP
1075
1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1077 {
1078         /*
1079          * The gen12+ lists only have the registers we program in the basic
1080          * default state. We rely on the context image using relative
1081          * addressing to automatic fixup the register state between the
1082          * physical engines for virtual engine.
1083          */
1084         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1085                    !intel_engine_has_relative_mmio(engine));
1086
1087         if (engine->class == RENDER_CLASS) {
1088                 if (INTEL_GEN(engine->i915) >= 12)
1089                         return gen12_rcs_offsets;
1090                 else if (INTEL_GEN(engine->i915) >= 11)
1091                         return gen11_rcs_offsets;
1092                 else if (INTEL_GEN(engine->i915) >= 9)
1093                         return gen9_rcs_offsets;
1094                 else
1095                         return gen8_rcs_offsets;
1096         } else {
1097                 if (INTEL_GEN(engine->i915) >= 12)
1098                         return gen12_xcs_offsets;
1099                 else if (INTEL_GEN(engine->i915) >= 9)
1100                         return gen9_xcs_offsets;
1101                 else
1102                         return gen8_xcs_offsets;
1103         }
1104 }
1105
1106 static struct i915_request *
1107 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1108 {
1109         struct i915_request *rq, *rn, *active = NULL;
1110         struct list_head *pl;
1111         int prio = I915_PRIORITY_INVALID;
1112
1113         lockdep_assert_held(&engine->active.lock);
1114
1115         list_for_each_entry_safe_reverse(rq, rn,
1116                                          &engine->active.requests,
1117                                          sched.link) {
1118                 if (i915_request_completed(rq))
1119                         continue; /* XXX */
1120
1121                 __i915_request_unsubmit(rq);
1122
1123                 /*
1124                  * Push the request back into the queue for later resubmission.
1125                  * If this request is not native to this physical engine (i.e.
1126                  * it came from a virtual source), push it back onto the virtual
1127                  * engine so that it can be moved across onto another physical
1128                  * engine as load dictates.
1129                  */
1130                 if (likely(rq->execution_mask == engine->mask)) {
1131                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1132                         if (rq_prio(rq) != prio) {
1133                                 prio = rq_prio(rq);
1134                                 pl = i915_sched_lookup_priolist(engine, prio);
1135                         }
1136                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1137
1138                         list_move(&rq->sched.link, pl);
1139                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1140
1141                         /* Check in case we rollback so far we wrap [size/2] */
1142                         if (intel_ring_direction(rq->ring,
1143                                                  rq->tail,
1144                                                  rq->ring->tail + 8) > 0)
1145                                 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1146
1147                         active = rq;
1148                 } else {
1149                         struct intel_engine_cs *owner = rq->context->engine;
1150
1151                         WRITE_ONCE(rq->engine, owner);
1152                         owner->submit_request(rq);
1153                         active = NULL;
1154                 }
1155         }
1156
1157         return active;
1158 }
1159
1160 struct i915_request *
1161 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1162 {
1163         struct intel_engine_cs *engine =
1164                 container_of(execlists, typeof(*engine), execlists);
1165
1166         return __unwind_incomplete_requests(engine);
1167 }
1168
1169 static inline void
1170 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1171 {
1172         /*
1173          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1174          * The compiler should eliminate this function as dead-code.
1175          */
1176         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1177                 return;
1178
1179         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1180                                    status, rq);
1181 }
1182
1183 static void intel_engine_context_in(struct intel_engine_cs *engine)
1184 {
1185         unsigned long flags;
1186
1187         if (atomic_add_unless(&engine->stats.active, 1, 0))
1188                 return;
1189
1190         write_seqlock_irqsave(&engine->stats.lock, flags);
1191         if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1192                 engine->stats.start = ktime_get();
1193                 atomic_inc(&engine->stats.active);
1194         }
1195         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1196 }
1197
1198 static void intel_engine_context_out(struct intel_engine_cs *engine)
1199 {
1200         unsigned long flags;
1201
1202         GEM_BUG_ON(!atomic_read(&engine->stats.active));
1203
1204         if (atomic_add_unless(&engine->stats.active, -1, 1))
1205                 return;
1206
1207         write_seqlock_irqsave(&engine->stats.lock, flags);
1208         if (atomic_dec_and_test(&engine->stats.active)) {
1209                 engine->stats.total =
1210                         ktime_add(engine->stats.total,
1211                                   ktime_sub(ktime_get(), engine->stats.start));
1212         }
1213         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1214 }
1215
1216 static void
1217 execlists_check_context(const struct intel_context *ce,
1218                         const struct intel_engine_cs *engine)
1219 {
1220         const struct intel_ring *ring = ce->ring;
1221         u32 *regs = ce->lrc_reg_state;
1222         bool valid = true;
1223         int x;
1224
1225         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1226                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1227                        engine->name,
1228                        regs[CTX_RING_START],
1229                        i915_ggtt_offset(ring->vma));
1230                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1231                 valid = false;
1232         }
1233
1234         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1235             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1236                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1237                        engine->name,
1238                        regs[CTX_RING_CTL],
1239                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1240                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1241                 valid = false;
1242         }
1243
1244         x = lrc_ring_mi_mode(engine);
1245         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1246                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1247                        engine->name, regs[x + 1]);
1248                 regs[x + 1] &= ~STOP_RING;
1249                 regs[x + 1] |= STOP_RING << 16;
1250                 valid = false;
1251         }
1252
1253         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1254 }
1255
1256 static void restore_default_state(struct intel_context *ce,
1257                                   struct intel_engine_cs *engine)
1258 {
1259         u32 *regs;
1260
1261         regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1262         execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1263
1264         ce->runtime.last = intel_context_get_runtime(ce);
1265 }
1266
1267 static void reset_active(struct i915_request *rq,
1268                          struct intel_engine_cs *engine)
1269 {
1270         struct intel_context * const ce = rq->context;
1271         u32 head;
1272
1273         /*
1274          * The executing context has been cancelled. We want to prevent
1275          * further execution along this context and propagate the error on
1276          * to anything depending on its results.
1277          *
1278          * In __i915_request_submit(), we apply the -EIO and remove the
1279          * requests' payloads for any banned requests. But first, we must
1280          * rewind the context back to the start of the incomplete request so
1281          * that we do not jump back into the middle of the batch.
1282          *
1283          * We preserve the breadcrumbs and semaphores of the incomplete
1284          * requests so that inter-timeline dependencies (i.e other timelines)
1285          * remain correctly ordered. And we defer to __i915_request_submit()
1286          * so that all asynchronous waits are correctly handled.
1287          */
1288         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1289                      rq->fence.context, rq->fence.seqno);
1290
1291         /* On resubmission of the active request, payload will be scrubbed */
1292         if (i915_request_completed(rq))
1293                 head = rq->tail;
1294         else
1295                 head = active_request(ce->timeline, rq)->head;
1296         head = intel_ring_wrap(ce->ring, head);
1297
1298         /* Scrub the context image to prevent replaying the previous batch */
1299         restore_default_state(ce, engine);
1300         __execlists_update_reg_state(ce, engine, head);
1301
1302         /* We've switched away, so this should be a no-op, but intent matters */
1303         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1304 }
1305
1306 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1307 {
1308 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1309         ce->runtime.num_underflow += dt < 0;
1310         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1311 #endif
1312 }
1313
1314 static void intel_context_update_runtime(struct intel_context *ce)
1315 {
1316         u32 old;
1317         s32 dt;
1318
1319         if (intel_context_is_barrier(ce))
1320                 return;
1321
1322         old = ce->runtime.last;
1323         ce->runtime.last = intel_context_get_runtime(ce);
1324         dt = ce->runtime.last - old;
1325
1326         if (unlikely(dt <= 0)) {
1327                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1328                          old, ce->runtime.last, dt);
1329                 st_update_runtime_underflow(ce, dt);
1330                 return;
1331         }
1332
1333         ewma_runtime_add(&ce->runtime.avg, dt);
1334         ce->runtime.total += dt;
1335 }
1336
1337 static inline struct intel_engine_cs *
1338 __execlists_schedule_in(struct i915_request *rq)
1339 {
1340         struct intel_engine_cs * const engine = rq->engine;
1341         struct intel_context * const ce = rq->context;
1342
1343         intel_context_get(ce);
1344
1345         if (unlikely(intel_context_is_banned(ce)))
1346                 reset_active(rq, engine);
1347
1348         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1349                 execlists_check_context(ce, engine);
1350
1351         if (ce->tag) {
1352                 /* Use a fixed tag for OA and friends */
1353                 GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1354                 ce->lrc.ccid = ce->tag;
1355         } else {
1356                 /* We don't need a strict matching tag, just different values */
1357                 unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1358
1359                 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1360                 clear_bit(tag - 1, &engine->context_tag);
1361                 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1362
1363                 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1364         }
1365
1366         ce->lrc.ccid |= engine->execlists.ccid;
1367
1368         __intel_gt_pm_get(engine->gt);
1369         if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1370                 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1371         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1372         intel_engine_context_in(engine);
1373
1374         return engine;
1375 }
1376
1377 static inline struct i915_request *
1378 execlists_schedule_in(struct i915_request *rq, int idx)
1379 {
1380         struct intel_context * const ce = rq->context;
1381         struct intel_engine_cs *old;
1382
1383         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1384         trace_i915_request_in(rq, idx);
1385
1386         old = READ_ONCE(ce->inflight);
1387         do {
1388                 if (!old) {
1389                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1390                         break;
1391                 }
1392         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1393
1394         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1395         return i915_request_get(rq);
1396 }
1397
1398 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1399 {
1400         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1401         struct i915_request *next = READ_ONCE(ve->request);
1402
1403         if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1404                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
1405 }
1406
1407 static inline void
1408 __execlists_schedule_out(struct i915_request *rq,
1409                          struct intel_engine_cs * const engine,
1410                          unsigned int ccid)
1411 {
1412         struct intel_context * const ce = rq->context;
1413
1414         /*
1415          * NB process_csb() is not under the engine->active.lock and hence
1416          * schedule_out can race with schedule_in meaning that we should
1417          * refrain from doing non-trivial work here.
1418          */
1419
1420         /*
1421          * If we have just completed this context, the engine may now be
1422          * idle and we want to re-enter powersaving.
1423          */
1424         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1425             i915_request_completed(rq))
1426                 intel_engine_add_retire(engine, ce->timeline);
1427
1428         ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1429         ccid &= GEN12_MAX_CONTEXT_HW_ID;
1430         if (ccid < BITS_PER_LONG) {
1431                 GEM_BUG_ON(ccid == 0);
1432                 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1433                 set_bit(ccid - 1, &engine->context_tag);
1434         }
1435
1436         intel_context_update_runtime(ce);
1437         intel_engine_context_out(engine);
1438         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1439         if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1440                 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1441         intel_gt_pm_put_async(engine->gt);
1442
1443         /*
1444          * If this is part of a virtual engine, its next request may
1445          * have been blocked waiting for access to the active context.
1446          * We have to kick all the siblings again in case we need to
1447          * switch (e.g. the next request is not runnable on this
1448          * engine). Hopefully, we will already have submitted the next
1449          * request before the tasklet runs and do not need to rebuild
1450          * each virtual tree and kick everyone again.
1451          */
1452         if (ce->engine != engine)
1453                 kick_siblings(rq, ce);
1454
1455         intel_context_put(ce);
1456 }
1457
1458 static inline void
1459 execlists_schedule_out(struct i915_request *rq)
1460 {
1461         struct intel_context * const ce = rq->context;
1462         struct intel_engine_cs *cur, *old;
1463         u32 ccid;
1464
1465         trace_i915_request_out(rq);
1466
1467         ccid = rq->context->lrc.ccid;
1468         old = READ_ONCE(ce->inflight);
1469         do
1470                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1471         while (!try_cmpxchg(&ce->inflight, &old, cur));
1472         if (!cur)
1473                 __execlists_schedule_out(rq, old, ccid);
1474
1475         i915_request_put(rq);
1476 }
1477
1478 static u64 execlists_update_context(struct i915_request *rq)
1479 {
1480         struct intel_context *ce = rq->context;
1481         u64 desc = ce->lrc.desc;
1482         u32 tail, prev;
1483
1484         /*
1485          * WaIdleLiteRestore:bdw,skl
1486          *
1487          * We should never submit the context with the same RING_TAIL twice
1488          * just in case we submit an empty ring, which confuses the HW.
1489          *
1490          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1491          * the normal request to be able to always advance the RING_TAIL on
1492          * subsequent resubmissions (for lite restore). Should that fail us,
1493          * and we try and submit the same tail again, force the context
1494          * reload.
1495          *
1496          * If we need to return to a preempted context, we need to skip the
1497          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1498          * HW has a tendency to ignore us rewinding the TAIL to the end of
1499          * an earlier request.
1500          */
1501         GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1502         prev = rq->ring->tail;
1503         tail = intel_ring_set_tail(rq->ring, rq->tail);
1504         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1505                 desc |= CTX_DESC_FORCE_RESTORE;
1506         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1507         rq->tail = rq->wa_tail;
1508
1509         /*
1510          * Make sure the context image is complete before we submit it to HW.
1511          *
1512          * Ostensibly, writes (including the WCB) should be flushed prior to
1513          * an uncached write such as our mmio register access, the empirical
1514          * evidence (esp. on Braswell) suggests that the WC write into memory
1515          * may not be visible to the HW prior to the completion of the UC
1516          * register write and that we may begin execution from the context
1517          * before its image is complete leading to invalid PD chasing.
1518          */
1519         wmb();
1520
1521         ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1522         return desc;
1523 }
1524
1525 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1526 {
1527         if (execlists->ctrl_reg) {
1528                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1529                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1530         } else {
1531                 writel(upper_32_bits(desc), execlists->submit_reg);
1532                 writel(lower_32_bits(desc), execlists->submit_reg);
1533         }
1534 }
1535
1536 static __maybe_unused char *
1537 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1538 {
1539         if (!rq)
1540                 return "";
1541
1542         snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1543                  prefix,
1544                  rq->context->lrc.ccid,
1545                  rq->fence.context, rq->fence.seqno,
1546                  i915_request_completed(rq) ? "!" :
1547                  i915_request_started(rq) ? "*" :
1548                  "",
1549                  rq_prio(rq));
1550
1551         return buf;
1552 }
1553
1554 static __maybe_unused void
1555 trace_ports(const struct intel_engine_execlists *execlists,
1556             const char *msg,
1557             struct i915_request * const *ports)
1558 {
1559         const struct intel_engine_cs *engine =
1560                 container_of(execlists, typeof(*engine), execlists);
1561         char __maybe_unused p0[40], p1[40];
1562
1563         if (!ports[0])
1564                 return;
1565
1566         ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1567                      dump_port(p0, sizeof(p0), "", ports[0]),
1568                      dump_port(p1, sizeof(p1), ", ", ports[1]));
1569 }
1570
1571 static inline bool
1572 reset_in_progress(const struct intel_engine_execlists *execlists)
1573 {
1574         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1575 }
1576
1577 static __maybe_unused bool
1578 assert_pending_valid(const struct intel_engine_execlists *execlists,
1579                      const char *msg)
1580 {
1581         struct intel_engine_cs *engine =
1582                 container_of(execlists, typeof(*engine), execlists);
1583         struct i915_request * const *port, *rq;
1584         struct intel_context *ce = NULL;
1585         bool sentinel = false;
1586         u32 ccid = -1;
1587
1588         trace_ports(execlists, msg, execlists->pending);
1589
1590         /* We may be messing around with the lists during reset, lalala */
1591         if (reset_in_progress(execlists))
1592                 return true;
1593
1594         if (!execlists->pending[0]) {
1595                 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1596                               engine->name);
1597                 return false;
1598         }
1599
1600         if (execlists->pending[execlists_num_ports(execlists)]) {
1601                 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1602                               engine->name, execlists_num_ports(execlists));
1603                 return false;
1604         }
1605
1606         for (port = execlists->pending; (rq = *port); port++) {
1607                 unsigned long flags;
1608                 bool ok = true;
1609
1610                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1611                 GEM_BUG_ON(!i915_request_is_active(rq));
1612
1613                 if (ce == rq->context) {
1614                         GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1615                                       engine->name,
1616                                       ce->timeline->fence_context,
1617                                       port - execlists->pending);
1618                         return false;
1619                 }
1620                 ce = rq->context;
1621
1622                 if (ccid == ce->lrc.ccid) {
1623                         GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1624                                       engine->name,
1625                                       ccid, ce->timeline->fence_context,
1626                                       port - execlists->pending);
1627                         return false;
1628                 }
1629                 ccid = ce->lrc.ccid;
1630
1631                 /*
1632                  * Sentinels are supposed to be the last request so they flush
1633                  * the current execution off the HW. Check that they are the only
1634                  * request in the pending submission.
1635                  */
1636                 if (sentinel) {
1637                         GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1638                                       engine->name,
1639                                       ce->timeline->fence_context,
1640                                       port - execlists->pending);
1641                         return false;
1642                 }
1643                 sentinel = i915_request_has_sentinel(rq);
1644
1645                 /* Hold tightly onto the lock to prevent concurrent retires! */
1646                 if (!spin_trylock_irqsave(&rq->lock, flags))
1647                         continue;
1648
1649                 if (i915_request_completed(rq))
1650                         goto unlock;
1651
1652                 if (i915_active_is_idle(&ce->active) &&
1653                     !intel_context_is_barrier(ce)) {
1654                         GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1655                                       engine->name,
1656                                       ce->timeline->fence_context,
1657                                       port - execlists->pending);
1658                         ok = false;
1659                         goto unlock;
1660                 }
1661
1662                 if (!i915_vma_is_pinned(ce->state)) {
1663                         GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1664                                       engine->name,
1665                                       ce->timeline->fence_context,
1666                                       port - execlists->pending);
1667                         ok = false;
1668                         goto unlock;
1669                 }
1670
1671                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1672                         GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1673                                       engine->name,
1674                                       ce->timeline->fence_context,
1675                                       port - execlists->pending);
1676                         ok = false;
1677                         goto unlock;
1678                 }
1679
1680 unlock:
1681                 spin_unlock_irqrestore(&rq->lock, flags);
1682                 if (!ok)
1683                         return false;
1684         }
1685
1686         return ce;
1687 }
1688
1689 static void execlists_submit_ports(struct intel_engine_cs *engine)
1690 {
1691         struct intel_engine_execlists *execlists = &engine->execlists;
1692         unsigned int n;
1693
1694         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1695
1696         /*
1697          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1698          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1699          * not be relinquished until the device is idle (see
1700          * i915_gem_idle_work_handler()). As a precaution, we make sure
1701          * that all ELSP are drained i.e. we have processed the CSB,
1702          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1703          */
1704         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1705
1706         /*
1707          * ELSQ note: the submit queue is not cleared after being submitted
1708          * to the HW so we need to make sure we always clean it up. This is
1709          * currently ensured by the fact that we always write the same number
1710          * of elsq entries, keep this in mind before changing the loop below.
1711          */
1712         for (n = execlists_num_ports(execlists); n--; ) {
1713                 struct i915_request *rq = execlists->pending[n];
1714
1715                 write_desc(execlists,
1716                            rq ? execlists_update_context(rq) : 0,
1717                            n);
1718         }
1719
1720         /* we need to manually load the submit queue */
1721         if (execlists->ctrl_reg)
1722                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1723 }
1724
1725 static bool ctx_single_port_submission(const struct intel_context *ce)
1726 {
1727         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1728                 intel_context_force_single_submission(ce));
1729 }
1730
1731 static bool can_merge_ctx(const struct intel_context *prev,
1732                           const struct intel_context *next)
1733 {
1734         if (prev != next)
1735                 return false;
1736
1737         if (ctx_single_port_submission(prev))
1738                 return false;
1739
1740         return true;
1741 }
1742
1743 static unsigned long i915_request_flags(const struct i915_request *rq)
1744 {
1745         return READ_ONCE(rq->fence.flags);
1746 }
1747
1748 static bool can_merge_rq(const struct i915_request *prev,
1749                          const struct i915_request *next)
1750 {
1751         GEM_BUG_ON(prev == next);
1752         GEM_BUG_ON(!assert_priority_queue(prev, next));
1753
1754         /*
1755          * We do not submit known completed requests. Therefore if the next
1756          * request is already completed, we can pretend to merge it in
1757          * with the previous context (and we will skip updating the ELSP
1758          * and tracking). Thus hopefully keeping the ELSP full with active
1759          * contexts, despite the best efforts of preempt-to-busy to confuse
1760          * us.
1761          */
1762         if (i915_request_completed(next))
1763                 return true;
1764
1765         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1766                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1767                       BIT(I915_FENCE_FLAG_SENTINEL))))
1768                 return false;
1769
1770         if (!can_merge_ctx(prev->context, next->context))
1771                 return false;
1772
1773         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1774         return true;
1775 }
1776
1777 static void virtual_update_register_offsets(u32 *regs,
1778                                             struct intel_engine_cs *engine)
1779 {
1780         set_offsets(regs, reg_offsets(engine), engine, false);
1781 }
1782
1783 static bool virtual_matches(const struct virtual_engine *ve,
1784                             const struct i915_request *rq,
1785                             const struct intel_engine_cs *engine)
1786 {
1787         const struct intel_engine_cs *inflight;
1788
1789         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1790                 return false;
1791
1792         /*
1793          * We track when the HW has completed saving the context image
1794          * (i.e. when we have seen the final CS event switching out of
1795          * the context) and must not overwrite the context image before
1796          * then. This restricts us to only using the active engine
1797          * while the previous virtualized request is inflight (so
1798          * we reuse the register offsets). This is a very small
1799          * hystersis on the greedy seelction algorithm.
1800          */
1801         inflight = intel_context_inflight(&ve->context);
1802         if (inflight && inflight != engine)
1803                 return false;
1804
1805         return true;
1806 }
1807
1808 static void virtual_xfer_context(struct virtual_engine *ve,
1809                                  struct intel_engine_cs *engine)
1810 {
1811         unsigned int n;
1812
1813         if (likely(engine == ve->siblings[0]))
1814                 return;
1815
1816         GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1817         if (!intel_engine_has_relative_mmio(engine))
1818                 virtual_update_register_offsets(ve->context.lrc_reg_state,
1819                                                 engine);
1820
1821         /*
1822          * Move the bound engine to the top of the list for
1823          * future execution. We then kick this tasklet first
1824          * before checking others, so that we preferentially
1825          * reuse this set of bound registers.
1826          */
1827         for (n = 1; n < ve->num_siblings; n++) {
1828                 if (ve->siblings[n] == engine) {
1829                         swap(ve->siblings[n], ve->siblings[0]);
1830                         break;
1831                 }
1832         }
1833 }
1834
1835 #define for_each_waiter(p__, rq__) \
1836         list_for_each_entry_lockless(p__, \
1837                                      &(rq__)->sched.waiters_list, \
1838                                      wait_link)
1839
1840 #define for_each_signaler(p__, rq__) \
1841         list_for_each_entry_rcu(p__, \
1842                                 &(rq__)->sched.signalers_list, \
1843                                 signal_link)
1844
1845 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1846 {
1847         LIST_HEAD(list);
1848
1849         /*
1850          * We want to move the interrupted request to the back of
1851          * the round-robin list (i.e. its priority level), but
1852          * in doing so, we must then move all requests that were in
1853          * flight and were waiting for the interrupted request to
1854          * be run after it again.
1855          */
1856         do {
1857                 struct i915_dependency *p;
1858
1859                 GEM_BUG_ON(i915_request_is_active(rq));
1860                 list_move_tail(&rq->sched.link, pl);
1861
1862                 for_each_waiter(p, rq) {
1863                         struct i915_request *w =
1864                                 container_of(p->waiter, typeof(*w), sched);
1865
1866                         if (p->flags & I915_DEPENDENCY_WEAK)
1867                                 continue;
1868
1869                         /* Leave semaphores spinning on the other engines */
1870                         if (w->engine != rq->engine)
1871                                 continue;
1872
1873                         /* No waiter should start before its signaler */
1874                         GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1875                                    i915_request_started(w) &&
1876                                    !i915_request_completed(rq));
1877
1878                         GEM_BUG_ON(i915_request_is_active(w));
1879                         if (!i915_request_is_ready(w))
1880                                 continue;
1881
1882                         if (rq_prio(w) < rq_prio(rq))
1883                                 continue;
1884
1885                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1886                         list_move_tail(&w->sched.link, &list);
1887                 }
1888
1889                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1890         } while (rq);
1891 }
1892
1893 static void defer_active(struct intel_engine_cs *engine)
1894 {
1895         struct i915_request *rq;
1896
1897         rq = __unwind_incomplete_requests(engine);
1898         if (!rq)
1899                 return;
1900
1901         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1902 }
1903
1904 static bool
1905 need_timeslice(const struct intel_engine_cs *engine,
1906                const struct i915_request *rq,
1907                const struct rb_node *rb)
1908 {
1909         int hint;
1910
1911         if (!intel_engine_has_timeslices(engine))
1912                 return false;
1913
1914         hint = engine->execlists.queue_priority_hint;
1915
1916         if (rb) {
1917                 const struct virtual_engine *ve =
1918                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1919                 const struct intel_engine_cs *inflight =
1920                         intel_context_inflight(&ve->context);
1921
1922                 if (!inflight || inflight == engine) {
1923                         struct i915_request *next;
1924
1925                         rcu_read_lock();
1926                         next = READ_ONCE(ve->request);
1927                         if (next)
1928                                 hint = max(hint, rq_prio(next));
1929                         rcu_read_unlock();
1930                 }
1931         }
1932
1933         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1934                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1935
1936         GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1937         return hint >= effective_prio(rq);
1938 }
1939
1940 static bool
1941 timeslice_yield(const struct intel_engine_execlists *el,
1942                 const struct i915_request *rq)
1943 {
1944         /*
1945          * Once bitten, forever smitten!
1946          *
1947          * If the active context ever busy-waited on a semaphore,
1948          * it will be treated as a hog until the end of its timeslice (i.e.
1949          * until it is scheduled out and replaced by a new submission,
1950          * possibly even its own lite-restore). The HW only sends an interrupt
1951          * on the first miss, and we do know if that semaphore has been
1952          * signaled, or even if it is now stuck on another semaphore. Play
1953          * safe, yield if it might be stuck -- it will be given a fresh
1954          * timeslice in the near future.
1955          */
1956         return rq->context->lrc.ccid == READ_ONCE(el->yield);
1957 }
1958
1959 static bool
1960 timeslice_expired(const struct intel_engine_execlists *el,
1961                   const struct i915_request *rq)
1962 {
1963         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1964 }
1965
1966 static int
1967 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1968 {
1969         if (list_is_last(&rq->sched.link, &engine->active.requests))
1970                 return engine->execlists.queue_priority_hint;
1971
1972         return rq_prio(list_next_entry(rq, sched.link));
1973 }
1974
1975 static inline unsigned long
1976 timeslice(const struct intel_engine_cs *engine)
1977 {
1978         return READ_ONCE(engine->props.timeslice_duration_ms);
1979 }
1980
1981 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1982 {
1983         const struct intel_engine_execlists *execlists = &engine->execlists;
1984         const struct i915_request *rq = *execlists->active;
1985
1986         if (!rq || i915_request_completed(rq))
1987                 return 0;
1988
1989         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1990                 return 0;
1991
1992         return timeslice(engine);
1993 }
1994
1995 static void set_timeslice(struct intel_engine_cs *engine)
1996 {
1997         unsigned long duration;
1998
1999         if (!intel_engine_has_timeslices(engine))
2000                 return;
2001
2002         duration = active_timeslice(engine);
2003         ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2004
2005         set_timer_ms(&engine->execlists.timer, duration);
2006 }
2007
2008 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2009 {
2010         struct intel_engine_execlists *execlists = &engine->execlists;
2011         unsigned long duration;
2012
2013         if (!intel_engine_has_timeslices(engine))
2014                 return;
2015
2016         WRITE_ONCE(execlists->switch_priority_hint, prio);
2017         if (prio == INT_MIN)
2018                 return;
2019
2020         if (timer_pending(&execlists->timer))
2021                 return;
2022
2023         duration = timeslice(engine);
2024         ENGINE_TRACE(engine,
2025                      "start timeslicing, prio:%d, interval:%lu",
2026                      prio, duration);
2027
2028         set_timer_ms(&execlists->timer, duration);
2029 }
2030
2031 static void record_preemption(struct intel_engine_execlists *execlists)
2032 {
2033         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2034 }
2035
2036 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2037                                             const struct i915_request *rq)
2038 {
2039         if (!rq)
2040                 return 0;
2041
2042         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
2043         if (unlikely(intel_context_is_banned(rq->context)))
2044                 return 1;
2045
2046         return READ_ONCE(engine->props.preempt_timeout_ms);
2047 }
2048
2049 static void set_preempt_timeout(struct intel_engine_cs *engine,
2050                                 const struct i915_request *rq)
2051 {
2052         if (!intel_engine_has_preempt_reset(engine))
2053                 return;
2054
2055         set_timer_ms(&engine->execlists.preempt,
2056                      active_preempt_timeout(engine, rq));
2057 }
2058
2059 static inline void clear_ports(struct i915_request **ports, int count)
2060 {
2061         memset_p((void **)ports, NULL, count);
2062 }
2063
2064 static inline void
2065 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2066 {
2067         /* A memcpy_p() would be very useful here! */
2068         while (count--)
2069                 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2070 }
2071
2072 static void execlists_dequeue(struct intel_engine_cs *engine)
2073 {
2074         struct intel_engine_execlists * const execlists = &engine->execlists;
2075         struct i915_request **port = execlists->pending;
2076         struct i915_request ** const last_port = port + execlists->port_mask;
2077         struct i915_request * const *active;
2078         struct i915_request *last;
2079         struct rb_node *rb;
2080         bool submit = false;
2081
2082         /*
2083          * Hardware submission is through 2 ports. Conceptually each port
2084          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2085          * static for a context, and unique to each, so we only execute
2086          * requests belonging to a single context from each ring. RING_HEAD
2087          * is maintained by the CS in the context image, it marks the place
2088          * where it got up to last time, and through RING_TAIL we tell the CS
2089          * where we want to execute up to this time.
2090          *
2091          * In this list the requests are in order of execution. Consecutive
2092          * requests from the same context are adjacent in the ringbuffer. We
2093          * can combine these requests into a single RING_TAIL update:
2094          *
2095          *              RING_HEAD...req1...req2
2096          *                                    ^- RING_TAIL
2097          * since to execute req2 the CS must first execute req1.
2098          *
2099          * Our goal then is to point each port to the end of a consecutive
2100          * sequence of requests as being the most optimal (fewest wake ups
2101          * and context switches) submission.
2102          */
2103
2104         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2105                 struct virtual_engine *ve =
2106                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2107                 struct i915_request *rq = READ_ONCE(ve->request);
2108
2109                 if (!rq) { /* lazily cleanup after another engine handled rq */
2110                         rb_erase_cached(rb, &execlists->virtual);
2111                         RB_CLEAR_NODE(rb);
2112                         rb = rb_first_cached(&execlists->virtual);
2113                         continue;
2114                 }
2115
2116                 if (!virtual_matches(ve, rq, engine)) {
2117                         rb = rb_next(rb);
2118                         continue;
2119                 }
2120
2121                 break;
2122         }
2123
2124         /*
2125          * If the queue is higher priority than the last
2126          * request in the currently active context, submit afresh.
2127          * We will resubmit again afterwards in case we need to split
2128          * the active context to interject the preemption request,
2129          * i.e. we will retrigger preemption following the ack in case
2130          * of trouble.
2131          */
2132         active = READ_ONCE(execlists->active);
2133
2134         /*
2135          * In theory we can skip over completed contexts that have not
2136          * yet been processed by events (as those events are in flight):
2137          *
2138          * while ((last = *active) && i915_request_completed(last))
2139          *      active++;
2140          *
2141          * However, the GPU cannot handle this as it will ultimately
2142          * find itself trying to jump back into a context it has just
2143          * completed and barf.
2144          */
2145
2146         if ((last = *active)) {
2147                 if (need_preempt(engine, last, rb)) {
2148                         if (i915_request_completed(last)) {
2149                                 tasklet_hi_schedule(&execlists->tasklet);
2150                                 return;
2151                         }
2152
2153                         ENGINE_TRACE(engine,
2154                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2155                                      last->fence.context,
2156                                      last->fence.seqno,
2157                                      last->sched.attr.priority,
2158                                      execlists->queue_priority_hint);
2159                         record_preemption(execlists);
2160
2161                         /*
2162                          * Don't let the RING_HEAD advance past the breadcrumb
2163                          * as we unwind (and until we resubmit) so that we do
2164                          * not accidentally tell it to go backwards.
2165                          */
2166                         ring_set_paused(engine, 1);
2167
2168                         /*
2169                          * Note that we have not stopped the GPU at this point,
2170                          * so we are unwinding the incomplete requests as they
2171                          * remain inflight and so by the time we do complete
2172                          * the preemption, some of the unwound requests may
2173                          * complete!
2174                          */
2175                         __unwind_incomplete_requests(engine);
2176
2177                         last = NULL;
2178                 } else if (need_timeslice(engine, last, rb) &&
2179                            timeslice_expired(execlists, last)) {
2180                         if (i915_request_completed(last)) {
2181                                 tasklet_hi_schedule(&execlists->tasklet);
2182                                 return;
2183                         }
2184
2185                         ENGINE_TRACE(engine,
2186                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2187                                      last->fence.context,
2188                                      last->fence.seqno,
2189                                      last->sched.attr.priority,
2190                                      execlists->queue_priority_hint,
2191                                      yesno(timeslice_yield(execlists, last)));
2192
2193                         ring_set_paused(engine, 1);
2194                         defer_active(engine);
2195
2196                         /*
2197                          * Unlike for preemption, if we rewind and continue
2198                          * executing the same context as previously active,
2199                          * the order of execution will remain the same and
2200                          * the tail will only advance. We do not need to
2201                          * force a full context restore, as a lite-restore
2202                          * is sufficient to resample the monotonic TAIL.
2203                          *
2204                          * If we switch to any other context, similarly we
2205                          * will not rewind TAIL of current context, and
2206                          * normal save/restore will preserve state and allow
2207                          * us to later continue executing the same request.
2208                          */
2209                         last = NULL;
2210                 } else {
2211                         /*
2212                          * Otherwise if we already have a request pending
2213                          * for execution after the current one, we can
2214                          * just wait until the next CS event before
2215                          * queuing more. In either case we will force a
2216                          * lite-restore preemption event, but if we wait
2217                          * we hopefully coalesce several updates into a single
2218                          * submission.
2219                          */
2220                         if (!list_is_last(&last->sched.link,
2221                                           &engine->active.requests)) {
2222                                 /*
2223                                  * Even if ELSP[1] is occupied and not worthy
2224                                  * of timeslices, our queue might be.
2225                                  */
2226                                 start_timeslice(engine, queue_prio(execlists));
2227                                 return;
2228                         }
2229                 }
2230         }
2231
2232         while (rb) { /* XXX virtual is always taking precedence */
2233                 struct virtual_engine *ve =
2234                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2235                 struct i915_request *rq;
2236
2237                 spin_lock(&ve->base.active.lock);
2238
2239                 rq = ve->request;
2240                 if (unlikely(!rq)) { /* lost the race to a sibling */
2241                         spin_unlock(&ve->base.active.lock);
2242                         rb_erase_cached(rb, &execlists->virtual);
2243                         RB_CLEAR_NODE(rb);
2244                         rb = rb_first_cached(&execlists->virtual);
2245                         continue;
2246                 }
2247
2248                 GEM_BUG_ON(rq != ve->request);
2249                 GEM_BUG_ON(rq->engine != &ve->base);
2250                 GEM_BUG_ON(rq->context != &ve->context);
2251
2252                 if (rq_prio(rq) >= queue_prio(execlists)) {
2253                         if (!virtual_matches(ve, rq, engine)) {
2254                                 spin_unlock(&ve->base.active.lock);
2255                                 rb = rb_next(rb);
2256                                 continue;
2257                         }
2258
2259                         if (last && !can_merge_rq(last, rq)) {
2260                                 spin_unlock(&ve->base.active.lock);
2261                                 start_timeslice(engine, rq_prio(rq));
2262                                 return; /* leave this for another sibling */
2263                         }
2264
2265                         ENGINE_TRACE(engine,
2266                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2267                                      rq->fence.context,
2268                                      rq->fence.seqno,
2269                                      i915_request_completed(rq) ? "!" :
2270                                      i915_request_started(rq) ? "*" :
2271                                      "",
2272                                      yesno(engine != ve->siblings[0]));
2273
2274                         WRITE_ONCE(ve->request, NULL);
2275                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2276                                    INT_MIN);
2277                         rb_erase_cached(rb, &execlists->virtual);
2278                         RB_CLEAR_NODE(rb);
2279
2280                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2281                         WRITE_ONCE(rq->engine, engine);
2282
2283                         if (__i915_request_submit(rq)) {
2284                                 /*
2285                                  * Only after we confirm that we will submit
2286                                  * this request (i.e. it has not already
2287                                  * completed), do we want to update the context.
2288                                  *
2289                                  * This serves two purposes. It avoids
2290                                  * unnecessary work if we are resubmitting an
2291                                  * already completed request after timeslicing.
2292                                  * But more importantly, it prevents us altering
2293                                  * ve->siblings[] on an idle context, where
2294                                  * we may be using ve->siblings[] in
2295                                  * virtual_context_enter / virtual_context_exit.
2296                                  */
2297                                 virtual_xfer_context(ve, engine);
2298                                 GEM_BUG_ON(ve->siblings[0] != engine);
2299
2300                                 submit = true;
2301                                 last = rq;
2302                         }
2303                         i915_request_put(rq);
2304
2305                         /*
2306                          * Hmm, we have a bunch of virtual engine requests,
2307                          * but the first one was already completed (thanks
2308                          * preempt-to-busy!). Keep looking at the veng queue
2309                          * until we have no more relevant requests (i.e.
2310                          * the normal submit queue has higher priority).
2311                          */
2312                         if (!submit) {
2313                                 spin_unlock(&ve->base.active.lock);
2314                                 rb = rb_first_cached(&execlists->virtual);
2315                                 continue;
2316                         }
2317                 }
2318
2319                 spin_unlock(&ve->base.active.lock);
2320                 break;
2321         }
2322
2323         while ((rb = rb_first_cached(&execlists->queue))) {
2324                 struct i915_priolist *p = to_priolist(rb);
2325                 struct i915_request *rq, *rn;
2326                 int i;
2327
2328                 priolist_for_each_request_consume(rq, rn, p, i) {
2329                         bool merge = true;
2330
2331                         /*
2332                          * Can we combine this request with the current port?
2333                          * It has to be the same context/ringbuffer and not
2334                          * have any exceptions (e.g. GVT saying never to
2335                          * combine contexts).
2336                          *
2337                          * If we can combine the requests, we can execute both
2338                          * by updating the RING_TAIL to point to the end of the
2339                          * second request, and so we never need to tell the
2340                          * hardware about the first.
2341                          */
2342                         if (last && !can_merge_rq(last, rq)) {
2343                                 /*
2344                                  * If we are on the second port and cannot
2345                                  * combine this request with the last, then we
2346                                  * are done.
2347                                  */
2348                                 if (port == last_port)
2349                                         goto done;
2350
2351                                 /*
2352                                  * We must not populate both ELSP[] with the
2353                                  * same LRCA, i.e. we must submit 2 different
2354                                  * contexts if we submit 2 ELSP.
2355                                  */
2356                                 if (last->context == rq->context)
2357                                         goto done;
2358
2359                                 if (i915_request_has_sentinel(last))
2360                                         goto done;
2361
2362                                 /*
2363                                  * If GVT overrides us we only ever submit
2364                                  * port[0], leaving port[1] empty. Note that we
2365                                  * also have to be careful that we don't queue
2366                                  * the same context (even though a different
2367                                  * request) to the second port.
2368                                  */
2369                                 if (ctx_single_port_submission(last->context) ||
2370                                     ctx_single_port_submission(rq->context))
2371                                         goto done;
2372
2373                                 merge = false;
2374                         }
2375
2376                         if (__i915_request_submit(rq)) {
2377                                 if (!merge) {
2378                                         *port = execlists_schedule_in(last, port - execlists->pending);
2379                                         port++;
2380                                         last = NULL;
2381                                 }
2382
2383                                 GEM_BUG_ON(last &&
2384                                            !can_merge_ctx(last->context,
2385                                                           rq->context));
2386                                 GEM_BUG_ON(last &&
2387                                            i915_seqno_passed(last->fence.seqno,
2388                                                              rq->fence.seqno));
2389
2390                                 submit = true;
2391                                 last = rq;
2392                         }
2393                 }
2394
2395                 rb_erase_cached(&p->node, &execlists->queue);
2396                 i915_priolist_free(p);
2397         }
2398
2399 done:
2400         /*
2401          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2402          *
2403          * We choose the priority hint such that if we add a request of greater
2404          * priority than this, we kick the submission tasklet to decide on
2405          * the right order of submitting the requests to hardware. We must
2406          * also be prepared to reorder requests as they are in-flight on the
2407          * HW. We derive the priority hint then as the first "hole" in
2408          * the HW submission ports and if there are no available slots,
2409          * the priority of the lowest executing request, i.e. last.
2410          *
2411          * When we do receive a higher priority request ready to run from the
2412          * user, see queue_request(), the priority hint is bumped to that
2413          * request triggering preemption on the next dequeue (or subsequent
2414          * interrupt for secondary ports).
2415          */
2416         execlists->queue_priority_hint = queue_prio(execlists);
2417
2418         if (submit) {
2419                 *port = execlists_schedule_in(last, port - execlists->pending);
2420                 execlists->switch_priority_hint =
2421                         switch_prio(engine, *execlists->pending);
2422
2423                 /*
2424                  * Skip if we ended up with exactly the same set of requests,
2425                  * e.g. trying to timeslice a pair of ordered contexts
2426                  */
2427                 if (!memcmp(active, execlists->pending,
2428                             (port - execlists->pending + 1) * sizeof(*port))) {
2429                         do
2430                                 execlists_schedule_out(fetch_and_zero(port));
2431                         while (port-- != execlists->pending);
2432
2433                         goto skip_submit;
2434                 }
2435                 clear_ports(port + 1, last_port - port);
2436
2437                 WRITE_ONCE(execlists->yield, -1);
2438                 set_preempt_timeout(engine, *active);
2439                 execlists_submit_ports(engine);
2440         } else {
2441                 start_timeslice(engine, execlists->queue_priority_hint);
2442 skip_submit:
2443                 ring_set_paused(engine, 0);
2444         }
2445 }
2446
2447 static void
2448 cancel_port_requests(struct intel_engine_execlists * const execlists)
2449 {
2450         struct i915_request * const *port;
2451
2452         for (port = execlists->pending; *port; port++)
2453                 execlists_schedule_out(*port);
2454         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2455
2456         /* Mark the end of active before we overwrite *active */
2457         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2458                 execlists_schedule_out(*port);
2459         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2460
2461         smp_wmb(); /* complete the seqlock for execlists_active() */
2462         WRITE_ONCE(execlists->active, execlists->inflight);
2463 }
2464
2465 static inline void
2466 invalidate_csb_entries(const u64 *first, const u64 *last)
2467 {
2468         clflush((void *)first);
2469         clflush((void *)last);
2470 }
2471
2472 /*
2473  * Starting with Gen12, the status has a new format:
2474  *
2475  *     bit  0:     switched to new queue
2476  *     bit  1:     reserved
2477  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2478  *                 switch detail is set to "wait on semaphore"
2479  *     bits 3-5:   engine class
2480  *     bits 6-11:  engine instance
2481  *     bits 12-14: reserved
2482  *     bits 15-25: sw context id of the lrc the GT switched to
2483  *     bits 26-31: sw counter of the lrc the GT switched to
2484  *     bits 32-35: context switch detail
2485  *                  - 0: ctx complete
2486  *                  - 1: wait on sync flip
2487  *                  - 2: wait on vblank
2488  *                  - 3: wait on scanline
2489  *                  - 4: wait on semaphore
2490  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2491  *                       WAIT_FOR_EVENT)
2492  *     bit  36:    reserved
2493  *     bits 37-43: wait detail (for switch detail 1 to 4)
2494  *     bits 44-46: reserved
2495  *     bits 47-57: sw context id of the lrc the GT switched away from
2496  *     bits 58-63: sw counter of the lrc the GT switched away from
2497  */
2498 static inline bool gen12_csb_parse(const u64 *csb)
2499 {
2500         bool ctx_away_valid;
2501         bool new_queue;
2502         u64 entry;
2503
2504         /* HSD#22011248461 */
2505         entry = READ_ONCE(*csb);
2506         if (unlikely(entry == -1)) {
2507                 preempt_disable();
2508                 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
2509                         GEM_WARN_ON("50us CSB timeout");
2510                 preempt_enable();
2511         }
2512         WRITE_ONCE(*(u64 *)csb, -1);
2513
2514         ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry));
2515         new_queue =
2516                 lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2517
2518         /*
2519          * The context switch detail is not guaranteed to be 5 when a preemption
2520          * occurs, so we can't just check for that. The check below works for
2521          * all the cases we care about, including preemptions of WAIT
2522          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2523          * would require some extra handling, but we don't support that.
2524          */
2525         if (!ctx_away_valid || new_queue) {
2526                 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry)));
2527                 return true;
2528         }
2529
2530         /*
2531          * switch detail = 5 is covered by the case above and we do not expect a
2532          * context switch on an unsuccessful wait instruction since we always
2533          * use polling mode.
2534          */
2535         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry)));
2536         return false;
2537 }
2538
2539 static inline bool gen8_csb_parse(const u64 *csb)
2540 {
2541         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2542 }
2543
2544 static void process_csb(struct intel_engine_cs *engine)
2545 {
2546         struct intel_engine_execlists * const execlists = &engine->execlists;
2547         const u64 * const buf = execlists->csb_status;
2548         const u8 num_entries = execlists->csb_size;
2549         u8 head, tail;
2550
2551         /*
2552          * As we modify our execlists state tracking we require exclusive
2553          * access. Either we are inside the tasklet, or the tasklet is disabled
2554          * and we assume that is only inside the reset paths and so serialised.
2555          */
2556         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2557                    !reset_in_progress(execlists));
2558         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2559
2560         /*
2561          * Note that csb_write, csb_status may be either in HWSP or mmio.
2562          * When reading from the csb_write mmio register, we have to be
2563          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2564          * the low 4bits. As it happens we know the next 4bits are always
2565          * zero and so we can simply masked off the low u8 of the register
2566          * and treat it identically to reading from the HWSP (without having
2567          * to use explicit shifting and masking, and probably bifurcating
2568          * the code to handle the legacy mmio read).
2569          */
2570         head = execlists->csb_head;
2571         tail = READ_ONCE(*execlists->csb_write);
2572         if (unlikely(head == tail))
2573                 return;
2574
2575         /*
2576          * We will consume all events from HW, or at least pretend to.
2577          *
2578          * The sequence of events from the HW is deterministic, and derived
2579          * from our writes to the ELSP, with a smidgen of variability for
2580          * the arrival of the asynchronous requests wrt to the inflight
2581          * execution. If the HW sends an event that does not correspond with
2582          * the one we are expecting, we have to abandon all hope as we lose
2583          * all tracking of what the engine is actually executing. We will
2584          * only detect we are out of sequence with the HW when we get an
2585          * 'impossible' event because we have already drained our own
2586          * preemption/promotion queue. If this occurs, we know that we likely
2587          * lost track of execution earlier and must unwind and restart, the
2588          * simplest way is by stop processing the event queue and force the
2589          * engine to reset.
2590          */
2591         execlists->csb_head = tail;
2592         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2593
2594         /*
2595          * Hopefully paired with a wmb() in HW!
2596          *
2597          * We must complete the read of the write pointer before any reads
2598          * from the CSB, so that we do not see stale values. Without an rmb
2599          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2600          * we perform the READ_ONCE(*csb_write).
2601          */
2602         rmb();
2603         do {
2604                 bool promote;
2605
2606                 if (++head == num_entries)
2607                         head = 0;
2608
2609                 /*
2610                  * We are flying near dragons again.
2611                  *
2612                  * We hold a reference to the request in execlist_port[]
2613                  * but no more than that. We are operating in softirq
2614                  * context and so cannot hold any mutex or sleep. That
2615                  * prevents us stopping the requests we are processing
2616                  * in port[] from being retired simultaneously (the
2617                  * breadcrumb will be complete before we see the
2618                  * context-switch). As we only hold the reference to the
2619                  * request, any pointer chasing underneath the request
2620                  * is subject to a potential use-after-free. Thus we
2621                  * store all of the bookkeeping within port[] as
2622                  * required, and avoid using unguarded pointers beneath
2623                  * request itself. The same applies to the atomic
2624                  * status notifier.
2625                  */
2626
2627                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2628                              head,
2629                              upper_32_bits(buf[head]),
2630                              lower_32_bits(buf[head]));
2631
2632                 if (INTEL_GEN(engine->i915) >= 12)
2633                         promote = gen12_csb_parse(buf + head);
2634                 else
2635                         promote = gen8_csb_parse(buf + head);
2636                 if (promote) {
2637                         struct i915_request * const *old = execlists->active;
2638
2639                         if (GEM_WARN_ON(!*execlists->pending)) {
2640                                 execlists->error_interrupt |= ERROR_CSB;
2641                                 break;
2642                         }
2643
2644                         ring_set_paused(engine, 0);
2645
2646                         /* Point active to the new ELSP; prevent overwriting */
2647                         WRITE_ONCE(execlists->active, execlists->pending);
2648                         smp_wmb(); /* notify execlists_active() */
2649
2650                         /* cancel old inflight, prepare for switch */
2651                         trace_ports(execlists, "preempted", old);
2652                         while (*old)
2653                                 execlists_schedule_out(*old++);
2654
2655                         /* switch pending to inflight */
2656                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2657                         copy_ports(execlists->inflight,
2658                                    execlists->pending,
2659                                    execlists_num_ports(execlists));
2660                         smp_wmb(); /* complete the seqlock */
2661                         WRITE_ONCE(execlists->active, execlists->inflight);
2662
2663                         /* XXX Magic delay for tgl */
2664                         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2665
2666                         WRITE_ONCE(execlists->pending[0], NULL);
2667                 } else {
2668                         if (GEM_WARN_ON(!*execlists->active)) {
2669                                 execlists->error_interrupt |= ERROR_CSB;
2670                                 break;
2671                         }
2672
2673                         /* port0 completed, advanced to port1 */
2674                         trace_ports(execlists, "completed", execlists->active);
2675
2676                         /*
2677                          * We rely on the hardware being strongly
2678                          * ordered, that the breadcrumb write is
2679                          * coherent (visible from the CPU) before the
2680                          * user interrupt is processed. One might assume
2681                          * that the breadcrumb write being before the
2682                          * user interrupt and the CS event for the context
2683                          * switch would therefore be before the CS event
2684                          * itself...
2685                          */
2686                         if (GEM_SHOW_DEBUG() &&
2687                             !i915_request_completed(*execlists->active)) {
2688                                 struct i915_request *rq = *execlists->active;
2689                                 const u32 *regs __maybe_unused =
2690                                         rq->context->lrc_reg_state;
2691
2692                                 ENGINE_TRACE(engine,
2693                                              "context completed before request!\n");
2694                                 ENGINE_TRACE(engine,
2695                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2696                                              ENGINE_READ(engine, RING_START),
2697                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2698                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2699                                              ENGINE_READ(engine, RING_CTL),
2700                                              ENGINE_READ(engine, RING_MI_MODE));
2701                                 ENGINE_TRACE(engine,
2702                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2703                                              i915_ggtt_offset(rq->ring->vma),
2704                                              rq->head, rq->tail,
2705                                              rq->fence.context,
2706                                              lower_32_bits(rq->fence.seqno),
2707                                              hwsp_seqno(rq));
2708                                 ENGINE_TRACE(engine,
2709                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2710                                              regs[CTX_RING_START],
2711                                              regs[CTX_RING_HEAD],
2712                                              regs[CTX_RING_TAIL]);
2713                         }
2714
2715                         execlists_schedule_out(*execlists->active++);
2716
2717                         GEM_BUG_ON(execlists->active - execlists->inflight >
2718                                    execlists_num_ports(execlists));
2719                 }
2720         } while (head != tail);
2721
2722         set_timeslice(engine);
2723
2724         /*
2725          * Gen11 has proven to fail wrt global observation point between
2726          * entry and tail update, failing on the ordering and thus
2727          * we see an old entry in the context status buffer.
2728          *
2729          * Forcibly evict out entries for the next gpu csb update,
2730          * to increase the odds that we get a fresh entries with non
2731          * working hardware. The cost for doing so comes out mostly with
2732          * the wash as hardware, working or not, will need to do the
2733          * invalidation before.
2734          */
2735         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2736 }
2737
2738 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2739 {
2740         lockdep_assert_held(&engine->active.lock);
2741         if (!READ_ONCE(engine->execlists.pending[0])) {
2742                 rcu_read_lock(); /* protect peeking at execlists->active */
2743                 execlists_dequeue(engine);
2744                 rcu_read_unlock();
2745         }
2746 }
2747
2748 static void __execlists_hold(struct i915_request *rq)
2749 {
2750         LIST_HEAD(list);
2751
2752         do {
2753                 struct i915_dependency *p;
2754
2755                 if (i915_request_is_active(rq))
2756                         __i915_request_unsubmit(rq);
2757
2758                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2759                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2760                 i915_request_set_hold(rq);
2761                 RQ_TRACE(rq, "on hold\n");
2762
2763                 for_each_waiter(p, rq) {
2764                         struct i915_request *w =
2765                                 container_of(p->waiter, typeof(*w), sched);
2766
2767                         /* Leave semaphores spinning on the other engines */
2768                         if (w->engine != rq->engine)
2769                                 continue;
2770
2771                         if (!i915_request_is_ready(w))
2772                                 continue;
2773
2774                         if (i915_request_completed(w))
2775                                 continue;
2776
2777                         if (i915_request_on_hold(w))
2778                                 continue;
2779
2780                         list_move_tail(&w->sched.link, &list);
2781                 }
2782
2783                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2784         } while (rq);
2785 }
2786
2787 static bool execlists_hold(struct intel_engine_cs *engine,
2788                            struct i915_request *rq)
2789 {
2790         spin_lock_irq(&engine->active.lock);
2791
2792         if (i915_request_completed(rq)) { /* too late! */
2793                 rq = NULL;
2794                 goto unlock;
2795         }
2796
2797         if (rq->engine != engine) { /* preempted virtual engine */
2798                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2799
2800                 /*
2801                  * intel_context_inflight() is only protected by virtue
2802                  * of process_csb() being called only by the tasklet (or
2803                  * directly from inside reset while the tasklet is suspended).
2804                  * Assert that neither of those are allowed to run while we
2805                  * poke at the request queues.
2806                  */
2807                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2808
2809                 /*
2810                  * An unsubmitted request along a virtual engine will
2811                  * remain on the active (this) engine until we are able
2812                  * to process the context switch away (and so mark the
2813                  * context as no longer in flight). That cannot have happened
2814                  * yet, otherwise we would not be hanging!
2815                  */
2816                 spin_lock(&ve->base.active.lock);
2817                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2818                 GEM_BUG_ON(ve->request != rq);
2819                 ve->request = NULL;
2820                 spin_unlock(&ve->base.active.lock);
2821                 i915_request_put(rq);
2822
2823                 rq->engine = engine;
2824         }
2825
2826         /*
2827          * Transfer this request onto the hold queue to prevent it
2828          * being resumbitted to HW (and potentially completed) before we have
2829          * released it. Since we may have already submitted following
2830          * requests, we need to remove those as well.
2831          */
2832         GEM_BUG_ON(i915_request_on_hold(rq));
2833         GEM_BUG_ON(rq->engine != engine);
2834         __execlists_hold(rq);
2835         GEM_BUG_ON(list_empty(&engine->active.hold));
2836
2837 unlock:
2838         spin_unlock_irq(&engine->active.lock);
2839         return rq;
2840 }
2841
2842 static bool hold_request(const struct i915_request *rq)
2843 {
2844         struct i915_dependency *p;
2845         bool result = false;
2846
2847         /*
2848          * If one of our ancestors is on hold, we must also be on hold,
2849          * otherwise we will bypass it and execute before it.
2850          */
2851         rcu_read_lock();
2852         for_each_signaler(p, rq) {
2853                 const struct i915_request *s =
2854                         container_of(p->signaler, typeof(*s), sched);
2855
2856                 if (s->engine != rq->engine)
2857                         continue;
2858
2859                 result = i915_request_on_hold(s);
2860                 if (result)
2861                         break;
2862         }
2863         rcu_read_unlock();
2864
2865         return result;
2866 }
2867
2868 static void __execlists_unhold(struct i915_request *rq)
2869 {
2870         LIST_HEAD(list);
2871
2872         do {
2873                 struct i915_dependency *p;
2874
2875                 RQ_TRACE(rq, "hold release\n");
2876
2877                 GEM_BUG_ON(!i915_request_on_hold(rq));
2878                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2879
2880                 i915_request_clear_hold(rq);
2881                 list_move_tail(&rq->sched.link,
2882                                i915_sched_lookup_priolist(rq->engine,
2883                                                           rq_prio(rq)));
2884                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2885
2886                 /* Also release any children on this engine that are ready */
2887                 for_each_waiter(p, rq) {
2888                         struct i915_request *w =
2889                                 container_of(p->waiter, typeof(*w), sched);
2890
2891                         /* Propagate any change in error status */
2892                         if (rq->fence.error)
2893                                 i915_request_set_error_once(w, rq->fence.error);
2894
2895                         if (w->engine != rq->engine)
2896                                 continue;
2897
2898                         if (!i915_request_on_hold(w))
2899                                 continue;
2900
2901                         /* Check that no other parents are also on hold */
2902                         if (hold_request(w))
2903                                 continue;
2904
2905                         list_move_tail(&w->sched.link, &list);
2906                 }
2907
2908                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2909         } while (rq);
2910 }
2911
2912 static void execlists_unhold(struct intel_engine_cs *engine,
2913                              struct i915_request *rq)
2914 {
2915         spin_lock_irq(&engine->active.lock);
2916
2917         /*
2918          * Move this request back to the priority queue, and all of its
2919          * children and grandchildren that were suspended along with it.
2920          */
2921         __execlists_unhold(rq);
2922
2923         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2924                 engine->execlists.queue_priority_hint = rq_prio(rq);
2925                 tasklet_hi_schedule(&engine->execlists.tasklet);
2926         }
2927
2928         spin_unlock_irq(&engine->active.lock);
2929 }
2930
2931 struct execlists_capture {
2932         struct work_struct work;
2933         struct i915_request *rq;
2934         struct i915_gpu_coredump *error;
2935 };
2936
2937 static void execlists_capture_work(struct work_struct *work)
2938 {
2939         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2940         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2941         struct intel_engine_cs *engine = cap->rq->engine;
2942         struct intel_gt_coredump *gt = cap->error->gt;
2943         struct intel_engine_capture_vma *vma;
2944
2945         /* Compress all the objects attached to the request, slow! */
2946         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2947         if (vma) {
2948                 struct i915_vma_compress *compress =
2949                         i915_vma_capture_prepare(gt);
2950
2951                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2952                 i915_vma_capture_finish(gt, compress);
2953         }
2954
2955         gt->simulated = gt->engine->simulated;
2956         cap->error->simulated = gt->simulated;
2957
2958         /* Publish the error state, and announce it to the world */
2959         i915_error_state_store(cap->error);
2960         i915_gpu_coredump_put(cap->error);
2961
2962         /* Return this request and all that depend upon it for signaling */
2963         execlists_unhold(engine, cap->rq);
2964         i915_request_put(cap->rq);
2965
2966         kfree(cap);
2967 }
2968
2969 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2970 {
2971         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2972         struct execlists_capture *cap;
2973
2974         cap = kmalloc(sizeof(*cap), gfp);
2975         if (!cap)
2976                 return NULL;
2977
2978         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2979         if (!cap->error)
2980                 goto err_cap;
2981
2982         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2983         if (!cap->error->gt)
2984                 goto err_gpu;
2985
2986         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2987         if (!cap->error->gt->engine)
2988                 goto err_gt;
2989
2990         return cap;
2991
2992 err_gt:
2993         kfree(cap->error->gt);
2994 err_gpu:
2995         kfree(cap->error);
2996 err_cap:
2997         kfree(cap);
2998         return NULL;
2999 }
3000
3001 static struct i915_request *
3002 active_context(struct intel_engine_cs *engine, u32 ccid)
3003 {
3004         const struct intel_engine_execlists * const el = &engine->execlists;
3005         struct i915_request * const *port, *rq;
3006
3007         /*
3008          * Use the most recent result from process_csb(), but just in case
3009          * we trigger an error (via interrupt) before the first CS event has
3010          * been written, peek at the next submission.
3011          */
3012
3013         for (port = el->active; (rq = *port); port++) {
3014                 if (rq->context->lrc.ccid == ccid) {
3015                         ENGINE_TRACE(engine,
3016                                      "ccid found at active:%zd\n",
3017                                      port - el->active);
3018                         return rq;
3019                 }
3020         }
3021
3022         for (port = el->pending; (rq = *port); port++) {
3023                 if (rq->context->lrc.ccid == ccid) {
3024                         ENGINE_TRACE(engine,
3025                                      "ccid found at pending:%zd\n",
3026                                      port - el->pending);
3027                         return rq;
3028                 }
3029         }
3030
3031         ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3032         return NULL;
3033 }
3034
3035 static u32 active_ccid(struct intel_engine_cs *engine)
3036 {
3037         return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3038 }
3039
3040 static void execlists_capture(struct intel_engine_cs *engine)
3041 {
3042         struct execlists_capture *cap;
3043
3044         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3045                 return;
3046
3047         /*
3048          * We need to _quickly_ capture the engine state before we reset.
3049          * We are inside an atomic section (softirq) here and we are delaying
3050          * the forced preemption event.
3051          */
3052         cap = capture_regs(engine);
3053         if (!cap)
3054                 return;
3055
3056         spin_lock_irq(&engine->active.lock);
3057         cap->rq = active_context(engine, active_ccid(engine));
3058         if (cap->rq) {
3059                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3060                 cap->rq = i915_request_get_rcu(cap->rq);
3061         }
3062         spin_unlock_irq(&engine->active.lock);
3063         if (!cap->rq)
3064                 goto err_free;
3065
3066         /*
3067          * Remove the request from the execlists queue, and take ownership
3068          * of the request. We pass it to our worker who will _slowly_ compress
3069          * all the pages the _user_ requested for debugging their batch, after
3070          * which we return it to the queue for signaling.
3071          *
3072          * By removing them from the execlists queue, we also remove the
3073          * requests from being processed by __unwind_incomplete_requests()
3074          * during the intel_engine_reset(), and so they will *not* be replayed
3075          * afterwards.
3076          *
3077          * Note that because we have not yet reset the engine at this point,
3078          * it is possible for the request that we have identified as being
3079          * guilty, did in fact complete and we will then hit an arbitration
3080          * point allowing the outstanding preemption to succeed. The likelihood
3081          * of that is very low (as capturing of the engine registers should be
3082          * fast enough to run inside an irq-off atomic section!), so we will
3083          * simply hold that request accountable for being non-preemptible
3084          * long enough to force the reset.
3085          */
3086         if (!execlists_hold(engine, cap->rq))
3087                 goto err_rq;
3088
3089         INIT_WORK(&cap->work, execlists_capture_work);
3090         schedule_work(&cap->work);
3091         return;
3092
3093 err_rq:
3094         i915_request_put(cap->rq);
3095 err_free:
3096         i915_gpu_coredump_put(cap->error);
3097         kfree(cap);
3098 }
3099
3100 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3101 {
3102         const unsigned int bit = I915_RESET_ENGINE + engine->id;
3103         unsigned long *lock = &engine->gt->reset.flags;
3104
3105         if (!intel_has_reset_engine(engine->gt))
3106                 return;
3107
3108         if (test_and_set_bit(bit, lock))
3109                 return;
3110
3111         ENGINE_TRACE(engine, "reset for %s\n", msg);
3112
3113         /* Mark this tasklet as disabled to avoid waiting for it to complete */
3114         tasklet_disable_nosync(&engine->execlists.tasklet);
3115
3116         ring_set_paused(engine, 1); /* Freeze the current request in place */
3117         execlists_capture(engine);
3118         intel_engine_reset(engine, msg);
3119
3120         tasklet_enable(&engine->execlists.tasklet);
3121         clear_and_wake_up_bit(bit, lock);
3122 }
3123
3124 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3125 {
3126         const struct timer_list *t = &engine->execlists.preempt;
3127
3128         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3129                 return false;
3130
3131         if (!timer_expired(t))
3132                 return false;
3133
3134         return READ_ONCE(engine->execlists.pending[0]);
3135 }
3136
3137 /*
3138  * Check the unread Context Status Buffers and manage the submission of new
3139  * contexts to the ELSP accordingly.
3140  */
3141 static void execlists_submission_tasklet(unsigned long data)
3142 {
3143         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3144         bool timeout = preempt_timeout(engine);
3145
3146         process_csb(engine);
3147
3148         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3149                 const char *msg;
3150
3151                 /* Generate the error message in priority wrt to the user! */
3152                 if (engine->execlists.error_interrupt & GENMASK(15, 0))
3153                         msg = "CS error"; /* thrown by a user payload */
3154                 else if (engine->execlists.error_interrupt & ERROR_CSB)
3155                         msg = "invalid CSB event";
3156                 else
3157                         msg = "internal error";
3158
3159                 engine->execlists.error_interrupt = 0;
3160                 execlists_reset(engine, msg);
3161         }
3162
3163         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3164                 unsigned long flags;
3165
3166                 spin_lock_irqsave(&engine->active.lock, flags);
3167                 __execlists_submission_tasklet(engine);
3168                 spin_unlock_irqrestore(&engine->active.lock, flags);
3169
3170                 /* Recheck after serialising with direct-submission */
3171                 if (unlikely(timeout && preempt_timeout(engine)))
3172                         execlists_reset(engine, "preemption time out");
3173         }
3174 }
3175
3176 static void __execlists_kick(struct intel_engine_execlists *execlists)
3177 {
3178         /* Kick the tasklet for some interrupt coalescing and reset handling */
3179         tasklet_hi_schedule(&execlists->tasklet);
3180 }
3181
3182 #define execlists_kick(t, member) \
3183         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
3184
3185 static void execlists_timeslice(struct timer_list *timer)
3186 {
3187         execlists_kick(timer, timer);
3188 }
3189
3190 static void execlists_preempt(struct timer_list *timer)
3191 {
3192         execlists_kick(timer, preempt);
3193 }
3194
3195 static void queue_request(struct intel_engine_cs *engine,
3196                           struct i915_request *rq)
3197 {
3198         GEM_BUG_ON(!list_empty(&rq->sched.link));
3199         list_add_tail(&rq->sched.link,
3200                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
3201         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3202 }
3203
3204 static void __submit_queue_imm(struct intel_engine_cs *engine)
3205 {
3206         struct intel_engine_execlists * const execlists = &engine->execlists;
3207
3208         if (reset_in_progress(execlists))
3209                 return; /* defer until we restart the engine following reset */
3210
3211         __execlists_submission_tasklet(engine);
3212 }
3213
3214 static void submit_queue(struct intel_engine_cs *engine,
3215                          const struct i915_request *rq)
3216 {
3217         struct intel_engine_execlists *execlists = &engine->execlists;
3218
3219         if (rq_prio(rq) <= execlists->queue_priority_hint)
3220                 return;
3221
3222         execlists->queue_priority_hint = rq_prio(rq);
3223         __submit_queue_imm(engine);
3224 }
3225
3226 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3227                              const struct i915_request *rq)
3228 {
3229         GEM_BUG_ON(i915_request_on_hold(rq));
3230         return !list_empty(&engine->active.hold) && hold_request(rq);
3231 }
3232
3233 static void flush_csb(struct intel_engine_cs *engine)
3234 {
3235         struct intel_engine_execlists *el = &engine->execlists;
3236
3237         if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3238                 if (!reset_in_progress(el))
3239                         process_csb(engine);
3240                 tasklet_unlock(&el->tasklet);
3241         }
3242 }
3243
3244 static void execlists_submit_request(struct i915_request *request)
3245 {
3246         struct intel_engine_cs *engine = request->engine;
3247         unsigned long flags;
3248
3249         /* Hopefully we clear execlists->pending[] to let us through */
3250         flush_csb(engine);
3251
3252         /* Will be called from irq-context when using foreign fences. */
3253         spin_lock_irqsave(&engine->active.lock, flags);
3254
3255         if (unlikely(ancestor_on_hold(engine, request))) {
3256                 RQ_TRACE(request, "ancestor on hold\n");
3257                 list_add_tail(&request->sched.link, &engine->active.hold);
3258                 i915_request_set_hold(request);
3259         } else {
3260                 queue_request(engine, request);
3261
3262                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3263                 GEM_BUG_ON(list_empty(&request->sched.link));
3264
3265                 submit_queue(engine, request);
3266         }
3267
3268         spin_unlock_irqrestore(&engine->active.lock, flags);
3269 }
3270
3271 static void __execlists_context_fini(struct intel_context *ce)
3272 {
3273         intel_ring_put(ce->ring);
3274         i915_vma_put(ce->state);
3275 }
3276
3277 static void execlists_context_destroy(struct kref *kref)
3278 {
3279         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3280
3281         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3282         GEM_BUG_ON(intel_context_is_pinned(ce));
3283
3284         if (ce->state)
3285                 __execlists_context_fini(ce);
3286
3287         intel_context_fini(ce);
3288         intel_context_free(ce);
3289 }
3290
3291 static void
3292 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3293 {
3294         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3295                 return;
3296
3297         vaddr += engine->context_size;
3298
3299         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3300 }
3301
3302 static void
3303 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3304 {
3305         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3306                 return;
3307
3308         vaddr += engine->context_size;
3309
3310         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3311                 drm_err_once(&engine->i915->drm,
3312                              "%s context redzone overwritten!\n",
3313                              engine->name);
3314 }
3315
3316 static void execlists_context_unpin(struct intel_context *ce)
3317 {
3318         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3319                       ce->engine);
3320 }
3321
3322 static void execlists_context_post_unpin(struct intel_context *ce)
3323 {
3324         i915_gem_object_unpin_map(ce->state->obj);
3325 }
3326
3327 static u32 *
3328 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3329 {
3330         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3331                 MI_SRM_LRM_GLOBAL_GTT |
3332                 MI_LRI_LRM_CS_MMIO;
3333         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3334         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3335                 CTX_TIMESTAMP * sizeof(u32);
3336         *cs++ = 0;
3337
3338         *cs++ = MI_LOAD_REGISTER_REG |
3339                 MI_LRR_SOURCE_CS_MMIO |
3340                 MI_LRI_LRM_CS_MMIO;
3341         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3342         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3343
3344         *cs++ = MI_LOAD_REGISTER_REG |
3345                 MI_LRR_SOURCE_CS_MMIO |
3346                 MI_LRI_LRM_CS_MMIO;
3347         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3348         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3349
3350         return cs;
3351 }
3352
3353 static u32 *
3354 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3355 {
3356         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3357
3358         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3359                 MI_SRM_LRM_GLOBAL_GTT |
3360                 MI_LRI_LRM_CS_MMIO;
3361         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3362         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3363                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3364         *cs++ = 0;
3365
3366         return cs;
3367 }
3368
3369 static u32 *
3370 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3371 {
3372         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3373
3374         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3375                 MI_SRM_LRM_GLOBAL_GTT |
3376                 MI_LRI_LRM_CS_MMIO;
3377         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3378         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3379                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3380         *cs++ = 0;
3381
3382         *cs++ = MI_LOAD_REGISTER_REG |
3383                 MI_LRR_SOURCE_CS_MMIO |
3384                 MI_LRI_LRM_CS_MMIO;
3385         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3386         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3387
3388         return cs;
3389 }
3390
3391 static u32 *
3392 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3393 {
3394         cs = gen12_emit_timestamp_wa(ce, cs);
3395         cs = gen12_emit_cmd_buf_wa(ce, cs);
3396         cs = gen12_emit_restore_scratch(ce, cs);
3397
3398         return cs;
3399 }
3400
3401 static u32 *
3402 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3403 {
3404         cs = gen12_emit_timestamp_wa(ce, cs);
3405         cs = gen12_emit_restore_scratch(ce, cs);
3406
3407         return cs;
3408 }
3409
3410 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3411 {
3412         return PAGE_SIZE * ce->wa_bb_page;
3413 }
3414
3415 static u32 *context_indirect_bb(const struct intel_context *ce)
3416 {
3417         void *ptr;
3418
3419         GEM_BUG_ON(!ce->wa_bb_page);
3420
3421         ptr = ce->lrc_reg_state;
3422         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3423         ptr += context_wa_bb_offset(ce);
3424
3425         return ptr;
3426 }
3427
3428 static void
3429 setup_indirect_ctx_bb(const struct intel_context *ce,
3430                       const struct intel_engine_cs *engine,
3431                       u32 *(*emit)(const struct intel_context *, u32 *))
3432 {
3433         u32 * const start = context_indirect_bb(ce);
3434         u32 *cs;
3435
3436         cs = emit(ce, start);
3437         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3438         while ((unsigned long)cs % CACHELINE_BYTES)
3439                 *cs++ = MI_NOOP;
3440
3441         lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3442                                     i915_ggtt_offset(ce->state) +
3443                                     context_wa_bb_offset(ce),
3444                                     (cs - start) * sizeof(*cs));
3445 }
3446
3447 static void
3448 __execlists_update_reg_state(const struct intel_context *ce,
3449                              const struct intel_engine_cs *engine,
3450                              u32 head)
3451 {
3452         struct intel_ring *ring = ce->ring;
3453         u32 *regs = ce->lrc_reg_state;
3454
3455         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3456         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3457
3458         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3459         regs[CTX_RING_HEAD] = head;
3460         regs[CTX_RING_TAIL] = ring->tail;
3461         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3462
3463         /* RPCS */
3464         if (engine->class == RENDER_CLASS) {
3465                 regs[CTX_R_PWR_CLK_STATE] =
3466                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3467
3468                 i915_oa_init_reg_state(ce, engine);
3469         }
3470
3471         if (ce->wa_bb_page) {
3472                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3473
3474                 fn = gen12_emit_indirect_ctx_xcs;
3475                 if (ce->engine->class == RENDER_CLASS)
3476                         fn = gen12_emit_indirect_ctx_rcs;
3477
3478                 /* Mutually exclusive wrt to global indirect bb */
3479                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3480                 setup_indirect_ctx_bb(ce, engine, fn);
3481         }
3482 }
3483
3484 static int
3485 execlists_context_pre_pin(struct intel_context *ce,
3486                           struct i915_gem_ww_ctx *ww, void **vaddr)
3487 {
3488         GEM_BUG_ON(!ce->state);
3489         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3490
3491         *vaddr = i915_gem_object_pin_map(ce->state->obj,
3492                                         i915_coherent_map_type(ce->engine->i915) |
3493                                         I915_MAP_OVERRIDE);
3494
3495         return PTR_ERR_OR_ZERO(*vaddr);
3496 }
3497
3498 static int
3499 __execlists_context_pin(struct intel_context *ce,
3500                         struct intel_engine_cs *engine,
3501                         void *vaddr)
3502 {
3503         ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3504         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3505         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3506
3507         return 0;
3508 }
3509
3510 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3511 {
3512         return __execlists_context_pin(ce, ce->engine, vaddr);
3513 }
3514
3515 static int execlists_context_alloc(struct intel_context *ce)
3516 {
3517         return __execlists_context_alloc(ce, ce->engine);
3518 }
3519
3520 static void execlists_context_reset(struct intel_context *ce)
3521 {
3522         CE_TRACE(ce, "reset\n");
3523         GEM_BUG_ON(!intel_context_is_pinned(ce));
3524
3525         intel_ring_reset(ce->ring, ce->ring->emit);
3526
3527         /* Scrub away the garbage */
3528         execlists_init_reg_state(ce->lrc_reg_state,
3529                                  ce, ce->engine, ce->ring, true);
3530         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3531
3532         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3533 }
3534
3535 static const struct intel_context_ops execlists_context_ops = {
3536         .alloc = execlists_context_alloc,
3537
3538         .pre_pin = execlists_context_pre_pin,
3539         .pin = execlists_context_pin,
3540         .unpin = execlists_context_unpin,
3541         .post_unpin = execlists_context_post_unpin,
3542
3543         .enter = intel_context_enter_engine,
3544         .exit = intel_context_exit_engine,
3545
3546         .reset = execlists_context_reset,
3547         .destroy = execlists_context_destroy,
3548 };
3549
3550 static u32 hwsp_offset(const struct i915_request *rq)
3551 {
3552         const struct intel_timeline_cacheline *cl;
3553
3554         /* Before the request is executed, the timeline/cachline is fixed */
3555
3556         cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3557         if (cl)
3558                 return cl->ggtt_offset;
3559
3560         return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3561 }
3562
3563 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3564 {
3565         u32 *cs;
3566
3567         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3568         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3569                 return 0;
3570
3571         cs = intel_ring_begin(rq, 6);
3572         if (IS_ERR(cs))
3573                 return PTR_ERR(cs);
3574
3575         /*
3576          * Check if we have been preempted before we even get started.
3577          *
3578          * After this point i915_request_started() reports true, even if
3579          * we get preempted and so are no longer running.
3580          */
3581         *cs++ = MI_ARB_CHECK;
3582         *cs++ = MI_NOOP;
3583
3584         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3585         *cs++ = hwsp_offset(rq);
3586         *cs++ = 0;
3587         *cs++ = rq->fence.seqno - 1;
3588
3589         intel_ring_advance(rq, cs);
3590
3591         /* Record the updated position of the request's payload */
3592         rq->infix = intel_ring_offset(rq, cs);
3593
3594         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3595
3596         return 0;
3597 }
3598
3599 static int emit_pdps(struct i915_request *rq)
3600 {
3601         const struct intel_engine_cs * const engine = rq->engine;
3602         struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3603         int err, i;
3604         u32 *cs;
3605
3606         GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3607
3608         /*
3609          * Beware ye of the dragons, this sequence is magic!
3610          *
3611          * Small changes to this sequence can cause anything from
3612          * GPU hangs to forcewake errors and machine lockups!
3613          */
3614
3615         /* Flush any residual operations from the context load */
3616         err = engine->emit_flush(rq, EMIT_FLUSH);
3617         if (err)
3618                 return err;
3619
3620         /* Magic required to prevent forcewake errors! */
3621         err = engine->emit_flush(rq, EMIT_INVALIDATE);
3622         if (err)
3623                 return err;
3624
3625         cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3626         if (IS_ERR(cs))
3627                 return PTR_ERR(cs);
3628
3629         /* Ensure the LRI have landed before we invalidate & continue */
3630         *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3631         for (i = GEN8_3LVL_PDPES; i--; ) {
3632                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3633                 u32 base = engine->mmio_base;
3634
3635                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3636                 *cs++ = upper_32_bits(pd_daddr);
3637                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3638                 *cs++ = lower_32_bits(pd_daddr);
3639         }
3640         *cs++ = MI_NOOP;
3641
3642         intel_ring_advance(rq, cs);
3643
3644         return 0;
3645 }
3646
3647 static int execlists_request_alloc(struct i915_request *request)
3648 {
3649         int ret;
3650
3651         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3652
3653         /*
3654          * Flush enough space to reduce the likelihood of waiting after
3655          * we start building the request - in which case we will just
3656          * have to repeat work.
3657          */
3658         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3659
3660         /*
3661          * Note that after this point, we have committed to using
3662          * this request as it is being used to both track the
3663          * state of engine initialisation and liveness of the
3664          * golden renderstate above. Think twice before you try
3665          * to cancel/unwind this request now.
3666          */
3667
3668         if (!i915_vm_is_4lvl(request->context->vm)) {
3669                 ret = emit_pdps(request);
3670                 if (ret)
3671                         return ret;
3672         }
3673
3674         /* Unconditionally invalidate GPU caches and TLBs. */
3675         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3676         if (ret)
3677                 return ret;
3678
3679         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3680         return 0;
3681 }
3682
3683 /*
3684  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3685  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3686  * but there is a slight complication as this is applied in WA batch where the
3687  * values are only initialized once so we cannot take register value at the
3688  * beginning and reuse it further; hence we save its value to memory, upload a
3689  * constant value with bit21 set and then we restore it back with the saved value.
3690  * To simplify the WA, a constant value is formed by using the default value
3691  * of this register. This shouldn't be a problem because we are only modifying
3692  * it for a short period and this batch in non-premptible. We can ofcourse
3693  * use additional instructions that read the actual value of the register
3694  * at that time and set our bit of interest but it makes the WA complicated.
3695  *
3696  * This WA is also required for Gen9 so extracting as a function avoids
3697  * code duplication.
3698  */
3699 static u32 *
3700 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3701 {
3702         /* NB no one else is allowed to scribble over scratch + 256! */
3703         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3704         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3705         *batch++ = intel_gt_scratch_offset(engine->gt,
3706                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3707         *batch++ = 0;
3708
3709         *batch++ = MI_LOAD_REGISTER_IMM(1);
3710         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3711         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3712
3713         batch = gen8_emit_pipe_control(batch,
3714                                        PIPE_CONTROL_CS_STALL |
3715                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3716                                        0);
3717
3718         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3719         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3720         *batch++ = intel_gt_scratch_offset(engine->gt,
3721                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3722         *batch++ = 0;
3723
3724         return batch;
3725 }
3726
3727 /*
3728  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3729  * initialized at the beginning and shared across all contexts but this field
3730  * helps us to have multiple batches at different offsets and select them based
3731  * on a criteria. At the moment this batch always start at the beginning of the page
3732  * and at this point we don't have multiple wa_ctx batch buffers.
3733  *
3734  * The number of WA applied are not known at the beginning; we use this field
3735  * to return the no of DWORDS written.
3736  *
3737  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3738  * so it adds NOOPs as padding to make it cacheline aligned.
3739  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3740  * makes a complete batch buffer.
3741  */
3742 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3743 {
3744         /* WaDisableCtxRestoreArbitration:bdw,chv */
3745         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3746
3747         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3748         if (IS_BROADWELL(engine->i915))
3749                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3750
3751         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3752         /* Actual scratch location is at 128 bytes offset */
3753         batch = gen8_emit_pipe_control(batch,
3754                                        PIPE_CONTROL_FLUSH_L3 |
3755                                        PIPE_CONTROL_STORE_DATA_INDEX |
3756                                        PIPE_CONTROL_CS_STALL |
3757                                        PIPE_CONTROL_QW_WRITE,
3758                                        LRC_PPHWSP_SCRATCH_ADDR);
3759
3760         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3761
3762         /* Pad to end of cacheline */
3763         while ((unsigned long)batch % CACHELINE_BYTES)
3764                 *batch++ = MI_NOOP;
3765
3766         /*
3767          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3768          * execution depends on the length specified in terms of cache lines
3769          * in the register CTX_RCS_INDIRECT_CTX
3770          */
3771
3772         return batch;
3773 }
3774
3775 struct lri {
3776         i915_reg_t reg;
3777         u32 value;
3778 };
3779
3780 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3781 {
3782         GEM_BUG_ON(!count || count > 63);
3783
3784         *batch++ = MI_LOAD_REGISTER_IMM(count);
3785         do {
3786                 *batch++ = i915_mmio_reg_offset(lri->reg);
3787                 *batch++ = lri->value;
3788         } while (lri++, --count);
3789         *batch++ = MI_NOOP;
3790
3791         return batch;
3792 }
3793
3794 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3795 {
3796         static const struct lri lri[] = {
3797                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3798                 {
3799                         COMMON_SLICE_CHICKEN2,
3800                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3801                                        0),
3802                 },
3803
3804                 /* BSpec: 11391 */
3805                 {
3806                         FF_SLICE_CHICKEN,
3807                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3808                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3809                 },
3810
3811                 /* BSpec: 11299 */
3812                 {
3813                         _3D_CHICKEN3,
3814                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3815                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3816                 }
3817         };
3818
3819         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3820
3821         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3822         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3823
3824         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3825         batch = gen8_emit_pipe_control(batch,
3826                                        PIPE_CONTROL_FLUSH_L3 |
3827                                        PIPE_CONTROL_STORE_DATA_INDEX |
3828                                        PIPE_CONTROL_CS_STALL |
3829                                        PIPE_CONTROL_QW_WRITE,
3830                                        LRC_PPHWSP_SCRATCH_ADDR);
3831
3832         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3833
3834         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3835         if (HAS_POOLED_EU(engine->i915)) {
3836                 /*
3837                  * EU pool configuration is setup along with golden context
3838                  * during context initialization. This value depends on
3839                  * device type (2x6 or 3x6) and needs to be updated based
3840                  * on which subslice is disabled especially for 2x6
3841                  * devices, however it is safe to load default
3842                  * configuration of 3x6 device instead of masking off
3843                  * corresponding bits because HW ignores bits of a disabled
3844                  * subslice and drops down to appropriate config. Please
3845                  * see render_state_setup() in i915_gem_render_state.c for
3846                  * possible configurations, to avoid duplication they are
3847                  * not shown here again.
3848                  */
3849                 *batch++ = GEN9_MEDIA_POOL_STATE;
3850                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3851                 *batch++ = 0x00777000;
3852                 *batch++ = 0;
3853                 *batch++ = 0;
3854                 *batch++ = 0;
3855         }
3856
3857         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3858
3859         /* Pad to end of cacheline */
3860         while ((unsigned long)batch % CACHELINE_BYTES)
3861                 *batch++ = MI_NOOP;
3862
3863         return batch;
3864 }
3865
3866 static u32 *
3867 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3868 {
3869         int i;
3870
3871         /*
3872          * WaPipeControlBefore3DStateSamplePattern: cnl
3873          *
3874          * Ensure the engine is idle prior to programming a
3875          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3876          */
3877         batch = gen8_emit_pipe_control(batch,
3878                                        PIPE_CONTROL_CS_STALL,
3879                                        0);
3880         /*
3881          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3882          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3883          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3884          * confusing. Since gen8_emit_pipe_control() already advances the
3885          * batch by 6 dwords, we advance the other 10 here, completing a
3886          * cacheline. It's not clear if the workaround requires this padding
3887          * before other commands, or if it's just the regular padding we would
3888          * already have for the workaround bb, so leave it here for now.
3889          */
3890         for (i = 0; i < 10; i++)
3891                 *batch++ = MI_NOOP;
3892
3893         /* Pad to end of cacheline */
3894         while ((unsigned long)batch % CACHELINE_BYTES)
3895                 *batch++ = MI_NOOP;
3896
3897         return batch;
3898 }
3899
3900 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3901
3902 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3903 {
3904         struct drm_i915_gem_object *obj;
3905         struct i915_vma *vma;
3906         int err;
3907
3908         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3909         if (IS_ERR(obj))
3910                 return PTR_ERR(obj);
3911
3912         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3913         if (IS_ERR(vma)) {
3914                 err = PTR_ERR(vma);
3915                 goto err;
3916         }
3917
3918         err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3919         if (err)
3920                 goto err;
3921
3922         engine->wa_ctx.vma = vma;
3923         return 0;
3924
3925 err:
3926         i915_gem_object_put(obj);
3927         return err;
3928 }
3929
3930 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3931 {
3932         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3933 }
3934
3935 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3936
3937 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3938 {
3939         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3940         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3941                                             &wa_ctx->per_ctx };
3942         wa_bb_func_t wa_bb_fn[2];
3943         void *batch, *batch_ptr;
3944         unsigned int i;
3945         int ret;
3946
3947         if (engine->class != RENDER_CLASS)
3948                 return 0;
3949
3950         switch (INTEL_GEN(engine->i915)) {
3951         case 12:
3952         case 11:
3953                 return 0;
3954         case 10:
3955                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3956                 wa_bb_fn[1] = NULL;
3957                 break;
3958         case 9:
3959                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3960                 wa_bb_fn[1] = NULL;
3961                 break;
3962         case 8:
3963                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3964                 wa_bb_fn[1] = NULL;
3965                 break;
3966         default:
3967                 MISSING_CASE(INTEL_GEN(engine->i915));
3968                 return 0;
3969         }
3970
3971         ret = lrc_setup_wa_ctx(engine);
3972         if (ret) {
3973                 drm_dbg(&engine->i915->drm,
3974                         "Failed to setup context WA page: %d\n", ret);
3975                 return ret;
3976         }
3977
3978         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3979
3980         /*
3981          * Emit the two workaround batch buffers, recording the offset from the
3982          * start of the workaround batch buffer object for each and their
3983          * respective sizes.
3984          */
3985         batch_ptr = batch;
3986         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3987                 wa_bb[i]->offset = batch_ptr - batch;
3988                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3989                                                   CACHELINE_BYTES))) {
3990                         ret = -EINVAL;
3991                         break;
3992                 }
3993                 if (wa_bb_fn[i])
3994                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3995                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3996         }
3997         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3998
3999         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4000         __i915_gem_object_release_map(wa_ctx->vma->obj);
4001         if (ret)
4002                 lrc_destroy_wa_ctx(engine);
4003
4004         return ret;
4005 }
4006
4007 static void reset_csb_pointers(struct intel_engine_cs *engine)
4008 {
4009         struct intel_engine_execlists * const execlists = &engine->execlists;
4010         const unsigned int reset_value = execlists->csb_size - 1;
4011
4012         ring_set_paused(engine, 0);
4013
4014         /*
4015          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4016          * Bludgeon them with a mmio update to be sure.
4017          */
4018         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4019                      0xffff << 16 | reset_value << 8 | reset_value);
4020         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4021
4022         /*
4023          * After a reset, the HW starts writing into CSB entry [0]. We
4024          * therefore have to set our HEAD pointer back one entry so that
4025          * the *first* entry we check is entry 0. To complicate this further,
4026          * as we don't wait for the first interrupt after reset, we have to
4027          * fake the HW write to point back to the last entry so that our
4028          * inline comparison of our cached head position against the last HW
4029          * write works even before the first interrupt.
4030          */
4031         execlists->csb_head = reset_value;
4032         WRITE_ONCE(*execlists->csb_write, reset_value);
4033         wmb(); /* Make sure this is visible to HW (paranoia?) */
4034
4035         /* Check that the GPU does indeed update the CSB entries! */
4036         memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4037         invalidate_csb_entries(&execlists->csb_status[0],
4038                                &execlists->csb_status[reset_value]);
4039
4040         /* Once more for luck and our trusty paranoia */
4041         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4042                      0xffff << 16 | reset_value << 8 | reset_value);
4043         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4044
4045         GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4046 }
4047
4048 static void execlists_sanitize(struct intel_engine_cs *engine)
4049 {
4050         /*
4051          * Poison residual state on resume, in case the suspend didn't!
4052          *
4053          * We have to assume that across suspend/resume (or other loss
4054          * of control) that the contents of our pinned buffers has been
4055          * lost, replaced by garbage. Since this doesn't always happen,
4056          * let's poison such state so that we more quickly spot when
4057          * we falsely assume it has been preserved.
4058          */
4059         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4060                 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4061
4062         reset_csb_pointers(engine);
4063
4064         /*
4065          * The kernel_context HWSP is stored in the status_page. As above,
4066          * that may be lost on resume/initialisation, and so we need to
4067          * reset the value in the HWSP.
4068          */
4069         intel_timeline_reset_seqno(engine->kernel_context->timeline);
4070
4071         /* And scrub the dirty cachelines for the HWSP */
4072         clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4073 }
4074
4075 static void enable_error_interrupt(struct intel_engine_cs *engine)
4076 {
4077         u32 status;
4078
4079         engine->execlists.error_interrupt = 0;
4080         ENGINE_WRITE(engine, RING_EMR, ~0u);
4081         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4082
4083         status = ENGINE_READ(engine, RING_ESR);
4084         if (unlikely(status)) {
4085                 drm_err(&engine->i915->drm,
4086                         "engine '%s' resumed still in error: %08x\n",
4087                         engine->name, status);
4088                 __intel_gt_reset(engine->gt, engine->mask);
4089         }
4090
4091         /*
4092          * On current gen8+, we have 2 signals to play with
4093          *
4094          * - I915_ERROR_INSTUCTION (bit 0)
4095          *
4096          *    Generate an error if the command parser encounters an invalid
4097          *    instruction
4098          *
4099          *    This is a fatal error.
4100          *
4101          * - CP_PRIV (bit 2)
4102          *
4103          *    Generate an error on privilege violation (where the CP replaces
4104          *    the instruction with a no-op). This also fires for writes into
4105          *    read-only scratch pages.
4106          *
4107          *    This is a non-fatal error, parsing continues.
4108          *
4109          * * there are a few others defined for odd HW that we do not use
4110          *
4111          * Since CP_PRIV fires for cases where we have chosen to ignore the
4112          * error (as the HW is validating and suppressing the mistakes), we
4113          * only unmask the instruction error bit.
4114          */
4115         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4116 }
4117
4118 static void enable_execlists(struct intel_engine_cs *engine)
4119 {
4120         u32 mode;
4121
4122         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4123
4124         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4125
4126         if (INTEL_GEN(engine->i915) >= 11)
4127                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4128         else
4129                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4130         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4131
4132         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4133
4134         ENGINE_WRITE_FW(engine,
4135                         RING_HWS_PGA,
4136                         i915_ggtt_offset(engine->status_page.vma));
4137         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4138
4139         enable_error_interrupt(engine);
4140
4141         engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4142 }
4143
4144 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4145 {
4146         bool unexpected = false;
4147
4148         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4149                 drm_dbg(&engine->i915->drm,
4150                         "STOP_RING still set in RING_MI_MODE\n");
4151                 unexpected = true;
4152         }
4153
4154         return unexpected;
4155 }
4156
4157 static int execlists_resume(struct intel_engine_cs *engine)
4158 {
4159         intel_mocs_init_engine(engine);
4160
4161         intel_breadcrumbs_reset(engine->breadcrumbs);
4162
4163         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4164                 struct drm_printer p = drm_debug_printer(__func__);
4165
4166                 intel_engine_dump(engine, &p, NULL);
4167         }
4168
4169         enable_execlists(engine);
4170
4171         return 0;
4172 }
4173
4174 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4175 {
4176         struct intel_engine_execlists * const execlists = &engine->execlists;
4177         unsigned long flags;
4178
4179         ENGINE_TRACE(engine, "depth<-%d\n",
4180                      atomic_read(&execlists->tasklet.count));
4181
4182         /*
4183          * Prevent request submission to the hardware until we have
4184          * completed the reset in i915_gem_reset_finish(). If a request
4185          * is completed by one engine, it may then queue a request
4186          * to a second via its execlists->tasklet *just* as we are
4187          * calling engine->resume() and also writing the ELSP.
4188          * Turning off the execlists->tasklet until the reset is over
4189          * prevents the race.
4190          */
4191         __tasklet_disable_sync_once(&execlists->tasklet);
4192         GEM_BUG_ON(!reset_in_progress(execlists));
4193
4194         /* And flush any current direct submission. */
4195         spin_lock_irqsave(&engine->active.lock, flags);
4196         spin_unlock_irqrestore(&engine->active.lock, flags);
4197
4198         /*
4199          * We stop engines, otherwise we might get failed reset and a
4200          * dead gpu (on elk). Also as modern gpu as kbl can suffer
4201          * from system hang if batchbuffer is progressing when
4202          * the reset is issued, regardless of READY_TO_RESET ack.
4203          * Thus assume it is best to stop engines on all gens
4204          * where we have a gpu reset.
4205          *
4206          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4207          *
4208          * FIXME: Wa for more modern gens needs to be validated
4209          */
4210         ring_set_paused(engine, 1);
4211         intel_engine_stop_cs(engine);
4212
4213         engine->execlists.reset_ccid = active_ccid(engine);
4214 }
4215
4216 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4217 {
4218         int x;
4219
4220         x = lrc_ring_mi_mode(engine);
4221         if (x != -1) {
4222                 regs[x + 1] &= ~STOP_RING;
4223                 regs[x + 1] |= STOP_RING << 16;
4224         }
4225 }
4226
4227 static void __execlists_reset_reg_state(const struct intel_context *ce,
4228                                         const struct intel_engine_cs *engine)
4229 {
4230         u32 *regs = ce->lrc_reg_state;
4231
4232         __reset_stop_ring(regs, engine);
4233 }
4234
4235 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4236 {
4237         struct intel_engine_execlists * const execlists = &engine->execlists;
4238         struct intel_context *ce;
4239         struct i915_request *rq;
4240         u32 head;
4241
4242         mb(); /* paranoia: read the CSB pointers from after the reset */
4243         clflush(execlists->csb_write);
4244         mb();
4245
4246         process_csb(engine); /* drain preemption events */
4247
4248         /* Following the reset, we need to reload the CSB read/write pointers */
4249         reset_csb_pointers(engine);
4250
4251         /*
4252          * Save the currently executing context, even if we completed
4253          * its request, it was still running at the time of the
4254          * reset and will have been clobbered.
4255          */
4256         rq = active_context(engine, engine->execlists.reset_ccid);
4257         if (!rq)
4258                 goto unwind;
4259
4260         ce = rq->context;
4261         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4262
4263         if (i915_request_completed(rq)) {
4264                 /* Idle context; tidy up the ring so we can restart afresh */
4265                 head = intel_ring_wrap(ce->ring, rq->tail);
4266                 goto out_replay;
4267         }
4268
4269         /* We still have requests in-flight; the engine should be active */
4270         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4271
4272         /* Context has requests still in-flight; it should not be idle! */
4273         GEM_BUG_ON(i915_active_is_idle(&ce->active));
4274
4275         rq = active_request(ce->timeline, rq);
4276         head = intel_ring_wrap(ce->ring, rq->head);
4277         GEM_BUG_ON(head == ce->ring->tail);
4278
4279         /*
4280          * If this request hasn't started yet, e.g. it is waiting on a
4281          * semaphore, we need to avoid skipping the request or else we
4282          * break the signaling chain. However, if the context is corrupt
4283          * the request will not restart and we will be stuck with a wedged
4284          * device. It is quite often the case that if we issue a reset
4285          * while the GPU is loading the context image, that the context
4286          * image becomes corrupt.
4287          *
4288          * Otherwise, if we have not started yet, the request should replay
4289          * perfectly and we do not need to flag the result as being erroneous.
4290          */
4291         if (!i915_request_started(rq))
4292                 goto out_replay;
4293
4294         /*
4295          * If the request was innocent, we leave the request in the ELSP
4296          * and will try to replay it on restarting. The context image may
4297          * have been corrupted by the reset, in which case we may have
4298          * to service a new GPU hang, but more likely we can continue on
4299          * without impact.
4300          *
4301          * If the request was guilty, we presume the context is corrupt
4302          * and have to at least restore the RING register in the context
4303          * image back to the expected values to skip over the guilty request.
4304          */
4305         __i915_request_reset(rq, stalled);
4306
4307         /*
4308          * We want a simple context + ring to execute the breadcrumb update.
4309          * We cannot rely on the context being intact across the GPU hang,
4310          * so clear it and rebuild just what we need for the breadcrumb.
4311          * All pending requests for this context will be zapped, and any
4312          * future request will be after userspace has had the opportunity
4313          * to recreate its own state.
4314          */
4315 out_replay:
4316         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4317                      head, ce->ring->tail);
4318         __execlists_reset_reg_state(ce, engine);
4319         __execlists_update_reg_state(ce, engine, head);
4320         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4321
4322 unwind:
4323         /* Push back any incomplete requests for replay after the reset. */
4324         cancel_port_requests(execlists);
4325         __unwind_incomplete_requests(engine);
4326 }
4327
4328 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4329 {
4330         unsigned long flags;
4331
4332         ENGINE_TRACE(engine, "\n");
4333
4334         spin_lock_irqsave(&engine->active.lock, flags);
4335
4336         __execlists_reset(engine, stalled);
4337
4338         spin_unlock_irqrestore(&engine->active.lock, flags);
4339 }
4340
4341 static void nop_submission_tasklet(unsigned long data)
4342 {
4343         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4344
4345         /* The driver is wedged; don't process any more events. */
4346         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4347 }
4348
4349 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4350 {
4351         struct intel_engine_execlists * const execlists = &engine->execlists;
4352         struct i915_request *rq, *rn;
4353         struct rb_node *rb;
4354         unsigned long flags;
4355
4356         ENGINE_TRACE(engine, "\n");
4357
4358         /*
4359          * Before we call engine->cancel_requests(), we should have exclusive
4360          * access to the submission state. This is arranged for us by the
4361          * caller disabling the interrupt generation, the tasklet and other
4362          * threads that may then access the same state, giving us a free hand
4363          * to reset state. However, we still need to let lockdep be aware that
4364          * we know this state may be accessed in hardirq context, so we
4365          * disable the irq around this manipulation and we want to keep
4366          * the spinlock focused on its duties and not accidentally conflate
4367          * coverage to the submission's irq state. (Similarly, although we
4368          * shouldn't need to disable irq around the manipulation of the
4369          * submission's irq state, we also wish to remind ourselves that
4370          * it is irq state.)
4371          */
4372         spin_lock_irqsave(&engine->active.lock, flags);
4373
4374         __execlists_reset(engine, true);
4375
4376         /* Mark all executing requests as skipped. */
4377         list_for_each_entry(rq, &engine->active.requests, sched.link)
4378                 mark_eio(rq);
4379
4380         /* Flush the queued requests to the timeline list (for retiring). */
4381         while ((rb = rb_first_cached(&execlists->queue))) {
4382                 struct i915_priolist *p = to_priolist(rb);
4383                 int i;
4384
4385                 priolist_for_each_request_consume(rq, rn, p, i) {
4386                         mark_eio(rq);
4387                         __i915_request_submit(rq);
4388                 }
4389
4390                 rb_erase_cached(&p->node, &execlists->queue);
4391                 i915_priolist_free(p);
4392         }
4393
4394         /* On-hold requests will be flushed to timeline upon their release */
4395         list_for_each_entry(rq, &engine->active.hold, sched.link)
4396                 mark_eio(rq);
4397
4398         /* Cancel all attached virtual engines */
4399         while ((rb = rb_first_cached(&execlists->virtual))) {
4400                 struct virtual_engine *ve =
4401                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4402
4403                 rb_erase_cached(rb, &execlists->virtual);
4404                 RB_CLEAR_NODE(rb);
4405
4406                 spin_lock(&ve->base.active.lock);
4407                 rq = fetch_and_zero(&ve->request);
4408                 if (rq) {
4409                         mark_eio(rq);
4410
4411                         rq->engine = engine;
4412                         __i915_request_submit(rq);
4413                         i915_request_put(rq);
4414
4415                         ve->base.execlists.queue_priority_hint = INT_MIN;
4416                 }
4417                 spin_unlock(&ve->base.active.lock);
4418         }
4419
4420         /* Remaining _unready_ requests will be nop'ed when submitted */
4421
4422         execlists->queue_priority_hint = INT_MIN;
4423         execlists->queue = RB_ROOT_CACHED;
4424
4425         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4426         execlists->tasklet.func = nop_submission_tasklet;
4427
4428         spin_unlock_irqrestore(&engine->active.lock, flags);
4429 }
4430
4431 static void execlists_reset_finish(struct intel_engine_cs *engine)
4432 {
4433         struct intel_engine_execlists * const execlists = &engine->execlists;
4434
4435         /*
4436          * After a GPU reset, we may have requests to replay. Do so now while
4437          * we still have the forcewake to be sure that the GPU is not allowed
4438          * to sleep before we restart and reload a context.
4439          */
4440         GEM_BUG_ON(!reset_in_progress(execlists));
4441         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4442                 execlists->tasklet.func(execlists->tasklet.data);
4443
4444         if (__tasklet_enable(&execlists->tasklet))
4445                 /* And kick in case we missed a new request submission. */
4446                 tasklet_hi_schedule(&execlists->tasklet);
4447         ENGINE_TRACE(engine, "depth->%d\n",
4448                      atomic_read(&execlists->tasklet.count));
4449 }
4450
4451 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4452                                     u64 offset, u32 len,
4453                                     const unsigned int flags)
4454 {
4455         u32 *cs;
4456
4457         cs = intel_ring_begin(rq, 4);
4458         if (IS_ERR(cs))
4459                 return PTR_ERR(cs);
4460
4461         /*
4462          * WaDisableCtxRestoreArbitration:bdw,chv
4463          *
4464          * We don't need to perform MI_ARB_ENABLE as often as we do (in
4465          * particular all the gen that do not need the w/a at all!), if we
4466          * took care to make sure that on every switch into this context
4467          * (both ordinary and for preemption) that arbitrartion was enabled
4468          * we would be fine.  However, for gen8 there is another w/a that
4469          * requires us to not preempt inside GPGPU execution, so we keep
4470          * arbitration disabled for gen8 batches. Arbitration will be
4471          * re-enabled before we close the request
4472          * (engine->emit_fini_breadcrumb).
4473          */
4474         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4475
4476         /* FIXME(BDW+): Address space and security selectors. */
4477         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4478                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4479         *cs++ = lower_32_bits(offset);
4480         *cs++ = upper_32_bits(offset);
4481
4482         intel_ring_advance(rq, cs);
4483
4484         return 0;
4485 }
4486
4487 static int gen8_emit_bb_start(struct i915_request *rq,
4488                               u64 offset, u32 len,
4489                               const unsigned int flags)
4490 {
4491         u32 *cs;
4492
4493         cs = intel_ring_begin(rq, 6);
4494         if (IS_ERR(cs))
4495                 return PTR_ERR(cs);
4496
4497         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4498
4499         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4500                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4501         *cs++ = lower_32_bits(offset);
4502         *cs++ = upper_32_bits(offset);
4503
4504         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4505         *cs++ = MI_NOOP;
4506
4507         intel_ring_advance(rq, cs);
4508
4509         return 0;
4510 }
4511
4512 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4513 {
4514         ENGINE_WRITE(engine, RING_IMR,
4515                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
4516         ENGINE_POSTING_READ(engine, RING_IMR);
4517 }
4518
4519 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4520 {
4521         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4522 }
4523
4524 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4525 {
4526         u32 cmd, *cs;
4527
4528         cs = intel_ring_begin(request, 4);
4529         if (IS_ERR(cs))
4530                 return PTR_ERR(cs);
4531
4532         cmd = MI_FLUSH_DW + 1;
4533
4534         /* We always require a command barrier so that subsequent
4535          * commands, such as breadcrumb interrupts, are strictly ordered
4536          * wrt the contents of the write cache being flushed to memory
4537          * (and thus being coherent from the CPU).
4538          */
4539         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4540
4541         if (mode & EMIT_INVALIDATE) {
4542                 cmd |= MI_INVALIDATE_TLB;
4543                 if (request->engine->class == VIDEO_DECODE_CLASS)
4544                         cmd |= MI_INVALIDATE_BSD;
4545         }
4546
4547         *cs++ = cmd;
4548         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4549         *cs++ = 0; /* upper addr */
4550         *cs++ = 0; /* value */
4551         intel_ring_advance(request, cs);
4552
4553         return 0;
4554 }
4555
4556 static int gen8_emit_flush_render(struct i915_request *request,
4557                                   u32 mode)
4558 {
4559         bool vf_flush_wa = false, dc_flush_wa = false;
4560         u32 *cs, flags = 0;
4561         int len;
4562
4563         flags |= PIPE_CONTROL_CS_STALL;
4564
4565         if (mode & EMIT_FLUSH) {
4566                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4567                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4568                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4569                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4570         }
4571
4572         if (mode & EMIT_INVALIDATE) {
4573                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4574                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4575                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4576                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4577                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4578                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4579                 flags |= PIPE_CONTROL_QW_WRITE;
4580                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4581
4582                 /*
4583                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4584                  * pipe control.
4585                  */
4586                 if (IS_GEN(request->engine->i915, 9))
4587                         vf_flush_wa = true;
4588
4589                 /* WaForGAMHang:kbl */
4590                 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4591                         dc_flush_wa = true;
4592         }
4593
4594         len = 6;
4595
4596         if (vf_flush_wa)
4597                 len += 6;
4598
4599         if (dc_flush_wa)
4600                 len += 12;
4601
4602         cs = intel_ring_begin(request, len);
4603         if (IS_ERR(cs))
4604                 return PTR_ERR(cs);
4605
4606         if (vf_flush_wa)
4607                 cs = gen8_emit_pipe_control(cs, 0, 0);
4608
4609         if (dc_flush_wa)
4610                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4611                                             0);
4612
4613         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4614
4615         if (dc_flush_wa)
4616                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4617
4618         intel_ring_advance(request, cs);
4619
4620         return 0;
4621 }
4622
4623 static int gen11_emit_flush_render(struct i915_request *request,
4624                                    u32 mode)
4625 {
4626         if (mode & EMIT_FLUSH) {
4627                 u32 *cs;
4628                 u32 flags = 0;
4629
4630                 flags |= PIPE_CONTROL_CS_STALL;
4631
4632                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4633                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4634                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4635                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4636                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4637                 flags |= PIPE_CONTROL_QW_WRITE;
4638                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4639
4640                 cs = intel_ring_begin(request, 6);
4641                 if (IS_ERR(cs))
4642                         return PTR_ERR(cs);
4643
4644                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4645                 intel_ring_advance(request, cs);
4646         }
4647
4648         if (mode & EMIT_INVALIDATE) {
4649                 u32 *cs;
4650                 u32 flags = 0;
4651
4652                 flags |= PIPE_CONTROL_CS_STALL;
4653
4654                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4655                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4656                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4657                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4658                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4659                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4660                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4661                 flags |= PIPE_CONTROL_QW_WRITE;
4662                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4663
4664                 cs = intel_ring_begin(request, 6);
4665                 if (IS_ERR(cs))
4666                         return PTR_ERR(cs);
4667
4668                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4669                 intel_ring_advance(request, cs);
4670         }
4671
4672         return 0;
4673 }
4674
4675 static u32 preparser_disable(bool state)
4676 {
4677         return MI_ARB_CHECK | 1 << 8 | state;
4678 }
4679
4680 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4681 {
4682         static const i915_reg_t vd[] = {
4683                 GEN12_VD0_AUX_NV,
4684                 GEN12_VD1_AUX_NV,
4685                 GEN12_VD2_AUX_NV,
4686                 GEN12_VD3_AUX_NV,
4687         };
4688
4689         static const i915_reg_t ve[] = {
4690                 GEN12_VE0_AUX_NV,
4691                 GEN12_VE1_AUX_NV,
4692         };
4693
4694         if (engine->class == VIDEO_DECODE_CLASS)
4695                 return vd[engine->instance];
4696
4697         if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4698                 return ve[engine->instance];
4699
4700         GEM_BUG_ON("unknown aux_inv_reg\n");
4701
4702         return INVALID_MMIO_REG;
4703 }
4704
4705 static u32 *
4706 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4707 {
4708         *cs++ = MI_LOAD_REGISTER_IMM(1);
4709         *cs++ = i915_mmio_reg_offset(inv_reg);
4710         *cs++ = AUX_INV;
4711         *cs++ = MI_NOOP;
4712
4713         return cs;
4714 }
4715
4716 static int gen12_emit_flush_render(struct i915_request *request,
4717                                    u32 mode)
4718 {
4719         if (mode & EMIT_FLUSH) {
4720                 u32 flags = 0;
4721                 u32 *cs;
4722
4723                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4724                 flags |= PIPE_CONTROL_FLUSH_L3;
4725                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4726                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4727                 /* Wa_1409600907:tgl */
4728                 flags |= PIPE_CONTROL_DEPTH_STALL;
4729                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4730                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4731
4732                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4733                 flags |= PIPE_CONTROL_QW_WRITE;
4734
4735                 flags |= PIPE_CONTROL_CS_STALL;
4736
4737                 cs = intel_ring_begin(request, 6);
4738                 if (IS_ERR(cs))
4739                         return PTR_ERR(cs);
4740
4741                 cs = gen12_emit_pipe_control(cs,
4742                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4743                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
4744                 intel_ring_advance(request, cs);
4745         }
4746
4747         if (mode & EMIT_INVALIDATE) {
4748                 u32 flags = 0;
4749                 u32 *cs;
4750
4751                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4752                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4753                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4754                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4755                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4756                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4757                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4758
4759                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4760                 flags |= PIPE_CONTROL_QW_WRITE;
4761
4762                 flags |= PIPE_CONTROL_CS_STALL;
4763
4764                 cs = intel_ring_begin(request, 8 + 4);
4765                 if (IS_ERR(cs))
4766                         return PTR_ERR(cs);
4767
4768                 /*
4769                  * Prevent the pre-parser from skipping past the TLB
4770                  * invalidate and loading a stale page for the batch
4771                  * buffer / request payload.
4772                  */
4773                 *cs++ = preparser_disable(true);
4774
4775                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4776
4777                 /* hsdes: 1809175790 */
4778                 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4779
4780                 *cs++ = preparser_disable(false);
4781                 intel_ring_advance(request, cs);
4782         }
4783
4784         return 0;
4785 }
4786
4787 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4788 {
4789         intel_engine_mask_t aux_inv = 0;
4790         u32 cmd, *cs;
4791
4792         cmd = 4;
4793         if (mode & EMIT_INVALIDATE)
4794                 cmd += 2;
4795         if (mode & EMIT_INVALIDATE)
4796                 aux_inv = request->engine->mask & ~BIT(BCS0);
4797         if (aux_inv)
4798                 cmd += 2 * hweight8(aux_inv) + 2;
4799
4800         cs = intel_ring_begin(request, cmd);
4801         if (IS_ERR(cs))
4802                 return PTR_ERR(cs);
4803
4804         if (mode & EMIT_INVALIDATE)
4805                 *cs++ = preparser_disable(true);
4806
4807         cmd = MI_FLUSH_DW + 1;
4808
4809         /* We always require a command barrier so that subsequent
4810          * commands, such as breadcrumb interrupts, are strictly ordered
4811          * wrt the contents of the write cache being flushed to memory
4812          * (and thus being coherent from the CPU).
4813          */
4814         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4815
4816         if (mode & EMIT_INVALIDATE) {
4817                 cmd |= MI_INVALIDATE_TLB;
4818                 if (request->engine->class == VIDEO_DECODE_CLASS)
4819                         cmd |= MI_INVALIDATE_BSD;
4820         }
4821
4822         *cs++ = cmd;
4823         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4824         *cs++ = 0; /* upper addr */
4825         *cs++ = 0; /* value */
4826
4827         if (aux_inv) { /* hsdes: 1809175790 */
4828                 struct intel_engine_cs *engine;
4829                 unsigned int tmp;
4830
4831                 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4832                 for_each_engine_masked(engine, request->engine->gt,
4833                                        aux_inv, tmp) {
4834                         *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4835                         *cs++ = AUX_INV;
4836                 }
4837                 *cs++ = MI_NOOP;
4838         }
4839
4840         if (mode & EMIT_INVALIDATE)
4841                 *cs++ = preparser_disable(false);
4842
4843         intel_ring_advance(request, cs);
4844
4845         return 0;
4846 }
4847
4848 static void assert_request_valid(struct i915_request *rq)
4849 {
4850         struct intel_ring *ring __maybe_unused = rq->ring;
4851
4852         /* Can we unwind this request without appearing to go forwards? */
4853         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4854 }
4855
4856 /*
4857  * Reserve space for 2 NOOPs at the end of each request to be
4858  * used as a workaround for not being allowed to do lite
4859  * restore with HEAD==TAIL (WaIdleLiteRestore).
4860  */
4861 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4862 {
4863         /* Ensure there's always at least one preemption point per-request. */
4864         *cs++ = MI_ARB_CHECK;
4865         *cs++ = MI_NOOP;
4866         request->wa_tail = intel_ring_offset(request, cs);
4867
4868         /* Check that entire request is less than half the ring */
4869         assert_request_valid(request);
4870
4871         return cs;
4872 }
4873
4874 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4875 {
4876         *cs++ = MI_SEMAPHORE_WAIT |
4877                 MI_SEMAPHORE_GLOBAL_GTT |
4878                 MI_SEMAPHORE_POLL |
4879                 MI_SEMAPHORE_SAD_EQ_SDD;
4880         *cs++ = 0;
4881         *cs++ = intel_hws_preempt_address(request->engine);
4882         *cs++ = 0;
4883
4884         return cs;
4885 }
4886
4887 static __always_inline u32*
4888 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4889 {
4890         *cs++ = MI_USER_INTERRUPT;
4891
4892         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4893         if (intel_engine_has_semaphores(request->engine))
4894                 cs = emit_preempt_busywait(request, cs);
4895
4896         request->tail = intel_ring_offset(request, cs);
4897         assert_ring_tail_valid(request->ring, request->tail);
4898
4899         return gen8_emit_wa_tail(request, cs);
4900 }
4901
4902 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4903 {
4904         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4905 }
4906
4907 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4908 {
4909         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4910 }
4911
4912 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4913 {
4914         cs = gen8_emit_pipe_control(cs,
4915                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4916                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4917                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4918                                     0);
4919
4920         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4921         cs = gen8_emit_ggtt_write_rcs(cs,
4922                                       request->fence.seqno,
4923                                       hwsp_offset(request),
4924                                       PIPE_CONTROL_FLUSH_ENABLE |
4925                                       PIPE_CONTROL_CS_STALL);
4926
4927         return gen8_emit_fini_breadcrumb_tail(request, cs);
4928 }
4929
4930 static u32 *
4931 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4932 {
4933         cs = gen8_emit_ggtt_write_rcs(cs,
4934                                       request->fence.seqno,
4935                                       hwsp_offset(request),
4936                                       PIPE_CONTROL_CS_STALL |
4937                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4938                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4939                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4940                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4941                                       PIPE_CONTROL_FLUSH_ENABLE);
4942
4943         return gen8_emit_fini_breadcrumb_tail(request, cs);
4944 }
4945
4946 /*
4947  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4948  * flush and will continue pre-fetching the instructions after it before the
4949  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4950  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4951  * of the next request before the memory has been flushed, we're guaranteed that
4952  * we won't access the batch itself too early.
4953  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4954  * so, if the current request is modifying an instruction in the next request on
4955  * the same intel_context, we might pre-fetch and then execute the pre-update
4956  * instruction. To avoid this, the users of self-modifying code should either
4957  * disable the parser around the code emitting the memory writes, via a new flag
4958  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4959  * the in-kernel use-cases we've opted to use a separate context, see
4960  * reloc_gpu() as an example.
4961  * All the above applies only to the instructions themselves. Non-inline data
4962  * used by the instructions is not pre-fetched.
4963  */
4964
4965 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4966 {
4967         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4968                 MI_SEMAPHORE_GLOBAL_GTT |
4969                 MI_SEMAPHORE_POLL |
4970                 MI_SEMAPHORE_SAD_EQ_SDD;
4971         *cs++ = 0;
4972         *cs++ = intel_hws_preempt_address(request->engine);
4973         *cs++ = 0;
4974         *cs++ = 0;
4975         *cs++ = MI_NOOP;
4976
4977         return cs;
4978 }
4979
4980 static __always_inline u32*
4981 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4982 {
4983         *cs++ = MI_USER_INTERRUPT;
4984
4985         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4986         if (intel_engine_has_semaphores(request->engine))
4987                 cs = gen12_emit_preempt_busywait(request, cs);
4988
4989         request->tail = intel_ring_offset(request, cs);
4990         assert_ring_tail_valid(request->ring, request->tail);
4991
4992         return gen8_emit_wa_tail(request, cs);
4993 }
4994
4995 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4996 {
4997         /* XXX Stalling flush before seqno write; post-sync not */
4998         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
4999         return gen12_emit_fini_breadcrumb_tail(rq, cs);
5000 }
5001
5002 static u32 *
5003 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5004 {
5005         cs = gen12_emit_ggtt_write_rcs(cs,
5006                                        request->fence.seqno,
5007                                        hwsp_offset(request),
5008                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5009                                        PIPE_CONTROL_CS_STALL |
5010                                        PIPE_CONTROL_TILE_CACHE_FLUSH |
5011                                        PIPE_CONTROL_FLUSH_L3 |
5012                                        PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5013                                        PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5014                                        /* Wa_1409600907:tgl */
5015                                        PIPE_CONTROL_DEPTH_STALL |
5016                                        PIPE_CONTROL_DC_FLUSH_ENABLE |
5017                                        PIPE_CONTROL_FLUSH_ENABLE);
5018
5019         return gen12_emit_fini_breadcrumb_tail(request, cs);
5020 }
5021
5022 static void execlists_park(struct intel_engine_cs *engine)
5023 {
5024         cancel_timer(&engine->execlists.timer);
5025         cancel_timer(&engine->execlists.preempt);
5026 }
5027
5028 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5029 {
5030         engine->submit_request = execlists_submit_request;
5031         engine->schedule = i915_schedule;
5032         engine->execlists.tasklet.func = execlists_submission_tasklet;
5033
5034         engine->reset.prepare = execlists_reset_prepare;
5035         engine->reset.rewind = execlists_reset_rewind;
5036         engine->reset.cancel = execlists_reset_cancel;
5037         engine->reset.finish = execlists_reset_finish;
5038
5039         engine->park = execlists_park;
5040         engine->unpark = NULL;
5041
5042         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5043         if (!intel_vgpu_active(engine->i915)) {
5044                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5045                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5046                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5047                         if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5048                                 engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5049                 }
5050         }
5051
5052         if (INTEL_GEN(engine->i915) >= 12)
5053                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5054
5055         if (intel_engine_has_preemption(engine))
5056                 engine->emit_bb_start = gen8_emit_bb_start;
5057         else
5058                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
5059 }
5060
5061 static void execlists_shutdown(struct intel_engine_cs *engine)
5062 {
5063         /* Synchronise with residual timers and any softirq they raise */
5064         del_timer_sync(&engine->execlists.timer);
5065         del_timer_sync(&engine->execlists.preempt);
5066         tasklet_kill(&engine->execlists.tasklet);
5067 }
5068
5069 static void execlists_release(struct intel_engine_cs *engine)
5070 {
5071         engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5072
5073         execlists_shutdown(engine);
5074
5075         intel_engine_cleanup_common(engine);
5076         lrc_destroy_wa_ctx(engine);
5077 }
5078
5079 static void
5080 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5081 {
5082         /* Default vfuncs which can be overriden by each engine. */
5083
5084         engine->resume = execlists_resume;
5085
5086         engine->cops = &execlists_context_ops;
5087         engine->request_alloc = execlists_request_alloc;
5088
5089         engine->emit_flush = gen8_emit_flush;
5090         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5091         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5092         if (INTEL_GEN(engine->i915) >= 12) {
5093                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5094                 engine->emit_flush = gen12_emit_flush;
5095         }
5096         engine->set_default_submission = intel_execlists_set_default_submission;
5097
5098         if (INTEL_GEN(engine->i915) < 11) {
5099                 engine->irq_enable = gen8_logical_ring_enable_irq;
5100                 engine->irq_disable = gen8_logical_ring_disable_irq;
5101         } else {
5102                 /*
5103                  * TODO: On Gen11 interrupt masks need to be clear
5104                  * to allow C6 entry. Keep interrupts enabled at
5105                  * and take the hit of generating extra interrupts
5106                  * until a more refined solution exists.
5107                  */
5108         }
5109 }
5110
5111 static inline void
5112 logical_ring_default_irqs(struct intel_engine_cs *engine)
5113 {
5114         unsigned int shift = 0;
5115
5116         if (INTEL_GEN(engine->i915) < 11) {
5117                 const u8 irq_shifts[] = {
5118                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
5119                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
5120                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5121                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5122                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
5123                 };
5124
5125                 shift = irq_shifts[engine->id];
5126         }
5127
5128         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5129         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5130         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5131         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5132 }
5133
5134 static void rcs_submission_override(struct intel_engine_cs *engine)
5135 {
5136         switch (INTEL_GEN(engine->i915)) {
5137         case 12:
5138                 engine->emit_flush = gen12_emit_flush_render;
5139                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5140                 break;
5141         case 11:
5142                 engine->emit_flush = gen11_emit_flush_render;
5143                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5144                 break;
5145         default:
5146                 engine->emit_flush = gen8_emit_flush_render;
5147                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5148                 break;
5149         }
5150 }
5151
5152 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5153 {
5154         struct intel_engine_execlists * const execlists = &engine->execlists;
5155         struct drm_i915_private *i915 = engine->i915;
5156         struct intel_uncore *uncore = engine->uncore;
5157         u32 base = engine->mmio_base;
5158
5159         tasklet_init(&engine->execlists.tasklet,
5160                      execlists_submission_tasklet, (unsigned long)engine);
5161         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5162         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5163
5164         logical_ring_default_vfuncs(engine);
5165         logical_ring_default_irqs(engine);
5166
5167         if (engine->class == RENDER_CLASS)
5168                 rcs_submission_override(engine);
5169
5170         if (intel_init_workaround_bb(engine))
5171                 /*
5172                  * We continue even if we fail to initialize WA batch
5173                  * because we only expect rare glitches but nothing
5174                  * critical to prevent us from using GPU
5175                  */
5176                 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5177
5178         if (HAS_LOGICAL_RING_ELSQ(i915)) {
5179                 execlists->submit_reg = uncore->regs +
5180                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5181                 execlists->ctrl_reg = uncore->regs +
5182                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5183         } else {
5184                 execlists->submit_reg = uncore->regs +
5185                         i915_mmio_reg_offset(RING_ELSP(base));
5186         }
5187
5188         execlists->csb_status =
5189                 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5190
5191         execlists->csb_write =
5192                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
5193
5194         if (INTEL_GEN(i915) < 11)
5195                 execlists->csb_size = GEN8_CSB_ENTRIES;
5196         else
5197                 execlists->csb_size = GEN11_CSB_ENTRIES;
5198
5199         if (INTEL_GEN(engine->i915) >= 11) {
5200                 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5201                 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5202         }
5203
5204         /* Finally, take ownership and responsibility for cleanup! */
5205         engine->sanitize = execlists_sanitize;
5206         engine->release = execlists_release;
5207
5208         return 0;
5209 }
5210
5211 static void init_common_reg_state(u32 * const regs,
5212                                   const struct intel_engine_cs *engine,
5213                                   const struct intel_ring *ring,
5214                                   bool inhibit)
5215 {
5216         u32 ctl;
5217
5218         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5219         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5220         if (inhibit)
5221                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5222         if (INTEL_GEN(engine->i915) < 11)
5223                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5224                                            CTX_CTRL_RS_CTX_ENABLE);
5225         regs[CTX_CONTEXT_CONTROL] = ctl;
5226
5227         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5228         regs[CTX_TIMESTAMP] = 0;
5229 }
5230
5231 static void init_wa_bb_reg_state(u32 * const regs,
5232                                  const struct intel_engine_cs *engine)
5233 {
5234         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5235
5236         if (wa_ctx->per_ctx.size) {
5237                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5238
5239                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5240                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5241                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5242         }
5243
5244         if (wa_ctx->indirect_ctx.size) {
5245                 lrc_ring_setup_indirect_ctx(regs, engine,
5246                                             i915_ggtt_offset(wa_ctx->vma) +
5247                                             wa_ctx->indirect_ctx.offset,
5248                                             wa_ctx->indirect_ctx.size);
5249         }
5250 }
5251
5252 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5253 {
5254         if (i915_vm_is_4lvl(&ppgtt->vm)) {
5255                 /* 64b PPGTT (48bit canonical)
5256                  * PDP0_DESCRIPTOR contains the base address to PML4 and
5257                  * other PDP Descriptors are ignored.
5258                  */
5259                 ASSIGN_CTX_PML4(ppgtt, regs);
5260         } else {
5261                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
5262                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
5263                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
5264                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
5265         }
5266 }
5267
5268 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5269 {
5270         if (i915_is_ggtt(vm))
5271                 return i915_vm_to_ggtt(vm)->alias;
5272         else
5273                 return i915_vm_to_ppgtt(vm);
5274 }
5275
5276 static void execlists_init_reg_state(u32 *regs,
5277                                      const struct intel_context *ce,
5278                                      const struct intel_engine_cs *engine,
5279                                      const struct intel_ring *ring,
5280                                      bool inhibit)
5281 {
5282         /*
5283          * A context is actually a big batch buffer with several
5284          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5285          * values we are setting here are only for the first context restore:
5286          * on a subsequent save, the GPU will recreate this batchbuffer with new
5287          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5288          * we are not initializing here).
5289          *
5290          * Must keep consistent with virtual_update_register_offsets().
5291          */
5292         set_offsets(regs, reg_offsets(engine), engine, inhibit);
5293
5294         init_common_reg_state(regs, engine, ring, inhibit);
5295         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5296
5297         init_wa_bb_reg_state(regs, engine);
5298
5299         __reset_stop_ring(regs, engine);
5300 }
5301
5302 static int
5303 populate_lr_context(struct intel_context *ce,
5304                     struct drm_i915_gem_object *ctx_obj,
5305                     struct intel_engine_cs *engine,
5306                     struct intel_ring *ring)
5307 {
5308         bool inhibit = true;
5309         void *vaddr;
5310
5311         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5312         if (IS_ERR(vaddr)) {
5313                 drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5314                 return PTR_ERR(vaddr);
5315         }
5316
5317         set_redzone(vaddr, engine);
5318
5319         if (engine->default_state) {
5320                 shmem_read(engine->default_state, 0,
5321                            vaddr, engine->context_size);
5322                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
5323                 inhibit = false;
5324         }
5325
5326         /* Clear the ppHWSP (inc. per-context counters) */
5327         memset(vaddr, 0, PAGE_SIZE);
5328
5329         /*
5330          * The second page of the context object contains some registers which
5331          * must be set up prior to the first execution.
5332          */
5333         execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5334                                  ce, engine, ring, inhibit);
5335
5336         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5337         i915_gem_object_unpin_map(ctx_obj);
5338         return 0;
5339 }
5340
5341 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5342 {
5343         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5344
5345         return intel_timeline_create_from_engine(ce->engine,
5346                                                  page_unmask_bits(tl));
5347 }
5348
5349 static int __execlists_context_alloc(struct intel_context *ce,
5350                                      struct intel_engine_cs *engine)
5351 {
5352         struct drm_i915_gem_object *ctx_obj;
5353         struct intel_ring *ring;
5354         struct i915_vma *vma;
5355         u32 context_size;
5356         int ret;
5357
5358         GEM_BUG_ON(ce->state);
5359         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5360
5361         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5362                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5363
5364         if (INTEL_GEN(engine->i915) == 12) {
5365                 ce->wa_bb_page = context_size / PAGE_SIZE;
5366                 context_size += PAGE_SIZE;
5367         }
5368
5369         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5370         if (IS_ERR(ctx_obj))
5371                 return PTR_ERR(ctx_obj);
5372
5373         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5374         if (IS_ERR(vma)) {
5375                 ret = PTR_ERR(vma);
5376                 goto error_deref_obj;
5377         }
5378
5379         if (!page_mask_bits(ce->timeline)) {
5380                 struct intel_timeline *tl;
5381
5382                 /*
5383                  * Use the static global HWSP for the kernel context, and
5384                  * a dynamically allocated cacheline for everyone else.
5385                  */
5386                 if (unlikely(ce->timeline))
5387                         tl = pinned_timeline(ce);
5388                 else
5389                         tl = intel_timeline_create(engine->gt);
5390                 if (IS_ERR(tl)) {
5391                         ret = PTR_ERR(tl);
5392                         goto error_deref_obj;
5393                 }
5394
5395                 ce->timeline = tl;
5396         }
5397
5398         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5399         if (IS_ERR(ring)) {
5400                 ret = PTR_ERR(ring);
5401                 goto error_deref_obj;
5402         }
5403
5404         ret = populate_lr_context(ce, ctx_obj, engine, ring);
5405         if (ret) {
5406                 drm_dbg(&engine->i915->drm,
5407                         "Failed to populate LRC: %d\n", ret);
5408                 goto error_ring_free;
5409         }
5410
5411         ce->ring = ring;
5412         ce->state = vma;
5413
5414         return 0;
5415
5416 error_ring_free:
5417         intel_ring_put(ring);
5418 error_deref_obj:
5419         i915_gem_object_put(ctx_obj);
5420         return ret;
5421 }
5422
5423 static struct list_head *virtual_queue(struct virtual_engine *ve)
5424 {
5425         return &ve->base.execlists.default_priolist.requests[0];
5426 }
5427
5428 static void virtual_context_destroy(struct kref *kref)
5429 {
5430         struct virtual_engine *ve =
5431                 container_of(kref, typeof(*ve), context.ref);
5432         unsigned int n;
5433
5434         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5435         GEM_BUG_ON(ve->request);
5436         GEM_BUG_ON(ve->context.inflight);
5437
5438         for (n = 0; n < ve->num_siblings; n++) {
5439                 struct intel_engine_cs *sibling = ve->siblings[n];
5440                 struct rb_node *node = &ve->nodes[sibling->id].rb;
5441                 unsigned long flags;
5442
5443                 if (RB_EMPTY_NODE(node))
5444                         continue;
5445
5446                 spin_lock_irqsave(&sibling->active.lock, flags);
5447
5448                 /* Detachment is lazily performed in the execlists tasklet */
5449                 if (!RB_EMPTY_NODE(node))
5450                         rb_erase_cached(node, &sibling->execlists.virtual);
5451
5452                 spin_unlock_irqrestore(&sibling->active.lock, flags);
5453         }
5454         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5455
5456         if (ve->context.state)
5457                 __execlists_context_fini(&ve->context);
5458         intel_context_fini(&ve->context);
5459
5460         intel_engine_free_request_pool(&ve->base);
5461
5462         kfree(ve->bonds);
5463         kfree(ve);
5464 }
5465
5466 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5467 {
5468         int swp;
5469
5470         /*
5471          * Pick a random sibling on starting to help spread the load around.
5472          *
5473          * New contexts are typically created with exactly the same order
5474          * of siblings, and often started in batches. Due to the way we iterate
5475          * the array of sibling when submitting requests, sibling[0] is
5476          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5477          * randomised across the system, we also help spread the load by the
5478          * first engine we inspect being different each time.
5479          *
5480          * NB This does not force us to execute on this engine, it will just
5481          * typically be the first we inspect for submission.
5482          */
5483         swp = prandom_u32_max(ve->num_siblings);
5484         if (swp)
5485                 swap(ve->siblings[swp], ve->siblings[0]);
5486 }
5487
5488 static int virtual_context_alloc(struct intel_context *ce)
5489 {
5490         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5491
5492         return __execlists_context_alloc(ce, ve->siblings[0]);
5493 }
5494
5495 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5496 {
5497         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5498
5499         /* Note: we must use a real engine class for setting up reg state */
5500         return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5501 }
5502
5503 static void virtual_context_enter(struct intel_context *ce)
5504 {
5505         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5506         unsigned int n;
5507
5508         for (n = 0; n < ve->num_siblings; n++)
5509                 intel_engine_pm_get(ve->siblings[n]);
5510
5511         intel_timeline_enter(ce->timeline);
5512 }
5513
5514 static void virtual_context_exit(struct intel_context *ce)
5515 {
5516         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5517         unsigned int n;
5518
5519         intel_timeline_exit(ce->timeline);
5520
5521         for (n = 0; n < ve->num_siblings; n++)
5522                 intel_engine_pm_put(ve->siblings[n]);
5523 }
5524
5525 static const struct intel_context_ops virtual_context_ops = {
5526         .alloc = virtual_context_alloc,
5527
5528         .pre_pin = execlists_context_pre_pin,
5529         .pin = virtual_context_pin,
5530         .unpin = execlists_context_unpin,
5531         .post_unpin = execlists_context_post_unpin,
5532
5533         .enter = virtual_context_enter,
5534         .exit = virtual_context_exit,
5535
5536         .destroy = virtual_context_destroy,
5537 };
5538
5539 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5540 {
5541         struct i915_request *rq;
5542         intel_engine_mask_t mask;
5543
5544         rq = READ_ONCE(ve->request);
5545         if (!rq)
5546                 return 0;
5547
5548         /* The rq is ready for submission; rq->execution_mask is now stable. */
5549         mask = rq->execution_mask;
5550         if (unlikely(!mask)) {
5551                 /* Invalid selection, submit to a random engine in error */
5552                 i915_request_set_error_once(rq, -ENODEV);
5553                 mask = ve->siblings[0]->mask;
5554         }
5555
5556         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5557                      rq->fence.context, rq->fence.seqno,
5558                      mask, ve->base.execlists.queue_priority_hint);
5559
5560         return mask;
5561 }
5562
5563 static void virtual_submission_tasklet(unsigned long data)
5564 {
5565         struct virtual_engine * const ve = (struct virtual_engine *)data;
5566         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5567         intel_engine_mask_t mask;
5568         unsigned int n;
5569
5570         rcu_read_lock();
5571         mask = virtual_submission_mask(ve);
5572         rcu_read_unlock();
5573         if (unlikely(!mask))
5574                 return;
5575
5576         local_irq_disable();
5577         for (n = 0; n < ve->num_siblings; n++) {
5578                 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5579                 struct ve_node * const node = &ve->nodes[sibling->id];
5580                 struct rb_node **parent, *rb;
5581                 bool first;
5582
5583                 if (!READ_ONCE(ve->request))
5584                         break; /* already handled by a sibling's tasklet */
5585
5586                 if (unlikely(!(mask & sibling->mask))) {
5587                         if (!RB_EMPTY_NODE(&node->rb)) {
5588                                 spin_lock(&sibling->active.lock);
5589                                 rb_erase_cached(&node->rb,
5590                                                 &sibling->execlists.virtual);
5591                                 RB_CLEAR_NODE(&node->rb);
5592                                 spin_unlock(&sibling->active.lock);
5593                         }
5594                         continue;
5595                 }
5596
5597                 spin_lock(&sibling->active.lock);
5598
5599                 if (!RB_EMPTY_NODE(&node->rb)) {
5600                         /*
5601                          * Cheat and avoid rebalancing the tree if we can
5602                          * reuse this node in situ.
5603                          */
5604                         first = rb_first_cached(&sibling->execlists.virtual) ==
5605                                 &node->rb;
5606                         if (prio == node->prio || (prio > node->prio && first))
5607                                 goto submit_engine;
5608
5609                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5610                 }
5611
5612                 rb = NULL;
5613                 first = true;
5614                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5615                 while (*parent) {
5616                         struct ve_node *other;
5617
5618                         rb = *parent;
5619                         other = rb_entry(rb, typeof(*other), rb);
5620                         if (prio > other->prio) {
5621                                 parent = &rb->rb_left;
5622                         } else {
5623                                 parent = &rb->rb_right;
5624                                 first = false;
5625                         }
5626                 }
5627
5628                 rb_link_node(&node->rb, rb, parent);
5629                 rb_insert_color_cached(&node->rb,
5630                                        &sibling->execlists.virtual,
5631                                        first);
5632
5633 submit_engine:
5634                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5635                 node->prio = prio;
5636                 if (first && prio > sibling->execlists.queue_priority_hint)
5637                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5638
5639                 spin_unlock(&sibling->active.lock);
5640         }
5641         local_irq_enable();
5642 }
5643
5644 static void virtual_submit_request(struct i915_request *rq)
5645 {
5646         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5647         struct i915_request *old;
5648         unsigned long flags;
5649
5650         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5651                      rq->fence.context,
5652                      rq->fence.seqno);
5653
5654         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5655
5656         spin_lock_irqsave(&ve->base.active.lock, flags);
5657
5658         old = ve->request;
5659         if (old) { /* background completion event from preempt-to-busy */
5660                 GEM_BUG_ON(!i915_request_completed(old));
5661                 __i915_request_submit(old);
5662                 i915_request_put(old);
5663         }
5664
5665         if (i915_request_completed(rq)) {
5666                 __i915_request_submit(rq);
5667
5668                 ve->base.execlists.queue_priority_hint = INT_MIN;
5669                 ve->request = NULL;
5670         } else {
5671                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5672                 ve->request = i915_request_get(rq);
5673
5674                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5675                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5676
5677                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
5678         }
5679
5680         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5681 }
5682
5683 static struct ve_bond *
5684 virtual_find_bond(struct virtual_engine *ve,
5685                   const struct intel_engine_cs *master)
5686 {
5687         int i;
5688
5689         for (i = 0; i < ve->num_bonds; i++) {
5690                 if (ve->bonds[i].master == master)
5691                         return &ve->bonds[i];
5692         }
5693
5694         return NULL;
5695 }
5696
5697 static void
5698 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5699 {
5700         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5701         intel_engine_mask_t allowed, exec;
5702         struct ve_bond *bond;
5703
5704         allowed = ~to_request(signal)->engine->mask;
5705
5706         bond = virtual_find_bond(ve, to_request(signal)->engine);
5707         if (bond)
5708                 allowed &= bond->sibling_mask;
5709
5710         /* Restrict the bonded request to run on only the available engines */
5711         exec = READ_ONCE(rq->execution_mask);
5712         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5713                 ;
5714
5715         /* Prevent the master from being re-run on the bonded engines */
5716         to_request(signal)->execution_mask &= ~allowed;
5717 }
5718
5719 struct intel_context *
5720 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5721                                unsigned int count)
5722 {
5723         struct virtual_engine *ve;
5724         unsigned int n;
5725         int err;
5726
5727         if (count == 0)
5728                 return ERR_PTR(-EINVAL);
5729
5730         if (count == 1)
5731                 return intel_context_create(siblings[0]);
5732
5733         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5734         if (!ve)
5735                 return ERR_PTR(-ENOMEM);
5736
5737         ve->base.i915 = siblings[0]->i915;
5738         ve->base.gt = siblings[0]->gt;
5739         ve->base.uncore = siblings[0]->uncore;
5740         ve->base.id = -1;
5741
5742         ve->base.class = OTHER_CLASS;
5743         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5744         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5745         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5746
5747         /*
5748          * The decision on whether to submit a request using semaphores
5749          * depends on the saturated state of the engine. We only compute
5750          * this during HW submission of the request, and we need for this
5751          * state to be globally applied to all requests being submitted
5752          * to this engine. Virtual engines encompass more than one physical
5753          * engine and so we cannot accurately tell in advance if one of those
5754          * engines is already saturated and so cannot afford to use a semaphore
5755          * and be pessimized in priority for doing so -- if we are the only
5756          * context using semaphores after all other clients have stopped, we
5757          * will be starved on the saturated system. Such a global switch for
5758          * semaphores is less than ideal, but alas is the current compromise.
5759          */
5760         ve->base.saturated = ALL_ENGINES;
5761
5762         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5763
5764         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5765         intel_engine_init_execlists(&ve->base);
5766
5767         ve->base.cops = &virtual_context_ops;
5768         ve->base.request_alloc = execlists_request_alloc;
5769
5770         ve->base.schedule = i915_schedule;
5771         ve->base.submit_request = virtual_submit_request;
5772         ve->base.bond_execute = virtual_bond_execute;
5773
5774         INIT_LIST_HEAD(virtual_queue(ve));
5775         ve->base.execlists.queue_priority_hint = INT_MIN;
5776         tasklet_init(&ve->base.execlists.tasklet,
5777                      virtual_submission_tasklet,
5778                      (unsigned long)ve);
5779
5780         intel_context_init(&ve->context, &ve->base);
5781
5782         ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5783         if (!ve->base.breadcrumbs) {
5784                 err = -ENOMEM;
5785                 goto err_put;
5786         }
5787
5788         for (n = 0; n < count; n++) {
5789                 struct intel_engine_cs *sibling = siblings[n];
5790
5791                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5792                 if (sibling->mask & ve->base.mask) {
5793                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5794                                   sibling->name);
5795                         err = -EINVAL;
5796                         goto err_put;
5797                 }
5798
5799                 /*
5800                  * The virtual engine implementation is tightly coupled to
5801                  * the execlists backend -- we push out request directly
5802                  * into a tree inside each physical engine. We could support
5803                  * layering if we handle cloning of the requests and
5804                  * submitting a copy into each backend.
5805                  */
5806                 if (sibling->execlists.tasklet.func !=
5807                     execlists_submission_tasklet) {
5808                         err = -ENODEV;
5809                         goto err_put;
5810                 }
5811
5812                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5813                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5814
5815                 ve->siblings[ve->num_siblings++] = sibling;
5816                 ve->base.mask |= sibling->mask;
5817
5818                 /*
5819                  * All physical engines must be compatible for their emission
5820                  * functions (as we build the instructions during request
5821                  * construction and do not alter them before submission
5822                  * on the physical engine). We use the engine class as a guide
5823                  * here, although that could be refined.
5824                  */
5825                 if (ve->base.class != OTHER_CLASS) {
5826                         if (ve->base.class != sibling->class) {
5827                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5828                                           sibling->class, ve->base.class);
5829                                 err = -EINVAL;
5830                                 goto err_put;
5831                         }
5832                         continue;
5833                 }
5834
5835                 ve->base.class = sibling->class;
5836                 ve->base.uabi_class = sibling->uabi_class;
5837                 snprintf(ve->base.name, sizeof(ve->base.name),
5838                          "v%dx%d", ve->base.class, count);
5839                 ve->base.context_size = sibling->context_size;
5840
5841                 ve->base.emit_bb_start = sibling->emit_bb_start;
5842                 ve->base.emit_flush = sibling->emit_flush;
5843                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5844                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5845                 ve->base.emit_fini_breadcrumb_dw =
5846                         sibling->emit_fini_breadcrumb_dw;
5847
5848                 ve->base.flags = sibling->flags;
5849         }
5850
5851         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5852
5853         virtual_engine_initial_hint(ve);
5854         return &ve->context;
5855
5856 err_put:
5857         intel_context_put(&ve->context);
5858         return ERR_PTR(err);
5859 }
5860
5861 struct intel_context *
5862 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5863 {
5864         struct virtual_engine *se = to_virtual_engine(src);
5865         struct intel_context *dst;
5866
5867         dst = intel_execlists_create_virtual(se->siblings,
5868                                              se->num_siblings);
5869         if (IS_ERR(dst))
5870                 return dst;
5871
5872         if (se->num_bonds) {
5873                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5874
5875                 de->bonds = kmemdup(se->bonds,
5876                                     sizeof(*se->bonds) * se->num_bonds,
5877                                     GFP_KERNEL);
5878                 if (!de->bonds) {
5879                         intel_context_put(dst);
5880                         return ERR_PTR(-ENOMEM);
5881                 }
5882
5883                 de->num_bonds = se->num_bonds;
5884         }
5885
5886         return dst;
5887 }
5888
5889 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5890                                      const struct intel_engine_cs *master,
5891                                      const struct intel_engine_cs *sibling)
5892 {
5893         struct virtual_engine *ve = to_virtual_engine(engine);
5894         struct ve_bond *bond;
5895         int n;
5896
5897         /* Sanity check the sibling is part of the virtual engine */
5898         for (n = 0; n < ve->num_siblings; n++)
5899                 if (sibling == ve->siblings[n])
5900                         break;
5901         if (n == ve->num_siblings)
5902                 return -EINVAL;
5903
5904         bond = virtual_find_bond(ve, master);
5905         if (bond) {
5906                 bond->sibling_mask |= sibling->mask;
5907                 return 0;
5908         }
5909
5910         bond = krealloc(ve->bonds,
5911                         sizeof(*bond) * (ve->num_bonds + 1),
5912                         GFP_KERNEL);
5913         if (!bond)
5914                 return -ENOMEM;
5915
5916         bond[ve->num_bonds].master = master;
5917         bond[ve->num_bonds].sibling_mask = sibling->mask;
5918
5919         ve->bonds = bond;
5920         ve->num_bonds++;
5921
5922         return 0;
5923 }
5924
5925 struct intel_engine_cs *
5926 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5927                                  unsigned int sibling)
5928 {
5929         struct virtual_engine *ve = to_virtual_engine(engine);
5930
5931         if (sibling >= ve->num_siblings)
5932                 return NULL;
5933
5934         return ve->siblings[sibling];
5935 }
5936
5937 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5938                                    struct drm_printer *m,
5939                                    void (*show_request)(struct drm_printer *m,
5940                                                         struct i915_request *rq,
5941                                                         const char *prefix),
5942                                    unsigned int max)
5943 {
5944         const struct intel_engine_execlists *execlists = &engine->execlists;
5945         struct i915_request *rq, *last;
5946         unsigned long flags;
5947         unsigned int count;
5948         struct rb_node *rb;
5949
5950         spin_lock_irqsave(&engine->active.lock, flags);
5951
5952         last = NULL;
5953         count = 0;
5954         list_for_each_entry(rq, &engine->active.requests, sched.link) {
5955                 if (count++ < max - 1)
5956                         show_request(m, rq, "\t\tE ");
5957                 else
5958                         last = rq;
5959         }
5960         if (last) {
5961                 if (count > max) {
5962                         drm_printf(m,
5963                                    "\t\t...skipping %d executing requests...\n",
5964                                    count - max);
5965                 }
5966                 show_request(m, last, "\t\tE ");
5967         }
5968
5969         if (execlists->switch_priority_hint != INT_MIN)
5970                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
5971                            READ_ONCE(execlists->switch_priority_hint));
5972         if (execlists->queue_priority_hint != INT_MIN)
5973                 drm_printf(m, "\t\tQueue priority hint: %d\n",
5974                            READ_ONCE(execlists->queue_priority_hint));
5975
5976         last = NULL;
5977         count = 0;
5978         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5979                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5980                 int i;
5981
5982                 priolist_for_each_request(rq, p, i) {
5983                         if (count++ < max - 1)
5984                                 show_request(m, rq, "\t\tQ ");
5985                         else
5986                                 last = rq;
5987                 }
5988         }
5989         if (last) {
5990                 if (count > max) {
5991                         drm_printf(m,
5992                                    "\t\t...skipping %d queued requests...\n",
5993                                    count - max);
5994                 }
5995                 show_request(m, last, "\t\tQ ");
5996         }
5997
5998         last = NULL;
5999         count = 0;
6000         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6001                 struct virtual_engine *ve =
6002                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6003                 struct i915_request *rq = READ_ONCE(ve->request);
6004
6005                 if (rq) {
6006                         if (count++ < max - 1)
6007                                 show_request(m, rq, "\t\tV ");
6008                         else
6009                                 last = rq;
6010                 }
6011         }
6012         if (last) {
6013                 if (count > max) {
6014                         drm_printf(m,
6015                                    "\t\t...skipping %d virtual requests...\n",
6016                                    count - max);
6017                 }
6018                 show_request(m, last, "\t\tV ");
6019         }
6020
6021         spin_unlock_irqrestore(&engine->active.lock, flags);
6022 }
6023
6024 void intel_lr_context_reset(struct intel_engine_cs *engine,
6025                             struct intel_context *ce,
6026                             u32 head,
6027                             bool scrub)
6028 {
6029         GEM_BUG_ON(!intel_context_is_pinned(ce));
6030
6031         /*
6032          * We want a simple context + ring to execute the breadcrumb update.
6033          * We cannot rely on the context being intact across the GPU hang,
6034          * so clear it and rebuild just what we need for the breadcrumb.
6035          * All pending requests for this context will be zapped, and any
6036          * future request will be after userspace has had the opportunity
6037          * to recreate its own state.
6038          */
6039         if (scrub)
6040                 restore_default_state(ce, engine);
6041
6042         /* Rerun the request; its payload has been neutered (if guilty). */
6043         __execlists_update_reg_state(ce, engine, head);
6044 }
6045
6046 bool
6047 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6048 {
6049         return engine->set_default_submission ==
6050                intel_execlists_set_default_submission;
6051 }
6052
6053 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6054 #include "selftest_lrc.c"
6055 #endif