drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_context.h"
 141 #include "intel_engine_pm.h"
 142 #include "intel_gt.h"
 143 #include "intel_gt_pm.h"
 144 #include "intel_gt_requests.h"
 145 #include "intel_lrc_reg.h"
 146 #include "intel_mocs.h"
 147 #include "intel_reset.h"
 148 #include "intel_ring.h"
 149 #include "intel_workarounds.h"
 150
 151 #define RING_EXECLIST_QFULL             (1 << 0x2)
 152 #define RING_EXECLIST1_VALID            (1 << 0x3)
 153 #define RING_EXECLIST0_VALID            (1 << 0x4)
 154 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 155 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 156 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 157
 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 159 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 162 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 163 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 164
 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 166          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 167
 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 169
 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 172 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 173 #define GEN12_IDLE_CTX_ID               0x7FF
 174 #define GEN12_CSB_CTX_VALID(csb_dw) \
 175         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 176
 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 179
 180 struct virtual_engine {
 181         struct intel_engine_cs base;
 182         struct intel_context context;
 183
 184         /*
 185          * We allow only a single request through the virtual engine at a time
 186          * (each request in the timeline waits for the completion fence of
 187          * the previous before being submitted). By restricting ourselves to
 188          * only submitting a single request, each request is placed on to a
 189          * physical to maximise load spreading (by virtue of the late greedy
 190          * scheduling -- each real engine takes the next available request
 191          * upon idling).
 192          */
 193         struct i915_request *request;
 194
 195         /*
 196          * We keep a rbtree of available virtual engines inside each physical
 197          * engine, sorted by priority. Here we preallocate the nodes we need
 198          * for the virtual engine, indexed by physical_engine->id.
 199          */
 200         struct ve_node {
 201                 struct rb_node rb;
 202                 int prio;
 203         } nodes[I915_NUM_ENGINES];
 204
 205         /*
 206          * Keep track of bonded pairs -- restrictions upon on our selection
 207          * of physical engines any particular request may be submitted to.
 208          * If we receive a submit-fence from a master engine, we will only
 209          * use one of sibling_mask physical engines.
 210          */
 211         struct ve_bond {
 212                 const struct intel_engine_cs *master;
 213                 intel_engine_mask_t sibling_mask;
 214         } *bonds;
 215         unsigned int num_bonds;
 216
 217         /* And finally, which physical engines this virtual engine maps onto. */
 218         unsigned int num_siblings;
 219         struct intel_engine_cs *siblings[0];
 220 };
 221
 222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 223 {
 224         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 225         return container_of(engine, struct virtual_engine, base);
 226 }
 227
 228 static int __execlists_context_alloc(struct intel_context *ce,
 229                                      struct intel_engine_cs *engine);
 230
 231 static void execlists_init_reg_state(u32 *reg_state,
 232                                      const struct intel_context *ce,
 233                                      const struct intel_engine_cs *engine,
 234                                      const struct intel_ring *ring,
 235                                      bool close);
 236 static void
 237 __execlists_update_reg_state(const struct intel_context *ce,
 238                              const struct intel_engine_cs *engine,
 239                              u32 head);
 240
 241 static void mark_eio(struct i915_request *rq)
 242 {
 243         if (i915_request_completed(rq))
 244                 return;
 245
 246         GEM_BUG_ON(i915_request_signaled(rq));
 247
 248         i915_request_set_error_once(rq, -EIO);
 249         i915_request_mark_complete(rq);
 250 }
 251
 252 static struct i915_request *
 253 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 254 {
 255         struct i915_request *active = rq;
 256
 257         rcu_read_lock();
 258         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 259                 if (i915_request_completed(rq))
 260                         break;
 261
 262                 active = rq;
 263         }
 264         rcu_read_unlock();
 265
 266         return active;
 267 }
 268
 269 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 270 {
 271         return (i915_ggtt_offset(engine->status_page.vma) +
 272                 I915_GEM_HWS_PREEMPT_ADDR);
 273 }
 274
 275 static inline void
 276 ring_set_paused(const struct intel_engine_cs *engine, int state)
 277 {
 278         /*
 279          * We inspect HWS_PREEMPT with a semaphore inside
 280          * engine->emit_fini_breadcrumb. If the dword is true,
 281          * the ring is paused as the semaphore will busywait
 282          * until the dword is false.
 283          */
 284         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 285         if (state)
 286                 wmb();
 287 }
 288
 289 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 290 {
 291         return rb_entry(rb, struct i915_priolist, node);
 292 }
 293
 294 static inline int rq_prio(const struct i915_request *rq)
 295 {
 296         return READ_ONCE(rq->sched.attr.priority);
 297 }
 298
 299 static int effective_prio(const struct i915_request *rq)
 300 {
 301         int prio = rq_prio(rq);
 302
 303         /*
 304          * If this request is special and must not be interrupted at any
 305          * cost, so be it. Note we are only checking the most recent request
 306          * in the context and so may be masking an earlier vip request. It
 307          * is hoped that under the conditions where nopreempt is used, this
 308          * will not matter (i.e. all requests to that context will be
 309          * nopreempt for as long as desired).
 310          */
 311         if (i915_request_has_nopreempt(rq))
 312                 prio = I915_PRIORITY_UNPREEMPTABLE;
 313
 314         /*
 315          * On unwinding the active request, we give it a priority bump
 316          * if it has completed waiting on any semaphore. If we know that
 317          * the request has already started, we can prevent an unwanted
 318          * preempt-to-idle cycle by taking that into account now.
 319          */
 320         if (__i915_request_has_started(rq))
 321                 prio |= I915_PRIORITY_NOSEMAPHORE;
 322
 323         /* Restrict mere WAIT boosts from triggering preemption */
 324         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
 325         return prio | __NO_PREEMPTION;
 326 }
 327
 328 static int queue_prio(const struct intel_engine_execlists *execlists)
 329 {
 330         struct i915_priolist *p;
 331         struct rb_node *rb;
 332
 333         rb = rb_first_cached(&execlists->queue);
 334         if (!rb)
 335                 return INT_MIN;
 336
 337         /*
 338          * As the priolist[] are inverted, with the highest priority in [0],
 339          * we have to flip the index value to become priority.
 340          */
 341         p = to_priolist(rb);
 342         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 343 }
 344
 345 static inline bool need_preempt(const struct intel_engine_cs *engine,
 346                                 const struct i915_request *rq,
 347                                 struct rb_node *rb)
 348 {
 349         int last_prio;
 350
 351         if (!intel_engine_has_semaphores(engine))
 352                 return false;
 353
 354         /*
 355          * Check if the current priority hint merits a preemption attempt.
 356          *
 357          * We record the highest value priority we saw during rescheduling
 358          * prior to this dequeue, therefore we know that if it is strictly
 359          * less than the current tail of ESLP[0], we do not need to force
 360          * a preempt-to-idle cycle.
 361          *
 362          * However, the priority hint is a mere hint that we may need to
 363          * preempt. If that hint is stale or we may be trying to preempt
 364          * ourselves, ignore the request.
 365          *
 366          * More naturally we would write
 367          *      prio >= max(0, last);
 368          * except that we wish to prevent triggering preemption at the same
 369          * priority level: the task that is running should remain running
 370          * to preserve FIFO ordering of dependencies.
 371          */
 372         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 373         if (engine->execlists.queue_priority_hint <= last_prio)
 374                 return false;
 375
 376         /*
 377          * Check against the first request in ELSP[1], it will, thanks to the
 378          * power of PI, be the highest priority of that context.
 379          */
 380         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 381             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 382                 return true;
 383
 384         if (rb) {
 385                 struct virtual_engine *ve =
 386                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 387                 bool preempt = false;
 388
 389                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 390                         struct i915_request *next;
 391
 392                         rcu_read_lock();
 393                         next = READ_ONCE(ve->request);
 394                         if (next)
 395                                 preempt = rq_prio(next) > last_prio;
 396                         rcu_read_unlock();
 397                 }
 398
 399                 if (preempt)
 400                         return preempt;
 401         }
 402
 403         /*
 404          * If the inflight context did not trigger the preemption, then maybe
 405          * it was the set of queued requests? Pick the highest priority in
 406          * the queue (the first active priolist) and see if it deserves to be
 407          * running instead of ELSP[0].
 408          *
 409          * The highest priority request in the queue can not be either
 410          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 411          * context, it's priority would not exceed ELSP[0] aka last_prio.
 412          */
 413         return queue_prio(&engine->execlists) > last_prio;
 414 }
 415
 416 __maybe_unused static inline bool
 417 assert_priority_queue(const struct i915_request *prev,
 418                       const struct i915_request *next)
 419 {
 420         /*
 421          * Without preemption, the prev may refer to the still active element
 422          * which we refuse to let go.
 423          *
 424          * Even with preemption, there are times when we think it is better not
 425          * to preempt and leave an ostensibly lower priority request in flight.
 426          */
 427         if (i915_request_is_active(prev))
 428                 return true;
 429
 430         return rq_prio(prev) >= rq_prio(next);
 431 }
 432
 433 /*
 434  * The context descriptor encodes various attributes of a context,
 435  * including its GTT address and some flags. Because it's fairly
 436  * expensive to calculate, we'll just do it once and cache the result,
 437  * which remains valid until the context is unpinned.
 438  *
 439  * This is what a descriptor looks like, from LSB to MSB::
 440  *
 441  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 442  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 443  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 444  *      bits 53-54:    mbz, reserved for use by hardware
 445  *      bits 55-63:    group ID, currently unused and set to 0
 446  *
 447  * Starting from Gen11, the upper dword of the descriptor has a new format:
 448  *
 449  *      bits 32-36:    reserved
 450  *      bits 37-47:    SW context ID
 451  *      bits 48:53:    engine instance
 452  *      bit 54:        mbz, reserved for use by hardware
 453  *      bits 55-60:    SW counter
 454  *      bits 61-63:    engine class
 455  *
 456  * engine info, SW context ID and SW counter need to form a unique number
 457  * (Context ID) per lrc.
 458  */
 459 static u32
 460 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 461 {
 462         u32 desc;
 463
 464         desc = INTEL_LEGACY_32B_CONTEXT;
 465         if (i915_vm_is_4lvl(ce->vm))
 466                 desc = INTEL_LEGACY_64B_CONTEXT;
 467         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 468
 469         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 470         if (IS_GEN(engine->i915, 8))
 471                 desc |= GEN8_CTX_L3LLC_COHERENT;
 472
 473         return i915_ggtt_offset(ce->state) | desc;
 474 }
 475
 476 static inline unsigned int dword_in_page(void *addr)
 477 {
 478         return offset_in_page(addr) / sizeof(u32);
 479 }
 480
 481 static void set_offsets(u32 *regs,
 482                         const u8 *data,
 483                         const struct intel_engine_cs *engine,
 484                         bool clear)
 485 #define NOP(x) (BIT(7) | (x))
 486 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 487 #define POSTED BIT(0)
 488 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 489 #define REG16(x) \
 490         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 491         (((x) >> 2) & 0x7f)
 492 #define END(x) 0, (x)
 493 {
 494         const u32 base = engine->mmio_base;
 495
 496         while (*data) {
 497                 u8 count, flags;
 498
 499                 if (*data & BIT(7)) { /* skip */
 500                         count = *data++ & ~BIT(7);
 501                         if (clear)
 502                                 memset32(regs, MI_NOOP, count);
 503                         regs += count;
 504                         continue;
 505                 }
 506
 507                 count = *data & 0x3f;
 508                 flags = *data >> 6;
 509                 data++;
 510
 511                 *regs = MI_LOAD_REGISTER_IMM(count);
 512                 if (flags & POSTED)
 513                         *regs |= MI_LRI_FORCE_POSTED;
 514                 if (INTEL_GEN(engine->i915) >= 11)
 515                         *regs |= MI_LRI_CS_MMIO;
 516                 regs++;
 517
 518                 GEM_BUG_ON(!count);
 519                 do {
 520                         u32 offset = 0;
 521                         u8 v;
 522
 523                         do {
 524                                 v = *data++;
 525                                 offset <<= 7;
 526                                 offset |= v & ~BIT(7);
 527                         } while (v & BIT(7));
 528
 529                         regs[0] = base + (offset << 2);
 530                         if (clear)
 531                                 regs[1] = 0;
 532                         regs += 2;
 533                 } while (--count);
 534         }
 535
 536         if (clear) {
 537                 u8 count = *++data;
 538
 539                 /* Clear past the tail for HW access */
 540                 GEM_BUG_ON(dword_in_page(regs) > count);
 541                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 542
 543                 /* Close the batch; used mainly by live_lrc_layout() */
 544                 *regs = MI_BATCH_BUFFER_END;
 545                 if (INTEL_GEN(engine->i915) >= 10)
 546                         *regs |= BIT(0);
 547         }
 548 }
 549
 550 static const u8 gen8_xcs_offsets[] = {
 551         NOP(1),
 552         LRI(11, 0),
 553         REG16(0x244),
 554         REG(0x034),
 555         REG(0x030),
 556         REG(0x038),
 557         REG(0x03c),
 558         REG(0x168),
 559         REG(0x140),
 560         REG(0x110),
 561         REG(0x11c),
 562         REG(0x114),
 563         REG(0x118),
 564
 565         NOP(9),
 566         LRI(9, 0),
 567         REG16(0x3a8),
 568         REG16(0x28c),
 569         REG16(0x288),
 570         REG16(0x284),
 571         REG16(0x280),
 572         REG16(0x27c),
 573         REG16(0x278),
 574         REG16(0x274),
 575         REG16(0x270),
 576
 577         NOP(13),
 578         LRI(2, 0),
 579         REG16(0x200),
 580         REG(0x028),
 581
 582         END(80)
 583 };
 584
 585 static const u8 gen9_xcs_offsets[] = {
 586         NOP(1),
 587         LRI(14, POSTED),
 588         REG16(0x244),
 589         REG(0x034),
 590         REG(0x030),
 591         REG(0x038),
 592         REG(0x03c),
 593         REG(0x168),
 594         REG(0x140),
 595         REG(0x110),
 596         REG(0x11c),
 597         REG(0x114),
 598         REG(0x118),
 599         REG(0x1c0),
 600         REG(0x1c4),
 601         REG(0x1c8),
 602
 603         NOP(3),
 604         LRI(9, POSTED),
 605         REG16(0x3a8),
 606         REG16(0x28c),
 607         REG16(0x288),
 608         REG16(0x284),
 609         REG16(0x280),
 610         REG16(0x27c),
 611         REG16(0x278),
 612         REG16(0x274),
 613         REG16(0x270),
 614
 615         NOP(13),
 616         LRI(1, POSTED),
 617         REG16(0x200),
 618
 619         NOP(13),
 620         LRI(44, POSTED),
 621         REG(0x028),
 622         REG(0x09c),
 623         REG(0x0c0),
 624         REG(0x178),
 625         REG(0x17c),
 626         REG16(0x358),
 627         REG(0x170),
 628         REG(0x150),
 629         REG(0x154),
 630         REG(0x158),
 631         REG16(0x41c),
 632         REG16(0x600),
 633         REG16(0x604),
 634         REG16(0x608),
 635         REG16(0x60c),
 636         REG16(0x610),
 637         REG16(0x614),
 638         REG16(0x618),
 639         REG16(0x61c),
 640         REG16(0x620),
 641         REG16(0x624),
 642         REG16(0x628),
 643         REG16(0x62c),
 644         REG16(0x630),
 645         REG16(0x634),
 646         REG16(0x638),
 647         REG16(0x63c),
 648         REG16(0x640),
 649         REG16(0x644),
 650         REG16(0x648),
 651         REG16(0x64c),
 652         REG16(0x650),
 653         REG16(0x654),
 654         REG16(0x658),
 655         REG16(0x65c),
 656         REG16(0x660),
 657         REG16(0x664),
 658         REG16(0x668),
 659         REG16(0x66c),
 660         REG16(0x670),
 661         REG16(0x674),
 662         REG16(0x678),
 663         REG16(0x67c),
 664         REG(0x068),
 665
 666         END(176)
 667 };
 668
 669 static const u8 gen12_xcs_offsets[] = {
 670         NOP(1),
 671         LRI(13, POSTED),
 672         REG16(0x244),
 673         REG(0x034),
 674         REG(0x030),
 675         REG(0x038),
 676         REG(0x03c),
 677         REG(0x168),
 678         REG(0x140),
 679         REG(0x110),
 680         REG(0x1c0),
 681         REG(0x1c4),
 682         REG(0x1c8),
 683         REG(0x180),
 684         REG16(0x2b4),
 685
 686         NOP(5),
 687         LRI(9, POSTED),
 688         REG16(0x3a8),
 689         REG16(0x28c),
 690         REG16(0x288),
 691         REG16(0x284),
 692         REG16(0x280),
 693         REG16(0x27c),
 694         REG16(0x278),
 695         REG16(0x274),
 696         REG16(0x270),
 697
 698         END(80)
 699 };
 700
 701 static const u8 gen8_rcs_offsets[] = {
 702         NOP(1),
 703         LRI(14, POSTED),
 704         REG16(0x244),
 705         REG(0x034),
 706         REG(0x030),
 707         REG(0x038),
 708         REG(0x03c),
 709         REG(0x168),
 710         REG(0x140),
 711         REG(0x110),
 712         REG(0x11c),
 713         REG(0x114),
 714         REG(0x118),
 715         REG(0x1c0),
 716         REG(0x1c4),
 717         REG(0x1c8),
 718
 719         NOP(3),
 720         LRI(9, POSTED),
 721         REG16(0x3a8),
 722         REG16(0x28c),
 723         REG16(0x288),
 724         REG16(0x284),
 725         REG16(0x280),
 726         REG16(0x27c),
 727         REG16(0x278),
 728         REG16(0x274),
 729         REG16(0x270),
 730
 731         NOP(13),
 732         LRI(1, 0),
 733         REG(0x0c8),
 734
 735         END(80)
 736 };
 737
 738 static const u8 gen9_rcs_offsets[] = {
 739         NOP(1),
 740         LRI(14, POSTED),
 741         REG16(0x244),
 742         REG(0x34),
 743         REG(0x30),
 744         REG(0x38),
 745         REG(0x3c),
 746         REG(0x168),
 747         REG(0x140),
 748         REG(0x110),
 749         REG(0x11c),
 750         REG(0x114),
 751         REG(0x118),
 752         REG(0x1c0),
 753         REG(0x1c4),
 754         REG(0x1c8),
 755
 756         NOP(3),
 757         LRI(9, POSTED),
 758         REG16(0x3a8),
 759         REG16(0x28c),
 760         REG16(0x288),
 761         REG16(0x284),
 762         REG16(0x280),
 763         REG16(0x27c),
 764         REG16(0x278),
 765         REG16(0x274),
 766         REG16(0x270),
 767
 768         NOP(13),
 769         LRI(1, 0),
 770         REG(0xc8),
 771
 772         NOP(13),
 773         LRI(44, POSTED),
 774         REG(0x28),
 775         REG(0x9c),
 776         REG(0xc0),
 777         REG(0x178),
 778         REG(0x17c),
 779         REG16(0x358),
 780         REG(0x170),
 781         REG(0x150),
 782         REG(0x154),
 783         REG(0x158),
 784         REG16(0x41c),
 785         REG16(0x600),
 786         REG16(0x604),
 787         REG16(0x608),
 788         REG16(0x60c),
 789         REG16(0x610),
 790         REG16(0x614),
 791         REG16(0x618),
 792         REG16(0x61c),
 793         REG16(0x620),
 794         REG16(0x624),
 795         REG16(0x628),
 796         REG16(0x62c),
 797         REG16(0x630),
 798         REG16(0x634),
 799         REG16(0x638),
 800         REG16(0x63c),
 801         REG16(0x640),
 802         REG16(0x644),
 803         REG16(0x648),
 804         REG16(0x64c),
 805         REG16(0x650),
 806         REG16(0x654),
 807         REG16(0x658),
 808         REG16(0x65c),
 809         REG16(0x660),
 810         REG16(0x664),
 811         REG16(0x668),
 812         REG16(0x66c),
 813         REG16(0x670),
 814         REG16(0x674),
 815         REG16(0x678),
 816         REG16(0x67c),
 817         REG(0x68),
 818
 819         END(176)
 820 };
 821
 822 static const u8 gen11_rcs_offsets[] = {
 823         NOP(1),
 824         LRI(15, POSTED),
 825         REG16(0x244),
 826         REG(0x034),
 827         REG(0x030),
 828         REG(0x038),
 829         REG(0x03c),
 830         REG(0x168),
 831         REG(0x140),
 832         REG(0x110),
 833         REG(0x11c),
 834         REG(0x114),
 835         REG(0x118),
 836         REG(0x1c0),
 837         REG(0x1c4),
 838         REG(0x1c8),
 839         REG(0x180),
 840
 841         NOP(1),
 842         LRI(9, POSTED),
 843         REG16(0x3a8),
 844         REG16(0x28c),
 845         REG16(0x288),
 846         REG16(0x284),
 847         REG16(0x280),
 848         REG16(0x27c),
 849         REG16(0x278),
 850         REG16(0x274),
 851         REG16(0x270),
 852
 853         LRI(1, POSTED),
 854         REG(0x1b0),
 855
 856         NOP(10),
 857         LRI(1, 0),
 858         REG(0x0c8),
 859
 860         END(80)
 861 };
 862
 863 static const u8 gen12_rcs_offsets[] = {
 864         NOP(1),
 865         LRI(13, POSTED),
 866         REG16(0x244),
 867         REG(0x034),
 868         REG(0x030),
 869         REG(0x038),
 870         REG(0x03c),
 871         REG(0x168),
 872         REG(0x140),
 873         REG(0x110),
 874         REG(0x1c0),
 875         REG(0x1c4),
 876         REG(0x1c8),
 877         REG(0x180),
 878         REG16(0x2b4),
 879
 880         NOP(5),
 881         LRI(9, POSTED),
 882         REG16(0x3a8),
 883         REG16(0x28c),
 884         REG16(0x288),
 885         REG16(0x284),
 886         REG16(0x280),
 887         REG16(0x27c),
 888         REG16(0x278),
 889         REG16(0x274),
 890         REG16(0x270),
 891
 892         LRI(3, POSTED),
 893         REG(0x1b0),
 894         REG16(0x5a8),
 895         REG16(0x5ac),
 896
 897         NOP(6),
 898         LRI(1, 0),
 899         REG(0x0c8),
 900
 901         END(80)
 902 };
 903
 904 #undef END
 905 #undef REG16
 906 #undef REG
 907 #undef LRI
 908 #undef NOP
 909
 910 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 911 {
 912         /*
 913          * The gen12+ lists only have the registers we program in the basic
 914          * default state. We rely on the context image using relative
 915          * addressing to automatic fixup the register state between the
 916          * physical engines for virtual engine.
 917          */
 918         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
 919                    !intel_engine_has_relative_mmio(engine));
 920
 921         if (engine->class == RENDER_CLASS) {
 922                 if (INTEL_GEN(engine->i915) >= 12)
 923                         return gen12_rcs_offsets;
 924                 else if (INTEL_GEN(engine->i915) >= 11)
 925                         return gen11_rcs_offsets;
 926                 else if (INTEL_GEN(engine->i915) >= 9)
 927                         return gen9_rcs_offsets;
 928                 else
 929                         return gen8_rcs_offsets;
 930         } else {
 931                 if (INTEL_GEN(engine->i915) >= 12)
 932                         return gen12_xcs_offsets;
 933                 else if (INTEL_GEN(engine->i915) >= 9)
 934                         return gen9_xcs_offsets;
 935                 else
 936                         return gen8_xcs_offsets;
 937         }
 938 }
 939
 940 static struct i915_request *
 941 __unwind_incomplete_requests(struct intel_engine_cs *engine)
 942 {
 943         struct i915_request *rq, *rn, *active = NULL;
 944         struct list_head *uninitialized_var(pl);
 945         int prio = I915_PRIORITY_INVALID;
 946
 947         lockdep_assert_held(&engine->active.lock);
 948
 949         list_for_each_entry_safe_reverse(rq, rn,
 950                                          &engine->active.requests,
 951                                          sched.link) {
 952                 if (i915_request_completed(rq))
 953                         continue; /* XXX */
 954
 955                 __i915_request_unsubmit(rq);
 956
 957                 /*
 958                  * Push the request back into the queue for later resubmission.
 959                  * If this request is not native to this physical engine (i.e.
 960                  * it came from a virtual source), push it back onto the virtual
 961                  * engine so that it can be moved across onto another physical
 962                  * engine as load dictates.
 963                  */
 964                 if (likely(rq->execution_mask == engine->mask)) {
 965                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 966                         if (rq_prio(rq) != prio) {
 967                                 prio = rq_prio(rq);
 968                                 pl = i915_sched_lookup_priolist(engine, prio);
 969                         }
 970                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 971
 972                         list_move(&rq->sched.link, pl);
 973                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
 974
 975                         active = rq;
 976                 } else {
 977                         struct intel_engine_cs *owner = rq->context->engine;
 978
 979                         /*
 980                          * Decouple the virtual breadcrumb before moving it
 981                          * back to the virtual engine -- we don't want the
 982                          * request to complete in the background and try
 983                          * and cancel the breadcrumb on the virtual engine
 984                          * (instead of the old engine where it is linked)!
 985                          */
 986                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 987                                      &rq->fence.flags)) {
 988                                 spin_lock_nested(&rq->lock,
 989                                                  SINGLE_DEPTH_NESTING);
 990                                 i915_request_cancel_breadcrumb(rq);
 991                                 spin_unlock(&rq->lock);
 992                         }
 993                         WRITE_ONCE(rq->engine, owner);
 994                         owner->submit_request(rq);
 995                         active = NULL;
 996                 }
 997         }
 998
 999         return active;
1000 }
1001
1002 struct i915_request *
1003 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1004 {
1005         struct intel_engine_cs *engine =
1006                 container_of(execlists, typeof(*engine), execlists);
1007
1008         return __unwind_incomplete_requests(engine);
1009 }
1010
1011 static inline void
1012 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1013 {
1014         /*
1015          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1016          * The compiler should eliminate this function as dead-code.
1017          */
1018         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1019                 return;
1020
1021         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1022                                    status, rq);
1023 }
1024
1025 static void intel_engine_context_in(struct intel_engine_cs *engine)
1026 {
1027         unsigned long flags;
1028
1029         if (READ_ONCE(engine->stats.enabled) == 0)
1030                 return;
1031
1032         write_seqlock_irqsave(&engine->stats.lock, flags);
1033
1034         if (engine->stats.enabled > 0) {
1035                 if (engine->stats.active++ == 0)
1036                         engine->stats.start = ktime_get();
1037                 GEM_BUG_ON(engine->stats.active == 0);
1038         }
1039
1040         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1041 }
1042
1043 static void intel_engine_context_out(struct intel_engine_cs *engine)
1044 {
1045         unsigned long flags;
1046
1047         if (READ_ONCE(engine->stats.enabled) == 0)
1048                 return;
1049
1050         write_seqlock_irqsave(&engine->stats.lock, flags);
1051
1052         if (engine->stats.enabled > 0) {
1053                 ktime_t last;
1054
1055                 if (engine->stats.active && --engine->stats.active == 0) {
1056                         /*
1057                          * Decrement the active context count and in case GPU
1058                          * is now idle add up to the running total.
1059                          */
1060                         last = ktime_sub(ktime_get(), engine->stats.start);
1061
1062                         engine->stats.total = ktime_add(engine->stats.total,
1063                                                         last);
1064                 } else if (engine->stats.active == 0) {
1065                         /*
1066                          * After turning on engine stats, context out might be
1067                          * the first event in which case we account from the
1068                          * time stats gathering was turned on.
1069                          */
1070                         last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1071
1072                         engine->stats.total = ktime_add(engine->stats.total,
1073                                                         last);
1074                 }
1075         }
1076
1077         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1078 }
1079
1080 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1081 {
1082         if (INTEL_GEN(engine->i915) >= 12)
1083                 return 0x60;
1084         else if (INTEL_GEN(engine->i915) >= 9)
1085                 return 0x54;
1086         else if (engine->class == RENDER_CLASS)
1087                 return 0x58;
1088         else
1089                 return -1;
1090 }
1091
1092 static void
1093 execlists_check_context(const struct intel_context *ce,
1094                         const struct intel_engine_cs *engine)
1095 {
1096         const struct intel_ring *ring = ce->ring;
1097         u32 *regs = ce->lrc_reg_state;
1098         bool valid = true;
1099         int x;
1100
1101         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1102                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1103                        engine->name,
1104                        regs[CTX_RING_START],
1105                        i915_ggtt_offset(ring->vma));
1106                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1107                 valid = false;
1108         }
1109
1110         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1111             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1112                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1113                        engine->name,
1114                        regs[CTX_RING_CTL],
1115                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1116                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1117                 valid = false;
1118         }
1119
1120         x = lrc_ring_mi_mode(engine);
1121         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1122                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1123                        engine->name, regs[x + 1]);
1124                 regs[x + 1] &= ~STOP_RING;
1125                 regs[x + 1] |= STOP_RING << 16;
1126                 valid = false;
1127         }
1128
1129         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1130 }
1131
1132 static void restore_default_state(struct intel_context *ce,
1133                                   struct intel_engine_cs *engine)
1134 {
1135         u32 *regs = ce->lrc_reg_state;
1136
1137         if (engine->pinned_default_state)
1138                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1139                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1140                        engine->context_size - PAGE_SIZE);
1141
1142         execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1143 }
1144
1145 static void reset_active(struct i915_request *rq,
1146                          struct intel_engine_cs *engine)
1147 {
1148         struct intel_context * const ce = rq->context;
1149         u32 head;
1150
1151         /*
1152          * The executing context has been cancelled. We want to prevent
1153          * further execution along this context and propagate the error on
1154          * to anything depending on its results.
1155          *
1156          * In __i915_request_submit(), we apply the -EIO and remove the
1157          * requests' payloads for any banned requests. But first, we must
1158          * rewind the context back to the start of the incomplete request so
1159          * that we do not jump back into the middle of the batch.
1160          *
1161          * We preserve the breadcrumbs and semaphores of the incomplete
1162          * requests so that inter-timeline dependencies (i.e other timelines)
1163          * remain correctly ordered. And we defer to __i915_request_submit()
1164          * so that all asynchronous waits are correctly handled.
1165          */
1166         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1167                      rq->fence.context, rq->fence.seqno);
1168
1169         /* On resubmission of the active request, payload will be scrubbed */
1170         if (i915_request_completed(rq))
1171                 head = rq->tail;
1172         else
1173                 head = active_request(ce->timeline, rq)->head;
1174         head = intel_ring_wrap(ce->ring, head);
1175
1176         /* Scrub the context image to prevent replaying the previous batch */
1177         restore_default_state(ce, engine);
1178         __execlists_update_reg_state(ce, engine, head);
1179
1180         /* We've switched away, so this should be a no-op, but intent matters */
1181         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1182 }
1183
1184 static u32 intel_context_get_runtime(const struct intel_context *ce)
1185 {
1186         /*
1187          * We can use either ppHWSP[16] which is recorded before the context
1188          * switch (and so excludes the cost of context switches) or use the
1189          * value from the context image itself, which is saved/restored earlier
1190          * and so includes the cost of the save.
1191          */
1192         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1193 }
1194
1195 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1196 {
1197 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1198         ce->runtime.num_underflow += dt < 0;
1199         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1200 #endif
1201 }
1202
1203 static void intel_context_update_runtime(struct intel_context *ce)
1204 {
1205         u32 old;
1206         s32 dt;
1207
1208         if (intel_context_is_barrier(ce))
1209                 return;
1210
1211         old = ce->runtime.last;
1212         ce->runtime.last = intel_context_get_runtime(ce);
1213         dt = ce->runtime.last - old;
1214
1215         if (unlikely(dt <= 0)) {
1216                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1217                          old, ce->runtime.last, dt);
1218                 st_update_runtime_underflow(ce, dt);
1219                 return;
1220         }
1221
1222         ewma_runtime_add(&ce->runtime.avg, dt);
1223         ce->runtime.total += dt;
1224 }
1225
1226 static inline struct intel_engine_cs *
1227 __execlists_schedule_in(struct i915_request *rq)
1228 {
1229         struct intel_engine_cs * const engine = rq->engine;
1230         struct intel_context * const ce = rq->context;
1231
1232         intel_context_get(ce);
1233
1234         if (unlikely(intel_context_is_banned(ce)))
1235                 reset_active(rq, engine);
1236
1237         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1238                 execlists_check_context(ce, engine);
1239
1240         if (ce->tag) {
1241                 /* Use a fixed tag for OA and friends */
1242                 GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1243                 ce->lrc.ccid = ce->tag;
1244         } else {
1245                 /* We don't need a strict matching tag, just different values */
1246                 unsigned int tag = ffs(engine->context_tag);
1247
1248                 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1249                 clear_bit(tag - 1, &engine->context_tag);
1250                 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1251
1252                 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1253         }
1254
1255         ce->lrc.ccid |= engine->execlists.ccid;
1256
1257         __intel_gt_pm_get(engine->gt);
1258         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1259         intel_engine_context_in(engine);
1260
1261         return engine;
1262 }
1263
1264 static inline struct i915_request *
1265 execlists_schedule_in(struct i915_request *rq, int idx)
1266 {
1267         struct intel_context * const ce = rq->context;
1268         struct intel_engine_cs *old;
1269
1270         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1271         trace_i915_request_in(rq, idx);
1272
1273         old = READ_ONCE(ce->inflight);
1274         do {
1275                 if (!old) {
1276                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1277                         break;
1278                 }
1279         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1280
1281         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1282         return i915_request_get(rq);
1283 }
1284
1285 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1286 {
1287         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1288         struct i915_request *next = READ_ONCE(ve->request);
1289
1290         if (next && next->execution_mask & ~rq->execution_mask)
1291                 tasklet_schedule(&ve->base.execlists.tasklet);
1292 }
1293
1294 static inline void
1295 __execlists_schedule_out(struct i915_request *rq,
1296                          struct intel_engine_cs * const engine,
1297                          unsigned int ccid)
1298 {
1299         struct intel_context * const ce = rq->context;
1300
1301         /*
1302          * NB process_csb() is not under the engine->active.lock and hence
1303          * schedule_out can race with schedule_in meaning that we should
1304          * refrain from doing non-trivial work here.
1305          */
1306
1307         /*
1308          * If we have just completed this context, the engine may now be
1309          * idle and we want to re-enter powersaving.
1310          */
1311         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1312             i915_request_completed(rq))
1313                 intel_engine_add_retire(engine, ce->timeline);
1314
1315         ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1316         ccid &= GEN12_MAX_CONTEXT_HW_ID;
1317         if (ccid < BITS_PER_LONG) {
1318                 GEM_BUG_ON(ccid == 0);
1319                 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1320                 set_bit(ccid - 1, &engine->context_tag);
1321         }
1322
1323         intel_context_update_runtime(ce);
1324         intel_engine_context_out(engine);
1325         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1326         intel_gt_pm_put_async(engine->gt);
1327
1328         /*
1329          * If this is part of a virtual engine, its next request may
1330          * have been blocked waiting for access to the active context.
1331          * We have to kick all the siblings again in case we need to
1332          * switch (e.g. the next request is not runnable on this
1333          * engine). Hopefully, we will already have submitted the next
1334          * request before the tasklet runs and do not need to rebuild
1335          * each virtual tree and kick everyone again.
1336          */
1337         if (ce->engine != engine)
1338                 kick_siblings(rq, ce);
1339
1340         intel_context_put(ce);
1341 }
1342
1343 static inline void
1344 execlists_schedule_out(struct i915_request *rq)
1345 {
1346         struct intel_context * const ce = rq->context;
1347         struct intel_engine_cs *cur, *old;
1348         u32 ccid;
1349
1350         trace_i915_request_out(rq);
1351
1352         ccid = rq->context->lrc.ccid;
1353         old = READ_ONCE(ce->inflight);
1354         do
1355                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1356         while (!try_cmpxchg(&ce->inflight, &old, cur));
1357         if (!cur)
1358                 __execlists_schedule_out(rq, old, ccid);
1359
1360         i915_request_put(rq);
1361 }
1362
1363 static u64 execlists_update_context(struct i915_request *rq)
1364 {
1365         struct intel_context *ce = rq->context;
1366         u64 desc = ce->lrc.desc;
1367         u32 tail, prev;
1368
1369         /*
1370          * WaIdleLiteRestore:bdw,skl
1371          *
1372          * We should never submit the context with the same RING_TAIL twice
1373          * just in case we submit an empty ring, which confuses the HW.
1374          *
1375          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1376          * the normal request to be able to always advance the RING_TAIL on
1377          * subsequent resubmissions (for lite restore). Should that fail us,
1378          * and we try and submit the same tail again, force the context
1379          * reload.
1380          *
1381          * If we need to return to a preempted context, we need to skip the
1382          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1383          * HW has a tendency to ignore us rewinding the TAIL to the end of
1384          * an earlier request.
1385          */
1386         tail = intel_ring_set_tail(rq->ring, rq->tail);
1387         prev = ce->lrc_reg_state[CTX_RING_TAIL];
1388         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1389                 desc |= CTX_DESC_FORCE_RESTORE;
1390         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1391         rq->tail = rq->wa_tail;
1392
1393         /*
1394          * Make sure the context image is complete before we submit it to HW.
1395          *
1396          * Ostensibly, writes (including the WCB) should be flushed prior to
1397          * an uncached write such as our mmio register access, the empirical
1398          * evidence (esp. on Braswell) suggests that the WC write into memory
1399          * may not be visible to the HW prior to the completion of the UC
1400          * register write and that we may begin execution from the context
1401          * before its image is complete leading to invalid PD chasing.
1402          */
1403         wmb();
1404
1405         ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1406         return desc;
1407 }
1408
1409 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1410 {
1411         if (execlists->ctrl_reg) {
1412                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1413                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1414         } else {
1415                 writel(upper_32_bits(desc), execlists->submit_reg);
1416                 writel(lower_32_bits(desc), execlists->submit_reg);
1417         }
1418 }
1419
1420 static __maybe_unused void
1421 trace_ports(const struct intel_engine_execlists *execlists,
1422             const char *msg,
1423             struct i915_request * const *ports)
1424 {
1425         const struct intel_engine_cs *engine =
1426                 container_of(execlists, typeof(*engine), execlists);
1427
1428         if (!ports[0])
1429                 return;
1430
1431         ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1432                      ports[0]->fence.context,
1433                      ports[0]->fence.seqno,
1434                      i915_request_completed(ports[0]) ? "!" :
1435                      i915_request_started(ports[0]) ? "*" :
1436                      "",
1437                      ports[1] ? ports[1]->fence.context : 0,
1438                      ports[1] ? ports[1]->fence.seqno : 0);
1439 }
1440
1441 static inline bool
1442 reset_in_progress(const struct intel_engine_execlists *execlists)
1443 {
1444         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1445 }
1446
1447 static __maybe_unused bool
1448 assert_pending_valid(const struct intel_engine_execlists *execlists,
1449                      const char *msg)
1450 {
1451         struct i915_request * const *port, *rq;
1452         struct intel_context *ce = NULL;
1453         bool sentinel = false;
1454
1455         trace_ports(execlists, msg, execlists->pending);
1456
1457         /* We may be messing around with the lists during reset, lalala */
1458         if (reset_in_progress(execlists))
1459                 return true;
1460
1461         if (!execlists->pending[0]) {
1462                 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1463                 return false;
1464         }
1465
1466         if (execlists->pending[execlists_num_ports(execlists)]) {
1467                 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1468                               execlists_num_ports(execlists));
1469                 return false;
1470         }
1471
1472         for (port = execlists->pending; (rq = *port); port++) {
1473                 unsigned long flags;
1474                 bool ok = true;
1475
1476                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1477                 GEM_BUG_ON(!i915_request_is_active(rq));
1478
1479                 if (ce == rq->context) {
1480                         GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1481                                       ce->timeline->fence_context,
1482                                       port - execlists->pending);
1483                         return false;
1484                 }
1485                 ce = rq->context;
1486
1487                 /*
1488                  * Sentinels are supposed to be lonely so they flush the
1489                  * current exection off the HW. Check that they are the
1490                  * only request in the pending submission.
1491                  */
1492                 if (sentinel) {
1493                         GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
1494                                       ce->timeline->fence_context,
1495                                       port - execlists->pending);
1496                         return false;
1497                 }
1498
1499                 sentinel = i915_request_has_sentinel(rq);
1500                 if (sentinel && port != execlists->pending) {
1501                         GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
1502                                       ce->timeline->fence_context,
1503                                       port - execlists->pending);
1504                         return false;
1505                 }
1506
1507                 /* Hold tightly onto the lock to prevent concurrent retires! */
1508                 if (!spin_trylock_irqsave(&rq->lock, flags))
1509                         continue;
1510
1511                 if (i915_request_completed(rq))
1512                         goto unlock;
1513
1514                 if (i915_active_is_idle(&ce->active) &&
1515                     !intel_context_is_barrier(ce)) {
1516                         GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1517                                       ce->timeline->fence_context,
1518                                       port - execlists->pending);
1519                         ok = false;
1520                         goto unlock;
1521                 }
1522
1523                 if (!i915_vma_is_pinned(ce->state)) {
1524                         GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1525                                       ce->timeline->fence_context,
1526                                       port - execlists->pending);
1527                         ok = false;
1528                         goto unlock;
1529                 }
1530
1531                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1532                         GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1533                                       ce->timeline->fence_context,
1534                                       port - execlists->pending);
1535                         ok = false;
1536                         goto unlock;
1537                 }
1538
1539 unlock:
1540                 spin_unlock_irqrestore(&rq->lock, flags);
1541                 if (!ok)
1542                         return false;
1543         }
1544
1545         return ce;
1546 }
1547
1548 static void execlists_submit_ports(struct intel_engine_cs *engine)
1549 {
1550         struct intel_engine_execlists *execlists = &engine->execlists;
1551         unsigned int n;
1552
1553         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1554
1555         /*
1556          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1557          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1558          * not be relinquished until the device is idle (see
1559          * i915_gem_idle_work_handler()). As a precaution, we make sure
1560          * that all ELSP are drained i.e. we have processed the CSB,
1561          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1562          */
1563         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1564
1565         /*
1566          * ELSQ note: the submit queue is not cleared after being submitted
1567          * to the HW so we need to make sure we always clean it up. This is
1568          * currently ensured by the fact that we always write the same number
1569          * of elsq entries, keep this in mind before changing the loop below.
1570          */
1571         for (n = execlists_num_ports(execlists); n--; ) {
1572                 struct i915_request *rq = execlists->pending[n];
1573
1574                 write_desc(execlists,
1575                            rq ? execlists_update_context(rq) : 0,
1576                            n);
1577         }
1578
1579         /* we need to manually load the submit queue */
1580         if (execlists->ctrl_reg)
1581                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1582 }
1583
1584 static bool ctx_single_port_submission(const struct intel_context *ce)
1585 {
1586         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1587                 intel_context_force_single_submission(ce));
1588 }
1589
1590 static bool can_merge_ctx(const struct intel_context *prev,
1591                           const struct intel_context *next)
1592 {
1593         if (prev != next)
1594                 return false;
1595
1596         if (ctx_single_port_submission(prev))
1597                 return false;
1598
1599         return true;
1600 }
1601
1602 static unsigned long i915_request_flags(const struct i915_request *rq)
1603 {
1604         return READ_ONCE(rq->fence.flags);
1605 }
1606
1607 static bool can_merge_rq(const struct i915_request *prev,
1608                          const struct i915_request *next)
1609 {
1610         GEM_BUG_ON(prev == next);
1611         GEM_BUG_ON(!assert_priority_queue(prev, next));
1612
1613         /*
1614          * We do not submit known completed requests. Therefore if the next
1615          * request is already completed, we can pretend to merge it in
1616          * with the previous context (and we will skip updating the ELSP
1617          * and tracking). Thus hopefully keeping the ELSP full with active
1618          * contexts, despite the best efforts of preempt-to-busy to confuse
1619          * us.
1620          */
1621         if (i915_request_completed(next))
1622                 return true;
1623
1624         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1625                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1626                       BIT(I915_FENCE_FLAG_SENTINEL))))
1627                 return false;
1628
1629         if (!can_merge_ctx(prev->context, next->context))
1630                 return false;
1631
1632         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1633         return true;
1634 }
1635
1636 static void virtual_update_register_offsets(u32 *regs,
1637                                             struct intel_engine_cs *engine)
1638 {
1639         set_offsets(regs, reg_offsets(engine), engine, false);
1640 }
1641
1642 static bool virtual_matches(const struct virtual_engine *ve,
1643                             const struct i915_request *rq,
1644                             const struct intel_engine_cs *engine)
1645 {
1646         const struct intel_engine_cs *inflight;
1647
1648         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1649                 return false;
1650
1651         /*
1652          * We track when the HW has completed saving the context image
1653          * (i.e. when we have seen the final CS event switching out of
1654          * the context) and must not overwrite the context image before
1655          * then. This restricts us to only using the active engine
1656          * while the previous virtualized request is inflight (so
1657          * we reuse the register offsets). This is a very small
1658          * hystersis on the greedy seelction algorithm.
1659          */
1660         inflight = intel_context_inflight(&ve->context);
1661         if (inflight && inflight != engine)
1662                 return false;
1663
1664         return true;
1665 }
1666
1667 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1668                                      struct i915_request *rq)
1669 {
1670         struct intel_engine_cs *old = ve->siblings[0];
1671
1672         /* All unattached (rq->engine == old) must already be completed */
1673
1674         spin_lock(&old->breadcrumbs.irq_lock);
1675         if (!list_empty(&ve->context.signal_link)) {
1676                 list_del_init(&ve->context.signal_link);
1677
1678                 /*
1679                  * We cannot acquire the new engine->breadcrumbs.irq_lock
1680                  * (as we are holding a breadcrumbs.irq_lock already),
1681                  * so attach this request to the signaler on submission.
1682                  * The queued irq_work will occur when we finally drop
1683                  * the engine->active.lock after dequeue.
1684                  */
1685                 set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
1686
1687                 /* Also transfer the pending irq_work for the old breadcrumb. */
1688                 intel_engine_signal_breadcrumbs(rq->engine);
1689         }
1690         spin_unlock(&old->breadcrumbs.irq_lock);
1691 }
1692
1693 #define for_each_waiter(p__, rq__) \
1694         list_for_each_entry_lockless(p__, \
1695                                      &(rq__)->sched.waiters_list, \
1696                                      wait_link)
1697
1698 #define for_each_signaler(p__, rq__) \
1699         list_for_each_entry_rcu(p__, \
1700                                 &(rq__)->sched.signalers_list, \
1701                                 signal_link)
1702
1703 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1704 {
1705         LIST_HEAD(list);
1706
1707         /*
1708          * We want to move the interrupted request to the back of
1709          * the round-robin list (i.e. its priority level), but
1710          * in doing so, we must then move all requests that were in
1711          * flight and were waiting for the interrupted request to
1712          * be run after it again.
1713          */
1714         do {
1715                 struct i915_dependency *p;
1716
1717                 GEM_BUG_ON(i915_request_is_active(rq));
1718                 list_move_tail(&rq->sched.link, pl);
1719
1720                 for_each_waiter(p, rq) {
1721                         struct i915_request *w =
1722                                 container_of(p->waiter, typeof(*w), sched);
1723
1724                         if (p->flags & I915_DEPENDENCY_WEAK)
1725                                 continue;
1726
1727                         /* Leave semaphores spinning on the other engines */
1728                         if (w->engine != rq->engine)
1729                                 continue;
1730
1731                         /* No waiter should start before its signaler */
1732                         GEM_BUG_ON(i915_request_started(w) &&
1733                                    !i915_request_completed(rq));
1734
1735                         GEM_BUG_ON(i915_request_is_active(w));
1736                         if (!i915_request_is_ready(w))
1737                                 continue;
1738
1739                         if (rq_prio(w) < rq_prio(rq))
1740                                 continue;
1741
1742                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1743                         list_move_tail(&w->sched.link, &list);
1744                 }
1745
1746                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1747         } while (rq);
1748 }
1749
1750 static void defer_active(struct intel_engine_cs *engine)
1751 {
1752         struct i915_request *rq;
1753
1754         rq = __unwind_incomplete_requests(engine);
1755         if (!rq)
1756                 return;
1757
1758         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1759 }
1760
1761 static bool
1762 need_timeslice(const struct intel_engine_cs *engine,
1763                const struct i915_request *rq)
1764 {
1765         int hint;
1766
1767         if (!intel_engine_has_timeslices(engine))
1768                 return false;
1769
1770         hint = engine->execlists.queue_priority_hint;
1771         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1772                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1773
1774         return hint >= effective_prio(rq);
1775 }
1776
1777 static bool
1778 timeslice_yield(const struct intel_engine_execlists *el,
1779                 const struct i915_request *rq)
1780 {
1781         /*
1782          * Once bitten, forever smitten!
1783          *
1784          * If the active context ever busy-waited on a semaphore,
1785          * it will be treated as a hog until the end of its timeslice (i.e.
1786          * until it is scheduled out and replaced by a new submission,
1787          * possibly even its own lite-restore). The HW only sends an interrupt
1788          * on the first miss, and we do know if that semaphore has been
1789          * signaled, or even if it is now stuck on another semaphore. Play
1790          * safe, yield if it might be stuck -- it will be given a fresh
1791          * timeslice in the near future.
1792          */
1793         return rq->context->lrc.ccid == READ_ONCE(el->yield);
1794 }
1795
1796 static bool
1797 timeslice_expired(const struct intel_engine_execlists *el,
1798                   const struct i915_request *rq)
1799 {
1800         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1801 }
1802
1803 static int
1804 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1805 {
1806         if (list_is_last(&rq->sched.link, &engine->active.requests))
1807                 return INT_MIN;
1808
1809         return rq_prio(list_next_entry(rq, sched.link));
1810 }
1811
1812 static inline unsigned long
1813 timeslice(const struct intel_engine_cs *engine)
1814 {
1815         return READ_ONCE(engine->props.timeslice_duration_ms);
1816 }
1817
1818 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1819 {
1820         const struct intel_engine_execlists *execlists = &engine->execlists;
1821         const struct i915_request *rq = *execlists->active;
1822
1823         if (!rq || i915_request_completed(rq))
1824                 return 0;
1825
1826         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1827                 return 0;
1828
1829         return timeslice(engine);
1830 }
1831
1832 static void set_timeslice(struct intel_engine_cs *engine)
1833 {
1834         if (!intel_engine_has_timeslices(engine))
1835                 return;
1836
1837         set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1838 }
1839
1840 static void start_timeslice(struct intel_engine_cs *engine)
1841 {
1842         struct intel_engine_execlists *execlists = &engine->execlists;
1843         int prio = queue_prio(execlists);
1844
1845         WRITE_ONCE(execlists->switch_priority_hint, prio);
1846         if (prio == INT_MIN)
1847                 return;
1848
1849         if (timer_pending(&execlists->timer))
1850                 return;
1851
1852         set_timer_ms(&execlists->timer, timeslice(engine));
1853 }
1854
1855 static void record_preemption(struct intel_engine_execlists *execlists)
1856 {
1857         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1858 }
1859
1860 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1861                                             const struct i915_request *rq)
1862 {
1863         if (!rq)
1864                 return 0;
1865
1866         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1867         if (unlikely(intel_context_is_banned(rq->context)))
1868                 return 1;
1869
1870         return READ_ONCE(engine->props.preempt_timeout_ms);
1871 }
1872
1873 static void set_preempt_timeout(struct intel_engine_cs *engine,
1874                                 const struct i915_request *rq)
1875 {
1876         if (!intel_engine_has_preempt_reset(engine))
1877                 return;
1878
1879         set_timer_ms(&engine->execlists.preempt,
1880                      active_preempt_timeout(engine, rq));
1881 }
1882
1883 static inline void clear_ports(struct i915_request **ports, int count)
1884 {
1885         memset_p((void **)ports, NULL, count);
1886 }
1887
1888 static void execlists_dequeue(struct intel_engine_cs *engine)
1889 {
1890         struct intel_engine_execlists * const execlists = &engine->execlists;
1891         struct i915_request **port = execlists->pending;
1892         struct i915_request ** const last_port = port + execlists->port_mask;
1893         struct i915_request * const *active;
1894         struct i915_request *last;
1895         struct rb_node *rb;
1896         bool submit = false;
1897
1898         /*
1899          * Hardware submission is through 2 ports. Conceptually each port
1900          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1901          * static for a context, and unique to each, so we only execute
1902          * requests belonging to a single context from each ring. RING_HEAD
1903          * is maintained by the CS in the context image, it marks the place
1904          * where it got up to last time, and through RING_TAIL we tell the CS
1905          * where we want to execute up to this time.
1906          *
1907          * In this list the requests are in order of execution. Consecutive
1908          * requests from the same context are adjacent in the ringbuffer. We
1909          * can combine these requests into a single RING_TAIL update:
1910          *
1911          *              RING_HEAD...req1...req2
1912          *                                    ^- RING_TAIL
1913          * since to execute req2 the CS must first execute req1.
1914          *
1915          * Our goal then is to point each port to the end of a consecutive
1916          * sequence of requests as being the most optimal (fewest wake ups
1917          * and context switches) submission.
1918          */
1919
1920         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1921                 struct virtual_engine *ve =
1922                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1923                 struct i915_request *rq = READ_ONCE(ve->request);
1924
1925                 if (!rq) { /* lazily cleanup after another engine handled rq */
1926                         rb_erase_cached(rb, &execlists->virtual);
1927                         RB_CLEAR_NODE(rb);
1928                         rb = rb_first_cached(&execlists->virtual);
1929                         continue;
1930                 }
1931
1932                 if (!virtual_matches(ve, rq, engine)) {
1933                         rb = rb_next(rb);
1934                         continue;
1935                 }
1936
1937                 break;
1938         }
1939
1940         /*
1941          * If the queue is higher priority than the last
1942          * request in the currently active context, submit afresh.
1943          * We will resubmit again afterwards in case we need to split
1944          * the active context to interject the preemption request,
1945          * i.e. we will retrigger preemption following the ack in case
1946          * of trouble.
1947          */
1948         active = READ_ONCE(execlists->active);
1949         while ((last = *active) && i915_request_completed(last))
1950                 active++;
1951
1952         if (last) {
1953                 if (need_preempt(engine, last, rb)) {
1954                         ENGINE_TRACE(engine,
1955                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1956                                      last->fence.context,
1957                                      last->fence.seqno,
1958                                      last->sched.attr.priority,
1959                                      execlists->queue_priority_hint);
1960                         record_preemption(execlists);
1961
1962                         /*
1963                          * Don't let the RING_HEAD advance past the breadcrumb
1964                          * as we unwind (and until we resubmit) so that we do
1965                          * not accidentally tell it to go backwards.
1966                          */
1967                         ring_set_paused(engine, 1);
1968
1969                         /*
1970                          * Note that we have not stopped the GPU at this point,
1971                          * so we are unwinding the incomplete requests as they
1972                          * remain inflight and so by the time we do complete
1973                          * the preemption, some of the unwound requests may
1974                          * complete!
1975                          */
1976                         __unwind_incomplete_requests(engine);
1977
1978                         last = NULL;
1979                 } else if (need_timeslice(engine, last) &&
1980                            timeslice_expired(execlists, last)) {
1981                         ENGINE_TRACE(engine,
1982                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
1983                                      last->fence.context,
1984                                      last->fence.seqno,
1985                                      last->sched.attr.priority,
1986                                      execlists->queue_priority_hint,
1987                                      yesno(timeslice_yield(execlists, last)));
1988
1989                         ring_set_paused(engine, 1);
1990                         defer_active(engine);
1991
1992                         /*
1993                          * Unlike for preemption, if we rewind and continue
1994                          * executing the same context as previously active,
1995                          * the order of execution will remain the same and
1996                          * the tail will only advance. We do not need to
1997                          * force a full context restore, as a lite-restore
1998                          * is sufficient to resample the monotonic TAIL.
1999                          *
2000                          * If we switch to any other context, similarly we
2001                          * will not rewind TAIL of current context, and
2002                          * normal save/restore will preserve state and allow
2003                          * us to later continue executing the same request.
2004                          */
2005                         last = NULL;
2006                 } else {
2007                         /*
2008                          * Otherwise if we already have a request pending
2009                          * for execution after the current one, we can
2010                          * just wait until the next CS event before
2011                          * queuing more. In either case we will force a
2012                          * lite-restore preemption event, but if we wait
2013                          * we hopefully coalesce several updates into a single
2014                          * submission.
2015                          */
2016                         if (!list_is_last(&last->sched.link,
2017                                           &engine->active.requests)) {
2018                                 /*
2019                                  * Even if ELSP[1] is occupied and not worthy
2020                                  * of timeslices, our queue might be.
2021                                  */
2022                                 start_timeslice(engine);
2023                                 return;
2024                         }
2025                 }
2026         }
2027
2028         while (rb) { /* XXX virtual is always taking precedence */
2029                 struct virtual_engine *ve =
2030                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2031                 struct i915_request *rq;
2032
2033                 spin_lock(&ve->base.active.lock);
2034
2035                 rq = ve->request;
2036                 if (unlikely(!rq)) { /* lost the race to a sibling */
2037                         spin_unlock(&ve->base.active.lock);
2038                         rb_erase_cached(rb, &execlists->virtual);
2039                         RB_CLEAR_NODE(rb);
2040                         rb = rb_first_cached(&execlists->virtual);
2041                         continue;
2042                 }
2043
2044                 GEM_BUG_ON(rq != ve->request);
2045                 GEM_BUG_ON(rq->engine != &ve->base);
2046                 GEM_BUG_ON(rq->context != &ve->context);
2047
2048                 if (rq_prio(rq) >= queue_prio(execlists)) {
2049                         if (!virtual_matches(ve, rq, engine)) {
2050                                 spin_unlock(&ve->base.active.lock);
2051                                 rb = rb_next(rb);
2052                                 continue;
2053                         }
2054
2055                         if (last && !can_merge_rq(last, rq)) {
2056                                 spin_unlock(&ve->base.active.lock);
2057                                 start_timeslice(engine);
2058                                 return; /* leave this for another sibling */
2059                         }
2060
2061                         ENGINE_TRACE(engine,
2062                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2063                                      rq->fence.context,
2064                                      rq->fence.seqno,
2065                                      i915_request_completed(rq) ? "!" :
2066                                      i915_request_started(rq) ? "*" :
2067                                      "",
2068                                      yesno(engine != ve->siblings[0]));
2069
2070                         WRITE_ONCE(ve->request, NULL);
2071                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2072                                    INT_MIN);
2073                         rb_erase_cached(rb, &execlists->virtual);
2074                         RB_CLEAR_NODE(rb);
2075
2076                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2077                         WRITE_ONCE(rq->engine, engine);
2078
2079                         if (engine != ve->siblings[0]) {
2080                                 u32 *regs = ve->context.lrc_reg_state;
2081                                 unsigned int n;
2082
2083                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2084
2085                                 if (!intel_engine_has_relative_mmio(engine))
2086                                         virtual_update_register_offsets(regs,
2087                                                                         engine);
2088
2089                                 if (!list_empty(&ve->context.signals))
2090                                         virtual_xfer_breadcrumbs(ve, rq);
2091
2092                                 /*
2093                                  * Move the bound engine to the top of the list
2094                                  * for future execution. We then kick this
2095                                  * tasklet first before checking others, so that
2096                                  * we preferentially reuse this set of bound
2097                                  * registers.
2098                                  */
2099                                 for (n = 1; n < ve->num_siblings; n++) {
2100                                         if (ve->siblings[n] == engine) {
2101                                                 swap(ve->siblings[n],
2102                                                      ve->siblings[0]);
2103                                                 break;
2104                                         }
2105                                 }
2106
2107                                 GEM_BUG_ON(ve->siblings[0] != engine);
2108                         }
2109
2110                         if (__i915_request_submit(rq)) {
2111                                 submit = true;
2112                                 last = rq;
2113                         }
2114                         i915_request_put(rq);
2115
2116                         /*
2117                          * Hmm, we have a bunch of virtual engine requests,
2118                          * but the first one was already completed (thanks
2119                          * preempt-to-busy!). Keep looking at the veng queue
2120                          * until we have no more relevant requests (i.e.
2121                          * the normal submit queue has higher priority).
2122                          */
2123                         if (!submit) {
2124                                 spin_unlock(&ve->base.active.lock);
2125                                 rb = rb_first_cached(&execlists->virtual);
2126                                 continue;
2127                         }
2128                 }
2129
2130                 spin_unlock(&ve->base.active.lock);
2131                 break;
2132         }
2133
2134         while ((rb = rb_first_cached(&execlists->queue))) {
2135                 struct i915_priolist *p = to_priolist(rb);
2136                 struct i915_request *rq, *rn;
2137                 int i;
2138
2139                 priolist_for_each_request_consume(rq, rn, p, i) {
2140                         bool merge = true;
2141
2142                         /*
2143                          * Can we combine this request with the current port?
2144                          * It has to be the same context/ringbuffer and not
2145                          * have any exceptions (e.g. GVT saying never to
2146                          * combine contexts).
2147                          *
2148                          * If we can combine the requests, we can execute both
2149                          * by updating the RING_TAIL to point to the end of the
2150                          * second request, and so we never need to tell the
2151                          * hardware about the first.
2152                          */
2153                         if (last && !can_merge_rq(last, rq)) {
2154                                 /*
2155                                  * If we are on the second port and cannot
2156                                  * combine this request with the last, then we
2157                                  * are done.
2158                                  */
2159                                 if (port == last_port)
2160                                         goto done;
2161
2162                                 /*
2163                                  * We must not populate both ELSP[] with the
2164                                  * same LRCA, i.e. we must submit 2 different
2165                                  * contexts if we submit 2 ELSP.
2166                                  */
2167                                 if (last->context == rq->context)
2168                                         goto done;
2169
2170                                 if (i915_request_has_sentinel(last))
2171                                         goto done;
2172
2173                                 /*
2174                                  * If GVT overrides us we only ever submit
2175                                  * port[0], leaving port[1] empty. Note that we
2176                                  * also have to be careful that we don't queue
2177                                  * the same context (even though a different
2178                                  * request) to the second port.
2179                                  */
2180                                 if (ctx_single_port_submission(last->context) ||
2181                                     ctx_single_port_submission(rq->context))
2182                                         goto done;
2183
2184                                 merge = false;
2185                         }
2186
2187                         if (__i915_request_submit(rq)) {
2188                                 if (!merge) {
2189                                         *port = execlists_schedule_in(last, port - execlists->pending);
2190                                         port++;
2191                                         last = NULL;
2192                                 }
2193
2194                                 GEM_BUG_ON(last &&
2195                                            !can_merge_ctx(last->context,
2196                                                           rq->context));
2197                                 GEM_BUG_ON(last &&
2198                                            i915_seqno_passed(last->fence.seqno,
2199                                                              rq->fence.seqno));
2200
2201                                 submit = true;
2202                                 last = rq;
2203                         }
2204                 }
2205
2206                 rb_erase_cached(&p->node, &execlists->queue);
2207                 i915_priolist_free(p);
2208         }
2209
2210 done:
2211         /*
2212          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2213          *
2214          * We choose the priority hint such that if we add a request of greater
2215          * priority than this, we kick the submission tasklet to decide on
2216          * the right order of submitting the requests to hardware. We must
2217          * also be prepared to reorder requests as they are in-flight on the
2218          * HW. We derive the priority hint then as the first "hole" in
2219          * the HW submission ports and if there are no available slots,
2220          * the priority of the lowest executing request, i.e. last.
2221          *
2222          * When we do receive a higher priority request ready to run from the
2223          * user, see queue_request(), the priority hint is bumped to that
2224          * request triggering preemption on the next dequeue (or subsequent
2225          * interrupt for secondary ports).
2226          */
2227         execlists->queue_priority_hint = queue_prio(execlists);
2228
2229         if (submit) {
2230                 *port = execlists_schedule_in(last, port - execlists->pending);
2231                 execlists->switch_priority_hint =
2232                         switch_prio(engine, *execlists->pending);
2233
2234                 /*
2235                  * Skip if we ended up with exactly the same set of requests,
2236                  * e.g. trying to timeslice a pair of ordered contexts
2237                  */
2238                 if (!memcmp(active, execlists->pending,
2239                             (port - execlists->pending + 1) * sizeof(*port))) {
2240                         do
2241                                 execlists_schedule_out(fetch_and_zero(port));
2242                         while (port-- != execlists->pending);
2243
2244                         goto skip_submit;
2245                 }
2246                 clear_ports(port + 1, last_port - port);
2247
2248                 WRITE_ONCE(execlists->yield, -1);
2249                 execlists_submit_ports(engine);
2250                 set_preempt_timeout(engine, *active);
2251         } else {
2252 skip_submit:
2253                 ring_set_paused(engine, 0);
2254         }
2255 }
2256
2257 static void
2258 cancel_port_requests(struct intel_engine_execlists * const execlists)
2259 {
2260         struct i915_request * const *port;
2261
2262         for (port = execlists->pending; *port; port++)
2263                 execlists_schedule_out(*port);
2264         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2265
2266         /* Mark the end of active before we overwrite *active */
2267         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2268                 execlists_schedule_out(*port);
2269         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2270
2271         smp_wmb(); /* complete the seqlock for execlists_active() */
2272         WRITE_ONCE(execlists->active, execlists->inflight);
2273 }
2274
2275 static inline void
2276 invalidate_csb_entries(const u32 *first, const u32 *last)
2277 {
2278         clflush((void *)first);
2279         clflush((void *)last);
2280 }
2281
2282 /*
2283  * Starting with Gen12, the status has a new format:
2284  *
2285  *     bit  0:     switched to new queue
2286  *     bit  1:     reserved
2287  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2288  *                 switch detail is set to "wait on semaphore"
2289  *     bits 3-5:   engine class
2290  *     bits 6-11:  engine instance
2291  *     bits 12-14: reserved
2292  *     bits 15-25: sw context id of the lrc the GT switched to
2293  *     bits 26-31: sw counter of the lrc the GT switched to
2294  *     bits 32-35: context switch detail
2295  *                  - 0: ctx complete
2296  *                  - 1: wait on sync flip
2297  *                  - 2: wait on vblank
2298  *                  - 3: wait on scanline
2299  *                  - 4: wait on semaphore
2300  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2301  *                       WAIT_FOR_EVENT)
2302  *     bit  36:    reserved
2303  *     bits 37-43: wait detail (for switch detail 1 to 4)
2304  *     bits 44-46: reserved
2305  *     bits 47-57: sw context id of the lrc the GT switched away from
2306  *     bits 58-63: sw counter of the lrc the GT switched away from
2307  */
2308 static inline bool
2309 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2310 {
2311         u32 lower_dw = csb[0];
2312         u32 upper_dw = csb[1];
2313         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2314         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2315         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2316
2317         /*
2318          * The context switch detail is not guaranteed to be 5 when a preemption
2319          * occurs, so we can't just check for that. The check below works for
2320          * all the cases we care about, including preemptions of WAIT
2321          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2322          * would require some extra handling, but we don't support that.
2323          */
2324         if (!ctx_away_valid || new_queue) {
2325                 GEM_BUG_ON(!ctx_to_valid);
2326                 return true;
2327         }
2328
2329         /*
2330          * switch detail = 5 is covered by the case above and we do not expect a
2331          * context switch on an unsuccessful wait instruction since we always
2332          * use polling mode.
2333          */
2334         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2335         return false;
2336 }
2337
2338 static inline bool
2339 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2340 {
2341         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2342 }
2343
2344 static void process_csb(struct intel_engine_cs *engine)
2345 {
2346         struct intel_engine_execlists * const execlists = &engine->execlists;
2347         const u32 * const buf = execlists->csb_status;
2348         const u8 num_entries = execlists->csb_size;
2349         u8 head, tail;
2350
2351         /*
2352          * As we modify our execlists state tracking we require exclusive
2353          * access. Either we are inside the tasklet, or the tasklet is disabled
2354          * and we assume that is only inside the reset paths and so serialised.
2355          */
2356         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2357                    !reset_in_progress(execlists));
2358         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2359
2360         /*
2361          * Note that csb_write, csb_status may be either in HWSP or mmio.
2362          * When reading from the csb_write mmio register, we have to be
2363          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2364          * the low 4bits. As it happens we know the next 4bits are always
2365          * zero and so we can simply masked off the low u8 of the register
2366          * and treat it identically to reading from the HWSP (without having
2367          * to use explicit shifting and masking, and probably bifurcating
2368          * the code to handle the legacy mmio read).
2369          */
2370         head = execlists->csb_head;
2371         tail = READ_ONCE(*execlists->csb_write);
2372         if (unlikely(head == tail))
2373                 return;
2374
2375         /*
2376          * Hopefully paired with a wmb() in HW!
2377          *
2378          * We must complete the read of the write pointer before any reads
2379          * from the CSB, so that we do not see stale values. Without an rmb
2380          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2381          * we perform the READ_ONCE(*csb_write).
2382          */
2383         rmb();
2384
2385         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2386         do {
2387                 bool promote;
2388
2389                 if (++head == num_entries)
2390                         head = 0;
2391
2392                 /*
2393                  * We are flying near dragons again.
2394                  *
2395                  * We hold a reference to the request in execlist_port[]
2396                  * but no more than that. We are operating in softirq
2397                  * context and so cannot hold any mutex or sleep. That
2398                  * prevents us stopping the requests we are processing
2399                  * in port[] from being retired simultaneously (the
2400                  * breadcrumb will be complete before we see the
2401                  * context-switch). As we only hold the reference to the
2402                  * request, any pointer chasing underneath the request
2403                  * is subject to a potential use-after-free. Thus we
2404                  * store all of the bookkeeping within port[] as
2405                  * required, and avoid using unguarded pointers beneath
2406                  * request itself. The same applies to the atomic
2407                  * status notifier.
2408                  */
2409
2410                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2411                              head, buf[2 * head + 0], buf[2 * head + 1]);
2412
2413                 if (INTEL_GEN(engine->i915) >= 12)
2414                         promote = gen12_csb_parse(execlists, buf + 2 * head);
2415                 else
2416                         promote = gen8_csb_parse(execlists, buf + 2 * head);
2417                 if (promote) {
2418                         struct i915_request * const *old = execlists->active;
2419
2420                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2421
2422                         ring_set_paused(engine, 0);
2423
2424                         /* Point active to the new ELSP; prevent overwriting */
2425                         WRITE_ONCE(execlists->active, execlists->pending);
2426                         smp_wmb(); /* notify execlists_active() */
2427
2428                         /* cancel old inflight, prepare for switch */
2429                         trace_ports(execlists, "preempted", old);
2430                         while (*old)
2431                                 execlists_schedule_out(*old++);
2432
2433                         /* switch pending to inflight */
2434                         memcpy(execlists->inflight,
2435                                execlists->pending,
2436                                execlists_num_ports(execlists) *
2437                                sizeof(*execlists->pending));
2438                         smp_wmb(); /* complete the seqlock */
2439                         WRITE_ONCE(execlists->active, execlists->inflight);
2440
2441                         WRITE_ONCE(execlists->pending[0], NULL);
2442                 } else {
2443                         GEM_BUG_ON(!*execlists->active);
2444
2445                         /* port0 completed, advanced to port1 */
2446                         trace_ports(execlists, "completed", execlists->active);
2447
2448                         /*
2449                          * We rely on the hardware being strongly
2450                          * ordered, that the breadcrumb write is
2451                          * coherent (visible from the CPU) before the
2452                          * user interrupt and CSB is processed.
2453                          */
2454                         if (GEM_SHOW_DEBUG() &&
2455                             !i915_request_completed(*execlists->active) &&
2456                             !reset_in_progress(execlists)) {
2457                                 struct i915_request *rq __maybe_unused =
2458                                         *execlists->active;
2459                                 const u32 *regs __maybe_unused =
2460                                         rq->context->lrc_reg_state;
2461
2462                                 ENGINE_TRACE(engine,
2463                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2464                                              ENGINE_READ(engine, RING_START),
2465                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2466                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2467                                              ENGINE_READ(engine, RING_CTL),
2468                                              ENGINE_READ(engine, RING_MI_MODE));
2469                                 ENGINE_TRACE(engine,
2470                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2471                                              i915_ggtt_offset(rq->ring->vma),
2472                                              rq->head, rq->tail,
2473                                              rq->fence.context,
2474                                              lower_32_bits(rq->fence.seqno),
2475                                              hwsp_seqno(rq));
2476                                 ENGINE_TRACE(engine,
2477                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2478                                              regs[CTX_RING_START],
2479                                              regs[CTX_RING_HEAD],
2480                                              regs[CTX_RING_TAIL]);
2481
2482                                 GEM_BUG_ON("context completed before request");
2483                         }
2484
2485                         execlists_schedule_out(*execlists->active++);
2486
2487                         GEM_BUG_ON(execlists->active - execlists->inflight >
2488                                    execlists_num_ports(execlists));
2489                 }
2490         } while (head != tail);
2491
2492         execlists->csb_head = head;
2493         set_timeslice(engine);
2494
2495         /*
2496          * Gen11 has proven to fail wrt global observation point between
2497          * entry and tail update, failing on the ordering and thus
2498          * we see an old entry in the context status buffer.
2499          *
2500          * Forcibly evict out entries for the next gpu csb update,
2501          * to increase the odds that we get a fresh entries with non
2502          * working hardware. The cost for doing so comes out mostly with
2503          * the wash as hardware, working or not, will need to do the
2504          * invalidation before.
2505          */
2506         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2507 }
2508
2509 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2510 {
2511         lockdep_assert_held(&engine->active.lock);
2512         if (!READ_ONCE(engine->execlists.pending[0])) {
2513                 rcu_read_lock(); /* protect peeking at execlists->active */
2514                 execlists_dequeue(engine);
2515                 rcu_read_unlock();
2516         }
2517 }
2518
2519 static void __execlists_hold(struct i915_request *rq)
2520 {
2521         LIST_HEAD(list);
2522
2523         do {
2524                 struct i915_dependency *p;
2525
2526                 if (i915_request_is_active(rq))
2527                         __i915_request_unsubmit(rq);
2528
2529                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2530                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2531                 i915_request_set_hold(rq);
2532                 RQ_TRACE(rq, "on hold\n");
2533
2534                 for_each_waiter(p, rq) {
2535                         struct i915_request *w =
2536                                 container_of(p->waiter, typeof(*w), sched);
2537
2538                         /* Leave semaphores spinning on the other engines */
2539                         if (w->engine != rq->engine)
2540                                 continue;
2541
2542                         if (!i915_request_is_ready(w))
2543                                 continue;
2544
2545                         if (i915_request_completed(w))
2546                                 continue;
2547
2548                         if (i915_request_on_hold(w))
2549                                 continue;
2550
2551                         list_move_tail(&w->sched.link, &list);
2552                 }
2553
2554                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2555         } while (rq);
2556 }
2557
2558 static bool execlists_hold(struct intel_engine_cs *engine,
2559                            struct i915_request *rq)
2560 {
2561         spin_lock_irq(&engine->active.lock);
2562
2563         if (i915_request_completed(rq)) { /* too late! */
2564                 rq = NULL;
2565                 goto unlock;
2566         }
2567
2568         if (rq->engine != engine) { /* preempted virtual engine */
2569                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2570
2571                 /*
2572                  * intel_context_inflight() is only protected by virtue
2573                  * of process_csb() being called only by the tasklet (or
2574                  * directly from inside reset while the tasklet is suspended).
2575                  * Assert that neither of those are allowed to run while we
2576                  * poke at the request queues.
2577                  */
2578                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2579
2580                 /*
2581                  * An unsubmitted request along a virtual engine will
2582                  * remain on the active (this) engine until we are able
2583                  * to process the context switch away (and so mark the
2584                  * context as no longer in flight). That cannot have happened
2585                  * yet, otherwise we would not be hanging!
2586                  */
2587                 spin_lock(&ve->base.active.lock);
2588                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2589                 GEM_BUG_ON(ve->request != rq);
2590                 ve->request = NULL;
2591                 spin_unlock(&ve->base.active.lock);
2592                 i915_request_put(rq);
2593
2594                 rq->engine = engine;
2595         }
2596
2597         /*
2598          * Transfer this request onto the hold queue to prevent it
2599          * being resumbitted to HW (and potentially completed) before we have
2600          * released it. Since we may have already submitted following
2601          * requests, we need to remove those as well.
2602          */
2603         GEM_BUG_ON(i915_request_on_hold(rq));
2604         GEM_BUG_ON(rq->engine != engine);
2605         __execlists_hold(rq);
2606         GEM_BUG_ON(list_empty(&engine->active.hold));
2607
2608 unlock:
2609         spin_unlock_irq(&engine->active.lock);
2610         return rq;
2611 }
2612
2613 static bool hold_request(const struct i915_request *rq)
2614 {
2615         struct i915_dependency *p;
2616         bool result = false;
2617
2618         /*
2619          * If one of our ancestors is on hold, we must also be on hold,
2620          * otherwise we will bypass it and execute before it.
2621          */
2622         rcu_read_lock();
2623         for_each_signaler(p, rq) {
2624                 const struct i915_request *s =
2625                         container_of(p->signaler, typeof(*s), sched);
2626
2627                 if (s->engine != rq->engine)
2628                         continue;
2629
2630                 result = i915_request_on_hold(s);
2631                 if (result)
2632                         break;
2633         }
2634         rcu_read_unlock();
2635
2636         return result;
2637 }
2638
2639 static void __execlists_unhold(struct i915_request *rq)
2640 {
2641         LIST_HEAD(list);
2642
2643         do {
2644                 struct i915_dependency *p;
2645
2646                 RQ_TRACE(rq, "hold release\n");
2647
2648                 GEM_BUG_ON(!i915_request_on_hold(rq));
2649                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2650
2651                 i915_request_clear_hold(rq);
2652                 list_move_tail(&rq->sched.link,
2653                                i915_sched_lookup_priolist(rq->engine,
2654                                                           rq_prio(rq)));
2655                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2656
2657                 /* Also release any children on this engine that are ready */
2658                 for_each_waiter(p, rq) {
2659                         struct i915_request *w =
2660                                 container_of(p->waiter, typeof(*w), sched);
2661
2662                         /* Propagate any change in error status */
2663                         if (rq->fence.error)
2664                                 i915_request_set_error_once(w, rq->fence.error);
2665
2666                         if (w->engine != rq->engine)
2667                                 continue;
2668
2669                         if (!i915_request_on_hold(w))
2670                                 continue;
2671
2672                         /* Check that no other parents are also on hold */
2673                         if (hold_request(w))
2674                                 continue;
2675
2676                         list_move_tail(&w->sched.link, &list);
2677                 }
2678
2679                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2680         } while (rq);
2681 }
2682
2683 static void execlists_unhold(struct intel_engine_cs *engine,
2684                              struct i915_request *rq)
2685 {
2686         spin_lock_irq(&engine->active.lock);
2687
2688         /*
2689          * Move this request back to the priority queue, and all of its
2690          * children and grandchildren that were suspended along with it.
2691          */
2692         __execlists_unhold(rq);
2693
2694         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2695                 engine->execlists.queue_priority_hint = rq_prio(rq);
2696                 tasklet_hi_schedule(&engine->execlists.tasklet);
2697         }
2698
2699         spin_unlock_irq(&engine->active.lock);
2700 }
2701
2702 struct execlists_capture {
2703         struct work_struct work;
2704         struct i915_request *rq;
2705         struct i915_gpu_coredump *error;
2706 };
2707
2708 static void execlists_capture_work(struct work_struct *work)
2709 {
2710         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2711         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2712         struct intel_engine_cs *engine = cap->rq->engine;
2713         struct intel_gt_coredump *gt = cap->error->gt;
2714         struct intel_engine_capture_vma *vma;
2715
2716         /* Compress all the objects attached to the request, slow! */
2717         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2718         if (vma) {
2719                 struct i915_vma_compress *compress =
2720                         i915_vma_capture_prepare(gt);
2721
2722                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2723                 i915_vma_capture_finish(gt, compress);
2724         }
2725
2726         gt->simulated = gt->engine->simulated;
2727         cap->error->simulated = gt->simulated;
2728
2729         /* Publish the error state, and announce it to the world */
2730         i915_error_state_store(cap->error);
2731         i915_gpu_coredump_put(cap->error);
2732
2733         /* Return this request and all that depend upon it for signaling */
2734         execlists_unhold(engine, cap->rq);
2735         i915_request_put(cap->rq);
2736
2737         kfree(cap);
2738 }
2739
2740 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2741 {
2742         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2743         struct execlists_capture *cap;
2744
2745         cap = kmalloc(sizeof(*cap), gfp);
2746         if (!cap)
2747                 return NULL;
2748
2749         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2750         if (!cap->error)
2751                 goto err_cap;
2752
2753         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2754         if (!cap->error->gt)
2755                 goto err_gpu;
2756
2757         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2758         if (!cap->error->gt->engine)
2759                 goto err_gt;
2760
2761         return cap;
2762
2763 err_gt:
2764         kfree(cap->error->gt);
2765 err_gpu:
2766         kfree(cap->error);
2767 err_cap:
2768         kfree(cap);
2769         return NULL;
2770 }
2771
2772 static bool execlists_capture(struct intel_engine_cs *engine)
2773 {
2774         struct execlists_capture *cap;
2775
2776         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2777                 return true;
2778
2779         /*
2780          * We need to _quickly_ capture the engine state before we reset.
2781          * We are inside an atomic section (softirq) here and we are delaying
2782          * the forced preemption event.
2783          */
2784         cap = capture_regs(engine);
2785         if (!cap)
2786                 return true;
2787
2788         spin_lock_irq(&engine->active.lock);
2789         cap->rq = execlists_active(&engine->execlists);
2790         if (cap->rq) {
2791                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2792                 cap->rq = i915_request_get_rcu(cap->rq);
2793         }
2794         spin_unlock_irq(&engine->active.lock);
2795         if (!cap->rq)
2796                 goto err_free;
2797
2798         /*
2799          * Remove the request from the execlists queue, and take ownership
2800          * of the request. We pass it to our worker who will _slowly_ compress
2801          * all the pages the _user_ requested for debugging their batch, after
2802          * which we return it to the queue for signaling.
2803          *
2804          * By removing them from the execlists queue, we also remove the
2805          * requests from being processed by __unwind_incomplete_requests()
2806          * during the intel_engine_reset(), and so they will *not* be replayed
2807          * afterwards.
2808          *
2809          * Note that because we have not yet reset the engine at this point,
2810          * it is possible for the request that we have identified as being
2811          * guilty, did in fact complete and we will then hit an arbitration
2812          * point allowing the outstanding preemption to succeed. The likelihood
2813          * of that is very low (as capturing of the engine registers should be
2814          * fast enough to run inside an irq-off atomic section!), so we will
2815          * simply hold that request accountable for being non-preemptible
2816          * long enough to force the reset.
2817          */
2818         if (!execlists_hold(engine, cap->rq))
2819                 goto err_rq;
2820
2821         INIT_WORK(&cap->work, execlists_capture_work);
2822         schedule_work(&cap->work);
2823         return true;
2824
2825 err_rq:
2826         i915_request_put(cap->rq);
2827 err_free:
2828         i915_gpu_coredump_put(cap->error);
2829         kfree(cap);
2830         return false;
2831 }
2832
2833 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2834 {
2835         const unsigned int bit = I915_RESET_ENGINE + engine->id;
2836         unsigned long *lock = &engine->gt->reset.flags;
2837
2838         if (!intel_has_reset_engine(engine->gt))
2839                 return;
2840
2841         if (test_and_set_bit(bit, lock))
2842                 return;
2843
2844         ENGINE_TRACE(engine, "reset for %s\n", msg);
2845
2846         /* Mark this tasklet as disabled to avoid waiting for it to complete */
2847         tasklet_disable_nosync(&engine->execlists.tasklet);
2848
2849         ring_set_paused(engine, 1); /* Freeze the current request in place */
2850         if (execlists_capture(engine))
2851                 intel_engine_reset(engine, msg);
2852         else
2853                 ring_set_paused(engine, 0);
2854
2855         tasklet_enable(&engine->execlists.tasklet);
2856         clear_and_wake_up_bit(bit, lock);
2857 }
2858
2859 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2860 {
2861         const struct timer_list *t = &engine->execlists.preempt;
2862
2863         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2864                 return false;
2865
2866         if (!timer_expired(t))
2867                 return false;
2868
2869         return READ_ONCE(engine->execlists.pending[0]);
2870 }
2871
2872 /*
2873  * Check the unread Context Status Buffers and manage the submission of new
2874  * contexts to the ELSP accordingly.
2875  */
2876 static void execlists_submission_tasklet(unsigned long data)
2877 {
2878         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2879         bool timeout = preempt_timeout(engine);
2880
2881         process_csb(engine);
2882
2883         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2884                 engine->execlists.error_interrupt = 0;
2885                 if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
2886                         execlists_reset(engine, "CS error");
2887         }
2888
2889         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2890                 unsigned long flags;
2891
2892                 spin_lock_irqsave(&engine->active.lock, flags);
2893                 __execlists_submission_tasklet(engine);
2894                 spin_unlock_irqrestore(&engine->active.lock, flags);
2895
2896                 /* Recheck after serialising with direct-submission */
2897                 if (unlikely(timeout && preempt_timeout(engine)))
2898                         execlists_reset(engine, "preemption time out");
2899         }
2900 }
2901
2902 static void __execlists_kick(struct intel_engine_execlists *execlists)
2903 {
2904         /* Kick the tasklet for some interrupt coalescing and reset handling */
2905         tasklet_hi_schedule(&execlists->tasklet);
2906 }
2907
2908 #define execlists_kick(t, member) \
2909         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
2910
2911 static void execlists_timeslice(struct timer_list *timer)
2912 {
2913         execlists_kick(timer, timer);
2914 }
2915
2916 static void execlists_preempt(struct timer_list *timer)
2917 {
2918         execlists_kick(timer, preempt);
2919 }
2920
2921 static void queue_request(struct intel_engine_cs *engine,
2922                           struct i915_request *rq)
2923 {
2924         GEM_BUG_ON(!list_empty(&rq->sched.link));
2925         list_add_tail(&rq->sched.link,
2926                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
2927         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2928 }
2929
2930 static void __submit_queue_imm(struct intel_engine_cs *engine)
2931 {
2932         struct intel_engine_execlists * const execlists = &engine->execlists;
2933
2934         if (reset_in_progress(execlists))
2935                 return; /* defer until we restart the engine following reset */
2936
2937         if (execlists->tasklet.func == execlists_submission_tasklet)
2938                 __execlists_submission_tasklet(engine);
2939         else
2940                 tasklet_hi_schedule(&execlists->tasklet);
2941 }
2942
2943 static void submit_queue(struct intel_engine_cs *engine,
2944                          const struct i915_request *rq)
2945 {
2946         struct intel_engine_execlists *execlists = &engine->execlists;
2947
2948         if (rq_prio(rq) <= execlists->queue_priority_hint)
2949                 return;
2950
2951         execlists->queue_priority_hint = rq_prio(rq);
2952         __submit_queue_imm(engine);
2953 }
2954
2955 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2956                              const struct i915_request *rq)
2957 {
2958         GEM_BUG_ON(i915_request_on_hold(rq));
2959         return !list_empty(&engine->active.hold) && hold_request(rq);
2960 }
2961
2962 static void execlists_submit_request(struct i915_request *request)
2963 {
2964         struct intel_engine_cs *engine = request->engine;
2965         unsigned long flags;
2966
2967         /* Will be called from irq-context when using foreign fences. */
2968         spin_lock_irqsave(&engine->active.lock, flags);
2969
2970         if (unlikely(ancestor_on_hold(engine, request))) {
2971                 RQ_TRACE(request, "ancestor on hold\n");
2972                 list_add_tail(&request->sched.link, &engine->active.hold);
2973                 i915_request_set_hold(request);
2974         } else {
2975                 queue_request(engine, request);
2976
2977                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2978                 GEM_BUG_ON(list_empty(&request->sched.link));
2979
2980                 submit_queue(engine, request);
2981         }
2982
2983         spin_unlock_irqrestore(&engine->active.lock, flags);
2984 }
2985
2986 static void __execlists_context_fini(struct intel_context *ce)
2987 {
2988         intel_ring_put(ce->ring);
2989         i915_vma_put(ce->state);
2990 }
2991
2992 static void execlists_context_destroy(struct kref *kref)
2993 {
2994         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2995
2996         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2997         GEM_BUG_ON(intel_context_is_pinned(ce));
2998
2999         if (ce->state)
3000                 __execlists_context_fini(ce);
3001
3002         intel_context_fini(ce);
3003         intel_context_free(ce);
3004 }
3005
3006 static void
3007 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3008 {
3009         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3010                 return;
3011
3012         vaddr += engine->context_size;
3013
3014         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3015 }
3016
3017 static void
3018 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3019 {
3020         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3021                 return;
3022
3023         vaddr += engine->context_size;
3024
3025         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3026                 dev_err_once(engine->i915->drm.dev,
3027                              "%s context redzone overwritten!\n",
3028                              engine->name);
3029 }
3030
3031 static void execlists_context_unpin(struct intel_context *ce)
3032 {
3033         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
3034                       ce->engine);
3035
3036         i915_gem_object_unpin_map(ce->state->obj);
3037 }
3038
3039 static void
3040 __execlists_update_reg_state(const struct intel_context *ce,
3041                              const struct intel_engine_cs *engine,
3042                              u32 head)
3043 {
3044         struct intel_ring *ring = ce->ring;
3045         u32 *regs = ce->lrc_reg_state;
3046
3047         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3048         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3049
3050         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3051         regs[CTX_RING_HEAD] = head;
3052         regs[CTX_RING_TAIL] = ring->tail;
3053         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3054
3055         /* RPCS */
3056         if (engine->class == RENDER_CLASS) {
3057                 regs[CTX_R_PWR_CLK_STATE] =
3058                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3059
3060                 i915_oa_init_reg_state(ce, engine);
3061         }
3062 }
3063
3064 static int
3065 __execlists_context_pin(struct intel_context *ce,
3066                         struct intel_engine_cs *engine)
3067 {
3068         void *vaddr;
3069
3070         GEM_BUG_ON(!ce->state);
3071         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3072
3073         vaddr = i915_gem_object_pin_map(ce->state->obj,
3074                                         i915_coherent_map_type(engine->i915) |
3075                                         I915_MAP_OVERRIDE);
3076         if (IS_ERR(vaddr))
3077                 return PTR_ERR(vaddr);
3078
3079         ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3080         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
3081         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3082
3083         return 0;
3084 }
3085
3086 static int execlists_context_pin(struct intel_context *ce)
3087 {
3088         return __execlists_context_pin(ce, ce->engine);
3089 }
3090
3091 static int execlists_context_alloc(struct intel_context *ce)
3092 {
3093         return __execlists_context_alloc(ce, ce->engine);
3094 }
3095
3096 static void execlists_context_reset(struct intel_context *ce)
3097 {
3098         CE_TRACE(ce, "reset\n");
3099         GEM_BUG_ON(!intel_context_is_pinned(ce));
3100
3101         intel_ring_reset(ce->ring, ce->ring->emit);
3102
3103         /* Scrub away the garbage */
3104         execlists_init_reg_state(ce->lrc_reg_state,
3105                                  ce, ce->engine, ce->ring, true);
3106         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3107
3108         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3109 }
3110
3111 static const struct intel_context_ops execlists_context_ops = {
3112         .alloc = execlists_context_alloc,
3113
3114         .pin = execlists_context_pin,
3115         .unpin = execlists_context_unpin,
3116
3117         .enter = intel_context_enter_engine,
3118         .exit = intel_context_exit_engine,
3119
3120         .reset = execlists_context_reset,
3121         .destroy = execlists_context_destroy,
3122 };
3123
3124 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3125 {
3126         u32 *cs;
3127
3128         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3129                 return 0;
3130
3131         cs = intel_ring_begin(rq, 6);
3132         if (IS_ERR(cs))
3133                 return PTR_ERR(cs);
3134
3135         /*
3136          * Check if we have been preempted before we even get started.
3137          *
3138          * After this point i915_request_started() reports true, even if
3139          * we get preempted and so are no longer running.
3140          */
3141         *cs++ = MI_ARB_CHECK;
3142         *cs++ = MI_NOOP;
3143
3144         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3145         *cs++ = i915_request_timeline(rq)->hwsp_offset;
3146         *cs++ = 0;
3147         *cs++ = rq->fence.seqno - 1;
3148
3149         intel_ring_advance(rq, cs);
3150
3151         /* Record the updated position of the request's payload */
3152         rq->infix = intel_ring_offset(rq, cs);
3153
3154         return 0;
3155 }
3156
3157 static int execlists_request_alloc(struct i915_request *request)
3158 {
3159         int ret;
3160
3161         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3162
3163         /*
3164          * Flush enough space to reduce the likelihood of waiting after
3165          * we start building the request - in which case we will just
3166          * have to repeat work.
3167          */
3168         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3169
3170         /*
3171          * Note that after this point, we have committed to using
3172          * this request as it is being used to both track the
3173          * state of engine initialisation and liveness of the
3174          * golden renderstate above. Think twice before you try
3175          * to cancel/unwind this request now.
3176          */
3177
3178         /* Unconditionally invalidate GPU caches and TLBs. */
3179         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3180         if (ret)
3181                 return ret;
3182
3183         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3184         return 0;
3185 }
3186
3187 /*
3188  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3189  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3190  * but there is a slight complication as this is applied in WA batch where the
3191  * values are only initialized once so we cannot take register value at the
3192  * beginning and reuse it further; hence we save its value to memory, upload a
3193  * constant value with bit21 set and then we restore it back with the saved value.
3194  * To simplify the WA, a constant value is formed by using the default value
3195  * of this register. This shouldn't be a problem because we are only modifying
3196  * it for a short period and this batch in non-premptible. We can ofcourse
3197  * use additional instructions that read the actual value of the register
3198  * at that time and set our bit of interest but it makes the WA complicated.
3199  *
3200  * This WA is also required for Gen9 so extracting as a function avoids
3201  * code duplication.
3202  */
3203 static u32 *
3204 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3205 {
3206         /* NB no one else is allowed to scribble over scratch + 256! */
3207         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3208         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3209         *batch++ = intel_gt_scratch_offset(engine->gt,
3210                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3211         *batch++ = 0;
3212
3213         *batch++ = MI_LOAD_REGISTER_IMM(1);
3214         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3215         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3216
3217         batch = gen8_emit_pipe_control(batch,
3218                                        PIPE_CONTROL_CS_STALL |
3219                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3220                                        0);
3221
3222         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3223         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3224         *batch++ = intel_gt_scratch_offset(engine->gt,
3225                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3226         *batch++ = 0;
3227
3228         return batch;
3229 }
3230
3231 /*
3232  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3233  * initialized at the beginning and shared across all contexts but this field
3234  * helps us to have multiple batches at different offsets and select them based
3235  * on a criteria. At the moment this batch always start at the beginning of the page
3236  * and at this point we don't have multiple wa_ctx batch buffers.
3237  *
3238  * The number of WA applied are not known at the beginning; we use this field
3239  * to return the no of DWORDS written.
3240  *
3241  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3242  * so it adds NOOPs as padding to make it cacheline aligned.
3243  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3244  * makes a complete batch buffer.
3245  */
3246 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3247 {
3248         /* WaDisableCtxRestoreArbitration:bdw,chv */
3249         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3250
3251         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3252         if (IS_BROADWELL(engine->i915))
3253                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3254
3255         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3256         /* Actual scratch location is at 128 bytes offset */
3257         batch = gen8_emit_pipe_control(batch,
3258                                        PIPE_CONTROL_FLUSH_L3 |
3259                                        PIPE_CONTROL_STORE_DATA_INDEX |
3260                                        PIPE_CONTROL_CS_STALL |
3261                                        PIPE_CONTROL_QW_WRITE,
3262                                        LRC_PPHWSP_SCRATCH_ADDR);
3263
3264         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3265
3266         /* Pad to end of cacheline */
3267         while ((unsigned long)batch % CACHELINE_BYTES)
3268                 *batch++ = MI_NOOP;
3269
3270         /*
3271          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3272          * execution depends on the length specified in terms of cache lines
3273          * in the register CTX_RCS_INDIRECT_CTX
3274          */
3275
3276         return batch;
3277 }
3278
3279 struct lri {
3280         i915_reg_t reg;
3281         u32 value;
3282 };
3283
3284 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3285 {
3286         GEM_BUG_ON(!count || count > 63);
3287
3288         *batch++ = MI_LOAD_REGISTER_IMM(count);
3289         do {
3290                 *batch++ = i915_mmio_reg_offset(lri->reg);
3291                 *batch++ = lri->value;
3292         } while (lri++, --count);
3293         *batch++ = MI_NOOP;
3294
3295         return batch;
3296 }
3297
3298 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3299 {
3300         static const struct lri lri[] = {
3301                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3302                 {
3303                         COMMON_SLICE_CHICKEN2,
3304                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3305                                        0),
3306                 },
3307
3308                 /* BSpec: 11391 */
3309                 {
3310                         FF_SLICE_CHICKEN,
3311                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3312                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3313                 },
3314
3315                 /* BSpec: 11299 */
3316                 {
3317                         _3D_CHICKEN3,
3318                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3319                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3320                 }
3321         };
3322
3323         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3324
3325         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3326         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3327
3328         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3329         batch = gen8_emit_pipe_control(batch,
3330                                        PIPE_CONTROL_FLUSH_L3 |
3331                                        PIPE_CONTROL_STORE_DATA_INDEX |
3332                                        PIPE_CONTROL_CS_STALL |
3333                                        PIPE_CONTROL_QW_WRITE,
3334                                        LRC_PPHWSP_SCRATCH_ADDR);
3335
3336         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3337
3338         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3339         if (HAS_POOLED_EU(engine->i915)) {
3340                 /*
3341                  * EU pool configuration is setup along with golden context
3342                  * during context initialization. This value depends on
3343                  * device type (2x6 or 3x6) and needs to be updated based
3344                  * on which subslice is disabled especially for 2x6
3345                  * devices, however it is safe to load default
3346                  * configuration of 3x6 device instead of masking off
3347                  * corresponding bits because HW ignores bits of a disabled
3348                  * subslice and drops down to appropriate config. Please
3349                  * see render_state_setup() in i915_gem_render_state.c for
3350                  * possible configurations, to avoid duplication they are
3351                  * not shown here again.
3352                  */
3353                 *batch++ = GEN9_MEDIA_POOL_STATE;
3354                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3355                 *batch++ = 0x00777000;
3356                 *batch++ = 0;
3357                 *batch++ = 0;
3358                 *batch++ = 0;
3359         }
3360
3361         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3362
3363         /* Pad to end of cacheline */
3364         while ((unsigned long)batch % CACHELINE_BYTES)
3365                 *batch++ = MI_NOOP;
3366
3367         return batch;
3368 }
3369
3370 static u32 *
3371 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3372 {
3373         int i;
3374
3375         /*
3376          * WaPipeControlBefore3DStateSamplePattern: cnl
3377          *
3378          * Ensure the engine is idle prior to programming a
3379          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3380          */
3381         batch = gen8_emit_pipe_control(batch,
3382                                        PIPE_CONTROL_CS_STALL,
3383                                        0);
3384         /*
3385          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3386          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3387          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3388          * confusing. Since gen8_emit_pipe_control() already advances the
3389          * batch by 6 dwords, we advance the other 10 here, completing a
3390          * cacheline. It's not clear if the workaround requires this padding
3391          * before other commands, or if it's just the regular padding we would
3392          * already have for the workaround bb, so leave it here for now.
3393          */
3394         for (i = 0; i < 10; i++)
3395                 *batch++ = MI_NOOP;
3396
3397         /* Pad to end of cacheline */
3398         while ((unsigned long)batch % CACHELINE_BYTES)
3399                 *batch++ = MI_NOOP;
3400
3401         return batch;
3402 }
3403
3404 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3405
3406 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3407 {
3408         struct drm_i915_gem_object *obj;
3409         struct i915_vma *vma;
3410         int err;
3411
3412         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3413         if (IS_ERR(obj))
3414                 return PTR_ERR(obj);
3415
3416         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3417         if (IS_ERR(vma)) {
3418                 err = PTR_ERR(vma);
3419                 goto err;
3420         }
3421
3422         err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3423         if (err)
3424                 goto err;
3425
3426         engine->wa_ctx.vma = vma;
3427         return 0;
3428
3429 err:
3430         i915_gem_object_put(obj);
3431         return err;
3432 }
3433
3434 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3435 {
3436         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3437 }
3438
3439 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3440
3441 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3442 {
3443         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3444         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3445                                             &wa_ctx->per_ctx };
3446         wa_bb_func_t wa_bb_fn[2];
3447         struct page *page;
3448         void *batch, *batch_ptr;
3449         unsigned int i;
3450         int ret;
3451
3452         if (engine->class != RENDER_CLASS)
3453                 return 0;
3454
3455         switch (INTEL_GEN(engine->i915)) {
3456         case 12:
3457         case 11:
3458                 return 0;
3459         case 10:
3460                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3461                 wa_bb_fn[1] = NULL;
3462                 break;
3463         case 9:
3464                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3465                 wa_bb_fn[1] = NULL;
3466                 break;
3467         case 8:
3468                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3469                 wa_bb_fn[1] = NULL;
3470                 break;
3471         default:
3472                 MISSING_CASE(INTEL_GEN(engine->i915));
3473                 return 0;
3474         }
3475
3476         ret = lrc_setup_wa_ctx(engine);
3477         if (ret) {
3478                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3479                 return ret;
3480         }
3481
3482         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3483         batch = batch_ptr = kmap_atomic(page);
3484
3485         /*
3486          * Emit the two workaround batch buffers, recording the offset from the
3487          * start of the workaround batch buffer object for each and their
3488          * respective sizes.
3489          */
3490         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3491                 wa_bb[i]->offset = batch_ptr - batch;
3492                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3493                                                   CACHELINE_BYTES))) {
3494                         ret = -EINVAL;
3495                         break;
3496                 }
3497                 if (wa_bb_fn[i])
3498                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3499                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3500         }
3501
3502         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3503
3504         kunmap_atomic(batch);
3505         if (ret)
3506                 lrc_destroy_wa_ctx(engine);
3507
3508         return ret;
3509 }
3510
3511 static void enable_error_interrupt(struct intel_engine_cs *engine)
3512 {
3513         u32 status;
3514
3515         engine->execlists.error_interrupt = 0;
3516         ENGINE_WRITE(engine, RING_EMR, ~0u);
3517         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3518
3519         status = ENGINE_READ(engine, RING_ESR);
3520         if (unlikely(status)) {
3521                 dev_err(engine->i915->drm.dev,
3522                         "engine '%s' resumed still in error: %08x\n",
3523                         engine->name, status);
3524                 __intel_gt_reset(engine->gt, engine->mask);
3525         }
3526
3527         /*
3528          * On current gen8+, we have 2 signals to play with
3529          *
3530          * - I915_ERROR_INSTUCTION (bit 0)
3531          *
3532          *    Generate an error if the command parser encounters an invalid
3533          *    instruction
3534          *
3535          *    This is a fatal error.
3536          *
3537          * - CP_PRIV (bit 2)
3538          *
3539          *    Generate an error on privilege violation (where the CP replaces
3540          *    the instruction with a no-op). This also fires for writes into
3541          *    read-only scratch pages.
3542          *
3543          *    This is a non-fatal error, parsing continues.
3544          *
3545          * * there are a few others defined for odd HW that we do not use
3546          *
3547          * Since CP_PRIV fires for cases where we have chosen to ignore the
3548          * error (as the HW is validating and suppressing the mistakes), we
3549          * only unmask the instruction error bit.
3550          */
3551         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3552 }
3553
3554 static void enable_execlists(struct intel_engine_cs *engine)
3555 {
3556         u32 mode;
3557
3558         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3559
3560         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3561
3562         if (INTEL_GEN(engine->i915) >= 11)
3563                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3564         else
3565                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3566         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3567
3568         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3569
3570         ENGINE_WRITE_FW(engine,
3571                         RING_HWS_PGA,
3572                         i915_ggtt_offset(engine->status_page.vma));
3573         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3574
3575         enable_error_interrupt(engine);
3576
3577         engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
3578 }
3579
3580 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3581 {
3582         bool unexpected = false;
3583
3584         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3585                 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3586                 unexpected = true;
3587         }
3588
3589         return unexpected;
3590 }
3591
3592 static int execlists_resume(struct intel_engine_cs *engine)
3593 {
3594         intel_mocs_init_engine(engine);
3595
3596         intel_engine_reset_breadcrumbs(engine);
3597
3598         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3599                 struct drm_printer p = drm_debug_printer(__func__);
3600
3601                 intel_engine_dump(engine, &p, NULL);
3602         }
3603
3604         enable_execlists(engine);
3605
3606         return 0;
3607 }
3608
3609 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3610 {
3611         struct intel_engine_execlists * const execlists = &engine->execlists;
3612         unsigned long flags;
3613
3614         ENGINE_TRACE(engine, "depth<-%d\n",
3615                      atomic_read(&execlists->tasklet.count));
3616
3617         /*
3618          * Prevent request submission to the hardware until we have
3619          * completed the reset in i915_gem_reset_finish(). If a request
3620          * is completed by one engine, it may then queue a request
3621          * to a second via its execlists->tasklet *just* as we are
3622          * calling engine->resume() and also writing the ELSP.
3623          * Turning off the execlists->tasklet until the reset is over
3624          * prevents the race.
3625          */
3626         __tasklet_disable_sync_once(&execlists->tasklet);
3627         GEM_BUG_ON(!reset_in_progress(execlists));
3628
3629         /* And flush any current direct submission. */
3630         spin_lock_irqsave(&engine->active.lock, flags);
3631         spin_unlock_irqrestore(&engine->active.lock, flags);
3632
3633         /*
3634          * We stop engines, otherwise we might get failed reset and a
3635          * dead gpu (on elk). Also as modern gpu as kbl can suffer
3636          * from system hang if batchbuffer is progressing when
3637          * the reset is issued, regardless of READY_TO_RESET ack.
3638          * Thus assume it is best to stop engines on all gens
3639          * where we have a gpu reset.
3640          *
3641          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3642          *
3643          * FIXME: Wa for more modern gens needs to be validated
3644          */
3645         intel_engine_stop_cs(engine);
3646 }
3647
3648 static void reset_csb_pointers(struct intel_engine_cs *engine)
3649 {
3650         struct intel_engine_execlists * const execlists = &engine->execlists;
3651         const unsigned int reset_value = execlists->csb_size - 1;
3652
3653         ring_set_paused(engine, 0);
3654
3655         /*
3656          * After a reset, the HW starts writing into CSB entry [0]. We
3657          * therefore have to set our HEAD pointer back one entry so that
3658          * the *first* entry we check is entry 0. To complicate this further,
3659          * as we don't wait for the first interrupt after reset, we have to
3660          * fake the HW write to point back to the last entry so that our
3661          * inline comparison of our cached head position against the last HW
3662          * write works even before the first interrupt.
3663          */
3664         execlists->csb_head = reset_value;
3665         WRITE_ONCE(*execlists->csb_write, reset_value);
3666         wmb(); /* Make sure this is visible to HW (paranoia?) */
3667
3668         /*
3669          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3670          * Bludgeon them with a mmio update to be sure.
3671          */
3672         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3673                      reset_value << 8 | reset_value);
3674         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3675
3676         invalidate_csb_entries(&execlists->csb_status[0],
3677                                &execlists->csb_status[reset_value]);
3678 }
3679
3680 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3681 {
3682         int x;
3683
3684         x = lrc_ring_mi_mode(engine);
3685         if (x != -1) {
3686                 regs[x + 1] &= ~STOP_RING;
3687                 regs[x + 1] |= STOP_RING << 16;
3688         }
3689 }
3690
3691 static void __execlists_reset_reg_state(const struct intel_context *ce,
3692                                         const struct intel_engine_cs *engine)
3693 {
3694         u32 *regs = ce->lrc_reg_state;
3695
3696         __reset_stop_ring(regs, engine);
3697 }
3698
3699 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3700 {
3701         struct intel_engine_execlists * const execlists = &engine->execlists;
3702         struct intel_context *ce;
3703         struct i915_request *rq;
3704         u32 head;
3705
3706         mb(); /* paranoia: read the CSB pointers from after the reset */
3707         clflush(execlists->csb_write);
3708         mb();
3709
3710         process_csb(engine); /* drain preemption events */
3711
3712         /* Following the reset, we need to reload the CSB read/write pointers */
3713         reset_csb_pointers(engine);
3714
3715         /*
3716          * Save the currently executing context, even if we completed
3717          * its request, it was still running at the time of the
3718          * reset and will have been clobbered.
3719          */
3720         rq = execlists_active(execlists);
3721         if (!rq)
3722                 goto unwind;
3723
3724         ce = rq->context;
3725         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3726
3727         if (i915_request_completed(rq)) {
3728                 /* Idle context; tidy up the ring so we can restart afresh */
3729                 head = intel_ring_wrap(ce->ring, rq->tail);
3730                 goto out_replay;
3731         }
3732
3733         /* We still have requests in-flight; the engine should be active */
3734         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3735
3736         /* Context has requests still in-flight; it should not be idle! */
3737         GEM_BUG_ON(i915_active_is_idle(&ce->active));
3738
3739         rq = active_request(ce->timeline, rq);
3740         head = intel_ring_wrap(ce->ring, rq->head);
3741         GEM_BUG_ON(head == ce->ring->tail);
3742
3743         /*
3744          * If this request hasn't started yet, e.g. it is waiting on a
3745          * semaphore, we need to avoid skipping the request or else we
3746          * break the signaling chain. However, if the context is corrupt
3747          * the request will not restart and we will be stuck with a wedged
3748          * device. It is quite often the case that if we issue a reset
3749          * while the GPU is loading the context image, that the context
3750          * image becomes corrupt.
3751          *
3752          * Otherwise, if we have not started yet, the request should replay
3753          * perfectly and we do not need to flag the result as being erroneous.
3754          */
3755         if (!i915_request_started(rq))
3756                 goto out_replay;
3757
3758         /*
3759          * If the request was innocent, we leave the request in the ELSP
3760          * and will try to replay it on restarting. The context image may
3761          * have been corrupted by the reset, in which case we may have
3762          * to service a new GPU hang, but more likely we can continue on
3763          * without impact.
3764          *
3765          * If the request was guilty, we presume the context is corrupt
3766          * and have to at least restore the RING register in the context
3767          * image back to the expected values to skip over the guilty request.
3768          */
3769         __i915_request_reset(rq, stalled);
3770         if (!stalled)
3771                 goto out_replay;
3772
3773         /*
3774          * We want a simple context + ring to execute the breadcrumb update.
3775          * We cannot rely on the context being intact across the GPU hang,
3776          * so clear it and rebuild just what we need for the breadcrumb.
3777          * All pending requests for this context will be zapped, and any
3778          * future request will be after userspace has had the opportunity
3779          * to recreate its own state.
3780          */
3781         GEM_BUG_ON(!intel_context_is_pinned(ce));
3782         restore_default_state(ce, engine);
3783
3784 out_replay:
3785         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3786                      head, ce->ring->tail);
3787         __execlists_reset_reg_state(ce, engine);
3788         __execlists_update_reg_state(ce, engine, head);
3789         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3790
3791 unwind:
3792         /* Push back any incomplete requests for replay after the reset. */
3793         cancel_port_requests(execlists);
3794         __unwind_incomplete_requests(engine);
3795 }
3796
3797 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3798 {
3799         unsigned long flags;
3800
3801         ENGINE_TRACE(engine, "\n");
3802
3803         spin_lock_irqsave(&engine->active.lock, flags);
3804
3805         __execlists_reset(engine, stalled);
3806
3807         spin_unlock_irqrestore(&engine->active.lock, flags);
3808 }
3809
3810 static void nop_submission_tasklet(unsigned long data)
3811 {
3812         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3813
3814         /* The driver is wedged; don't process any more events. */
3815         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
3816 }
3817
3818 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3819 {
3820         struct intel_engine_execlists * const execlists = &engine->execlists;
3821         struct i915_request *rq, *rn;
3822         struct rb_node *rb;
3823         unsigned long flags;
3824
3825         ENGINE_TRACE(engine, "\n");
3826
3827         /*
3828          * Before we call engine->cancel_requests(), we should have exclusive
3829          * access to the submission state. This is arranged for us by the
3830          * caller disabling the interrupt generation, the tasklet and other
3831          * threads that may then access the same state, giving us a free hand
3832          * to reset state. However, we still need to let lockdep be aware that
3833          * we know this state may be accessed in hardirq context, so we
3834          * disable the irq around this manipulation and we want to keep
3835          * the spinlock focused on its duties and not accidentally conflate
3836          * coverage to the submission's irq state. (Similarly, although we
3837          * shouldn't need to disable irq around the manipulation of the
3838          * submission's irq state, we also wish to remind ourselves that
3839          * it is irq state.)
3840          */
3841         spin_lock_irqsave(&engine->active.lock, flags);
3842
3843         __execlists_reset(engine, true);
3844
3845         /* Mark all executing requests as skipped. */
3846         list_for_each_entry(rq, &engine->active.requests, sched.link)
3847                 mark_eio(rq);
3848
3849         /* Flush the queued requests to the timeline list (for retiring). */
3850         while ((rb = rb_first_cached(&execlists->queue))) {
3851                 struct i915_priolist *p = to_priolist(rb);
3852                 int i;
3853
3854                 priolist_for_each_request_consume(rq, rn, p, i) {
3855                         mark_eio(rq);
3856                         __i915_request_submit(rq);
3857                 }
3858
3859                 rb_erase_cached(&p->node, &execlists->queue);
3860                 i915_priolist_free(p);
3861         }
3862
3863         /* On-hold requests will be flushed to timeline upon their release */
3864         list_for_each_entry(rq, &engine->active.hold, sched.link)
3865                 mark_eio(rq);
3866
3867         /* Cancel all attached virtual engines */
3868         while ((rb = rb_first_cached(&execlists->virtual))) {
3869                 struct virtual_engine *ve =
3870                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3871
3872                 rb_erase_cached(rb, &execlists->virtual);
3873                 RB_CLEAR_NODE(rb);
3874
3875                 spin_lock(&ve->base.active.lock);
3876                 rq = fetch_and_zero(&ve->request);
3877                 if (rq) {
3878                         mark_eio(rq);
3879
3880                         rq->engine = engine;
3881                         __i915_request_submit(rq);
3882                         i915_request_put(rq);
3883
3884                         ve->base.execlists.queue_priority_hint = INT_MIN;
3885                 }
3886                 spin_unlock(&ve->base.active.lock);
3887         }
3888
3889         /* Remaining _unready_ requests will be nop'ed when submitted */
3890
3891         execlists->queue_priority_hint = INT_MIN;
3892         execlists->queue = RB_ROOT_CACHED;
3893
3894         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3895         execlists->tasklet.func = nop_submission_tasklet;
3896
3897         spin_unlock_irqrestore(&engine->active.lock, flags);
3898 }
3899
3900 static void execlists_reset_finish(struct intel_engine_cs *engine)
3901 {
3902         struct intel_engine_execlists * const execlists = &engine->execlists;
3903
3904         /*
3905          * After a GPU reset, we may have requests to replay. Do so now while
3906          * we still have the forcewake to be sure that the GPU is not allowed
3907          * to sleep before we restart and reload a context.
3908          */
3909         GEM_BUG_ON(!reset_in_progress(execlists));
3910         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3911                 execlists->tasklet.func(execlists->tasklet.data);
3912
3913         if (__tasklet_enable(&execlists->tasklet))
3914                 /* And kick in case we missed a new request submission. */
3915                 tasklet_hi_schedule(&execlists->tasklet);
3916         ENGINE_TRACE(engine, "depth->%d\n",
3917                      atomic_read(&execlists->tasklet.count));
3918 }
3919
3920 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3921                                     u64 offset, u32 len,
3922                                     const unsigned int flags)
3923 {
3924         u32 *cs;
3925
3926         cs = intel_ring_begin(rq, 4);
3927         if (IS_ERR(cs))
3928                 return PTR_ERR(cs);
3929
3930         /*
3931          * WaDisableCtxRestoreArbitration:bdw,chv
3932          *
3933          * We don't need to perform MI_ARB_ENABLE as often as we do (in
3934          * particular all the gen that do not need the w/a at all!), if we
3935          * took care to make sure that on every switch into this context
3936          * (both ordinary and for preemption) that arbitrartion was enabled
3937          * we would be fine.  However, for gen8 there is another w/a that
3938          * requires us to not preempt inside GPGPU execution, so we keep
3939          * arbitration disabled for gen8 batches. Arbitration will be
3940          * re-enabled before we close the request
3941          * (engine->emit_fini_breadcrumb).
3942          */
3943         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3944
3945         /* FIXME(BDW+): Address space and security selectors. */
3946         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3947                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3948         *cs++ = lower_32_bits(offset);
3949         *cs++ = upper_32_bits(offset);
3950
3951         intel_ring_advance(rq, cs);
3952
3953         return 0;
3954 }
3955
3956 static int gen8_emit_bb_start(struct i915_request *rq,
3957                               u64 offset, u32 len,
3958                               const unsigned int flags)
3959 {
3960         u32 *cs;
3961
3962         cs = intel_ring_begin(rq, 6);
3963         if (IS_ERR(cs))
3964                 return PTR_ERR(cs);
3965
3966         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3967
3968         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3969                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3970         *cs++ = lower_32_bits(offset);
3971         *cs++ = upper_32_bits(offset);
3972
3973         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3974         *cs++ = MI_NOOP;
3975
3976         intel_ring_advance(rq, cs);
3977
3978         return 0;
3979 }
3980
3981 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3982 {
3983         ENGINE_WRITE(engine, RING_IMR,
3984                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
3985         ENGINE_POSTING_READ(engine, RING_IMR);
3986 }
3987
3988 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3989 {
3990         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3991 }
3992
3993 static int gen8_emit_flush(struct i915_request *request, u32 mode)
3994 {
3995         u32 cmd, *cs;
3996
3997         cs = intel_ring_begin(request, 4);
3998         if (IS_ERR(cs))
3999                 return PTR_ERR(cs);
4000
4001         cmd = MI_FLUSH_DW + 1;
4002
4003         /* We always require a command barrier so that subsequent
4004          * commands, such as breadcrumb interrupts, are strictly ordered
4005          * wrt the contents of the write cache being flushed to memory
4006          * (and thus being coherent from the CPU).
4007          */
4008         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4009
4010         if (mode & EMIT_INVALIDATE) {
4011                 cmd |= MI_INVALIDATE_TLB;
4012                 if (request->engine->class == VIDEO_DECODE_CLASS)
4013                         cmd |= MI_INVALIDATE_BSD;
4014         }
4015
4016         *cs++ = cmd;
4017         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4018         *cs++ = 0; /* upper addr */
4019         *cs++ = 0; /* value */
4020         intel_ring_advance(request, cs);
4021
4022         return 0;
4023 }
4024
4025 static int gen8_emit_flush_render(struct i915_request *request,
4026                                   u32 mode)
4027 {
4028         bool vf_flush_wa = false, dc_flush_wa = false;
4029         u32 *cs, flags = 0;
4030         int len;
4031
4032         flags |= PIPE_CONTROL_CS_STALL;
4033
4034         if (mode & EMIT_FLUSH) {
4035                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4036                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4037                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4038                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4039         }
4040
4041         if (mode & EMIT_INVALIDATE) {
4042                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4043                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4044                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4045                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4046                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4047                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4048                 flags |= PIPE_CONTROL_QW_WRITE;
4049                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4050
4051                 /*
4052                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4053                  * pipe control.
4054                  */
4055                 if (IS_GEN(request->i915, 9))
4056                         vf_flush_wa = true;
4057
4058                 /* WaForGAMHang:kbl */
4059                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4060                         dc_flush_wa = true;
4061         }
4062
4063         len = 6;
4064
4065         if (vf_flush_wa)
4066                 len += 6;
4067
4068         if (dc_flush_wa)
4069                 len += 12;
4070
4071         cs = intel_ring_begin(request, len);
4072         if (IS_ERR(cs))
4073                 return PTR_ERR(cs);
4074
4075         if (vf_flush_wa)
4076                 cs = gen8_emit_pipe_control(cs, 0, 0);
4077
4078         if (dc_flush_wa)
4079                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4080                                             0);
4081
4082         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4083
4084         if (dc_flush_wa)
4085                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4086
4087         intel_ring_advance(request, cs);
4088
4089         return 0;
4090 }
4091
4092 static int gen11_emit_flush_render(struct i915_request *request,
4093                                    u32 mode)
4094 {
4095         if (mode & EMIT_FLUSH) {
4096                 u32 *cs;
4097                 u32 flags = 0;
4098
4099                 flags |= PIPE_CONTROL_CS_STALL;
4100
4101                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4102                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4103                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4104                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4105                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4106                 flags |= PIPE_CONTROL_QW_WRITE;
4107                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4108
4109                 cs = intel_ring_begin(request, 6);
4110                 if (IS_ERR(cs))
4111                         return PTR_ERR(cs);
4112
4113                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4114                 intel_ring_advance(request, cs);
4115         }
4116
4117         if (mode & EMIT_INVALIDATE) {
4118                 u32 *cs;
4119                 u32 flags = 0;
4120
4121                 flags |= PIPE_CONTROL_CS_STALL;
4122
4123                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4124                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4125                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4126                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4127                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4128                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4129                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4130                 flags |= PIPE_CONTROL_QW_WRITE;
4131                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4132
4133                 cs = intel_ring_begin(request, 6);
4134                 if (IS_ERR(cs))
4135                         return PTR_ERR(cs);
4136
4137                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4138                 intel_ring_advance(request, cs);
4139         }
4140
4141         return 0;
4142 }
4143
4144 static u32 preparser_disable(bool state)
4145 {
4146         return MI_ARB_CHECK | 1 << 8 | state;
4147 }
4148
4149 static int gen12_emit_flush_render(struct i915_request *request,
4150                                    u32 mode)
4151 {
4152         if (mode & EMIT_FLUSH) {
4153                 u32 flags = 0;
4154                 u32 *cs;
4155
4156                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4157                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4158                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4159                 /* Wa_1409600907:tgl */
4160                 flags |= PIPE_CONTROL_DEPTH_STALL;
4161                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4162                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4163                 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4164
4165                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4166                 flags |= PIPE_CONTROL_QW_WRITE;
4167
4168                 flags |= PIPE_CONTROL_CS_STALL;
4169
4170                 cs = intel_ring_begin(request, 6);
4171                 if (IS_ERR(cs))
4172                         return PTR_ERR(cs);
4173
4174                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4175                 intel_ring_advance(request, cs);
4176         }
4177
4178         if (mode & EMIT_INVALIDATE) {
4179                 u32 flags = 0;
4180                 u32 *cs;
4181
4182                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4183                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4184                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4185                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4186                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4187                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4188                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4189                 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4190
4191                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4192                 flags |= PIPE_CONTROL_QW_WRITE;
4193
4194                 flags |= PIPE_CONTROL_CS_STALL;
4195
4196                 cs = intel_ring_begin(request, 8);
4197                 if (IS_ERR(cs))
4198                         return PTR_ERR(cs);
4199
4200                 /*
4201                  * Prevent the pre-parser from skipping past the TLB
4202                  * invalidate and loading a stale page for the batch
4203                  * buffer / request payload.
4204                  */
4205                 *cs++ = preparser_disable(true);
4206
4207                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4208
4209                 *cs++ = preparser_disable(false);
4210                 intel_ring_advance(request, cs);
4211         }
4212
4213         return 0;
4214 }
4215
4216 /*
4217  * Reserve space for 2 NOOPs at the end of each request to be
4218  * used as a workaround for not being allowed to do lite
4219  * restore with HEAD==TAIL (WaIdleLiteRestore).
4220  */
4221 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4222 {
4223         /* Ensure there's always at least one preemption point per-request. */
4224         *cs++ = MI_ARB_CHECK;
4225         *cs++ = MI_NOOP;
4226         request->wa_tail = intel_ring_offset(request, cs);
4227
4228         return cs;
4229 }
4230
4231 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4232 {
4233         *cs++ = MI_SEMAPHORE_WAIT |
4234                 MI_SEMAPHORE_GLOBAL_GTT |
4235                 MI_SEMAPHORE_POLL |
4236                 MI_SEMAPHORE_SAD_EQ_SDD;
4237         *cs++ = 0;
4238         *cs++ = intel_hws_preempt_address(request->engine);
4239         *cs++ = 0;
4240
4241         return cs;
4242 }
4243
4244 static __always_inline u32*
4245 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4246                                  u32 *cs)
4247 {
4248         *cs++ = MI_USER_INTERRUPT;
4249
4250         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4251         if (intel_engine_has_semaphores(request->engine))
4252                 cs = emit_preempt_busywait(request, cs);
4253
4254         request->tail = intel_ring_offset(request, cs);
4255         assert_ring_tail_valid(request->ring, request->tail);
4256
4257         return gen8_emit_wa_tail(request, cs);
4258 }
4259
4260 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4261 {
4262         cs = gen8_emit_ggtt_write(cs,
4263                                   request->fence.seqno,
4264                                   i915_request_active_timeline(request)->hwsp_offset,
4265                                   0);
4266
4267         return gen8_emit_fini_breadcrumb_footer(request, cs);
4268 }
4269
4270 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4271 {
4272         cs = gen8_emit_pipe_control(cs,
4273                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4274                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4275                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4276                                     0);
4277
4278         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4279         cs = gen8_emit_ggtt_write_rcs(cs,
4280                                       request->fence.seqno,
4281                                       i915_request_active_timeline(request)->hwsp_offset,
4282                                       PIPE_CONTROL_FLUSH_ENABLE |
4283                                       PIPE_CONTROL_CS_STALL);
4284
4285         return gen8_emit_fini_breadcrumb_footer(request, cs);
4286 }
4287
4288 static u32 *
4289 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4290 {
4291         cs = gen8_emit_ggtt_write_rcs(cs,
4292                                       request->fence.seqno,
4293                                       i915_request_active_timeline(request)->hwsp_offset,
4294                                       PIPE_CONTROL_CS_STALL |
4295                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4296                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4297                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4298                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4299                                       PIPE_CONTROL_FLUSH_ENABLE);
4300
4301         return gen8_emit_fini_breadcrumb_footer(request, cs);
4302 }
4303
4304 /*
4305  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4306  * flush and will continue pre-fetching the instructions after it before the
4307  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4308  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4309  * of the next request before the memory has been flushed, we're guaranteed that
4310  * we won't access the batch itself too early.
4311  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4312  * so, if the current request is modifying an instruction in the next request on
4313  * the same intel_context, we might pre-fetch and then execute the pre-update
4314  * instruction. To avoid this, the users of self-modifying code should either
4315  * disable the parser around the code emitting the memory writes, via a new flag
4316  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4317  * the in-kernel use-cases we've opted to use a separate context, see
4318  * reloc_gpu() as an example.
4319  * All the above applies only to the instructions themselves. Non-inline data
4320  * used by the instructions is not pre-fetched.
4321  */
4322
4323 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4324 {
4325         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4326                 MI_SEMAPHORE_GLOBAL_GTT |
4327                 MI_SEMAPHORE_POLL |
4328                 MI_SEMAPHORE_SAD_EQ_SDD;
4329         *cs++ = 0;
4330         *cs++ = intel_hws_preempt_address(request->engine);
4331         *cs++ = 0;
4332         *cs++ = 0;
4333         *cs++ = MI_NOOP;
4334
4335         return cs;
4336 }
4337
4338 static __always_inline u32*
4339 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4340 {
4341         *cs++ = MI_USER_INTERRUPT;
4342
4343         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4344         if (intel_engine_has_semaphores(request->engine))
4345                 cs = gen12_emit_preempt_busywait(request, cs);
4346
4347         request->tail = intel_ring_offset(request, cs);
4348         assert_ring_tail_valid(request->ring, request->tail);
4349
4350         return gen8_emit_wa_tail(request, cs);
4351 }
4352
4353 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4354 {
4355         cs = gen8_emit_ggtt_write(cs,
4356                                   request->fence.seqno,
4357                                   i915_request_active_timeline(request)->hwsp_offset,
4358                                   0);
4359
4360         return gen12_emit_fini_breadcrumb_footer(request, cs);
4361 }
4362
4363 static u32 *
4364 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4365 {
4366         cs = gen8_emit_ggtt_write_rcs(cs,
4367                                       request->fence.seqno,
4368                                       i915_request_active_timeline(request)->hwsp_offset,
4369                                       PIPE_CONTROL_CS_STALL |
4370                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4371                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4372                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4373                                       /* Wa_1409600907:tgl */
4374                                       PIPE_CONTROL_DEPTH_STALL |
4375                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4376                                       PIPE_CONTROL_FLUSH_ENABLE |
4377                                       PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4378
4379         return gen12_emit_fini_breadcrumb_footer(request, cs);
4380 }
4381
4382 static void execlists_park(struct intel_engine_cs *engine)
4383 {
4384         cancel_timer(&engine->execlists.timer);
4385         cancel_timer(&engine->execlists.preempt);
4386 }
4387
4388 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4389 {
4390         engine->submit_request = execlists_submit_request;
4391         engine->schedule = i915_schedule;
4392         engine->execlists.tasklet.func = execlists_submission_tasklet;
4393
4394         engine->reset.prepare = execlists_reset_prepare;
4395         engine->reset.rewind = execlists_reset_rewind;
4396         engine->reset.cancel = execlists_reset_cancel;
4397         engine->reset.finish = execlists_reset_finish;
4398
4399         engine->park = execlists_park;
4400         engine->unpark = NULL;
4401
4402         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4403         if (!intel_vgpu_active(engine->i915)) {
4404                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4405                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
4406                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4407                         if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
4408                                 engine->flags |= I915_ENGINE_HAS_TIMESLICES;
4409                 }
4410         }
4411
4412         if (INTEL_GEN(engine->i915) >= 12)
4413                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4414
4415         if (intel_engine_has_preemption(engine))
4416                 engine->emit_bb_start = gen8_emit_bb_start;
4417         else
4418                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
4419 }
4420
4421 static void execlists_shutdown(struct intel_engine_cs *engine)
4422 {
4423         /* Synchronise with residual timers and any softirq they raise */
4424         del_timer_sync(&engine->execlists.timer);
4425         del_timer_sync(&engine->execlists.preempt);
4426         tasklet_kill(&engine->execlists.tasklet);
4427 }
4428
4429 static void execlists_release(struct intel_engine_cs *engine)
4430 {
4431         execlists_shutdown(engine);
4432
4433         intel_engine_cleanup_common(engine);
4434         lrc_destroy_wa_ctx(engine);
4435 }
4436
4437 static void
4438 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4439 {
4440         /* Default vfuncs which can be overriden by each engine. */
4441
4442         engine->resume = execlists_resume;
4443
4444         engine->cops = &execlists_context_ops;
4445         engine->request_alloc = execlists_request_alloc;
4446
4447         engine->emit_flush = gen8_emit_flush;
4448         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4449         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4450         if (INTEL_GEN(engine->i915) >= 12)
4451                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4452
4453         engine->set_default_submission = intel_execlists_set_default_submission;
4454
4455         if (INTEL_GEN(engine->i915) < 11) {
4456                 engine->irq_enable = gen8_logical_ring_enable_irq;
4457                 engine->irq_disable = gen8_logical_ring_disable_irq;
4458         } else {
4459                 /*
4460                  * TODO: On Gen11 interrupt masks need to be clear
4461                  * to allow C6 entry. Keep interrupts enabled at
4462                  * and take the hit of generating extra interrupts
4463                  * until a more refined solution exists.
4464                  */
4465         }
4466 }
4467
4468 static inline void
4469 logical_ring_default_irqs(struct intel_engine_cs *engine)
4470 {
4471         unsigned int shift = 0;
4472
4473         if (INTEL_GEN(engine->i915) < 11) {
4474                 const u8 irq_shifts[] = {
4475                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
4476                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
4477                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4478                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4479                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
4480                 };
4481
4482                 shift = irq_shifts[engine->id];
4483         }
4484
4485         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4486         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4487         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4488         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
4489 }
4490
4491 static void rcs_submission_override(struct intel_engine_cs *engine)
4492 {
4493         switch (INTEL_GEN(engine->i915)) {
4494         case 12:
4495                 engine->emit_flush = gen12_emit_flush_render;
4496                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4497                 break;
4498         case 11:
4499                 engine->emit_flush = gen11_emit_flush_render;
4500                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4501                 break;
4502         default:
4503                 engine->emit_flush = gen8_emit_flush_render;
4504                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4505                 break;
4506         }
4507 }
4508
4509 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4510 {
4511         struct intel_engine_execlists * const execlists = &engine->execlists;
4512         struct drm_i915_private *i915 = engine->i915;
4513         struct intel_uncore *uncore = engine->uncore;
4514         u32 base = engine->mmio_base;
4515
4516         tasklet_init(&engine->execlists.tasklet,
4517                      execlists_submission_tasklet, (unsigned long)engine);
4518         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4519         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4520
4521         logical_ring_default_vfuncs(engine);
4522         logical_ring_default_irqs(engine);
4523
4524         if (engine->class == RENDER_CLASS)
4525                 rcs_submission_override(engine);
4526
4527         if (intel_init_workaround_bb(engine))
4528                 /*
4529                  * We continue even if we fail to initialize WA batch
4530                  * because we only expect rare glitches but nothing
4531                  * critical to prevent us from using GPU
4532                  */
4533                 DRM_ERROR("WA batch buffer initialization failed\n");
4534
4535         if (HAS_LOGICAL_RING_ELSQ(i915)) {
4536                 execlists->submit_reg = uncore->regs +
4537                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4538                 execlists->ctrl_reg = uncore->regs +
4539                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4540         } else {
4541                 execlists->submit_reg = uncore->regs +
4542                         i915_mmio_reg_offset(RING_ELSP(base));
4543         }
4544
4545         execlists->csb_status =
4546                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4547
4548         execlists->csb_write =
4549                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4550
4551         if (INTEL_GEN(i915) < 11)
4552                 execlists->csb_size = GEN8_CSB_ENTRIES;
4553         else
4554                 execlists->csb_size = GEN11_CSB_ENTRIES;
4555
4556         if (INTEL_GEN(engine->i915) >= 11) {
4557                 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
4558                 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
4559         }
4560
4561         reset_csb_pointers(engine);
4562
4563         /* Finally, take ownership and responsibility for cleanup! */
4564         engine->release = execlists_release;
4565
4566         return 0;
4567 }
4568
4569 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4570 {
4571         u32 indirect_ctx_offset;
4572
4573         switch (INTEL_GEN(engine->i915)) {
4574         default:
4575                 MISSING_CASE(INTEL_GEN(engine->i915));
4576                 /* fall through */
4577         case 12:
4578                 indirect_ctx_offset =
4579                         GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4580                 break;
4581         case 11:
4582                 indirect_ctx_offset =
4583                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4584                 break;
4585         case 10:
4586                 indirect_ctx_offset =
4587                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4588                 break;
4589         case 9:
4590                 indirect_ctx_offset =
4591                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4592                 break;
4593         case 8:
4594                 indirect_ctx_offset =
4595                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4596                 break;
4597         }
4598
4599         return indirect_ctx_offset;
4600 }
4601
4602
4603 static void init_common_reg_state(u32 * const regs,
4604                                   const struct intel_engine_cs *engine,
4605                                   const struct intel_ring *ring,
4606                                   bool inhibit)
4607 {
4608         u32 ctl;
4609
4610         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4611         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4612         if (inhibit)
4613                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4614         if (INTEL_GEN(engine->i915) < 11)
4615                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4616                                            CTX_CTRL_RS_CTX_ENABLE);
4617         regs[CTX_CONTEXT_CONTROL] = ctl;
4618
4619         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4620 }
4621
4622 static void init_wa_bb_reg_state(u32 * const regs,
4623                                  const struct intel_engine_cs *engine,
4624                                  u32 pos_bb_per_ctx)
4625 {
4626         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4627
4628         if (wa_ctx->per_ctx.size) {
4629                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4630
4631                 regs[pos_bb_per_ctx] =
4632                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4633         }
4634
4635         if (wa_ctx->indirect_ctx.size) {
4636                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4637
4638                 regs[pos_bb_per_ctx + 2] =
4639                         (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4640                         (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4641
4642                 regs[pos_bb_per_ctx + 4] =
4643                         intel_lr_indirect_ctx_offset(engine) << 6;
4644         }
4645 }
4646
4647 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4648 {
4649         if (i915_vm_is_4lvl(&ppgtt->vm)) {
4650                 /* 64b PPGTT (48bit canonical)
4651                  * PDP0_DESCRIPTOR contains the base address to PML4 and
4652                  * other PDP Descriptors are ignored.
4653                  */
4654                 ASSIGN_CTX_PML4(ppgtt, regs);
4655         } else {
4656                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
4657                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
4658                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
4659                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
4660         }
4661 }
4662
4663 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4664 {
4665         if (i915_is_ggtt(vm))
4666                 return i915_vm_to_ggtt(vm)->alias;
4667         else
4668                 return i915_vm_to_ppgtt(vm);
4669 }
4670
4671 static void execlists_init_reg_state(u32 *regs,
4672                                      const struct intel_context *ce,
4673                                      const struct intel_engine_cs *engine,
4674                                      const struct intel_ring *ring,
4675                                      bool inhibit)
4676 {
4677         /*
4678          * A context is actually a big batch buffer with several
4679          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4680          * values we are setting here are only for the first context restore:
4681          * on a subsequent save, the GPU will recreate this batchbuffer with new
4682          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4683          * we are not initializing here).
4684          *
4685          * Must keep consistent with virtual_update_register_offsets().
4686          */
4687         set_offsets(regs, reg_offsets(engine), engine, inhibit);
4688
4689         init_common_reg_state(regs, engine, ring, inhibit);
4690         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4691
4692         init_wa_bb_reg_state(regs, engine,
4693                              INTEL_GEN(engine->i915) >= 12 ?
4694                              GEN12_CTX_BB_PER_CTX_PTR :
4695                              CTX_BB_PER_CTX_PTR);
4696
4697         __reset_stop_ring(regs, engine);
4698 }
4699
4700 static int
4701 populate_lr_context(struct intel_context *ce,
4702                     struct drm_i915_gem_object *ctx_obj,
4703                     struct intel_engine_cs *engine,
4704                     struct intel_ring *ring)
4705 {
4706         bool inhibit = true;
4707         void *vaddr;
4708         int ret;
4709
4710         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4711         if (IS_ERR(vaddr)) {
4712                 ret = PTR_ERR(vaddr);
4713                 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4714                 return ret;
4715         }
4716
4717         set_redzone(vaddr, engine);
4718
4719         if (engine->default_state) {
4720                 void *defaults;
4721
4722                 defaults = i915_gem_object_pin_map(engine->default_state,
4723                                                    I915_MAP_WB);
4724                 if (IS_ERR(defaults)) {
4725                         ret = PTR_ERR(defaults);
4726                         goto err_unpin_ctx;
4727                 }
4728
4729                 memcpy(vaddr, defaults, engine->context_size);
4730                 i915_gem_object_unpin_map(engine->default_state);
4731                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4732                 inhibit = false;
4733         }
4734
4735         /* Clear the ppHWSP (inc. per-context counters) */
4736         memset(vaddr, 0, PAGE_SIZE);
4737
4738         /*
4739          * The second page of the context object contains some registers which
4740          * must be set up prior to the first execution.
4741          */
4742         execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4743                                  ce, engine, ring, inhibit);
4744
4745         ret = 0;
4746 err_unpin_ctx:
4747         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4748         i915_gem_object_unpin_map(ctx_obj);
4749         return ret;
4750 }
4751
4752 static int __execlists_context_alloc(struct intel_context *ce,
4753                                      struct intel_engine_cs *engine)
4754 {
4755         struct drm_i915_gem_object *ctx_obj;
4756         struct intel_ring *ring;
4757         struct i915_vma *vma;
4758         u32 context_size;
4759         int ret;
4760
4761         GEM_BUG_ON(ce->state);
4762         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4763
4764         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4765                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4766
4767         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4768         if (IS_ERR(ctx_obj))
4769                 return PTR_ERR(ctx_obj);
4770
4771         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4772         if (IS_ERR(vma)) {
4773                 ret = PTR_ERR(vma);
4774                 goto error_deref_obj;
4775         }
4776
4777         if (!ce->timeline) {
4778                 struct intel_timeline *tl;
4779                 struct i915_vma *hwsp;
4780
4781                 /*
4782                  * Use the static global HWSP for the kernel context, and
4783                  * a dynamically allocated cacheline for everyone else.
4784                  */
4785                 hwsp = NULL;
4786                 if (unlikely(intel_context_is_barrier(ce)))
4787                         hwsp = engine->status_page.vma;
4788
4789                 tl = intel_timeline_create(engine->gt, hwsp);
4790                 if (IS_ERR(tl)) {
4791                         ret = PTR_ERR(tl);
4792                         goto error_deref_obj;
4793                 }
4794
4795                 ce->timeline = tl;
4796         }
4797
4798         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4799         if (IS_ERR(ring)) {
4800                 ret = PTR_ERR(ring);
4801                 goto error_deref_obj;
4802         }
4803
4804         ret = populate_lr_context(ce, ctx_obj, engine, ring);
4805         if (ret) {
4806                 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4807                 goto error_ring_free;
4808         }
4809
4810         ce->ring = ring;
4811         ce->state = vma;
4812
4813         return 0;
4814
4815 error_ring_free:
4816         intel_ring_put(ring);
4817 error_deref_obj:
4818         i915_gem_object_put(ctx_obj);
4819         return ret;
4820 }
4821
4822 static struct list_head *virtual_queue(struct virtual_engine *ve)
4823 {
4824         return &ve->base.execlists.default_priolist.requests[0];
4825 }
4826
4827 static void virtual_context_destroy(struct kref *kref)
4828 {
4829         struct virtual_engine *ve =
4830                 container_of(kref, typeof(*ve), context.ref);
4831         unsigned int n;
4832
4833         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4834         GEM_BUG_ON(ve->request);
4835         GEM_BUG_ON(ve->context.inflight);
4836
4837         for (n = 0; n < ve->num_siblings; n++) {
4838                 struct intel_engine_cs *sibling = ve->siblings[n];
4839                 struct rb_node *node = &ve->nodes[sibling->id].rb;
4840                 unsigned long flags;
4841
4842                 if (RB_EMPTY_NODE(node))
4843                         continue;
4844
4845                 spin_lock_irqsave(&sibling->active.lock, flags);
4846
4847                 /* Detachment is lazily performed in the execlists tasklet */
4848                 if (!RB_EMPTY_NODE(node))
4849                         rb_erase_cached(node, &sibling->execlists.virtual);
4850
4851                 spin_unlock_irqrestore(&sibling->active.lock, flags);
4852         }
4853         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4854
4855         if (ve->context.state)
4856                 __execlists_context_fini(&ve->context);
4857         intel_context_fini(&ve->context);
4858
4859         kfree(ve->bonds);
4860         kfree(ve);
4861 }
4862
4863 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4864 {
4865         int swp;
4866
4867         /*
4868          * Pick a random sibling on starting to help spread the load around.
4869          *
4870          * New contexts are typically created with exactly the same order
4871          * of siblings, and often started in batches. Due to the way we iterate
4872          * the array of sibling when submitting requests, sibling[0] is
4873          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4874          * randomised across the system, we also help spread the load by the
4875          * first engine we inspect being different each time.
4876          *
4877          * NB This does not force us to execute on this engine, it will just
4878          * typically be the first we inspect for submission.
4879          */
4880         swp = prandom_u32_max(ve->num_siblings);
4881         if (!swp)
4882                 return;
4883
4884         swap(ve->siblings[swp], ve->siblings[0]);
4885         if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4886                 virtual_update_register_offsets(ve->context.lrc_reg_state,
4887                                                 ve->siblings[0]);
4888 }
4889
4890 static int virtual_context_alloc(struct intel_context *ce)
4891 {
4892         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4893
4894         return __execlists_context_alloc(ce, ve->siblings[0]);
4895 }
4896
4897 static int virtual_context_pin(struct intel_context *ce)
4898 {
4899         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4900         int err;
4901
4902         /* Note: we must use a real engine class for setting up reg state */
4903         err = __execlists_context_pin(ce, ve->siblings[0]);
4904         if (err)
4905                 return err;
4906
4907         virtual_engine_initial_hint(ve);
4908         return 0;
4909 }
4910
4911 static void virtual_context_enter(struct intel_context *ce)
4912 {
4913         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4914         unsigned int n;
4915
4916         for (n = 0; n < ve->num_siblings; n++)
4917                 intel_engine_pm_get(ve->siblings[n]);
4918
4919         intel_timeline_enter(ce->timeline);
4920 }
4921
4922 static void virtual_context_exit(struct intel_context *ce)
4923 {
4924         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4925         unsigned int n;
4926
4927         intel_timeline_exit(ce->timeline);
4928
4929         for (n = 0; n < ve->num_siblings; n++)
4930                 intel_engine_pm_put(ve->siblings[n]);
4931 }
4932
4933 static const struct intel_context_ops virtual_context_ops = {
4934         .alloc = virtual_context_alloc,
4935
4936         .pin = virtual_context_pin,
4937         .unpin = execlists_context_unpin,
4938
4939         .enter = virtual_context_enter,
4940         .exit = virtual_context_exit,
4941
4942         .destroy = virtual_context_destroy,
4943 };
4944
4945 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4946 {
4947         struct i915_request *rq;
4948         intel_engine_mask_t mask;
4949
4950         rq = READ_ONCE(ve->request);
4951         if (!rq)
4952                 return 0;
4953
4954         /* The rq is ready for submission; rq->execution_mask is now stable. */
4955         mask = rq->execution_mask;
4956         if (unlikely(!mask)) {
4957                 /* Invalid selection, submit to a random engine in error */
4958                 i915_request_set_error_once(rq, -ENODEV);
4959                 mask = ve->siblings[0]->mask;
4960         }
4961
4962         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4963                      rq->fence.context, rq->fence.seqno,
4964                      mask, ve->base.execlists.queue_priority_hint);
4965
4966         return mask;
4967 }
4968
4969 static void virtual_submission_tasklet(unsigned long data)
4970 {
4971         struct virtual_engine * const ve = (struct virtual_engine *)data;
4972         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
4973         intel_engine_mask_t mask;
4974         unsigned int n;
4975
4976         rcu_read_lock();
4977         mask = virtual_submission_mask(ve);
4978         rcu_read_unlock();
4979         if (unlikely(!mask))
4980                 return;
4981
4982         local_irq_disable();
4983         for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4984                 struct intel_engine_cs *sibling = ve->siblings[n];
4985                 struct ve_node * const node = &ve->nodes[sibling->id];
4986                 struct rb_node **parent, *rb;
4987                 bool first;
4988
4989                 if (unlikely(!(mask & sibling->mask))) {
4990                         if (!RB_EMPTY_NODE(&node->rb)) {
4991                                 spin_lock(&sibling->active.lock);
4992                                 rb_erase_cached(&node->rb,
4993                                                 &sibling->execlists.virtual);
4994                                 RB_CLEAR_NODE(&node->rb);
4995                                 spin_unlock(&sibling->active.lock);
4996                         }
4997                         continue;
4998                 }
4999
5000                 spin_lock(&sibling->active.lock);
5001
5002                 if (!RB_EMPTY_NODE(&node->rb)) {
5003                         /*
5004                          * Cheat and avoid rebalancing the tree if we can
5005                          * reuse this node in situ.
5006                          */
5007                         first = rb_first_cached(&sibling->execlists.virtual) ==
5008                                 &node->rb;
5009                         if (prio == node->prio || (prio > node->prio && first))
5010                                 goto submit_engine;
5011
5012                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5013                 }
5014
5015                 rb = NULL;
5016                 first = true;
5017                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5018                 while (*parent) {
5019                         struct ve_node *other;
5020
5021                         rb = *parent;
5022                         other = rb_entry(rb, typeof(*other), rb);
5023                         if (prio > other->prio) {
5024                                 parent = &rb->rb_left;
5025                         } else {
5026                                 parent = &rb->rb_right;
5027                                 first = false;
5028                         }
5029                 }
5030
5031                 rb_link_node(&node->rb, rb, parent);
5032                 rb_insert_color_cached(&node->rb,
5033                                        &sibling->execlists.virtual,
5034                                        first);
5035
5036 submit_engine:
5037                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5038                 node->prio = prio;
5039                 if (first && prio > sibling->execlists.queue_priority_hint) {
5040                         sibling->execlists.queue_priority_hint = prio;
5041                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5042                 }
5043
5044                 spin_unlock(&sibling->active.lock);
5045         }
5046         local_irq_enable();
5047 }
5048
5049 static void virtual_submit_request(struct i915_request *rq)
5050 {
5051         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5052         struct i915_request *old;
5053         unsigned long flags;
5054
5055         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5056                      rq->fence.context,
5057                      rq->fence.seqno);
5058
5059         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5060
5061         spin_lock_irqsave(&ve->base.active.lock, flags);
5062
5063         old = ve->request;
5064         if (old) { /* background completion event from preempt-to-busy */
5065                 GEM_BUG_ON(!i915_request_completed(old));
5066                 __i915_request_submit(old);
5067                 i915_request_put(old);
5068         }
5069
5070         if (i915_request_completed(rq)) {
5071                 __i915_request_submit(rq);
5072
5073                 ve->base.execlists.queue_priority_hint = INT_MIN;
5074                 ve->request = NULL;
5075         } else {
5076                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5077                 ve->request = i915_request_get(rq);
5078
5079                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5080                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5081
5082                 tasklet_schedule(&ve->base.execlists.tasklet);
5083         }
5084
5085         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5086 }
5087
5088 static struct ve_bond *
5089 virtual_find_bond(struct virtual_engine *ve,
5090                   const struct intel_engine_cs *master)
5091 {
5092         int i;
5093
5094         for (i = 0; i < ve->num_bonds; i++) {
5095                 if (ve->bonds[i].master == master)
5096                         return &ve->bonds[i];
5097         }
5098
5099         return NULL;
5100 }
5101
5102 static void
5103 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5104 {
5105         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5106         intel_engine_mask_t allowed, exec;
5107         struct ve_bond *bond;
5108
5109         allowed = ~to_request(signal)->engine->mask;
5110
5111         bond = virtual_find_bond(ve, to_request(signal)->engine);
5112         if (bond)
5113                 allowed &= bond->sibling_mask;
5114
5115         /* Restrict the bonded request to run on only the available engines */
5116         exec = READ_ONCE(rq->execution_mask);
5117         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5118                 ;
5119
5120         /* Prevent the master from being re-run on the bonded engines */
5121         to_request(signal)->execution_mask &= ~allowed;
5122 }
5123
5124 struct intel_context *
5125 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5126                                unsigned int count)
5127 {
5128         struct virtual_engine *ve;
5129         unsigned int n;
5130         int err;
5131
5132         if (count == 0)
5133                 return ERR_PTR(-EINVAL);
5134
5135         if (count == 1)
5136                 return intel_context_create(siblings[0]);
5137
5138         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5139         if (!ve)
5140                 return ERR_PTR(-ENOMEM);
5141
5142         ve->base.i915 = siblings[0]->i915;
5143         ve->base.gt = siblings[0]->gt;
5144         ve->base.uncore = siblings[0]->uncore;
5145         ve->base.id = -1;
5146
5147         ve->base.class = OTHER_CLASS;
5148         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5149         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5150         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5151
5152         /*
5153          * The decision on whether to submit a request using semaphores
5154          * depends on the saturated state of the engine. We only compute
5155          * this during HW submission of the request, and we need for this
5156          * state to be globally applied to all requests being submitted
5157          * to this engine. Virtual engines encompass more than one physical
5158          * engine and so we cannot accurately tell in advance if one of those
5159          * engines is already saturated and so cannot afford to use a semaphore
5160          * and be pessimized in priority for doing so -- if we are the only
5161          * context using semaphores after all other clients have stopped, we
5162          * will be starved on the saturated system. Such a global switch for
5163          * semaphores is less than ideal, but alas is the current compromise.
5164          */
5165         ve->base.saturated = ALL_ENGINES;
5166
5167         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5168
5169         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5170         intel_engine_init_breadcrumbs(&ve->base);
5171         intel_engine_init_execlists(&ve->base);
5172
5173         ve->base.cops = &virtual_context_ops;
5174         ve->base.request_alloc = execlists_request_alloc;
5175
5176         ve->base.schedule = i915_schedule;
5177         ve->base.submit_request = virtual_submit_request;
5178         ve->base.bond_execute = virtual_bond_execute;
5179
5180         INIT_LIST_HEAD(virtual_queue(ve));
5181         ve->base.execlists.queue_priority_hint = INT_MIN;
5182         tasklet_init(&ve->base.execlists.tasklet,
5183                      virtual_submission_tasklet,
5184                      (unsigned long)ve);
5185
5186         intel_context_init(&ve->context, &ve->base);
5187
5188         for (n = 0; n < count; n++) {
5189                 struct intel_engine_cs *sibling = siblings[n];
5190
5191                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5192                 if (sibling->mask & ve->base.mask) {
5193                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5194                                   sibling->name);
5195                         err = -EINVAL;
5196                         goto err_put;
5197                 }
5198
5199                 /*
5200                  * The virtual engine implementation is tightly coupled to
5201                  * the execlists backend -- we push out request directly
5202                  * into a tree inside each physical engine. We could support
5203                  * layering if we handle cloning of the requests and
5204                  * submitting a copy into each backend.
5205                  */
5206                 if (sibling->execlists.tasklet.func !=
5207                     execlists_submission_tasklet) {
5208                         err = -ENODEV;
5209                         goto err_put;
5210                 }
5211
5212                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5213                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5214
5215                 ve->siblings[ve->num_siblings++] = sibling;
5216                 ve->base.mask |= sibling->mask;
5217
5218                 /*
5219                  * All physical engines must be compatible for their emission
5220                  * functions (as we build the instructions during request
5221                  * construction and do not alter them before submission
5222                  * on the physical engine). We use the engine class as a guide
5223                  * here, although that could be refined.
5224                  */
5225                 if (ve->base.class != OTHER_CLASS) {
5226                         if (ve->base.class != sibling->class) {
5227                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5228                                           sibling->class, ve->base.class);
5229                                 err = -EINVAL;
5230                                 goto err_put;
5231                         }
5232                         continue;
5233                 }
5234
5235                 ve->base.class = sibling->class;
5236                 ve->base.uabi_class = sibling->uabi_class;
5237                 snprintf(ve->base.name, sizeof(ve->base.name),
5238                          "v%dx%d", ve->base.class, count);
5239                 ve->base.context_size = sibling->context_size;
5240
5241                 ve->base.emit_bb_start = sibling->emit_bb_start;
5242                 ve->base.emit_flush = sibling->emit_flush;
5243                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5244                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5245                 ve->base.emit_fini_breadcrumb_dw =
5246                         sibling->emit_fini_breadcrumb_dw;
5247
5248                 ve->base.flags = sibling->flags;
5249         }
5250
5251         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5252
5253         return &ve->context;
5254
5255 err_put:
5256         intel_context_put(&ve->context);
5257         return ERR_PTR(err);
5258 }
5259
5260 struct intel_context *
5261 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5262 {
5263         struct virtual_engine *se = to_virtual_engine(src);
5264         struct intel_context *dst;
5265
5266         dst = intel_execlists_create_virtual(se->siblings,
5267                                              se->num_siblings);
5268         if (IS_ERR(dst))
5269                 return dst;
5270
5271         if (se->num_bonds) {
5272                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5273
5274                 de->bonds = kmemdup(se->bonds,
5275                                     sizeof(*se->bonds) * se->num_bonds,
5276                                     GFP_KERNEL);
5277                 if (!de->bonds) {
5278                         intel_context_put(dst);
5279                         return ERR_PTR(-ENOMEM);
5280                 }
5281
5282                 de->num_bonds = se->num_bonds;
5283         }
5284
5285         return dst;
5286 }
5287
5288 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5289                                      const struct intel_engine_cs *master,
5290                                      const struct intel_engine_cs *sibling)
5291 {
5292         struct virtual_engine *ve = to_virtual_engine(engine);
5293         struct ve_bond *bond;
5294         int n;
5295
5296         /* Sanity check the sibling is part of the virtual engine */
5297         for (n = 0; n < ve->num_siblings; n++)
5298                 if (sibling == ve->siblings[n])
5299                         break;
5300         if (n == ve->num_siblings)
5301                 return -EINVAL;
5302
5303         bond = virtual_find_bond(ve, master);
5304         if (bond) {
5305                 bond->sibling_mask |= sibling->mask;
5306                 return 0;
5307         }
5308
5309         bond = krealloc(ve->bonds,
5310                         sizeof(*bond) * (ve->num_bonds + 1),
5311                         GFP_KERNEL);
5312         if (!bond)
5313                 return -ENOMEM;
5314
5315         bond[ve->num_bonds].master = master;
5316         bond[ve->num_bonds].sibling_mask = sibling->mask;
5317
5318         ve->bonds = bond;
5319         ve->num_bonds++;
5320
5321         return 0;
5322 }
5323
5324 struct intel_engine_cs *
5325 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5326                                  unsigned int sibling)
5327 {
5328         struct virtual_engine *ve = to_virtual_engine(engine);
5329
5330         if (sibling >= ve->num_siblings)
5331                 return NULL;
5332
5333         return ve->siblings[sibling];
5334 }
5335
5336 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5337                                    struct drm_printer *m,
5338                                    void (*show_request)(struct drm_printer *m,
5339                                                         struct i915_request *rq,
5340                                                         const char *prefix),
5341                                    unsigned int max)
5342 {
5343         const struct intel_engine_execlists *execlists = &engine->execlists;
5344         struct i915_request *rq, *last;
5345         unsigned long flags;
5346         unsigned int count;
5347         struct rb_node *rb;
5348
5349         spin_lock_irqsave(&engine->active.lock, flags);
5350
5351         last = NULL;
5352         count = 0;
5353         list_for_each_entry(rq, &engine->active.requests, sched.link) {
5354                 if (count++ < max - 1)
5355                         show_request(m, rq, "\t\tE ");
5356                 else
5357                         last = rq;
5358         }
5359         if (last) {
5360                 if (count > max) {
5361                         drm_printf(m,
5362                                    "\t\t...skipping %d executing requests...\n",
5363                                    count - max);
5364                 }
5365                 show_request(m, last, "\t\tE ");
5366         }
5367
5368         if (execlists->switch_priority_hint != INT_MIN)
5369                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
5370                            READ_ONCE(execlists->switch_priority_hint));
5371         if (execlists->queue_priority_hint != INT_MIN)
5372                 drm_printf(m, "\t\tQueue priority hint: %d\n",
5373                            READ_ONCE(execlists->queue_priority_hint));
5374
5375         last = NULL;
5376         count = 0;
5377         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5378                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5379                 int i;
5380
5381                 priolist_for_each_request(rq, p, i) {
5382                         if (count++ < max - 1)
5383                                 show_request(m, rq, "\t\tQ ");
5384                         else
5385                                 last = rq;
5386                 }
5387         }
5388         if (last) {
5389                 if (count > max) {
5390                         drm_printf(m,
5391                                    "\t\t...skipping %d queued requests...\n",
5392                                    count - max);
5393                 }
5394                 show_request(m, last, "\t\tQ ");
5395         }
5396
5397         last = NULL;
5398         count = 0;
5399         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5400                 struct virtual_engine *ve =
5401                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5402                 struct i915_request *rq = READ_ONCE(ve->request);
5403
5404                 if (rq) {
5405                         if (count++ < max - 1)
5406                                 show_request(m, rq, "\t\tV ");
5407                         else
5408                                 last = rq;
5409                 }
5410         }
5411         if (last) {
5412                 if (count > max) {
5413                         drm_printf(m,
5414                                    "\t\t...skipping %d virtual requests...\n",
5415                                    count - max);
5416                 }
5417                 show_request(m, last, "\t\tV ");
5418         }
5419
5420         spin_unlock_irqrestore(&engine->active.lock, flags);
5421 }
5422
5423 void intel_lr_context_reset(struct intel_engine_cs *engine,
5424                             struct intel_context *ce,
5425                             u32 head,
5426                             bool scrub)
5427 {
5428         GEM_BUG_ON(!intel_context_is_pinned(ce));
5429
5430         /*
5431          * We want a simple context + ring to execute the breadcrumb update.
5432          * We cannot rely on the context being intact across the GPU hang,
5433          * so clear it and rebuild just what we need for the breadcrumb.
5434          * All pending requests for this context will be zapped, and any
5435          * future request will be after userspace has had the opportunity
5436          * to recreate its own state.
5437          */
5438         if (scrub)
5439                 restore_default_state(ce, engine);
5440
5441         /* Rerun the request; its payload has been neutered (if guilty). */
5442         __execlists_update_reg_state(ce, engine, head);
5443 }
5444
5445 bool
5446 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5447 {
5448         return engine->set_default_submission ==
5449                intel_execlists_set_default_submission;
5450 }
5451
5452 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5453 #include "selftest_lrc.c"
5454 #endif