drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_context.h"
 141 #include "intel_engine_pm.h"
 142 #include "intel_gt.h"
 143 #include "intel_gt_pm.h"
 144 #include "intel_gt_requests.h"
 145 #include "intel_lrc_reg.h"
 146 #include "intel_mocs.h"
 147 #include "intel_reset.h"
 148 #include "intel_ring.h"
 149 #include "intel_workarounds.h"
 150
 151 #define RING_EXECLIST_QFULL             (1 << 0x2)
 152 #define RING_EXECLIST1_VALID            (1 << 0x3)
 153 #define RING_EXECLIST0_VALID            (1 << 0x4)
 154 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 155 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 156 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 157
 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 159 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 162 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 163 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 164
 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 166          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 167
 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 169
 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 172 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 173 #define GEN12_IDLE_CTX_ID               0x7FF
 174 #define GEN12_CSB_CTX_VALID(csb_dw) \
 175         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 176
 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 179
 180 struct virtual_engine {
 181         struct intel_engine_cs base;
 182         struct intel_context context;
 183
 184         /*
 185          * We allow only a single request through the virtual engine at a time
 186          * (each request in the timeline waits for the completion fence of
 187          * the previous before being submitted). By restricting ourselves to
 188          * only submitting a single request, each request is placed on to a
 189          * physical to maximise load spreading (by virtue of the late greedy
 190          * scheduling -- each real engine takes the next available request
 191          * upon idling).
 192          */
 193         struct i915_request *request;
 194
 195         /*
 196          * We keep a rbtree of available virtual engines inside each physical
 197          * engine, sorted by priority. Here we preallocate the nodes we need
 198          * for the virtual engine, indexed by physical_engine->id.
 199          */
 200         struct ve_node {
 201                 struct rb_node rb;
 202                 int prio;
 203         } nodes[I915_NUM_ENGINES];
 204
 205         /*
 206          * Keep track of bonded pairs -- restrictions upon on our selection
 207          * of physical engines any particular request may be submitted to.
 208          * If we receive a submit-fence from a master engine, we will only
 209          * use one of sibling_mask physical engines.
 210          */
 211         struct ve_bond {
 212                 const struct intel_engine_cs *master;
 213                 intel_engine_mask_t sibling_mask;
 214         } *bonds;
 215         unsigned int num_bonds;
 216
 217         /* And finally, which physical engines this virtual engine maps onto. */
 218         unsigned int num_siblings;
 219         struct intel_engine_cs *siblings[0];
 220 };
 221
 222 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 223 {
 224         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 225         return container_of(engine, struct virtual_engine, base);
 226 }
 227
 228 static int __execlists_context_alloc(struct intel_context *ce,
 229                                      struct intel_engine_cs *engine);
 230
 231 static void execlists_init_reg_state(u32 *reg_state,
 232                                      const struct intel_context *ce,
 233                                      const struct intel_engine_cs *engine,
 234                                      const struct intel_ring *ring,
 235                                      bool close);
 236 static void
 237 __execlists_update_reg_state(const struct intel_context *ce,
 238                              const struct intel_engine_cs *engine,
 239                              u32 head);
 240
 241 static u32 intel_context_get_runtime(const struct intel_context *ce)
 242 {
 243         /*
 244          * We can use either ppHWSP[16] which is recorded before the context
 245          * switch (and so excludes the cost of context switches) or use the
 246          * value from the context image itself, which is saved/restored earlier
 247          * and so includes the cost of the save.
 248          */
 249         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
 250 }
 251
 252 static void mark_eio(struct i915_request *rq)
 253 {
 254         if (i915_request_completed(rq))
 255                 return;
 256
 257         GEM_BUG_ON(i915_request_signaled(rq));
 258
 259         i915_request_set_error_once(rq, -EIO);
 260         i915_request_mark_complete(rq);
 261 }
 262
 263 static struct i915_request *
 264 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 265 {
 266         struct i915_request *active = rq;
 267
 268         rcu_read_lock();
 269         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 270                 if (i915_request_completed(rq))
 271                         break;
 272
 273                 active = rq;
 274         }
 275         rcu_read_unlock();
 276
 277         return active;
 278 }
 279
 280 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 281 {
 282         return (i915_ggtt_offset(engine->status_page.vma) +
 283                 I915_GEM_HWS_PREEMPT_ADDR);
 284 }
 285
 286 static inline void
 287 ring_set_paused(const struct intel_engine_cs *engine, int state)
 288 {
 289         /*
 290          * We inspect HWS_PREEMPT with a semaphore inside
 291          * engine->emit_fini_breadcrumb. If the dword is true,
 292          * the ring is paused as the semaphore will busywait
 293          * until the dword is false.
 294          */
 295         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 296         if (state)
 297                 wmb();
 298 }
 299
 300 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 301 {
 302         return rb_entry(rb, struct i915_priolist, node);
 303 }
 304
 305 static inline int rq_prio(const struct i915_request *rq)
 306 {
 307         return READ_ONCE(rq->sched.attr.priority);
 308 }
 309
 310 static int effective_prio(const struct i915_request *rq)
 311 {
 312         int prio = rq_prio(rq);
 313
 314         /*
 315          * If this request is special and must not be interrupted at any
 316          * cost, so be it. Note we are only checking the most recent request
 317          * in the context and so may be masking an earlier vip request. It
 318          * is hoped that under the conditions where nopreempt is used, this
 319          * will not matter (i.e. all requests to that context will be
 320          * nopreempt for as long as desired).
 321          */
 322         if (i915_request_has_nopreempt(rq))
 323                 prio = I915_PRIORITY_UNPREEMPTABLE;
 324
 325         /*
 326          * On unwinding the active request, we give it a priority bump
 327          * if it has completed waiting on any semaphore. If we know that
 328          * the request has already started, we can prevent an unwanted
 329          * preempt-to-idle cycle by taking that into account now.
 330          */
 331         if (__i915_request_has_started(rq))
 332                 prio |= I915_PRIORITY_NOSEMAPHORE;
 333
 334         /* Restrict mere WAIT boosts from triggering preemption */
 335         BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
 336         return prio | __NO_PREEMPTION;
 337 }
 338
 339 static int queue_prio(const struct intel_engine_execlists *execlists)
 340 {
 341         struct i915_priolist *p;
 342         struct rb_node *rb;
 343
 344         rb = rb_first_cached(&execlists->queue);
 345         if (!rb)
 346                 return INT_MIN;
 347
 348         /*
 349          * As the priolist[] are inverted, with the highest priority in [0],
 350          * we have to flip the index value to become priority.
 351          */
 352         p = to_priolist(rb);
 353         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 354 }
 355
 356 static inline bool need_preempt(const struct intel_engine_cs *engine,
 357                                 const struct i915_request *rq,
 358                                 struct rb_node *rb)
 359 {
 360         int last_prio;
 361
 362         if (!intel_engine_has_semaphores(engine))
 363                 return false;
 364
 365         /*
 366          * Check if the current priority hint merits a preemption attempt.
 367          *
 368          * We record the highest value priority we saw during rescheduling
 369          * prior to this dequeue, therefore we know that if it is strictly
 370          * less than the current tail of ESLP[0], we do not need to force
 371          * a preempt-to-idle cycle.
 372          *
 373          * However, the priority hint is a mere hint that we may need to
 374          * preempt. If that hint is stale or we may be trying to preempt
 375          * ourselves, ignore the request.
 376          *
 377          * More naturally we would write
 378          *      prio >= max(0, last);
 379          * except that we wish to prevent triggering preemption at the same
 380          * priority level: the task that is running should remain running
 381          * to preserve FIFO ordering of dependencies.
 382          */
 383         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 384         if (engine->execlists.queue_priority_hint <= last_prio)
 385                 return false;
 386
 387         /*
 388          * Check against the first request in ELSP[1], it will, thanks to the
 389          * power of PI, be the highest priority of that context.
 390          */
 391         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 392             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 393                 return true;
 394
 395         if (rb) {
 396                 struct virtual_engine *ve =
 397                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 398                 bool preempt = false;
 399
 400                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 401                         struct i915_request *next;
 402
 403                         rcu_read_lock();
 404                         next = READ_ONCE(ve->request);
 405                         if (next)
 406                                 preempt = rq_prio(next) > last_prio;
 407                         rcu_read_unlock();
 408                 }
 409
 410                 if (preempt)
 411                         return preempt;
 412         }
 413
 414         /*
 415          * If the inflight context did not trigger the preemption, then maybe
 416          * it was the set of queued requests? Pick the highest priority in
 417          * the queue (the first active priolist) and see if it deserves to be
 418          * running instead of ELSP[0].
 419          *
 420          * The highest priority request in the queue can not be either
 421          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 422          * context, it's priority would not exceed ELSP[0] aka last_prio.
 423          */
 424         return queue_prio(&engine->execlists) > last_prio;
 425 }
 426
 427 __maybe_unused static inline bool
 428 assert_priority_queue(const struct i915_request *prev,
 429                       const struct i915_request *next)
 430 {
 431         /*
 432          * Without preemption, the prev may refer to the still active element
 433          * which we refuse to let go.
 434          *
 435          * Even with preemption, there are times when we think it is better not
 436          * to preempt and leave an ostensibly lower priority request in flight.
 437          */
 438         if (i915_request_is_active(prev))
 439                 return true;
 440
 441         return rq_prio(prev) >= rq_prio(next);
 442 }
 443
 444 /*
 445  * The context descriptor encodes various attributes of a context,
 446  * including its GTT address and some flags. Because it's fairly
 447  * expensive to calculate, we'll just do it once and cache the result,
 448  * which remains valid until the context is unpinned.
 449  *
 450  * This is what a descriptor looks like, from LSB to MSB::
 451  *
 452  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 453  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 454  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 455  *      bits 53-54:    mbz, reserved for use by hardware
 456  *      bits 55-63:    group ID, currently unused and set to 0
 457  *
 458  * Starting from Gen11, the upper dword of the descriptor has a new format:
 459  *
 460  *      bits 32-36:    reserved
 461  *      bits 37-47:    SW context ID
 462  *      bits 48:53:    engine instance
 463  *      bit 54:        mbz, reserved for use by hardware
 464  *      bits 55-60:    SW counter
 465  *      bits 61-63:    engine class
 466  *
 467  * engine info, SW context ID and SW counter need to form a unique number
 468  * (Context ID) per lrc.
 469  */
 470 static u64
 471 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 472 {
 473         u64 desc;
 474
 475         desc = INTEL_LEGACY_32B_CONTEXT;
 476         if (i915_vm_is_4lvl(ce->vm))
 477                 desc = INTEL_LEGACY_64B_CONTEXT;
 478         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 479
 480         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 481         if (IS_GEN(engine->i915, 8))
 482                 desc |= GEN8_CTX_L3LLC_COHERENT;
 483
 484         desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
 485         /*
 486          * The following 32bits are copied into the OA reports (dword 2).
 487          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
 488          * anything below.
 489          */
 490         if (INTEL_GEN(engine->i915) >= 11) {
 491                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
 492                                                                 /* bits 48-53 */
 493
 494                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
 495                                                                 /* bits 61-63 */
 496         }
 497
 498         return desc;
 499 }
 500
 501 static inline unsigned int dword_in_page(void *addr)
 502 {
 503         return offset_in_page(addr) / sizeof(u32);
 504 }
 505
 506 static void set_offsets(u32 *regs,
 507                         const u8 *data,
 508                         const struct intel_engine_cs *engine,
 509                         bool clear)
 510 #define NOP(x) (BIT(7) | (x))
 511 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 512 #define POSTED BIT(0)
 513 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 514 #define REG16(x) \
 515         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 516         (((x) >> 2) & 0x7f)
 517 #define END(x) 0, (x)
 518 {
 519         const u32 base = engine->mmio_base;
 520
 521         while (*data) {
 522                 u8 count, flags;
 523
 524                 if (*data & BIT(7)) { /* skip */
 525                         count = *data++ & ~BIT(7);
 526                         if (clear)
 527                                 memset32(regs, MI_NOOP, count);
 528                         regs += count;
 529                         continue;
 530                 }
 531
 532                 count = *data & 0x3f;
 533                 flags = *data >> 6;
 534                 data++;
 535
 536                 *regs = MI_LOAD_REGISTER_IMM(count);
 537                 if (flags & POSTED)
 538                         *regs |= MI_LRI_FORCE_POSTED;
 539                 if (INTEL_GEN(engine->i915) >= 11)
 540                         *regs |= MI_LRI_CS_MMIO;
 541                 regs++;
 542
 543                 GEM_BUG_ON(!count);
 544                 do {
 545                         u32 offset = 0;
 546                         u8 v;
 547
 548                         do {
 549                                 v = *data++;
 550                                 offset <<= 7;
 551                                 offset |= v & ~BIT(7);
 552                         } while (v & BIT(7));
 553
 554                         regs[0] = base + (offset << 2);
 555                         if (clear)
 556                                 regs[1] = 0;
 557                         regs += 2;
 558                 } while (--count);
 559         }
 560
 561         if (clear) {
 562                 u8 count = *++data;
 563
 564                 /* Clear past the tail for HW access */
 565                 GEM_BUG_ON(dword_in_page(regs) > count);
 566                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 567
 568                 /* Close the batch; used mainly by live_lrc_layout() */
 569                 *regs = MI_BATCH_BUFFER_END;
 570                 if (INTEL_GEN(engine->i915) >= 10)
 571                         *regs |= BIT(0);
 572         }
 573 }
 574
 575 static const u8 gen8_xcs_offsets[] = {
 576         NOP(1),
 577         LRI(11, 0),
 578         REG16(0x244),
 579         REG(0x034),
 580         REG(0x030),
 581         REG(0x038),
 582         REG(0x03c),
 583         REG(0x168),
 584         REG(0x140),
 585         REG(0x110),
 586         REG(0x11c),
 587         REG(0x114),
 588         REG(0x118),
 589
 590         NOP(9),
 591         LRI(9, 0),
 592         REG16(0x3a8),
 593         REG16(0x28c),
 594         REG16(0x288),
 595         REG16(0x284),
 596         REG16(0x280),
 597         REG16(0x27c),
 598         REG16(0x278),
 599         REG16(0x274),
 600         REG16(0x270),
 601
 602         NOP(13),
 603         LRI(2, 0),
 604         REG16(0x200),
 605         REG(0x028),
 606
 607         END(80)
 608 };
 609
 610 static const u8 gen9_xcs_offsets[] = {
 611         NOP(1),
 612         LRI(14, POSTED),
 613         REG16(0x244),
 614         REG(0x034),
 615         REG(0x030),
 616         REG(0x038),
 617         REG(0x03c),
 618         REG(0x168),
 619         REG(0x140),
 620         REG(0x110),
 621         REG(0x11c),
 622         REG(0x114),
 623         REG(0x118),
 624         REG(0x1c0),
 625         REG(0x1c4),
 626         REG(0x1c8),
 627
 628         NOP(3),
 629         LRI(9, POSTED),
 630         REG16(0x3a8),
 631         REG16(0x28c),
 632         REG16(0x288),
 633         REG16(0x284),
 634         REG16(0x280),
 635         REG16(0x27c),
 636         REG16(0x278),
 637         REG16(0x274),
 638         REG16(0x270),
 639
 640         NOP(13),
 641         LRI(1, POSTED),
 642         REG16(0x200),
 643
 644         NOP(13),
 645         LRI(44, POSTED),
 646         REG(0x028),
 647         REG(0x09c),
 648         REG(0x0c0),
 649         REG(0x178),
 650         REG(0x17c),
 651         REG16(0x358),
 652         REG(0x170),
 653         REG(0x150),
 654         REG(0x154),
 655         REG(0x158),
 656         REG16(0x41c),
 657         REG16(0x600),
 658         REG16(0x604),
 659         REG16(0x608),
 660         REG16(0x60c),
 661         REG16(0x610),
 662         REG16(0x614),
 663         REG16(0x618),
 664         REG16(0x61c),
 665         REG16(0x620),
 666         REG16(0x624),
 667         REG16(0x628),
 668         REG16(0x62c),
 669         REG16(0x630),
 670         REG16(0x634),
 671         REG16(0x638),
 672         REG16(0x63c),
 673         REG16(0x640),
 674         REG16(0x644),
 675         REG16(0x648),
 676         REG16(0x64c),
 677         REG16(0x650),
 678         REG16(0x654),
 679         REG16(0x658),
 680         REG16(0x65c),
 681         REG16(0x660),
 682         REG16(0x664),
 683         REG16(0x668),
 684         REG16(0x66c),
 685         REG16(0x670),
 686         REG16(0x674),
 687         REG16(0x678),
 688         REG16(0x67c),
 689         REG(0x068),
 690
 691         END(176)
 692 };
 693
 694 static const u8 gen12_xcs_offsets[] = {
 695         NOP(1),
 696         LRI(13, POSTED),
 697         REG16(0x244),
 698         REG(0x034),
 699         REG(0x030),
 700         REG(0x038),
 701         REG(0x03c),
 702         REG(0x168),
 703         REG(0x140),
 704         REG(0x110),
 705         REG(0x1c0),
 706         REG(0x1c4),
 707         REG(0x1c8),
 708         REG(0x180),
 709         REG16(0x2b4),
 710
 711         NOP(5),
 712         LRI(9, POSTED),
 713         REG16(0x3a8),
 714         REG16(0x28c),
 715         REG16(0x288),
 716         REG16(0x284),
 717         REG16(0x280),
 718         REG16(0x27c),
 719         REG16(0x278),
 720         REG16(0x274),
 721         REG16(0x270),
 722
 723         END(80)
 724 };
 725
 726 static const u8 gen8_rcs_offsets[] = {
 727         NOP(1),
 728         LRI(14, POSTED),
 729         REG16(0x244),
 730         REG(0x034),
 731         REG(0x030),
 732         REG(0x038),
 733         REG(0x03c),
 734         REG(0x168),
 735         REG(0x140),
 736         REG(0x110),
 737         REG(0x11c),
 738         REG(0x114),
 739         REG(0x118),
 740         REG(0x1c0),
 741         REG(0x1c4),
 742         REG(0x1c8),
 743
 744         NOP(3),
 745         LRI(9, POSTED),
 746         REG16(0x3a8),
 747         REG16(0x28c),
 748         REG16(0x288),
 749         REG16(0x284),
 750         REG16(0x280),
 751         REG16(0x27c),
 752         REG16(0x278),
 753         REG16(0x274),
 754         REG16(0x270),
 755
 756         NOP(13),
 757         LRI(1, 0),
 758         REG(0x0c8),
 759
 760         END(80)
 761 };
 762
 763 static const u8 gen9_rcs_offsets[] = {
 764         NOP(1),
 765         LRI(14, POSTED),
 766         REG16(0x244),
 767         REG(0x34),
 768         REG(0x30),
 769         REG(0x38),
 770         REG(0x3c),
 771         REG(0x168),
 772         REG(0x140),
 773         REG(0x110),
 774         REG(0x11c),
 775         REG(0x114),
 776         REG(0x118),
 777         REG(0x1c0),
 778         REG(0x1c4),
 779         REG(0x1c8),
 780
 781         NOP(3),
 782         LRI(9, POSTED),
 783         REG16(0x3a8),
 784         REG16(0x28c),
 785         REG16(0x288),
 786         REG16(0x284),
 787         REG16(0x280),
 788         REG16(0x27c),
 789         REG16(0x278),
 790         REG16(0x274),
 791         REG16(0x270),
 792
 793         NOP(13),
 794         LRI(1, 0),
 795         REG(0xc8),
 796
 797         NOP(13),
 798         LRI(44, POSTED),
 799         REG(0x28),
 800         REG(0x9c),
 801         REG(0xc0),
 802         REG(0x178),
 803         REG(0x17c),
 804         REG16(0x358),
 805         REG(0x170),
 806         REG(0x150),
 807         REG(0x154),
 808         REG(0x158),
 809         REG16(0x41c),
 810         REG16(0x600),
 811         REG16(0x604),
 812         REG16(0x608),
 813         REG16(0x60c),
 814         REG16(0x610),
 815         REG16(0x614),
 816         REG16(0x618),
 817         REG16(0x61c),
 818         REG16(0x620),
 819         REG16(0x624),
 820         REG16(0x628),
 821         REG16(0x62c),
 822         REG16(0x630),
 823         REG16(0x634),
 824         REG16(0x638),
 825         REG16(0x63c),
 826         REG16(0x640),
 827         REG16(0x644),
 828         REG16(0x648),
 829         REG16(0x64c),
 830         REG16(0x650),
 831         REG16(0x654),
 832         REG16(0x658),
 833         REG16(0x65c),
 834         REG16(0x660),
 835         REG16(0x664),
 836         REG16(0x668),
 837         REG16(0x66c),
 838         REG16(0x670),
 839         REG16(0x674),
 840         REG16(0x678),
 841         REG16(0x67c),
 842         REG(0x68),
 843
 844         END(176)
 845 };
 846
 847 static const u8 gen11_rcs_offsets[] = {
 848         NOP(1),
 849         LRI(15, POSTED),
 850         REG16(0x244),
 851         REG(0x034),
 852         REG(0x030),
 853         REG(0x038),
 854         REG(0x03c),
 855         REG(0x168),
 856         REG(0x140),
 857         REG(0x110),
 858         REG(0x11c),
 859         REG(0x114),
 860         REG(0x118),
 861         REG(0x1c0),
 862         REG(0x1c4),
 863         REG(0x1c8),
 864         REG(0x180),
 865
 866         NOP(1),
 867         LRI(9, POSTED),
 868         REG16(0x3a8),
 869         REG16(0x28c),
 870         REG16(0x288),
 871         REG16(0x284),
 872         REG16(0x280),
 873         REG16(0x27c),
 874         REG16(0x278),
 875         REG16(0x274),
 876         REG16(0x270),
 877
 878         LRI(1, POSTED),
 879         REG(0x1b0),
 880
 881         NOP(10),
 882         LRI(1, 0),
 883         REG(0x0c8),
 884
 885         END(80)
 886 };
 887
 888 static const u8 gen12_rcs_offsets[] = {
 889         NOP(1),
 890         LRI(13, POSTED),
 891         REG16(0x244),
 892         REG(0x034),
 893         REG(0x030),
 894         REG(0x038),
 895         REG(0x03c),
 896         REG(0x168),
 897         REG(0x140),
 898         REG(0x110),
 899         REG(0x1c0),
 900         REG(0x1c4),
 901         REG(0x1c8),
 902         REG(0x180),
 903         REG16(0x2b4),
 904
 905         NOP(5),
 906         LRI(9, POSTED),
 907         REG16(0x3a8),
 908         REG16(0x28c),
 909         REG16(0x288),
 910         REG16(0x284),
 911         REG16(0x280),
 912         REG16(0x27c),
 913         REG16(0x278),
 914         REG16(0x274),
 915         REG16(0x270),
 916
 917         LRI(3, POSTED),
 918         REG(0x1b0),
 919         REG16(0x5a8),
 920         REG16(0x5ac),
 921
 922         NOP(6),
 923         LRI(1, 0),
 924         REG(0x0c8),
 925
 926         END(80)
 927 };
 928
 929 #undef END
 930 #undef REG16
 931 #undef REG
 932 #undef LRI
 933 #undef NOP
 934
 935 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
 936 {
 937         /*
 938          * The gen12+ lists only have the registers we program in the basic
 939          * default state. We rely on the context image using relative
 940          * addressing to automatic fixup the register state between the
 941          * physical engines for virtual engine.
 942          */
 943         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
 944                    !intel_engine_has_relative_mmio(engine));
 945
 946         if (engine->class == RENDER_CLASS) {
 947                 if (INTEL_GEN(engine->i915) >= 12)
 948                         return gen12_rcs_offsets;
 949                 else if (INTEL_GEN(engine->i915) >= 11)
 950                         return gen11_rcs_offsets;
 951                 else if (INTEL_GEN(engine->i915) >= 9)
 952                         return gen9_rcs_offsets;
 953                 else
 954                         return gen8_rcs_offsets;
 955         } else {
 956                 if (INTEL_GEN(engine->i915) >= 12)
 957                         return gen12_xcs_offsets;
 958                 else if (INTEL_GEN(engine->i915) >= 9)
 959                         return gen9_xcs_offsets;
 960                 else
 961                         return gen8_xcs_offsets;
 962         }
 963 }
 964
 965 static struct i915_request *
 966 __unwind_incomplete_requests(struct intel_engine_cs *engine)
 967 {
 968         struct i915_request *rq, *rn, *active = NULL;
 969         struct list_head *uninitialized_var(pl);
 970         int prio = I915_PRIORITY_INVALID;
 971
 972         lockdep_assert_held(&engine->active.lock);
 973
 974         list_for_each_entry_safe_reverse(rq, rn,
 975                                          &engine->active.requests,
 976                                          sched.link) {
 977                 if (i915_request_completed(rq))
 978                         continue; /* XXX */
 979
 980                 __i915_request_unsubmit(rq);
 981
 982                 /*
 983                  * Push the request back into the queue for later resubmission.
 984                  * If this request is not native to this physical engine (i.e.
 985                  * it came from a virtual source), push it back onto the virtual
 986                  * engine so that it can be moved across onto another physical
 987                  * engine as load dictates.
 988                  */
 989                 if (likely(rq->execution_mask == engine->mask)) {
 990                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 991                         if (rq_prio(rq) != prio) {
 992                                 prio = rq_prio(rq);
 993                                 pl = i915_sched_lookup_priolist(engine, prio);
 994                         }
 995                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 996
 997                         list_move(&rq->sched.link, pl);
 998                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
 999
1000                         active = rq;
1001                 } else {
1002                         struct intel_engine_cs *owner = rq->context->engine;
1003
1004                         /*
1005                          * Decouple the virtual breadcrumb before moving it
1006                          * back to the virtual engine -- we don't want the
1007                          * request to complete in the background and try
1008                          * and cancel the breadcrumb on the virtual engine
1009                          * (instead of the old engine where it is linked)!
1010                          */
1011                         if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1012                                      &rq->fence.flags)) {
1013                                 spin_lock_nested(&rq->lock,
1014                                                  SINGLE_DEPTH_NESTING);
1015                                 i915_request_cancel_breadcrumb(rq);
1016                                 spin_unlock(&rq->lock);
1017                         }
1018                         WRITE_ONCE(rq->engine, owner);
1019                         owner->submit_request(rq);
1020                         active = NULL;
1021                 }
1022         }
1023
1024         return active;
1025 }
1026
1027 struct i915_request *
1028 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1029 {
1030         struct intel_engine_cs *engine =
1031                 container_of(execlists, typeof(*engine), execlists);
1032
1033         return __unwind_incomplete_requests(engine);
1034 }
1035
1036 static inline void
1037 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1038 {
1039         /*
1040          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1041          * The compiler should eliminate this function as dead-code.
1042          */
1043         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1044                 return;
1045
1046         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1047                                    status, rq);
1048 }
1049
1050 static void intel_engine_context_in(struct intel_engine_cs *engine)
1051 {
1052         unsigned long flags;
1053
1054         if (READ_ONCE(engine->stats.enabled) == 0)
1055                 return;
1056
1057         write_seqlock_irqsave(&engine->stats.lock, flags);
1058
1059         if (engine->stats.enabled > 0) {
1060                 if (engine->stats.active++ == 0)
1061                         engine->stats.start = ktime_get();
1062                 GEM_BUG_ON(engine->stats.active == 0);
1063         }
1064
1065         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1066 }
1067
1068 static void intel_engine_context_out(struct intel_engine_cs *engine)
1069 {
1070         unsigned long flags;
1071
1072         if (READ_ONCE(engine->stats.enabled) == 0)
1073                 return;
1074
1075         write_seqlock_irqsave(&engine->stats.lock, flags);
1076
1077         if (engine->stats.enabled > 0) {
1078                 ktime_t last;
1079
1080                 if (engine->stats.active && --engine->stats.active == 0) {
1081                         /*
1082                          * Decrement the active context count and in case GPU
1083                          * is now idle add up to the running total.
1084                          */
1085                         last = ktime_sub(ktime_get(), engine->stats.start);
1086
1087                         engine->stats.total = ktime_add(engine->stats.total,
1088                                                         last);
1089                 } else if (engine->stats.active == 0) {
1090                         /*
1091                          * After turning on engine stats, context out might be
1092                          * the first event in which case we account from the
1093                          * time stats gathering was turned on.
1094                          */
1095                         last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1096
1097                         engine->stats.total = ktime_add(engine->stats.total,
1098                                                         last);
1099                 }
1100         }
1101
1102         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1103 }
1104
1105 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1106 {
1107         if (INTEL_GEN(engine->i915) >= 12)
1108                 return 0x60;
1109         else if (INTEL_GEN(engine->i915) >= 9)
1110                 return 0x54;
1111         else if (engine->class == RENDER_CLASS)
1112                 return 0x58;
1113         else
1114                 return -1;
1115 }
1116
1117 static void
1118 execlists_check_context(const struct intel_context *ce,
1119                         const struct intel_engine_cs *engine)
1120 {
1121         const struct intel_ring *ring = ce->ring;
1122         u32 *regs = ce->lrc_reg_state;
1123         bool valid = true;
1124         int x;
1125
1126         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1127                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1128                        engine->name,
1129                        regs[CTX_RING_START],
1130                        i915_ggtt_offset(ring->vma));
1131                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1132                 valid = false;
1133         }
1134
1135         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1136             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1137                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1138                        engine->name,
1139                        regs[CTX_RING_CTL],
1140                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1141                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1142                 valid = false;
1143         }
1144
1145         x = lrc_ring_mi_mode(engine);
1146         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1147                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1148                        engine->name, regs[x + 1]);
1149                 regs[x + 1] &= ~STOP_RING;
1150                 regs[x + 1] |= STOP_RING << 16;
1151                 valid = false;
1152         }
1153
1154         WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1155 }
1156
1157 static void restore_default_state(struct intel_context *ce,
1158                                   struct intel_engine_cs *engine)
1159 {
1160         u32 *regs = ce->lrc_reg_state;
1161
1162         if (engine->pinned_default_state)
1163                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1164                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1165                        engine->context_size - PAGE_SIZE);
1166
1167         execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1168         ce->runtime.last = intel_context_get_runtime(ce);
1169 }
1170
1171 static void reset_active(struct i915_request *rq,
1172                          struct intel_engine_cs *engine)
1173 {
1174         struct intel_context * const ce = rq->context;
1175         u32 head;
1176
1177         /*
1178          * The executing context has been cancelled. We want to prevent
1179          * further execution along this context and propagate the error on
1180          * to anything depending on its results.
1181          *
1182          * In __i915_request_submit(), we apply the -EIO and remove the
1183          * requests' payloads for any banned requests. But first, we must
1184          * rewind the context back to the start of the incomplete request so
1185          * that we do not jump back into the middle of the batch.
1186          *
1187          * We preserve the breadcrumbs and semaphores of the incomplete
1188          * requests so that inter-timeline dependencies (i.e other timelines)
1189          * remain correctly ordered. And we defer to __i915_request_submit()
1190          * so that all asynchronous waits are correctly handled.
1191          */
1192         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1193                      rq->fence.context, rq->fence.seqno);
1194
1195         /* On resubmission of the active request, payload will be scrubbed */
1196         if (i915_request_completed(rq))
1197                 head = rq->tail;
1198         else
1199                 head = active_request(ce->timeline, rq)->head;
1200         head = intel_ring_wrap(ce->ring, head);
1201
1202         /* Scrub the context image to prevent replaying the previous batch */
1203         restore_default_state(ce, engine);
1204         __execlists_update_reg_state(ce, engine, head);
1205
1206         /* We've switched away, so this should be a no-op, but intent matters */
1207         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1208 }
1209
1210 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1211 {
1212 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1213         ce->runtime.num_underflow += dt < 0;
1214         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1215 #endif
1216 }
1217
1218 static void intel_context_update_runtime(struct intel_context *ce)
1219 {
1220         u32 old;
1221         s32 dt;
1222
1223         if (intel_context_is_barrier(ce))
1224                 return;
1225
1226         old = ce->runtime.last;
1227         ce->runtime.last = intel_context_get_runtime(ce);
1228         dt = ce->runtime.last - old;
1229
1230         if (unlikely(dt <= 0)) {
1231                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1232                          old, ce->runtime.last, dt);
1233                 st_update_runtime_underflow(ce, dt);
1234                 return;
1235         }
1236
1237         ewma_runtime_add(&ce->runtime.avg, dt);
1238         ce->runtime.total += dt;
1239 }
1240
1241 static inline struct intel_engine_cs *
1242 __execlists_schedule_in(struct i915_request *rq)
1243 {
1244         struct intel_engine_cs * const engine = rq->engine;
1245         struct intel_context * const ce = rq->context;
1246
1247         intel_context_get(ce);
1248
1249         if (unlikely(intel_context_is_banned(ce)))
1250                 reset_active(rq, engine);
1251
1252         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1253                 execlists_check_context(ce, engine);
1254
1255         ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1256         if (ce->tag) {
1257                 /* Use a fixed tag for OA and friends */
1258                 ce->lrc_desc |= (u64)ce->tag << 32;
1259         } else {
1260                 /* We don't need a strict matching tag, just different values */
1261                 ce->lrc_desc |=
1262                         (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1263                         GEN11_SW_CTX_ID_SHIFT;
1264                 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1265         }
1266
1267         __intel_gt_pm_get(engine->gt);
1268         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1269         intel_engine_context_in(engine);
1270
1271         return engine;
1272 }
1273
1274 static inline struct i915_request *
1275 execlists_schedule_in(struct i915_request *rq, int idx)
1276 {
1277         struct intel_context * const ce = rq->context;
1278         struct intel_engine_cs *old;
1279
1280         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1281         trace_i915_request_in(rq, idx);
1282
1283         old = READ_ONCE(ce->inflight);
1284         do {
1285                 if (!old) {
1286                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1287                         break;
1288                 }
1289         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1290
1291         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1292         return i915_request_get(rq);
1293 }
1294
1295 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1296 {
1297         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1298         struct i915_request *next = READ_ONCE(ve->request);
1299
1300         if (next && next->execution_mask & ~rq->execution_mask)
1301                 tasklet_schedule(&ve->base.execlists.tasklet);
1302 }
1303
1304 static inline void
1305 __execlists_schedule_out(struct i915_request *rq,
1306                          struct intel_engine_cs * const engine)
1307 {
1308         struct intel_context * const ce = rq->context;
1309
1310         /*
1311          * NB process_csb() is not under the engine->active.lock and hence
1312          * schedule_out can race with schedule_in meaning that we should
1313          * refrain from doing non-trivial work here.
1314          */
1315
1316         /*
1317          * If we have just completed this context, the engine may now be
1318          * idle and we want to re-enter powersaving.
1319          */
1320         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1321             i915_request_completed(rq))
1322                 intel_engine_add_retire(engine, ce->timeline);
1323
1324         intel_context_update_runtime(ce);
1325         intel_engine_context_out(engine);
1326         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1327         intel_gt_pm_put_async(engine->gt);
1328
1329         /*
1330          * If this is part of a virtual engine, its next request may
1331          * have been blocked waiting for access to the active context.
1332          * We have to kick all the siblings again in case we need to
1333          * switch (e.g. the next request is not runnable on this
1334          * engine). Hopefully, we will already have submitted the next
1335          * request before the tasklet runs and do not need to rebuild
1336          * each virtual tree and kick everyone again.
1337          */
1338         if (ce->engine != engine)
1339                 kick_siblings(rq, ce);
1340
1341         intel_context_put(ce);
1342 }
1343
1344 static inline void
1345 execlists_schedule_out(struct i915_request *rq)
1346 {
1347         struct intel_context * const ce = rq->context;
1348         struct intel_engine_cs *cur, *old;
1349
1350         trace_i915_request_out(rq);
1351
1352         old = READ_ONCE(ce->inflight);
1353         do
1354                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1355         while (!try_cmpxchg(&ce->inflight, &old, cur));
1356         if (!cur)
1357                 __execlists_schedule_out(rq, old);
1358
1359         i915_request_put(rq);
1360 }
1361
1362 static u64 execlists_update_context(struct i915_request *rq)
1363 {
1364         struct intel_context *ce = rq->context;
1365         u64 desc = ce->lrc_desc;
1366         u32 tail, prev;
1367
1368         /*
1369          * WaIdleLiteRestore:bdw,skl
1370          *
1371          * We should never submit the context with the same RING_TAIL twice
1372          * just in case we submit an empty ring, which confuses the HW.
1373          *
1374          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1375          * the normal request to be able to always advance the RING_TAIL on
1376          * subsequent resubmissions (for lite restore). Should that fail us,
1377          * and we try and submit the same tail again, force the context
1378          * reload.
1379          *
1380          * If we need to return to a preempted context, we need to skip the
1381          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1382          * HW has a tendency to ignore us rewinding the TAIL to the end of
1383          * an earlier request.
1384          */
1385         tail = intel_ring_set_tail(rq->ring, rq->tail);
1386         prev = ce->lrc_reg_state[CTX_RING_TAIL];
1387         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1388                 desc |= CTX_DESC_FORCE_RESTORE;
1389         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1390         rq->tail = rq->wa_tail;
1391
1392         /*
1393          * Make sure the context image is complete before we submit it to HW.
1394          *
1395          * Ostensibly, writes (including the WCB) should be flushed prior to
1396          * an uncached write such as our mmio register access, the empirical
1397          * evidence (esp. on Braswell) suggests that the WC write into memory
1398          * may not be visible to the HW prior to the completion of the UC
1399          * register write and that we may begin execution from the context
1400          * before its image is complete leading to invalid PD chasing.
1401          */
1402         wmb();
1403
1404         ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1405         return desc;
1406 }
1407
1408 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1409 {
1410         if (execlists->ctrl_reg) {
1411                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1412                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1413         } else {
1414                 writel(upper_32_bits(desc), execlists->submit_reg);
1415                 writel(lower_32_bits(desc), execlists->submit_reg);
1416         }
1417 }
1418
1419 static __maybe_unused char *
1420 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1421 {
1422         if (!rq)
1423                 return "";
1424
1425         snprintf(buf, buflen, "%s%llx:%lld%s prio %d",
1426                  prefix,
1427                  rq->fence.context, rq->fence.seqno,
1428                  i915_request_completed(rq) ? "!" :
1429                  i915_request_started(rq) ? "*" :
1430                  "",
1431                  rq_prio(rq));
1432
1433         return buf;
1434 }
1435
1436 static __maybe_unused void
1437 trace_ports(const struct intel_engine_execlists *execlists,
1438             const char *msg,
1439             struct i915_request * const *ports)
1440 {
1441         const struct intel_engine_cs *engine =
1442                 container_of(execlists, typeof(*engine), execlists);
1443         char __maybe_unused p0[40], p1[40];
1444
1445         if (!ports[0])
1446                 return;
1447
1448         ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1449                      dump_port(p0, sizeof(p0), "", ports[0]),
1450                      dump_port(p1, sizeof(p1), ", ", ports[1]));
1451 }
1452
1453 static inline bool
1454 reset_in_progress(const struct intel_engine_execlists *execlists)
1455 {
1456         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1457 }
1458
1459 static __maybe_unused bool
1460 assert_pending_valid(const struct intel_engine_execlists *execlists,
1461                      const char *msg)
1462 {
1463         struct i915_request * const *port, *rq;
1464         struct intel_context *ce = NULL;
1465         bool sentinel = false;
1466
1467         trace_ports(execlists, msg, execlists->pending);
1468
1469         /* We may be messing around with the lists during reset, lalala */
1470         if (reset_in_progress(execlists))
1471                 return true;
1472
1473         if (!execlists->pending[0]) {
1474                 GEM_TRACE_ERR("Nothing pending for promotion!\n");
1475                 return false;
1476         }
1477
1478         if (execlists->pending[execlists_num_ports(execlists)]) {
1479                 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1480                               execlists_num_ports(execlists));
1481                 return false;
1482         }
1483
1484         for (port = execlists->pending; (rq = *port); port++) {
1485                 unsigned long flags;
1486                 bool ok = true;
1487
1488                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1489                 GEM_BUG_ON(!i915_request_is_active(rq));
1490
1491                 if (ce == rq->context) {
1492                         GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1493                                       ce->timeline->fence_context,
1494                                       port - execlists->pending);
1495                         return false;
1496                 }
1497                 ce = rq->context;
1498
1499                 /*
1500                  * Sentinels are supposed to be lonely so they flush the
1501                  * current exection off the HW. Check that they are the
1502                  * only request in the pending submission.
1503                  */
1504                 if (sentinel) {
1505                         GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
1506                                       ce->timeline->fence_context,
1507                                       port - execlists->pending);
1508                         return false;
1509                 }
1510
1511                 sentinel = i915_request_has_sentinel(rq);
1512                 if (sentinel && port != execlists->pending) {
1513                         GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
1514                                       ce->timeline->fence_context,
1515                                       port - execlists->pending);
1516                         return false;
1517                 }
1518
1519                 /* Hold tightly onto the lock to prevent concurrent retires! */
1520                 if (!spin_trylock_irqsave(&rq->lock, flags))
1521                         continue;
1522
1523                 if (i915_request_completed(rq))
1524                         goto unlock;
1525
1526                 if (i915_active_is_idle(&ce->active) &&
1527                     !intel_context_is_barrier(ce)) {
1528                         GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1529                                       ce->timeline->fence_context,
1530                                       port - execlists->pending);
1531                         ok = false;
1532                         goto unlock;
1533                 }
1534
1535                 if (!i915_vma_is_pinned(ce->state)) {
1536                         GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1537                                       ce->timeline->fence_context,
1538                                       port - execlists->pending);
1539                         ok = false;
1540                         goto unlock;
1541                 }
1542
1543                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1544                         GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1545                                       ce->timeline->fence_context,
1546                                       port - execlists->pending);
1547                         ok = false;
1548                         goto unlock;
1549                 }
1550
1551 unlock:
1552                 spin_unlock_irqrestore(&rq->lock, flags);
1553                 if (!ok)
1554                         return false;
1555         }
1556
1557         return ce;
1558 }
1559
1560 static void execlists_submit_ports(struct intel_engine_cs *engine)
1561 {
1562         struct intel_engine_execlists *execlists = &engine->execlists;
1563         unsigned int n;
1564
1565         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1566
1567         /*
1568          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1569          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1570          * not be relinquished until the device is idle (see
1571          * i915_gem_idle_work_handler()). As a precaution, we make sure
1572          * that all ELSP are drained i.e. we have processed the CSB,
1573          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1574          */
1575         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1576
1577         /*
1578          * ELSQ note: the submit queue is not cleared after being submitted
1579          * to the HW so we need to make sure we always clean it up. This is
1580          * currently ensured by the fact that we always write the same number
1581          * of elsq entries, keep this in mind before changing the loop below.
1582          */
1583         for (n = execlists_num_ports(execlists); n--; ) {
1584                 struct i915_request *rq = execlists->pending[n];
1585
1586                 write_desc(execlists,
1587                            rq ? execlists_update_context(rq) : 0,
1588                            n);
1589         }
1590
1591         /* we need to manually load the submit queue */
1592         if (execlists->ctrl_reg)
1593                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1594 }
1595
1596 static bool ctx_single_port_submission(const struct intel_context *ce)
1597 {
1598         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1599                 intel_context_force_single_submission(ce));
1600 }
1601
1602 static bool can_merge_ctx(const struct intel_context *prev,
1603                           const struct intel_context *next)
1604 {
1605         if (prev != next)
1606                 return false;
1607
1608         if (ctx_single_port_submission(prev))
1609                 return false;
1610
1611         return true;
1612 }
1613
1614 static unsigned long i915_request_flags(const struct i915_request *rq)
1615 {
1616         return READ_ONCE(rq->fence.flags);
1617 }
1618
1619 static bool can_merge_rq(const struct i915_request *prev,
1620                          const struct i915_request *next)
1621 {
1622         GEM_BUG_ON(prev == next);
1623         GEM_BUG_ON(!assert_priority_queue(prev, next));
1624
1625         /*
1626          * We do not submit known completed requests. Therefore if the next
1627          * request is already completed, we can pretend to merge it in
1628          * with the previous context (and we will skip updating the ELSP
1629          * and tracking). Thus hopefully keeping the ELSP full with active
1630          * contexts, despite the best efforts of preempt-to-busy to confuse
1631          * us.
1632          */
1633         if (i915_request_completed(next))
1634                 return true;
1635
1636         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1637                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1638                       BIT(I915_FENCE_FLAG_SENTINEL))))
1639                 return false;
1640
1641         if (!can_merge_ctx(prev->context, next->context))
1642                 return false;
1643
1644         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1645         return true;
1646 }
1647
1648 static void virtual_update_register_offsets(u32 *regs,
1649                                             struct intel_engine_cs *engine)
1650 {
1651         set_offsets(regs, reg_offsets(engine), engine, false);
1652 }
1653
1654 static bool virtual_matches(const struct virtual_engine *ve,
1655                             const struct i915_request *rq,
1656                             const struct intel_engine_cs *engine)
1657 {
1658         const struct intel_engine_cs *inflight;
1659
1660         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1661                 return false;
1662
1663         /*
1664          * We track when the HW has completed saving the context image
1665          * (i.e. when we have seen the final CS event switching out of
1666          * the context) and must not overwrite the context image before
1667          * then. This restricts us to only using the active engine
1668          * while the previous virtualized request is inflight (so
1669          * we reuse the register offsets). This is a very small
1670          * hystersis on the greedy seelction algorithm.
1671          */
1672         inflight = intel_context_inflight(&ve->context);
1673         if (inflight && inflight != engine)
1674                 return false;
1675
1676         return true;
1677 }
1678
1679 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1680                                      struct i915_request *rq)
1681 {
1682         struct intel_engine_cs *old = ve->siblings[0];
1683
1684         /* All unattached (rq->engine == old) must already be completed */
1685
1686         spin_lock(&old->breadcrumbs.irq_lock);
1687         if (!list_empty(&ve->context.signal_link)) {
1688                 list_del_init(&ve->context.signal_link);
1689
1690                 /*
1691                  * We cannot acquire the new engine->breadcrumbs.irq_lock
1692                  * (as we are holding a breadcrumbs.irq_lock already),
1693                  * so attach this request to the signaler on submission.
1694                  * The queued irq_work will occur when we finally drop
1695                  * the engine->active.lock after dequeue.
1696                  */
1697                 set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
1698
1699                 /* Also transfer the pending irq_work for the old breadcrumb. */
1700                 intel_engine_signal_breadcrumbs(rq->engine);
1701         }
1702         spin_unlock(&old->breadcrumbs.irq_lock);
1703 }
1704
1705 #define for_each_waiter(p__, rq__) \
1706         list_for_each_entry_lockless(p__, \
1707                                      &(rq__)->sched.waiters_list, \
1708                                      wait_link)
1709
1710 #define for_each_signaler(p__, rq__) \
1711         list_for_each_entry_rcu(p__, \
1712                                 &(rq__)->sched.signalers_list, \
1713                                 signal_link)
1714
1715 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1716 {
1717         LIST_HEAD(list);
1718
1719         /*
1720          * We want to move the interrupted request to the back of
1721          * the round-robin list (i.e. its priority level), but
1722          * in doing so, we must then move all requests that were in
1723          * flight and were waiting for the interrupted request to
1724          * be run after it again.
1725          */
1726         do {
1727                 struct i915_dependency *p;
1728
1729                 GEM_BUG_ON(i915_request_is_active(rq));
1730                 list_move_tail(&rq->sched.link, pl);
1731
1732                 for_each_waiter(p, rq) {
1733                         struct i915_request *w =
1734                                 container_of(p->waiter, typeof(*w), sched);
1735
1736                         /* Leave semaphores spinning on the other engines */
1737                         if (w->engine != rq->engine)
1738                                 continue;
1739
1740                         /* No waiter should start before its signaler */
1741                         GEM_BUG_ON(i915_request_started(w) &&
1742                                    !i915_request_completed(rq));
1743
1744                         GEM_BUG_ON(i915_request_is_active(w));
1745                         if (!i915_request_is_ready(w))
1746                                 continue;
1747
1748                         if (rq_prio(w) < rq_prio(rq))
1749                                 continue;
1750
1751                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1752                         list_move_tail(&w->sched.link, &list);
1753                 }
1754
1755                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1756         } while (rq);
1757 }
1758
1759 static void defer_active(struct intel_engine_cs *engine)
1760 {
1761         struct i915_request *rq;
1762
1763         rq = __unwind_incomplete_requests(engine);
1764         if (!rq)
1765                 return;
1766
1767         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1768 }
1769
1770 static bool
1771 need_timeslice(const struct intel_engine_cs *engine,
1772                const struct i915_request *rq)
1773 {
1774         int hint;
1775
1776         if (!intel_engine_has_timeslices(engine))
1777                 return false;
1778
1779         hint = engine->execlists.queue_priority_hint;
1780         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1781                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1782
1783         return hint >= effective_prio(rq);
1784 }
1785
1786 static bool
1787 timeslice_yield(const struct intel_engine_execlists *el,
1788                 const struct i915_request *rq)
1789 {
1790         /*
1791          * Once bitten, forever smitten!
1792          *
1793          * If the active context ever busy-waited on a semaphore,
1794          * it will be treated as a hog until the end of its timeslice (i.e.
1795          * until it is scheduled out and replaced by a new submission,
1796          * possibly even its own lite-restore). The HW only sends an interrupt
1797          * on the first miss, and we do know if that semaphore has been
1798          * signaled, or even if it is now stuck on another semaphore. Play
1799          * safe, yield if it might be stuck -- it will be given a fresh
1800          * timeslice in the near future.
1801          */
1802         return upper_32_bits(rq->context->lrc_desc) == READ_ONCE(el->yield);
1803 }
1804
1805 static bool
1806 timeslice_expired(const struct intel_engine_execlists *el,
1807                   const struct i915_request *rq)
1808 {
1809         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1810 }
1811
1812 static int
1813 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1814 {
1815         if (list_is_last(&rq->sched.link, &engine->active.requests))
1816                 return INT_MIN;
1817
1818         return rq_prio(list_next_entry(rq, sched.link));
1819 }
1820
1821 static inline unsigned long
1822 timeslice(const struct intel_engine_cs *engine)
1823 {
1824         return READ_ONCE(engine->props.timeslice_duration_ms);
1825 }
1826
1827 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1828 {
1829         const struct intel_engine_execlists *execlists = &engine->execlists;
1830         const struct i915_request *rq = *execlists->active;
1831
1832         if (!rq || i915_request_completed(rq))
1833                 return 0;
1834
1835         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1836                 return 0;
1837
1838         return timeslice(engine);
1839 }
1840
1841 static void set_timeslice(struct intel_engine_cs *engine)
1842 {
1843         unsigned long duration;
1844
1845         if (!intel_engine_has_timeslices(engine))
1846                 return;
1847
1848         duration = active_timeslice(engine);
1849         ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
1850
1851         set_timer_ms(&engine->execlists.timer, duration);
1852 }
1853
1854 static void start_timeslice(struct intel_engine_cs *engine)
1855 {
1856         struct intel_engine_execlists *execlists = &engine->execlists;
1857         const int prio = queue_prio(execlists);
1858         unsigned long duration;
1859
1860         if (!intel_engine_has_timeslices(engine))
1861                 return;
1862
1863         WRITE_ONCE(execlists->switch_priority_hint, prio);
1864         if (prio == INT_MIN)
1865                 return;
1866
1867         if (timer_pending(&execlists->timer))
1868                 return;
1869
1870         duration = timeslice(engine);
1871         ENGINE_TRACE(engine,
1872                      "start timeslicing, prio:%d, interval:%lu",
1873                      prio, duration);
1874
1875         set_timer_ms(&execlists->timer, duration);
1876 }
1877
1878 static void record_preemption(struct intel_engine_execlists *execlists)
1879 {
1880         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1881 }
1882
1883 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1884                                             const struct i915_request *rq)
1885 {
1886         if (!rq)
1887                 return 0;
1888
1889         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1890         if (unlikely(intel_context_is_banned(rq->context)))
1891                 return 1;
1892
1893         return READ_ONCE(engine->props.preempt_timeout_ms);
1894 }
1895
1896 static void set_preempt_timeout(struct intel_engine_cs *engine,
1897                                 const struct i915_request *rq)
1898 {
1899         if (!intel_engine_has_preempt_reset(engine))
1900                 return;
1901
1902         set_timer_ms(&engine->execlists.preempt,
1903                      active_preempt_timeout(engine, rq));
1904 }
1905
1906 static inline void clear_ports(struct i915_request **ports, int count)
1907 {
1908         memset_p((void **)ports, NULL, count);
1909 }
1910
1911 static void execlists_dequeue(struct intel_engine_cs *engine)
1912 {
1913         struct intel_engine_execlists * const execlists = &engine->execlists;
1914         struct i915_request **port = execlists->pending;
1915         struct i915_request ** const last_port = port + execlists->port_mask;
1916         struct i915_request * const *active;
1917         struct i915_request *last;
1918         struct rb_node *rb;
1919         bool submit = false;
1920
1921         /*
1922          * Hardware submission is through 2 ports. Conceptually each port
1923          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1924          * static for a context, and unique to each, so we only execute
1925          * requests belonging to a single context from each ring. RING_HEAD
1926          * is maintained by the CS in the context image, it marks the place
1927          * where it got up to last time, and through RING_TAIL we tell the CS
1928          * where we want to execute up to this time.
1929          *
1930          * In this list the requests are in order of execution. Consecutive
1931          * requests from the same context are adjacent in the ringbuffer. We
1932          * can combine these requests into a single RING_TAIL update:
1933          *
1934          *              RING_HEAD...req1...req2
1935          *                                    ^- RING_TAIL
1936          * since to execute req2 the CS must first execute req1.
1937          *
1938          * Our goal then is to point each port to the end of a consecutive
1939          * sequence of requests as being the most optimal (fewest wake ups
1940          * and context switches) submission.
1941          */
1942
1943         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1944                 struct virtual_engine *ve =
1945                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1946                 struct i915_request *rq = READ_ONCE(ve->request);
1947
1948                 if (!rq) { /* lazily cleanup after another engine handled rq */
1949                         rb_erase_cached(rb, &execlists->virtual);
1950                         RB_CLEAR_NODE(rb);
1951                         rb = rb_first_cached(&execlists->virtual);
1952                         continue;
1953                 }
1954
1955                 if (!virtual_matches(ve, rq, engine)) {
1956                         rb = rb_next(rb);
1957                         continue;
1958                 }
1959
1960                 break;
1961         }
1962
1963         /*
1964          * If the queue is higher priority than the last
1965          * request in the currently active context, submit afresh.
1966          * We will resubmit again afterwards in case we need to split
1967          * the active context to interject the preemption request,
1968          * i.e. we will retrigger preemption following the ack in case
1969          * of trouble.
1970          */
1971         active = READ_ONCE(execlists->active);
1972
1973         /*
1974          * In theory we can skip over completed contexts that have not
1975          * yet been processed by events (as those events are in flight):
1976          *
1977          * while ((last = *active) && i915_request_completed(last))
1978          *      active++;
1979          *
1980          * However, the GPU cannot handle this as it will ultimately
1981          * find itself trying to jump back into a context it has just
1982          * completed and barf.
1983          */
1984
1985         if ((last = *active)) {
1986                 if (need_preempt(engine, last, rb)) {
1987                         if (i915_request_completed(last)) {
1988                                 tasklet_hi_schedule(&execlists->tasklet);
1989                                 return;
1990                         }
1991
1992                         ENGINE_TRACE(engine,
1993                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1994                                      last->fence.context,
1995                                      last->fence.seqno,
1996                                      last->sched.attr.priority,
1997                                      execlists->queue_priority_hint);
1998                         record_preemption(execlists);
1999
2000                         /*
2001                          * Don't let the RING_HEAD advance past the breadcrumb
2002                          * as we unwind (and until we resubmit) so that we do
2003                          * not accidentally tell it to go backwards.
2004                          */
2005                         ring_set_paused(engine, 1);
2006
2007                         /*
2008                          * Note that we have not stopped the GPU at this point,
2009                          * so we are unwinding the incomplete requests as they
2010                          * remain inflight and so by the time we do complete
2011                          * the preemption, some of the unwound requests may
2012                          * complete!
2013                          */
2014                         __unwind_incomplete_requests(engine);
2015
2016                         last = NULL;
2017                 } else if (need_timeslice(engine, last) &&
2018                            timeslice_expired(execlists, last)) {
2019                         if (i915_request_completed(last)) {
2020                                 tasklet_hi_schedule(&execlists->tasklet);
2021                                 return;
2022                         }
2023
2024                         ENGINE_TRACE(engine,
2025                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2026                                      last->fence.context,
2027                                      last->fence.seqno,
2028                                      last->sched.attr.priority,
2029                                      execlists->queue_priority_hint,
2030                                      yesno(timeslice_yield(execlists, last)));
2031
2032                         ring_set_paused(engine, 1);
2033                         defer_active(engine);
2034
2035                         /*
2036                          * Unlike for preemption, if we rewind and continue
2037                          * executing the same context as previously active,
2038                          * the order of execution will remain the same and
2039                          * the tail will only advance. We do not need to
2040                          * force a full context restore, as a lite-restore
2041                          * is sufficient to resample the monotonic TAIL.
2042                          *
2043                          * If we switch to any other context, similarly we
2044                          * will not rewind TAIL of current context, and
2045                          * normal save/restore will preserve state and allow
2046                          * us to later continue executing the same request.
2047                          */
2048                         last = NULL;
2049                 } else {
2050                         /*
2051                          * Otherwise if we already have a request pending
2052                          * for execution after the current one, we can
2053                          * just wait until the next CS event before
2054                          * queuing more. In either case we will force a
2055                          * lite-restore preemption event, but if we wait
2056                          * we hopefully coalesce several updates into a single
2057                          * submission.
2058                          */
2059                         if (!list_is_last(&last->sched.link,
2060                                           &engine->active.requests)) {
2061                                 /*
2062                                  * Even if ELSP[1] is occupied and not worthy
2063                                  * of timeslices, our queue might be.
2064                                  */
2065                                 start_timeslice(engine);
2066                                 return;
2067                         }
2068                 }
2069         }
2070
2071         while (rb) { /* XXX virtual is always taking precedence */
2072                 struct virtual_engine *ve =
2073                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2074                 struct i915_request *rq;
2075
2076                 spin_lock(&ve->base.active.lock);
2077
2078                 rq = ve->request;
2079                 if (unlikely(!rq)) { /* lost the race to a sibling */
2080                         spin_unlock(&ve->base.active.lock);
2081                         rb_erase_cached(rb, &execlists->virtual);
2082                         RB_CLEAR_NODE(rb);
2083                         rb = rb_first_cached(&execlists->virtual);
2084                         continue;
2085                 }
2086
2087                 GEM_BUG_ON(rq != ve->request);
2088                 GEM_BUG_ON(rq->engine != &ve->base);
2089                 GEM_BUG_ON(rq->context != &ve->context);
2090
2091                 if (rq_prio(rq) >= queue_prio(execlists)) {
2092                         if (!virtual_matches(ve, rq, engine)) {
2093                                 spin_unlock(&ve->base.active.lock);
2094                                 rb = rb_next(rb);
2095                                 continue;
2096                         }
2097
2098                         if (last && !can_merge_rq(last, rq)) {
2099                                 spin_unlock(&ve->base.active.lock);
2100                                 start_timeslice(engine);
2101                                 return; /* leave this for another sibling */
2102                         }
2103
2104                         ENGINE_TRACE(engine,
2105                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2106                                      rq->fence.context,
2107                                      rq->fence.seqno,
2108                                      i915_request_completed(rq) ? "!" :
2109                                      i915_request_started(rq) ? "*" :
2110                                      "",
2111                                      yesno(engine != ve->siblings[0]));
2112
2113                         WRITE_ONCE(ve->request, NULL);
2114                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2115                                    INT_MIN);
2116                         rb_erase_cached(rb, &execlists->virtual);
2117                         RB_CLEAR_NODE(rb);
2118
2119                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2120                         WRITE_ONCE(rq->engine, engine);
2121
2122                         if (engine != ve->siblings[0]) {
2123                                 u32 *regs = ve->context.lrc_reg_state;
2124                                 unsigned int n;
2125
2126                                 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2127
2128                                 if (!intel_engine_has_relative_mmio(engine))
2129                                         virtual_update_register_offsets(regs,
2130                                                                         engine);
2131
2132                                 if (!list_empty(&ve->context.signals))
2133                                         virtual_xfer_breadcrumbs(ve, rq);
2134
2135                                 /*
2136                                  * Move the bound engine to the top of the list
2137                                  * for future execution. We then kick this
2138                                  * tasklet first before checking others, so that
2139                                  * we preferentially reuse this set of bound
2140                                  * registers.
2141                                  */
2142                                 for (n = 1; n < ve->num_siblings; n++) {
2143                                         if (ve->siblings[n] == engine) {
2144                                                 swap(ve->siblings[n],
2145                                                      ve->siblings[0]);
2146                                                 break;
2147                                         }
2148                                 }
2149
2150                                 GEM_BUG_ON(ve->siblings[0] != engine);
2151                         }
2152
2153                         if (__i915_request_submit(rq)) {
2154                                 submit = true;
2155                                 last = rq;
2156                         }
2157                         i915_request_put(rq);
2158
2159                         /*
2160                          * Hmm, we have a bunch of virtual engine requests,
2161                          * but the first one was already completed (thanks
2162                          * preempt-to-busy!). Keep looking at the veng queue
2163                          * until we have no more relevant requests (i.e.
2164                          * the normal submit queue has higher priority).
2165                          */
2166                         if (!submit) {
2167                                 spin_unlock(&ve->base.active.lock);
2168                                 rb = rb_first_cached(&execlists->virtual);
2169                                 continue;
2170                         }
2171                 }
2172
2173                 spin_unlock(&ve->base.active.lock);
2174                 break;
2175         }
2176
2177         while ((rb = rb_first_cached(&execlists->queue))) {
2178                 struct i915_priolist *p = to_priolist(rb);
2179                 struct i915_request *rq, *rn;
2180                 int i;
2181
2182                 priolist_for_each_request_consume(rq, rn, p, i) {
2183                         bool merge = true;
2184
2185                         /*
2186                          * Can we combine this request with the current port?
2187                          * It has to be the same context/ringbuffer and not
2188                          * have any exceptions (e.g. GVT saying never to
2189                          * combine contexts).
2190                          *
2191                          * If we can combine the requests, we can execute both
2192                          * by updating the RING_TAIL to point to the end of the
2193                          * second request, and so we never need to tell the
2194                          * hardware about the first.
2195                          */
2196                         if (last && !can_merge_rq(last, rq)) {
2197                                 /*
2198                                  * If we are on the second port and cannot
2199                                  * combine this request with the last, then we
2200                                  * are done.
2201                                  */
2202                                 if (port == last_port)
2203                                         goto done;
2204
2205                                 /*
2206                                  * We must not populate both ELSP[] with the
2207                                  * same LRCA, i.e. we must submit 2 different
2208                                  * contexts if we submit 2 ELSP.
2209                                  */
2210                                 if (last->context == rq->context)
2211                                         goto done;
2212
2213                                 if (i915_request_has_sentinel(last))
2214                                         goto done;
2215
2216                                 /*
2217                                  * If GVT overrides us we only ever submit
2218                                  * port[0], leaving port[1] empty. Note that we
2219                                  * also have to be careful that we don't queue
2220                                  * the same context (even though a different
2221                                  * request) to the second port.
2222                                  */
2223                                 if (ctx_single_port_submission(last->context) ||
2224                                     ctx_single_port_submission(rq->context))
2225                                         goto done;
2226
2227                                 merge = false;
2228                         }
2229
2230                         if (__i915_request_submit(rq)) {
2231                                 if (!merge) {
2232                                         *port = execlists_schedule_in(last, port - execlists->pending);
2233                                         port++;
2234                                         last = NULL;
2235                                 }
2236
2237                                 GEM_BUG_ON(last &&
2238                                            !can_merge_ctx(last->context,
2239                                                           rq->context));
2240                                 GEM_BUG_ON(last &&
2241                                            i915_seqno_passed(last->fence.seqno,
2242                                                              rq->fence.seqno));
2243
2244                                 submit = true;
2245                                 last = rq;
2246                         }
2247                 }
2248
2249                 rb_erase_cached(&p->node, &execlists->queue);
2250                 i915_priolist_free(p);
2251         }
2252
2253 done:
2254         /*
2255          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2256          *
2257          * We choose the priority hint such that if we add a request of greater
2258          * priority than this, we kick the submission tasklet to decide on
2259          * the right order of submitting the requests to hardware. We must
2260          * also be prepared to reorder requests as they are in-flight on the
2261          * HW. We derive the priority hint then as the first "hole" in
2262          * the HW submission ports and if there are no available slots,
2263          * the priority of the lowest executing request, i.e. last.
2264          *
2265          * When we do receive a higher priority request ready to run from the
2266          * user, see queue_request(), the priority hint is bumped to that
2267          * request triggering preemption on the next dequeue (or subsequent
2268          * interrupt for secondary ports).
2269          */
2270         execlists->queue_priority_hint = queue_prio(execlists);
2271
2272         if (submit) {
2273                 *port = execlists_schedule_in(last, port - execlists->pending);
2274                 execlists->switch_priority_hint =
2275                         switch_prio(engine, *execlists->pending);
2276
2277                 /*
2278                  * Skip if we ended up with exactly the same set of requests,
2279                  * e.g. trying to timeslice a pair of ordered contexts
2280                  */
2281                 if (!memcmp(active, execlists->pending,
2282                             (port - execlists->pending + 1) * sizeof(*port))) {
2283                         do
2284                                 execlists_schedule_out(fetch_and_zero(port));
2285                         while (port-- != execlists->pending);
2286
2287                         goto skip_submit;
2288                 }
2289                 clear_ports(port + 1, last_port - port);
2290
2291                 WRITE_ONCE(execlists->yield, -1);
2292                 execlists_submit_ports(engine);
2293                 set_preempt_timeout(engine, *active);
2294         } else {
2295 skip_submit:
2296                 ring_set_paused(engine, 0);
2297         }
2298 }
2299
2300 static void
2301 cancel_port_requests(struct intel_engine_execlists * const execlists)
2302 {
2303         struct i915_request * const *port;
2304
2305         for (port = execlists->pending; *port; port++)
2306                 execlists_schedule_out(*port);
2307         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2308
2309         /* Mark the end of active before we overwrite *active */
2310         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2311                 execlists_schedule_out(*port);
2312         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2313
2314         smp_wmb(); /* complete the seqlock for execlists_active() */
2315         WRITE_ONCE(execlists->active, execlists->inflight);
2316 }
2317
2318 static inline void
2319 invalidate_csb_entries(const u32 *first, const u32 *last)
2320 {
2321         clflush((void *)first);
2322         clflush((void *)last);
2323 }
2324
2325 /*
2326  * Starting with Gen12, the status has a new format:
2327  *
2328  *     bit  0:     switched to new queue
2329  *     bit  1:     reserved
2330  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2331  *                 switch detail is set to "wait on semaphore"
2332  *     bits 3-5:   engine class
2333  *     bits 6-11:  engine instance
2334  *     bits 12-14: reserved
2335  *     bits 15-25: sw context id of the lrc the GT switched to
2336  *     bits 26-31: sw counter of the lrc the GT switched to
2337  *     bits 32-35: context switch detail
2338  *                  - 0: ctx complete
2339  *                  - 1: wait on sync flip
2340  *                  - 2: wait on vblank
2341  *                  - 3: wait on scanline
2342  *                  - 4: wait on semaphore
2343  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2344  *                       WAIT_FOR_EVENT)
2345  *     bit  36:    reserved
2346  *     bits 37-43: wait detail (for switch detail 1 to 4)
2347  *     bits 44-46: reserved
2348  *     bits 47-57: sw context id of the lrc the GT switched away from
2349  *     bits 58-63: sw counter of the lrc the GT switched away from
2350  */
2351 static inline bool
2352 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2353 {
2354         u32 lower_dw = csb[0];
2355         u32 upper_dw = csb[1];
2356         bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2357         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2358         bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2359
2360         /*
2361          * The context switch detail is not guaranteed to be 5 when a preemption
2362          * occurs, so we can't just check for that. The check below works for
2363          * all the cases we care about, including preemptions of WAIT
2364          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2365          * would require some extra handling, but we don't support that.
2366          */
2367         if (!ctx_away_valid || new_queue) {
2368                 GEM_BUG_ON(!ctx_to_valid);
2369                 return true;
2370         }
2371
2372         /*
2373          * switch detail = 5 is covered by the case above and we do not expect a
2374          * context switch on an unsuccessful wait instruction since we always
2375          * use polling mode.
2376          */
2377         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2378         return false;
2379 }
2380
2381 static inline bool
2382 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2383 {
2384         return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2385 }
2386
2387 static void process_csb(struct intel_engine_cs *engine)
2388 {
2389         struct intel_engine_execlists * const execlists = &engine->execlists;
2390         const u32 * const buf = execlists->csb_status;
2391         const u8 num_entries = execlists->csb_size;
2392         u8 head, tail;
2393
2394         /*
2395          * As we modify our execlists state tracking we require exclusive
2396          * access. Either we are inside the tasklet, or the tasklet is disabled
2397          * and we assume that is only inside the reset paths and so serialised.
2398          */
2399         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2400                    !reset_in_progress(execlists));
2401         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2402
2403         /*
2404          * Note that csb_write, csb_status may be either in HWSP or mmio.
2405          * When reading from the csb_write mmio register, we have to be
2406          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2407          * the low 4bits. As it happens we know the next 4bits are always
2408          * zero and so we can simply masked off the low u8 of the register
2409          * and treat it identically to reading from the HWSP (without having
2410          * to use explicit shifting and masking, and probably bifurcating
2411          * the code to handle the legacy mmio read).
2412          */
2413         head = execlists->csb_head;
2414         tail = READ_ONCE(*execlists->csb_write);
2415         if (unlikely(head == tail))
2416                 return;
2417
2418         /*
2419          * Hopefully paired with a wmb() in HW!
2420          *
2421          * We must complete the read of the write pointer before any reads
2422          * from the CSB, so that we do not see stale values. Without an rmb
2423          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2424          * we perform the READ_ONCE(*csb_write).
2425          */
2426         rmb();
2427
2428         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2429         do {
2430                 bool promote;
2431
2432                 if (++head == num_entries)
2433                         head = 0;
2434
2435                 /*
2436                  * We are flying near dragons again.
2437                  *
2438                  * We hold a reference to the request in execlist_port[]
2439                  * but no more than that. We are operating in softirq
2440                  * context and so cannot hold any mutex or sleep. That
2441                  * prevents us stopping the requests we are processing
2442                  * in port[] from being retired simultaneously (the
2443                  * breadcrumb will be complete before we see the
2444                  * context-switch). As we only hold the reference to the
2445                  * request, any pointer chasing underneath the request
2446                  * is subject to a potential use-after-free. Thus we
2447                  * store all of the bookkeeping within port[] as
2448                  * required, and avoid using unguarded pointers beneath
2449                  * request itself. The same applies to the atomic
2450                  * status notifier.
2451                  */
2452
2453                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2454                              head, buf[2 * head + 0], buf[2 * head + 1]);
2455
2456                 if (INTEL_GEN(engine->i915) >= 12)
2457                         promote = gen12_csb_parse(execlists, buf + 2 * head);
2458                 else
2459                         promote = gen8_csb_parse(execlists, buf + 2 * head);
2460                 if (promote) {
2461                         struct i915_request * const *old = execlists->active;
2462
2463                         ring_set_paused(engine, 0);
2464
2465                         /* Point active to the new ELSP; prevent overwriting */
2466                         WRITE_ONCE(execlists->active, execlists->pending);
2467                         smp_wmb(); /* notify execlists_active() */
2468
2469                         /* cancel old inflight, prepare for switch */
2470                         trace_ports(execlists, "preempted", old);
2471                         while (*old)
2472                                 execlists_schedule_out(*old++);
2473
2474                         /* switch pending to inflight */
2475                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2476                         memcpy(execlists->inflight,
2477                                execlists->pending,
2478                                execlists_num_ports(execlists) *
2479                                sizeof(*execlists->pending));
2480                         smp_wmb(); /* complete the seqlock */
2481                         WRITE_ONCE(execlists->active, execlists->inflight);
2482
2483                         WRITE_ONCE(execlists->pending[0], NULL);
2484                 } else {
2485                         GEM_BUG_ON(!*execlists->active);
2486
2487                         /* port0 completed, advanced to port1 */
2488                         trace_ports(execlists, "completed", execlists->active);
2489
2490                         /*
2491                          * We rely on the hardware being strongly
2492                          * ordered, that the breadcrumb write is
2493                          * coherent (visible from the CPU) before the
2494                          * user interrupt is processed. One might assume
2495                          * that the breadcrumb write being before the
2496                          * user interrupt and the CS event for the context
2497                          * switch would therefore be before the CS event
2498                          * itself...
2499                          */
2500                         if (GEM_SHOW_DEBUG() &&
2501                             !i915_request_completed(*execlists->active)) {
2502                                 struct i915_request *rq = *execlists->active;
2503                                 const u32 *regs __maybe_unused =
2504                                         rq->context->lrc_reg_state;
2505
2506                                 ENGINE_TRACE(engine,
2507                                              "context completed before request!\n");
2508                                 ENGINE_TRACE(engine,
2509                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2510                                              ENGINE_READ(engine, RING_START),
2511                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2512                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2513                                              ENGINE_READ(engine, RING_CTL),
2514                                              ENGINE_READ(engine, RING_MI_MODE));
2515                                 ENGINE_TRACE(engine,
2516                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2517                                              i915_ggtt_offset(rq->ring->vma),
2518                                              rq->head, rq->tail,
2519                                              rq->fence.context,
2520                                              lower_32_bits(rq->fence.seqno),
2521                                              hwsp_seqno(rq));
2522                                 ENGINE_TRACE(engine,
2523                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2524                                              regs[CTX_RING_START],
2525                                              regs[CTX_RING_HEAD],
2526                                              regs[CTX_RING_TAIL]);
2527                         }
2528
2529                         execlists_schedule_out(*execlists->active++);
2530
2531                         GEM_BUG_ON(execlists->active - execlists->inflight >
2532                                    execlists_num_ports(execlists));
2533                 }
2534         } while (head != tail);
2535
2536         execlists->csb_head = head;
2537         set_timeslice(engine);
2538
2539         /*
2540          * Gen11 has proven to fail wrt global observation point between
2541          * entry and tail update, failing on the ordering and thus
2542          * we see an old entry in the context status buffer.
2543          *
2544          * Forcibly evict out entries for the next gpu csb update,
2545          * to increase the odds that we get a fresh entries with non
2546          * working hardware. The cost for doing so comes out mostly with
2547          * the wash as hardware, working or not, will need to do the
2548          * invalidation before.
2549          */
2550         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2551 }
2552
2553 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2554 {
2555         lockdep_assert_held(&engine->active.lock);
2556         if (!READ_ONCE(engine->execlists.pending[0])) {
2557                 rcu_read_lock(); /* protect peeking at execlists->active */
2558                 execlists_dequeue(engine);
2559                 rcu_read_unlock();
2560         }
2561 }
2562
2563 static void __execlists_hold(struct i915_request *rq)
2564 {
2565         LIST_HEAD(list);
2566
2567         do {
2568                 struct i915_dependency *p;
2569
2570                 if (i915_request_is_active(rq))
2571                         __i915_request_unsubmit(rq);
2572
2573                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2574                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2575                 i915_request_set_hold(rq);
2576                 RQ_TRACE(rq, "on hold\n");
2577
2578                 for_each_waiter(p, rq) {
2579                         struct i915_request *w =
2580                                 container_of(p->waiter, typeof(*w), sched);
2581
2582                         /* Leave semaphores spinning on the other engines */
2583                         if (w->engine != rq->engine)
2584                                 continue;
2585
2586                         if (!i915_request_is_ready(w))
2587                                 continue;
2588
2589                         if (i915_request_completed(w))
2590                                 continue;
2591
2592                         if (i915_request_on_hold(w))
2593                                 continue;
2594
2595                         list_move_tail(&w->sched.link, &list);
2596                 }
2597
2598                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2599         } while (rq);
2600 }
2601
2602 static bool execlists_hold(struct intel_engine_cs *engine,
2603                            struct i915_request *rq)
2604 {
2605         spin_lock_irq(&engine->active.lock);
2606
2607         if (i915_request_completed(rq)) { /* too late! */
2608                 rq = NULL;
2609                 goto unlock;
2610         }
2611
2612         if (rq->engine != engine) { /* preempted virtual engine */
2613                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2614
2615                 /*
2616                  * intel_context_inflight() is only protected by virtue
2617                  * of process_csb() being called only by the tasklet (or
2618                  * directly from inside reset while the tasklet is suspended).
2619                  * Assert that neither of those are allowed to run while we
2620                  * poke at the request queues.
2621                  */
2622                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2623
2624                 /*
2625                  * An unsubmitted request along a virtual engine will
2626                  * remain on the active (this) engine until we are able
2627                  * to process the context switch away (and so mark the
2628                  * context as no longer in flight). That cannot have happened
2629                  * yet, otherwise we would not be hanging!
2630                  */
2631                 spin_lock(&ve->base.active.lock);
2632                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2633                 GEM_BUG_ON(ve->request != rq);
2634                 ve->request = NULL;
2635                 spin_unlock(&ve->base.active.lock);
2636                 i915_request_put(rq);
2637
2638                 rq->engine = engine;
2639         }
2640
2641         /*
2642          * Transfer this request onto the hold queue to prevent it
2643          * being resumbitted to HW (and potentially completed) before we have
2644          * released it. Since we may have already submitted following
2645          * requests, we need to remove those as well.
2646          */
2647         GEM_BUG_ON(i915_request_on_hold(rq));
2648         GEM_BUG_ON(rq->engine != engine);
2649         __execlists_hold(rq);
2650         GEM_BUG_ON(list_empty(&engine->active.hold));
2651
2652 unlock:
2653         spin_unlock_irq(&engine->active.lock);
2654         return rq;
2655 }
2656
2657 static bool hold_request(const struct i915_request *rq)
2658 {
2659         struct i915_dependency *p;
2660         bool result = false;
2661
2662         /*
2663          * If one of our ancestors is on hold, we must also be on hold,
2664          * otherwise we will bypass it and execute before it.
2665          */
2666         rcu_read_lock();
2667         for_each_signaler(p, rq) {
2668                 const struct i915_request *s =
2669                         container_of(p->signaler, typeof(*s), sched);
2670
2671                 if (s->engine != rq->engine)
2672                         continue;
2673
2674                 result = i915_request_on_hold(s);
2675                 if (result)
2676                         break;
2677         }
2678         rcu_read_unlock();
2679
2680         return result;
2681 }
2682
2683 static void __execlists_unhold(struct i915_request *rq)
2684 {
2685         LIST_HEAD(list);
2686
2687         do {
2688                 struct i915_dependency *p;
2689
2690                 RQ_TRACE(rq, "hold release\n");
2691
2692                 GEM_BUG_ON(!i915_request_on_hold(rq));
2693                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2694
2695                 i915_request_clear_hold(rq);
2696                 list_move_tail(&rq->sched.link,
2697                                i915_sched_lookup_priolist(rq->engine,
2698                                                           rq_prio(rq)));
2699                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2700
2701                 /* Also release any children on this engine that are ready */
2702                 for_each_waiter(p, rq) {
2703                         struct i915_request *w =
2704                                 container_of(p->waiter, typeof(*w), sched);
2705
2706                         /* Propagate any change in error status */
2707                         if (rq->fence.error)
2708                                 i915_request_set_error_once(w, rq->fence.error);
2709
2710                         if (w->engine != rq->engine)
2711                                 continue;
2712
2713                         if (!i915_request_on_hold(w))
2714                                 continue;
2715
2716                         /* Check that no other parents are also on hold */
2717                         if (hold_request(w))
2718                                 continue;
2719
2720                         list_move_tail(&w->sched.link, &list);
2721                 }
2722
2723                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2724         } while (rq);
2725 }
2726
2727 static void execlists_unhold(struct intel_engine_cs *engine,
2728                              struct i915_request *rq)
2729 {
2730         spin_lock_irq(&engine->active.lock);
2731
2732         /*
2733          * Move this request back to the priority queue, and all of its
2734          * children and grandchildren that were suspended along with it.
2735          */
2736         __execlists_unhold(rq);
2737
2738         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2739                 engine->execlists.queue_priority_hint = rq_prio(rq);
2740                 tasklet_hi_schedule(&engine->execlists.tasklet);
2741         }
2742
2743         spin_unlock_irq(&engine->active.lock);
2744 }
2745
2746 struct execlists_capture {
2747         struct work_struct work;
2748         struct i915_request *rq;
2749         struct i915_gpu_coredump *error;
2750 };
2751
2752 static void execlists_capture_work(struct work_struct *work)
2753 {
2754         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2755         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2756         struct intel_engine_cs *engine = cap->rq->engine;
2757         struct intel_gt_coredump *gt = cap->error->gt;
2758         struct intel_engine_capture_vma *vma;
2759
2760         /* Compress all the objects attached to the request, slow! */
2761         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2762         if (vma) {
2763                 struct i915_vma_compress *compress =
2764                         i915_vma_capture_prepare(gt);
2765
2766                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2767                 i915_vma_capture_finish(gt, compress);
2768         }
2769
2770         gt->simulated = gt->engine->simulated;
2771         cap->error->simulated = gt->simulated;
2772
2773         /* Publish the error state, and announce it to the world */
2774         i915_error_state_store(cap->error);
2775         i915_gpu_coredump_put(cap->error);
2776
2777         /* Return this request and all that depend upon it for signaling */
2778         execlists_unhold(engine, cap->rq);
2779         i915_request_put(cap->rq);
2780
2781         kfree(cap);
2782 }
2783
2784 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2785 {
2786         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2787         struct execlists_capture *cap;
2788
2789         cap = kmalloc(sizeof(*cap), gfp);
2790         if (!cap)
2791                 return NULL;
2792
2793         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2794         if (!cap->error)
2795                 goto err_cap;
2796
2797         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2798         if (!cap->error->gt)
2799                 goto err_gpu;
2800
2801         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2802         if (!cap->error->gt->engine)
2803                 goto err_gt;
2804
2805         return cap;
2806
2807 err_gt:
2808         kfree(cap->error->gt);
2809 err_gpu:
2810         kfree(cap->error);
2811 err_cap:
2812         kfree(cap);
2813         return NULL;
2814 }
2815
2816 static struct i915_request *
2817 active_context(struct intel_engine_cs *engine, u32 ccid)
2818 {
2819         const struct intel_engine_execlists * const el = &engine->execlists;
2820         struct i915_request * const *port, *rq;
2821
2822         /*
2823          * Use the most recent result from process_csb(), but just in case
2824          * we trigger an error (via interrupt) before the first CS event has
2825          * been written, peek at the next submission.
2826          */
2827
2828         for (port = el->active; (rq = *port); port++) {
2829                 if (upper_32_bits(rq->context->lrc_desc) == ccid) {
2830                         ENGINE_TRACE(engine,
2831                                      "ccid found at active:%zd\n",
2832                                      port - el->active);
2833                         return rq;
2834                 }
2835         }
2836
2837         for (port = el->pending; (rq = *port); port++) {
2838                 if (upper_32_bits(rq->context->lrc_desc) == ccid) {
2839                         ENGINE_TRACE(engine,
2840                                      "ccid found at pending:%zd\n",
2841                                      port - el->pending);
2842                         return rq;
2843                 }
2844         }
2845
2846         ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
2847         return NULL;
2848 }
2849
2850 static u32 active_ccid(struct intel_engine_cs *engine)
2851 {
2852         return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
2853 }
2854
2855 static bool execlists_capture(struct intel_engine_cs *engine)
2856 {
2857         struct execlists_capture *cap;
2858
2859         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2860                 return true;
2861
2862         /*
2863          * We need to _quickly_ capture the engine state before we reset.
2864          * We are inside an atomic section (softirq) here and we are delaying
2865          * the forced preemption event.
2866          */
2867         cap = capture_regs(engine);
2868         if (!cap)
2869                 return true;
2870
2871         spin_lock_irq(&engine->active.lock);
2872         cap->rq = active_context(engine, active_ccid(engine));
2873         if (cap->rq) {
2874                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2875                 cap->rq = i915_request_get_rcu(cap->rq);
2876         }
2877         spin_unlock_irq(&engine->active.lock);
2878         if (!cap->rq)
2879                 goto err_free;
2880
2881         /*
2882          * Remove the request from the execlists queue, and take ownership
2883          * of the request. We pass it to our worker who will _slowly_ compress
2884          * all the pages the _user_ requested for debugging their batch, after
2885          * which we return it to the queue for signaling.
2886          *
2887          * By removing them from the execlists queue, we also remove the
2888          * requests from being processed by __unwind_incomplete_requests()
2889          * during the intel_engine_reset(), and so they will *not* be replayed
2890          * afterwards.
2891          *
2892          * Note that because we have not yet reset the engine at this point,
2893          * it is possible for the request that we have identified as being
2894          * guilty, did in fact complete and we will then hit an arbitration
2895          * point allowing the outstanding preemption to succeed. The likelihood
2896          * of that is very low (as capturing of the engine registers should be
2897          * fast enough to run inside an irq-off atomic section!), so we will
2898          * simply hold that request accountable for being non-preemptible
2899          * long enough to force the reset.
2900          */
2901         if (!execlists_hold(engine, cap->rq))
2902                 goto err_rq;
2903
2904         INIT_WORK(&cap->work, execlists_capture_work);
2905         schedule_work(&cap->work);
2906         return true;
2907
2908 err_rq:
2909         i915_request_put(cap->rq);
2910 err_free:
2911         i915_gpu_coredump_put(cap->error);
2912         kfree(cap);
2913         return false;
2914 }
2915
2916 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2917 {
2918         const unsigned int bit = I915_RESET_ENGINE + engine->id;
2919         unsigned long *lock = &engine->gt->reset.flags;
2920
2921         if (!intel_has_reset_engine(engine->gt))
2922                 return;
2923
2924         if (test_and_set_bit(bit, lock))
2925                 return;
2926
2927         ENGINE_TRACE(engine, "reset for %s\n", msg);
2928
2929         /* Mark this tasklet as disabled to avoid waiting for it to complete */
2930         tasklet_disable_nosync(&engine->execlists.tasklet);
2931
2932         ring_set_paused(engine, 1); /* Freeze the current request in place */
2933         if (execlists_capture(engine))
2934                 intel_engine_reset(engine, msg);
2935         else
2936                 ring_set_paused(engine, 0);
2937
2938         tasklet_enable(&engine->execlists.tasklet);
2939         clear_and_wake_up_bit(bit, lock);
2940 }
2941
2942 static bool preempt_timeout(const struct intel_engine_cs *const engine)
2943 {
2944         const struct timer_list *t = &engine->execlists.preempt;
2945
2946         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2947                 return false;
2948
2949         if (!timer_expired(t))
2950                 return false;
2951
2952         return READ_ONCE(engine->execlists.pending[0]);
2953 }
2954
2955 /*
2956  * Check the unread Context Status Buffers and manage the submission of new
2957  * contexts to the ELSP accordingly.
2958  */
2959 static void execlists_submission_tasklet(unsigned long data)
2960 {
2961         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2962         bool timeout = preempt_timeout(engine);
2963
2964         process_csb(engine);
2965
2966         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2967                 engine->execlists.error_interrupt = 0;
2968                 if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
2969                         execlists_reset(engine, "CS error");
2970         }
2971
2972         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2973                 unsigned long flags;
2974
2975                 spin_lock_irqsave(&engine->active.lock, flags);
2976                 __execlists_submission_tasklet(engine);
2977                 spin_unlock_irqrestore(&engine->active.lock, flags);
2978
2979                 /* Recheck after serialising with direct-submission */
2980                 if (unlikely(timeout && preempt_timeout(engine)))
2981                         execlists_reset(engine, "preemption time out");
2982         }
2983 }
2984
2985 static void __execlists_kick(struct intel_engine_execlists *execlists)
2986 {
2987         /* Kick the tasklet for some interrupt coalescing and reset handling */
2988         tasklet_hi_schedule(&execlists->tasklet);
2989 }
2990
2991 #define execlists_kick(t, member) \
2992         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
2993
2994 static void execlists_timeslice(struct timer_list *timer)
2995 {
2996         execlists_kick(timer, timer);
2997 }
2998
2999 static void execlists_preempt(struct timer_list *timer)
3000 {
3001         execlists_kick(timer, preempt);
3002 }
3003
3004 static void queue_request(struct intel_engine_cs *engine,
3005                           struct i915_request *rq)
3006 {
3007         GEM_BUG_ON(!list_empty(&rq->sched.link));
3008         list_add_tail(&rq->sched.link,
3009                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
3010         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3011 }
3012
3013 static void __submit_queue_imm(struct intel_engine_cs *engine)
3014 {
3015         struct intel_engine_execlists * const execlists = &engine->execlists;
3016
3017         if (reset_in_progress(execlists))
3018                 return; /* defer until we restart the engine following reset */
3019
3020         /* Hopefully we clear execlists->pending[] to let us through */
3021         if (READ_ONCE(execlists->pending[0]) &&
3022             tasklet_trylock(&execlists->tasklet)) {
3023                 process_csb(engine);
3024                 tasklet_unlock(&execlists->tasklet);
3025         }
3026
3027         __execlists_submission_tasklet(engine);
3028 }
3029
3030 static void submit_queue(struct intel_engine_cs *engine,
3031                          const struct i915_request *rq)
3032 {
3033         struct intel_engine_execlists *execlists = &engine->execlists;
3034
3035         if (rq_prio(rq) <= execlists->queue_priority_hint)
3036                 return;
3037
3038         execlists->queue_priority_hint = rq_prio(rq);
3039         __submit_queue_imm(engine);
3040 }
3041
3042 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3043                              const struct i915_request *rq)
3044 {
3045         GEM_BUG_ON(i915_request_on_hold(rq));
3046         return !list_empty(&engine->active.hold) && hold_request(rq);
3047 }
3048
3049 static void execlists_submit_request(struct i915_request *request)
3050 {
3051         struct intel_engine_cs *engine = request->engine;
3052         unsigned long flags;
3053
3054         /* Will be called from irq-context when using foreign fences. */
3055         spin_lock_irqsave(&engine->active.lock, flags);
3056
3057         if (unlikely(ancestor_on_hold(engine, request))) {
3058                 RQ_TRACE(request, "ancestor on hold\n");
3059                 list_add_tail(&request->sched.link, &engine->active.hold);
3060                 i915_request_set_hold(request);
3061         } else {
3062                 queue_request(engine, request);
3063
3064                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3065                 GEM_BUG_ON(list_empty(&request->sched.link));
3066
3067                 submit_queue(engine, request);
3068         }
3069
3070         spin_unlock_irqrestore(&engine->active.lock, flags);
3071 }
3072
3073 static void __execlists_context_fini(struct intel_context *ce)
3074 {
3075         intel_ring_put(ce->ring);
3076         i915_vma_put(ce->state);
3077 }
3078
3079 static void execlists_context_destroy(struct kref *kref)
3080 {
3081         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3082
3083         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3084         GEM_BUG_ON(intel_context_is_pinned(ce));
3085
3086         if (ce->state)
3087                 __execlists_context_fini(ce);
3088
3089         intel_context_fini(ce);
3090         intel_context_free(ce);
3091 }
3092
3093 static void
3094 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3095 {
3096         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3097                 return;
3098
3099         vaddr += engine->context_size;
3100
3101         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3102 }
3103
3104 static void
3105 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3106 {
3107         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3108                 return;
3109
3110         vaddr += engine->context_size;
3111
3112         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3113                 drm_err_once(&engine->i915->drm,
3114                              "%s context redzone overwritten!\n",
3115                              engine->name);
3116 }
3117
3118 static void execlists_context_unpin(struct intel_context *ce)
3119 {
3120         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
3121                       ce->engine);
3122
3123         i915_gem_object_unpin_map(ce->state->obj);
3124 }
3125
3126 static void
3127 __execlists_update_reg_state(const struct intel_context *ce,
3128                              const struct intel_engine_cs *engine,
3129                              u32 head)
3130 {
3131         struct intel_ring *ring = ce->ring;
3132         u32 *regs = ce->lrc_reg_state;
3133
3134         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3135         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3136
3137         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3138         regs[CTX_RING_HEAD] = head;
3139         regs[CTX_RING_TAIL] = ring->tail;
3140         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3141
3142         /* RPCS */
3143         if (engine->class == RENDER_CLASS) {
3144                 regs[CTX_R_PWR_CLK_STATE] =
3145                         intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3146
3147                 i915_oa_init_reg_state(ce, engine);
3148         }
3149 }
3150
3151 static int
3152 __execlists_context_pin(struct intel_context *ce,
3153                         struct intel_engine_cs *engine)
3154 {
3155         void *vaddr;
3156
3157         GEM_BUG_ON(!ce->state);
3158         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3159
3160         vaddr = i915_gem_object_pin_map(ce->state->obj,
3161                                         i915_coherent_map_type(engine->i915) |
3162                                         I915_MAP_OVERRIDE);
3163         if (IS_ERR(vaddr))
3164                 return PTR_ERR(vaddr);
3165
3166         ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3167         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
3168         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3169
3170         return 0;
3171 }
3172
3173 static int execlists_context_pin(struct intel_context *ce)
3174 {
3175         return __execlists_context_pin(ce, ce->engine);
3176 }
3177
3178 static int execlists_context_alloc(struct intel_context *ce)
3179 {
3180         return __execlists_context_alloc(ce, ce->engine);
3181 }
3182
3183 static void execlists_context_reset(struct intel_context *ce)
3184 {
3185         CE_TRACE(ce, "reset\n");
3186         GEM_BUG_ON(!intel_context_is_pinned(ce));
3187
3188         intel_ring_reset(ce->ring, ce->ring->emit);
3189
3190         /* Scrub away the garbage */
3191         execlists_init_reg_state(ce->lrc_reg_state,
3192                                  ce, ce->engine, ce->ring, true);
3193         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3194
3195         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
3196 }
3197
3198 static const struct intel_context_ops execlists_context_ops = {
3199         .alloc = execlists_context_alloc,
3200
3201         .pin = execlists_context_pin,
3202         .unpin = execlists_context_unpin,
3203
3204         .enter = intel_context_enter_engine,
3205         .exit = intel_context_exit_engine,
3206
3207         .reset = execlists_context_reset,
3208         .destroy = execlists_context_destroy,
3209 };
3210
3211 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3212 {
3213         u32 *cs;
3214
3215         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3216                 return 0;
3217
3218         cs = intel_ring_begin(rq, 6);
3219         if (IS_ERR(cs))
3220                 return PTR_ERR(cs);
3221
3222         /*
3223          * Check if we have been preempted before we even get started.
3224          *
3225          * After this point i915_request_started() reports true, even if
3226          * we get preempted and so are no longer running.
3227          */
3228         *cs++ = MI_ARB_CHECK;
3229         *cs++ = MI_NOOP;
3230
3231         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3232         *cs++ = i915_request_timeline(rq)->hwsp_offset;
3233         *cs++ = 0;
3234         *cs++ = rq->fence.seqno - 1;
3235
3236         intel_ring_advance(rq, cs);
3237
3238         /* Record the updated position of the request's payload */
3239         rq->infix = intel_ring_offset(rq, cs);
3240
3241         return 0;
3242 }
3243
3244 static int execlists_request_alloc(struct i915_request *request)
3245 {
3246         int ret;
3247
3248         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3249
3250         /*
3251          * Flush enough space to reduce the likelihood of waiting after
3252          * we start building the request - in which case we will just
3253          * have to repeat work.
3254          */
3255         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3256
3257         /*
3258          * Note that after this point, we have committed to using
3259          * this request as it is being used to both track the
3260          * state of engine initialisation and liveness of the
3261          * golden renderstate above. Think twice before you try
3262          * to cancel/unwind this request now.
3263          */
3264
3265         /* Unconditionally invalidate GPU caches and TLBs. */
3266         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3267         if (ret)
3268                 return ret;
3269
3270         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3271         return 0;
3272 }
3273
3274 /*
3275  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3276  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3277  * but there is a slight complication as this is applied in WA batch where the
3278  * values are only initialized once so we cannot take register value at the
3279  * beginning and reuse it further; hence we save its value to memory, upload a
3280  * constant value with bit21 set and then we restore it back with the saved value.
3281  * To simplify the WA, a constant value is formed by using the default value
3282  * of this register. This shouldn't be a problem because we are only modifying
3283  * it for a short period and this batch in non-premptible. We can ofcourse
3284  * use additional instructions that read the actual value of the register
3285  * at that time and set our bit of interest but it makes the WA complicated.
3286  *
3287  * This WA is also required for Gen9 so extracting as a function avoids
3288  * code duplication.
3289  */
3290 static u32 *
3291 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3292 {
3293         /* NB no one else is allowed to scribble over scratch + 256! */
3294         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3295         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3296         *batch++ = intel_gt_scratch_offset(engine->gt,
3297                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3298         *batch++ = 0;
3299
3300         *batch++ = MI_LOAD_REGISTER_IMM(1);
3301         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3302         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3303
3304         batch = gen8_emit_pipe_control(batch,
3305                                        PIPE_CONTROL_CS_STALL |
3306                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3307                                        0);
3308
3309         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3310         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3311         *batch++ = intel_gt_scratch_offset(engine->gt,
3312                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3313         *batch++ = 0;
3314
3315         return batch;
3316 }
3317
3318 /*
3319  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3320  * initialized at the beginning and shared across all contexts but this field
3321  * helps us to have multiple batches at different offsets and select them based
3322  * on a criteria. At the moment this batch always start at the beginning of the page
3323  * and at this point we don't have multiple wa_ctx batch buffers.
3324  *
3325  * The number of WA applied are not known at the beginning; we use this field
3326  * to return the no of DWORDS written.
3327  *
3328  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3329  * so it adds NOOPs as padding to make it cacheline aligned.
3330  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3331  * makes a complete batch buffer.
3332  */
3333 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3334 {
3335         /* WaDisableCtxRestoreArbitration:bdw,chv */
3336         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3337
3338         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3339         if (IS_BROADWELL(engine->i915))
3340                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3341
3342         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3343         /* Actual scratch location is at 128 bytes offset */
3344         batch = gen8_emit_pipe_control(batch,
3345                                        PIPE_CONTROL_FLUSH_L3 |
3346                                        PIPE_CONTROL_STORE_DATA_INDEX |
3347                                        PIPE_CONTROL_CS_STALL |
3348                                        PIPE_CONTROL_QW_WRITE,
3349                                        LRC_PPHWSP_SCRATCH_ADDR);
3350
3351         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3352
3353         /* Pad to end of cacheline */
3354         while ((unsigned long)batch % CACHELINE_BYTES)
3355                 *batch++ = MI_NOOP;
3356
3357         /*
3358          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3359          * execution depends on the length specified in terms of cache lines
3360          * in the register CTX_RCS_INDIRECT_CTX
3361          */
3362
3363         return batch;
3364 }
3365
3366 struct lri {
3367         i915_reg_t reg;
3368         u32 value;
3369 };
3370
3371 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3372 {
3373         GEM_BUG_ON(!count || count > 63);
3374
3375         *batch++ = MI_LOAD_REGISTER_IMM(count);
3376         do {
3377                 *batch++ = i915_mmio_reg_offset(lri->reg);
3378                 *batch++ = lri->value;
3379         } while (lri++, --count);
3380         *batch++ = MI_NOOP;
3381
3382         return batch;
3383 }
3384
3385 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3386 {
3387         static const struct lri lri[] = {
3388                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3389                 {
3390                         COMMON_SLICE_CHICKEN2,
3391                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3392                                        0),
3393                 },
3394
3395                 /* BSpec: 11391 */
3396                 {
3397                         FF_SLICE_CHICKEN,
3398                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3399                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3400                 },
3401
3402                 /* BSpec: 11299 */
3403                 {
3404                         _3D_CHICKEN3,
3405                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3406                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3407                 }
3408         };
3409
3410         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3411
3412         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3413         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3414
3415         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3416         batch = gen8_emit_pipe_control(batch,
3417                                        PIPE_CONTROL_FLUSH_L3 |
3418                                        PIPE_CONTROL_STORE_DATA_INDEX |
3419                                        PIPE_CONTROL_CS_STALL |
3420                                        PIPE_CONTROL_QW_WRITE,
3421                                        LRC_PPHWSP_SCRATCH_ADDR);
3422
3423         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3424
3425         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3426         if (HAS_POOLED_EU(engine->i915)) {
3427                 /*
3428                  * EU pool configuration is setup along with golden context
3429                  * during context initialization. This value depends on
3430                  * device type (2x6 or 3x6) and needs to be updated based
3431                  * on which subslice is disabled especially for 2x6
3432                  * devices, however it is safe to load default
3433                  * configuration of 3x6 device instead of masking off
3434                  * corresponding bits because HW ignores bits of a disabled
3435                  * subslice and drops down to appropriate config. Please
3436                  * see render_state_setup() in i915_gem_render_state.c for
3437                  * possible configurations, to avoid duplication they are
3438                  * not shown here again.
3439                  */
3440                 *batch++ = GEN9_MEDIA_POOL_STATE;
3441                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3442                 *batch++ = 0x00777000;
3443                 *batch++ = 0;
3444                 *batch++ = 0;
3445                 *batch++ = 0;
3446         }
3447
3448         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3449
3450         /* Pad to end of cacheline */
3451         while ((unsigned long)batch % CACHELINE_BYTES)
3452                 *batch++ = MI_NOOP;
3453
3454         return batch;
3455 }
3456
3457 static u32 *
3458 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3459 {
3460         int i;
3461
3462         /*
3463          * WaPipeControlBefore3DStateSamplePattern: cnl
3464          *
3465          * Ensure the engine is idle prior to programming a
3466          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3467          */
3468         batch = gen8_emit_pipe_control(batch,
3469                                        PIPE_CONTROL_CS_STALL,
3470                                        0);
3471         /*
3472          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3473          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3474          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3475          * confusing. Since gen8_emit_pipe_control() already advances the
3476          * batch by 6 dwords, we advance the other 10 here, completing a
3477          * cacheline. It's not clear if the workaround requires this padding
3478          * before other commands, or if it's just the regular padding we would
3479          * already have for the workaround bb, so leave it here for now.
3480          */
3481         for (i = 0; i < 10; i++)
3482                 *batch++ = MI_NOOP;
3483
3484         /* Pad to end of cacheline */
3485         while ((unsigned long)batch % CACHELINE_BYTES)
3486                 *batch++ = MI_NOOP;
3487
3488         return batch;
3489 }
3490
3491 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3492
3493 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3494 {
3495         struct drm_i915_gem_object *obj;
3496         struct i915_vma *vma;
3497         int err;
3498
3499         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3500         if (IS_ERR(obj))
3501                 return PTR_ERR(obj);
3502
3503         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3504         if (IS_ERR(vma)) {
3505                 err = PTR_ERR(vma);
3506                 goto err;
3507         }
3508
3509         err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3510         if (err)
3511                 goto err;
3512
3513         engine->wa_ctx.vma = vma;
3514         return 0;
3515
3516 err:
3517         i915_gem_object_put(obj);
3518         return err;
3519 }
3520
3521 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3522 {
3523         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3524 }
3525
3526 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3527
3528 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3529 {
3530         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3531         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3532                                             &wa_ctx->per_ctx };
3533         wa_bb_func_t wa_bb_fn[2];
3534         struct page *page;
3535         void *batch, *batch_ptr;
3536         unsigned int i;
3537         int ret;
3538
3539         if (engine->class != RENDER_CLASS)
3540                 return 0;
3541
3542         switch (INTEL_GEN(engine->i915)) {
3543         case 12:
3544         case 11:
3545                 return 0;
3546         case 10:
3547                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3548                 wa_bb_fn[1] = NULL;
3549                 break;
3550         case 9:
3551                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3552                 wa_bb_fn[1] = NULL;
3553                 break;
3554         case 8:
3555                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3556                 wa_bb_fn[1] = NULL;
3557                 break;
3558         default:
3559                 MISSING_CASE(INTEL_GEN(engine->i915));
3560                 return 0;
3561         }
3562
3563         ret = lrc_setup_wa_ctx(engine);
3564         if (ret) {
3565                 drm_dbg(&engine->i915->drm,
3566                         "Failed to setup context WA page: %d\n", ret);
3567                 return ret;
3568         }
3569
3570         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3571         batch = batch_ptr = kmap_atomic(page);
3572
3573         /*
3574          * Emit the two workaround batch buffers, recording the offset from the
3575          * start of the workaround batch buffer object for each and their
3576          * respective sizes.
3577          */
3578         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3579                 wa_bb[i]->offset = batch_ptr - batch;
3580                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3581                                                   CACHELINE_BYTES))) {
3582                         ret = -EINVAL;
3583                         break;
3584                 }
3585                 if (wa_bb_fn[i])
3586                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3587                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3588         }
3589
3590         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3591
3592         kunmap_atomic(batch);
3593         if (ret)
3594                 lrc_destroy_wa_ctx(engine);
3595
3596         return ret;
3597 }
3598
3599 static void reset_csb_pointers(struct intel_engine_cs *engine)
3600 {
3601         struct intel_engine_execlists * const execlists = &engine->execlists;
3602         const unsigned int reset_value = execlists->csb_size - 1;
3603
3604         ring_set_paused(engine, 0);
3605
3606         /*
3607          * After a reset, the HW starts writing into CSB entry [0]. We
3608          * therefore have to set our HEAD pointer back one entry so that
3609          * the *first* entry we check is entry 0. To complicate this further,
3610          * as we don't wait for the first interrupt after reset, we have to
3611          * fake the HW write to point back to the last entry so that our
3612          * inline comparison of our cached head position against the last HW
3613          * write works even before the first interrupt.
3614          */
3615         execlists->csb_head = reset_value;
3616         WRITE_ONCE(*execlists->csb_write, reset_value);
3617         wmb(); /* Make sure this is visible to HW (paranoia?) */
3618
3619         /*
3620          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3621          * Bludgeon them with a mmio update to be sure.
3622          */
3623         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3624                      reset_value << 8 | reset_value);
3625         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3626
3627         invalidate_csb_entries(&execlists->csb_status[0],
3628                                &execlists->csb_status[reset_value]);
3629 }
3630
3631 static void execlists_sanitize(struct intel_engine_cs *engine)
3632 {
3633         /*
3634          * Poison residual state on resume, in case the suspend didn't!
3635          *
3636          * We have to assume that across suspend/resume (or other loss
3637          * of control) that the contents of our pinned buffers has been
3638          * lost, replaced by garbage. Since this doesn't always happen,
3639          * let's poison such state so that we more quickly spot when
3640          * we falsely assume it has been preserved.
3641          */
3642         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3643                 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
3644
3645         reset_csb_pointers(engine);
3646
3647         /*
3648          * The kernel_context HWSP is stored in the status_page. As above,
3649          * that may be lost on resume/initialisation, and so we need to
3650          * reset the value in the HWSP.
3651          */
3652         intel_timeline_reset_seqno(engine->kernel_context->timeline);
3653 }
3654
3655 static void enable_error_interrupt(struct intel_engine_cs *engine)
3656 {
3657         u32 status;
3658
3659         engine->execlists.error_interrupt = 0;
3660         ENGINE_WRITE(engine, RING_EMR, ~0u);
3661         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3662
3663         status = ENGINE_READ(engine, RING_ESR);
3664         if (unlikely(status)) {
3665                 drm_err(&engine->i915->drm,
3666                         "engine '%s' resumed still in error: %08x\n",
3667                         engine->name, status);
3668                 __intel_gt_reset(engine->gt, engine->mask);
3669         }
3670
3671         /*
3672          * On current gen8+, we have 2 signals to play with
3673          *
3674          * - I915_ERROR_INSTUCTION (bit 0)
3675          *
3676          *    Generate an error if the command parser encounters an invalid
3677          *    instruction
3678          *
3679          *    This is a fatal error.
3680          *
3681          * - CP_PRIV (bit 2)
3682          *
3683          *    Generate an error on privilege violation (where the CP replaces
3684          *    the instruction with a no-op). This also fires for writes into
3685          *    read-only scratch pages.
3686          *
3687          *    This is a non-fatal error, parsing continues.
3688          *
3689          * * there are a few others defined for odd HW that we do not use
3690          *
3691          * Since CP_PRIV fires for cases where we have chosen to ignore the
3692          * error (as the HW is validating and suppressing the mistakes), we
3693          * only unmask the instruction error bit.
3694          */
3695         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3696 }
3697
3698 static void enable_execlists(struct intel_engine_cs *engine)
3699 {
3700         u32 mode;
3701
3702         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3703
3704         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3705
3706         if (INTEL_GEN(engine->i915) >= 11)
3707                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3708         else
3709                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3710         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3711
3712         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3713
3714         ENGINE_WRITE_FW(engine,
3715                         RING_HWS_PGA,
3716                         i915_ggtt_offset(engine->status_page.vma));
3717         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3718
3719         enable_error_interrupt(engine);
3720
3721         engine->context_tag = 0;
3722 }
3723
3724 static bool unexpected_starting_state(struct intel_engine_cs *engine)
3725 {
3726         bool unexpected = false;
3727
3728         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3729                 drm_dbg(&engine->i915->drm,
3730                         "STOP_RING still set in RING_MI_MODE\n");
3731                 unexpected = true;
3732         }
3733
3734         return unexpected;
3735 }
3736
3737 static int execlists_resume(struct intel_engine_cs *engine)
3738 {
3739         intel_mocs_init_engine(engine);
3740
3741         intel_engine_reset_breadcrumbs(engine);
3742
3743         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3744                 struct drm_printer p = drm_debug_printer(__func__);
3745
3746                 intel_engine_dump(engine, &p, NULL);
3747         }
3748
3749         enable_execlists(engine);
3750
3751         return 0;
3752 }
3753
3754 static void execlists_reset_prepare(struct intel_engine_cs *engine)
3755 {
3756         struct intel_engine_execlists * const execlists = &engine->execlists;
3757         unsigned long flags;
3758
3759         ENGINE_TRACE(engine, "depth<-%d\n",
3760                      atomic_read(&execlists->tasklet.count));
3761
3762         /*
3763          * Prevent request submission to the hardware until we have
3764          * completed the reset in i915_gem_reset_finish(). If a request
3765          * is completed by one engine, it may then queue a request
3766          * to a second via its execlists->tasklet *just* as we are
3767          * calling engine->resume() and also writing the ELSP.
3768          * Turning off the execlists->tasklet until the reset is over
3769          * prevents the race.
3770          */
3771         __tasklet_disable_sync_once(&execlists->tasklet);
3772         GEM_BUG_ON(!reset_in_progress(execlists));
3773
3774         /* And flush any current direct submission. */
3775         spin_lock_irqsave(&engine->active.lock, flags);
3776         spin_unlock_irqrestore(&engine->active.lock, flags);
3777
3778         /*
3779          * We stop engines, otherwise we might get failed reset and a
3780          * dead gpu (on elk). Also as modern gpu as kbl can suffer
3781          * from system hang if batchbuffer is progressing when
3782          * the reset is issued, regardless of READY_TO_RESET ack.
3783          * Thus assume it is best to stop engines on all gens
3784          * where we have a gpu reset.
3785          *
3786          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3787          *
3788          * FIXME: Wa for more modern gens needs to be validated
3789          */
3790         ring_set_paused(engine, 1);
3791         intel_engine_stop_cs(engine);
3792 }
3793
3794 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3795 {
3796         int x;
3797
3798         x = lrc_ring_mi_mode(engine);
3799         if (x != -1) {
3800                 regs[x + 1] &= ~STOP_RING;
3801                 regs[x + 1] |= STOP_RING << 16;
3802         }
3803 }
3804
3805 static void __execlists_reset_reg_state(const struct intel_context *ce,
3806                                         const struct intel_engine_cs *engine)
3807 {
3808         u32 *regs = ce->lrc_reg_state;
3809
3810         __reset_stop_ring(regs, engine);
3811 }
3812
3813 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3814 {
3815         struct intel_engine_execlists * const execlists = &engine->execlists;
3816         struct intel_context *ce;
3817         struct i915_request *rq;
3818         u32 head;
3819
3820         mb(); /* paranoia: read the CSB pointers from after the reset */
3821         clflush(execlists->csb_write);
3822         mb();
3823
3824         process_csb(engine); /* drain preemption events */
3825
3826         /* Following the reset, we need to reload the CSB read/write pointers */
3827         reset_csb_pointers(engine);
3828
3829         /*
3830          * Save the currently executing context, even if we completed
3831          * its request, it was still running at the time of the
3832          * reset and will have been clobbered.
3833          */
3834         rq = execlists_active(execlists);
3835         if (!rq)
3836                 goto unwind;
3837
3838         ce = rq->context;
3839         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3840
3841         if (i915_request_completed(rq)) {
3842                 /* Idle context; tidy up the ring so we can restart afresh */
3843                 head = intel_ring_wrap(ce->ring, rq->tail);
3844                 goto out_replay;
3845         }
3846
3847         /* We still have requests in-flight; the engine should be active */
3848         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3849
3850         /* Context has requests still in-flight; it should not be idle! */
3851         GEM_BUG_ON(i915_active_is_idle(&ce->active));
3852
3853         rq = active_request(ce->timeline, rq);
3854         head = intel_ring_wrap(ce->ring, rq->head);
3855         GEM_BUG_ON(head == ce->ring->tail);
3856
3857         /*
3858          * If this request hasn't started yet, e.g. it is waiting on a
3859          * semaphore, we need to avoid skipping the request or else we
3860          * break the signaling chain. However, if the context is corrupt
3861          * the request will not restart and we will be stuck with a wedged
3862          * device. It is quite often the case that if we issue a reset
3863          * while the GPU is loading the context image, that the context
3864          * image becomes corrupt.
3865          *
3866          * Otherwise, if we have not started yet, the request should replay
3867          * perfectly and we do not need to flag the result as being erroneous.
3868          */
3869         if (!i915_request_started(rq))
3870                 goto out_replay;
3871
3872         /*
3873          * If the request was innocent, we leave the request in the ELSP
3874          * and will try to replay it on restarting. The context image may
3875          * have been corrupted by the reset, in which case we may have
3876          * to service a new GPU hang, but more likely we can continue on
3877          * without impact.
3878          *
3879          * If the request was guilty, we presume the context is corrupt
3880          * and have to at least restore the RING register in the context
3881          * image back to the expected values to skip over the guilty request.
3882          */
3883         __i915_request_reset(rq, stalled);
3884         if (!stalled)
3885                 goto out_replay;
3886
3887         /*
3888          * We want a simple context + ring to execute the breadcrumb update.
3889          * We cannot rely on the context being intact across the GPU hang,
3890          * so clear it and rebuild just what we need for the breadcrumb.
3891          * All pending requests for this context will be zapped, and any
3892          * future request will be after userspace has had the opportunity
3893          * to recreate its own state.
3894          */
3895         GEM_BUG_ON(!intel_context_is_pinned(ce));
3896         restore_default_state(ce, engine);
3897
3898 out_replay:
3899         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3900                      head, ce->ring->tail);
3901         __execlists_reset_reg_state(ce, engine);
3902         __execlists_update_reg_state(ce, engine, head);
3903         ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3904
3905 unwind:
3906         /* Push back any incomplete requests for replay after the reset. */
3907         cancel_port_requests(execlists);
3908         __unwind_incomplete_requests(engine);
3909 }
3910
3911 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3912 {
3913         unsigned long flags;
3914
3915         ENGINE_TRACE(engine, "\n");
3916
3917         spin_lock_irqsave(&engine->active.lock, flags);
3918
3919         __execlists_reset(engine, stalled);
3920
3921         spin_unlock_irqrestore(&engine->active.lock, flags);
3922 }
3923
3924 static void nop_submission_tasklet(unsigned long data)
3925 {
3926         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3927
3928         /* The driver is wedged; don't process any more events. */
3929         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
3930 }
3931
3932 static void execlists_reset_cancel(struct intel_engine_cs *engine)
3933 {
3934         struct intel_engine_execlists * const execlists = &engine->execlists;
3935         struct i915_request *rq, *rn;
3936         struct rb_node *rb;
3937         unsigned long flags;
3938
3939         ENGINE_TRACE(engine, "\n");
3940
3941         /*
3942          * Before we call engine->cancel_requests(), we should have exclusive
3943          * access to the submission state. This is arranged for us by the
3944          * caller disabling the interrupt generation, the tasklet and other
3945          * threads that may then access the same state, giving us a free hand
3946          * to reset state. However, we still need to let lockdep be aware that
3947          * we know this state may be accessed in hardirq context, so we
3948          * disable the irq around this manipulation and we want to keep
3949          * the spinlock focused on its duties and not accidentally conflate
3950          * coverage to the submission's irq state. (Similarly, although we
3951          * shouldn't need to disable irq around the manipulation of the
3952          * submission's irq state, we also wish to remind ourselves that
3953          * it is irq state.)
3954          */
3955         spin_lock_irqsave(&engine->active.lock, flags);
3956
3957         __execlists_reset(engine, true);
3958
3959         /* Mark all executing requests as skipped. */
3960         list_for_each_entry(rq, &engine->active.requests, sched.link)
3961                 mark_eio(rq);
3962
3963         /* Flush the queued requests to the timeline list (for retiring). */
3964         while ((rb = rb_first_cached(&execlists->queue))) {
3965                 struct i915_priolist *p = to_priolist(rb);
3966                 int i;
3967
3968                 priolist_for_each_request_consume(rq, rn, p, i) {
3969                         mark_eio(rq);
3970                         __i915_request_submit(rq);
3971                 }
3972
3973                 rb_erase_cached(&p->node, &execlists->queue);
3974                 i915_priolist_free(p);
3975         }
3976
3977         /* On-hold requests will be flushed to timeline upon their release */
3978         list_for_each_entry(rq, &engine->active.hold, sched.link)
3979                 mark_eio(rq);
3980
3981         /* Cancel all attached virtual engines */
3982         while ((rb = rb_first_cached(&execlists->virtual))) {
3983                 struct virtual_engine *ve =
3984                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3985
3986                 rb_erase_cached(rb, &execlists->virtual);
3987                 RB_CLEAR_NODE(rb);
3988
3989                 spin_lock(&ve->base.active.lock);
3990                 rq = fetch_and_zero(&ve->request);
3991                 if (rq) {
3992                         mark_eio(rq);
3993
3994                         rq->engine = engine;
3995                         __i915_request_submit(rq);
3996                         i915_request_put(rq);
3997
3998                         ve->base.execlists.queue_priority_hint = INT_MIN;
3999                 }
4000                 spin_unlock(&ve->base.active.lock);
4001         }
4002
4003         /* Remaining _unready_ requests will be nop'ed when submitted */
4004
4005         execlists->queue_priority_hint = INT_MIN;
4006         execlists->queue = RB_ROOT_CACHED;
4007
4008         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4009         execlists->tasklet.func = nop_submission_tasklet;
4010
4011         spin_unlock_irqrestore(&engine->active.lock, flags);
4012 }
4013
4014 static void execlists_reset_finish(struct intel_engine_cs *engine)
4015 {
4016         struct intel_engine_execlists * const execlists = &engine->execlists;
4017
4018         /*
4019          * After a GPU reset, we may have requests to replay. Do so now while
4020          * we still have the forcewake to be sure that the GPU is not allowed
4021          * to sleep before we restart and reload a context.
4022          */
4023         GEM_BUG_ON(!reset_in_progress(execlists));
4024         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4025                 execlists->tasklet.func(execlists->tasklet.data);
4026
4027         if (__tasklet_enable(&execlists->tasklet))
4028                 /* And kick in case we missed a new request submission. */
4029                 tasklet_hi_schedule(&execlists->tasklet);
4030         ENGINE_TRACE(engine, "depth->%d\n",
4031                      atomic_read(&execlists->tasklet.count));
4032 }
4033
4034 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4035                                     u64 offset, u32 len,
4036                                     const unsigned int flags)
4037 {
4038         u32 *cs;
4039
4040         cs = intel_ring_begin(rq, 4);
4041         if (IS_ERR(cs))
4042                 return PTR_ERR(cs);
4043
4044         /*
4045          * WaDisableCtxRestoreArbitration:bdw,chv
4046          *
4047          * We don't need to perform MI_ARB_ENABLE as often as we do (in
4048          * particular all the gen that do not need the w/a at all!), if we
4049          * took care to make sure that on every switch into this context
4050          * (both ordinary and for preemption) that arbitrartion was enabled
4051          * we would be fine.  However, for gen8 there is another w/a that
4052          * requires us to not preempt inside GPGPU execution, so we keep
4053          * arbitration disabled for gen8 batches. Arbitration will be
4054          * re-enabled before we close the request
4055          * (engine->emit_fini_breadcrumb).
4056          */
4057         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4058
4059         /* FIXME(BDW+): Address space and security selectors. */
4060         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4061                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4062         *cs++ = lower_32_bits(offset);
4063         *cs++ = upper_32_bits(offset);
4064
4065         intel_ring_advance(rq, cs);
4066
4067         return 0;
4068 }
4069
4070 static int gen8_emit_bb_start(struct i915_request *rq,
4071                               u64 offset, u32 len,
4072                               const unsigned int flags)
4073 {
4074         u32 *cs;
4075
4076         cs = intel_ring_begin(rq, 6);
4077         if (IS_ERR(cs))
4078                 return PTR_ERR(cs);
4079
4080         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4081
4082         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4083                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4084         *cs++ = lower_32_bits(offset);
4085         *cs++ = upper_32_bits(offset);
4086
4087         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4088         *cs++ = MI_NOOP;
4089
4090         intel_ring_advance(rq, cs);
4091
4092         return 0;
4093 }
4094
4095 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4096 {
4097         ENGINE_WRITE(engine, RING_IMR,
4098                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
4099         ENGINE_POSTING_READ(engine, RING_IMR);
4100 }
4101
4102 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4103 {
4104         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4105 }
4106
4107 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4108 {
4109         u32 cmd, *cs;
4110
4111         cs = intel_ring_begin(request, 4);
4112         if (IS_ERR(cs))
4113                 return PTR_ERR(cs);
4114
4115         cmd = MI_FLUSH_DW + 1;
4116
4117         /* We always require a command barrier so that subsequent
4118          * commands, such as breadcrumb interrupts, are strictly ordered
4119          * wrt the contents of the write cache being flushed to memory
4120          * (and thus being coherent from the CPU).
4121          */
4122         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4123
4124         if (mode & EMIT_INVALIDATE) {
4125                 cmd |= MI_INVALIDATE_TLB;
4126                 if (request->engine->class == VIDEO_DECODE_CLASS)
4127                         cmd |= MI_INVALIDATE_BSD;
4128         }
4129
4130         *cs++ = cmd;
4131         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4132         *cs++ = 0; /* upper addr */
4133         *cs++ = 0; /* value */
4134         intel_ring_advance(request, cs);
4135
4136         return 0;
4137 }
4138
4139 static int gen8_emit_flush_render(struct i915_request *request,
4140                                   u32 mode)
4141 {
4142         bool vf_flush_wa = false, dc_flush_wa = false;
4143         u32 *cs, flags = 0;
4144         int len;
4145
4146         flags |= PIPE_CONTROL_CS_STALL;
4147
4148         if (mode & EMIT_FLUSH) {
4149                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4150                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4151                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4152                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4153         }
4154
4155         if (mode & EMIT_INVALIDATE) {
4156                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4157                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4158                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4159                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4160                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4161                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4162                 flags |= PIPE_CONTROL_QW_WRITE;
4163                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4164
4165                 /*
4166                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4167                  * pipe control.
4168                  */
4169                 if (IS_GEN(request->i915, 9))
4170                         vf_flush_wa = true;
4171
4172                 /* WaForGAMHang:kbl */
4173                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4174                         dc_flush_wa = true;
4175         }
4176
4177         len = 6;
4178
4179         if (vf_flush_wa)
4180                 len += 6;
4181
4182         if (dc_flush_wa)
4183                 len += 12;
4184
4185         cs = intel_ring_begin(request, len);
4186         if (IS_ERR(cs))
4187                 return PTR_ERR(cs);
4188
4189         if (vf_flush_wa)
4190                 cs = gen8_emit_pipe_control(cs, 0, 0);
4191
4192         if (dc_flush_wa)
4193                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4194                                             0);
4195
4196         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4197
4198         if (dc_flush_wa)
4199                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4200
4201         intel_ring_advance(request, cs);
4202
4203         return 0;
4204 }
4205
4206 static int gen11_emit_flush_render(struct i915_request *request,
4207                                    u32 mode)
4208 {
4209         if (mode & EMIT_FLUSH) {
4210                 u32 *cs;
4211                 u32 flags = 0;
4212
4213                 flags |= PIPE_CONTROL_CS_STALL;
4214
4215                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4216                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4217                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4218                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4219                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4220                 flags |= PIPE_CONTROL_QW_WRITE;
4221                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4222
4223                 cs = intel_ring_begin(request, 6);
4224                 if (IS_ERR(cs))
4225                         return PTR_ERR(cs);
4226
4227                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4228                 intel_ring_advance(request, cs);
4229         }
4230
4231         if (mode & EMIT_INVALIDATE) {
4232                 u32 *cs;
4233                 u32 flags = 0;
4234
4235                 flags |= PIPE_CONTROL_CS_STALL;
4236
4237                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4238                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4239                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4240                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4241                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4242                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4243                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4244                 flags |= PIPE_CONTROL_QW_WRITE;
4245                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4246
4247                 cs = intel_ring_begin(request, 6);
4248                 if (IS_ERR(cs))
4249                         return PTR_ERR(cs);
4250
4251                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4252                 intel_ring_advance(request, cs);
4253         }
4254
4255         return 0;
4256 }
4257
4258 static u32 preparser_disable(bool state)
4259 {
4260         return MI_ARB_CHECK | 1 << 8 | state;
4261 }
4262
4263 static int gen12_emit_flush_render(struct i915_request *request,
4264                                    u32 mode)
4265 {
4266         if (mode & EMIT_FLUSH) {
4267                 u32 flags = 0;
4268                 u32 *cs;
4269
4270                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4271                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4272                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4273                 /* Wa_1409600907:tgl */
4274                 flags |= PIPE_CONTROL_DEPTH_STALL;
4275                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4276                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4277                 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4278
4279                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4280                 flags |= PIPE_CONTROL_QW_WRITE;
4281
4282                 flags |= PIPE_CONTROL_CS_STALL;
4283
4284                 cs = intel_ring_begin(request, 6);
4285                 if (IS_ERR(cs))
4286                         return PTR_ERR(cs);
4287
4288                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4289                 intel_ring_advance(request, cs);
4290         }
4291
4292         if (mode & EMIT_INVALIDATE) {
4293                 u32 flags = 0;
4294                 u32 *cs;
4295
4296                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4297                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4298                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4299                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4300                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4301                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4302                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4303                 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4304
4305                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4306                 flags |= PIPE_CONTROL_QW_WRITE;
4307
4308                 flags |= PIPE_CONTROL_CS_STALL;
4309
4310                 cs = intel_ring_begin(request, 8);
4311                 if (IS_ERR(cs))
4312                         return PTR_ERR(cs);
4313
4314                 /*
4315                  * Prevent the pre-parser from skipping past the TLB
4316                  * invalidate and loading a stale page for the batch
4317                  * buffer / request payload.
4318                  */
4319                 *cs++ = preparser_disable(true);
4320
4321                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4322
4323                 *cs++ = preparser_disable(false);
4324                 intel_ring_advance(request, cs);
4325         }
4326
4327         return 0;
4328 }
4329
4330 /*
4331  * Reserve space for 2 NOOPs at the end of each request to be
4332  * used as a workaround for not being allowed to do lite
4333  * restore with HEAD==TAIL (WaIdleLiteRestore).
4334  */
4335 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4336 {
4337         /* Ensure there's always at least one preemption point per-request. */
4338         *cs++ = MI_ARB_CHECK;
4339         *cs++ = MI_NOOP;
4340         request->wa_tail = intel_ring_offset(request, cs);
4341
4342         return cs;
4343 }
4344
4345 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4346 {
4347         *cs++ = MI_SEMAPHORE_WAIT |
4348                 MI_SEMAPHORE_GLOBAL_GTT |
4349                 MI_SEMAPHORE_POLL |
4350                 MI_SEMAPHORE_SAD_EQ_SDD;
4351         *cs++ = 0;
4352         *cs++ = intel_hws_preempt_address(request->engine);
4353         *cs++ = 0;
4354
4355         return cs;
4356 }
4357
4358 static __always_inline u32*
4359 gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4360                                  u32 *cs)
4361 {
4362         *cs++ = MI_USER_INTERRUPT;
4363
4364         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4365         if (intel_engine_has_semaphores(request->engine))
4366                 cs = emit_preempt_busywait(request, cs);
4367
4368         request->tail = intel_ring_offset(request, cs);
4369         assert_ring_tail_valid(request->ring, request->tail);
4370
4371         return gen8_emit_wa_tail(request, cs);
4372 }
4373
4374 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4375 {
4376         cs = gen8_emit_ggtt_write(cs,
4377                                   request->fence.seqno,
4378                                   i915_request_active_timeline(request)->hwsp_offset,
4379                                   0);
4380
4381         return gen8_emit_fini_breadcrumb_footer(request, cs);
4382 }
4383
4384 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4385 {
4386         cs = gen8_emit_pipe_control(cs,
4387                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4388                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4389                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4390                                     0);
4391
4392         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4393         cs = gen8_emit_ggtt_write_rcs(cs,
4394                                       request->fence.seqno,
4395                                       i915_request_active_timeline(request)->hwsp_offset,
4396                                       PIPE_CONTROL_FLUSH_ENABLE |
4397                                       PIPE_CONTROL_CS_STALL);
4398
4399         return gen8_emit_fini_breadcrumb_footer(request, cs);
4400 }
4401
4402 static u32 *
4403 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4404 {
4405         cs = gen8_emit_ggtt_write_rcs(cs,
4406                                       request->fence.seqno,
4407                                       i915_request_active_timeline(request)->hwsp_offset,
4408                                       PIPE_CONTROL_CS_STALL |
4409                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4410                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4411                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4412                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4413                                       PIPE_CONTROL_FLUSH_ENABLE);
4414
4415         return gen8_emit_fini_breadcrumb_footer(request, cs);
4416 }
4417
4418 /*
4419  * Note that the CS instruction pre-parser will not stall on the breadcrumb
4420  * flush and will continue pre-fetching the instructions after it before the
4421  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4422  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4423  * of the next request before the memory has been flushed, we're guaranteed that
4424  * we won't access the batch itself too early.
4425  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4426  * so, if the current request is modifying an instruction in the next request on
4427  * the same intel_context, we might pre-fetch and then execute the pre-update
4428  * instruction. To avoid this, the users of self-modifying code should either
4429  * disable the parser around the code emitting the memory writes, via a new flag
4430  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4431  * the in-kernel use-cases we've opted to use a separate context, see
4432  * reloc_gpu() as an example.
4433  * All the above applies only to the instructions themselves. Non-inline data
4434  * used by the instructions is not pre-fetched.
4435  */
4436
4437 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4438 {
4439         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4440                 MI_SEMAPHORE_GLOBAL_GTT |
4441                 MI_SEMAPHORE_POLL |
4442                 MI_SEMAPHORE_SAD_EQ_SDD;
4443         *cs++ = 0;
4444         *cs++ = intel_hws_preempt_address(request->engine);
4445         *cs++ = 0;
4446         *cs++ = 0;
4447         *cs++ = MI_NOOP;
4448
4449         return cs;
4450 }
4451
4452 static __always_inline u32*
4453 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4454 {
4455         *cs++ = MI_USER_INTERRUPT;
4456
4457         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4458         if (intel_engine_has_semaphores(request->engine))
4459                 cs = gen12_emit_preempt_busywait(request, cs);
4460
4461         request->tail = intel_ring_offset(request, cs);
4462         assert_ring_tail_valid(request->ring, request->tail);
4463
4464         return gen8_emit_wa_tail(request, cs);
4465 }
4466
4467 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4468 {
4469         cs = gen8_emit_ggtt_write(cs,
4470                                   request->fence.seqno,
4471                                   i915_request_active_timeline(request)->hwsp_offset,
4472                                   0);
4473
4474         return gen12_emit_fini_breadcrumb_footer(request, cs);
4475 }
4476
4477 static u32 *
4478 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4479 {
4480         cs = gen8_emit_ggtt_write_rcs(cs,
4481                                       request->fence.seqno,
4482                                       i915_request_active_timeline(request)->hwsp_offset,
4483                                       PIPE_CONTROL_CS_STALL |
4484                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4485                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4486                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4487                                       /* Wa_1409600907:tgl */
4488                                       PIPE_CONTROL_DEPTH_STALL |
4489                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
4490                                       PIPE_CONTROL_FLUSH_ENABLE |
4491                                       PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4492
4493         return gen12_emit_fini_breadcrumb_footer(request, cs);
4494 }
4495
4496 static void execlists_park(struct intel_engine_cs *engine)
4497 {
4498         cancel_timer(&engine->execlists.timer);
4499         cancel_timer(&engine->execlists.preempt);
4500 }
4501
4502 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4503 {
4504         engine->submit_request = execlists_submit_request;
4505         engine->schedule = i915_schedule;
4506         engine->execlists.tasklet.func = execlists_submission_tasklet;
4507
4508         engine->reset.prepare = execlists_reset_prepare;
4509         engine->reset.rewind = execlists_reset_rewind;
4510         engine->reset.cancel = execlists_reset_cancel;
4511         engine->reset.finish = execlists_reset_finish;
4512
4513         engine->park = execlists_park;
4514         engine->unpark = NULL;
4515
4516         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4517         if (!intel_vgpu_active(engine->i915)) {
4518                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4519                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4520                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4521         }
4522
4523         if (INTEL_GEN(engine->i915) >= 12)
4524                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4525
4526         if (intel_engine_has_preemption(engine))
4527                 engine->emit_bb_start = gen8_emit_bb_start;
4528         else
4529                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
4530 }
4531
4532 static void execlists_shutdown(struct intel_engine_cs *engine)
4533 {
4534         /* Synchronise with residual timers and any softirq they raise */
4535         del_timer_sync(&engine->execlists.timer);
4536         del_timer_sync(&engine->execlists.preempt);
4537         tasklet_kill(&engine->execlists.tasklet);
4538 }
4539
4540 static void execlists_release(struct intel_engine_cs *engine)
4541 {
4542         engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
4543
4544         execlists_shutdown(engine);
4545
4546         intel_engine_cleanup_common(engine);
4547         lrc_destroy_wa_ctx(engine);
4548 }
4549
4550 static void
4551 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4552 {
4553         /* Default vfuncs which can be overriden by each engine. */
4554
4555         engine->resume = execlists_resume;
4556
4557         engine->cops = &execlists_context_ops;
4558         engine->request_alloc = execlists_request_alloc;
4559
4560         engine->emit_flush = gen8_emit_flush;
4561         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4562         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4563         if (INTEL_GEN(engine->i915) >= 12)
4564                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4565
4566         engine->set_default_submission = intel_execlists_set_default_submission;
4567
4568         if (INTEL_GEN(engine->i915) < 11) {
4569                 engine->irq_enable = gen8_logical_ring_enable_irq;
4570                 engine->irq_disable = gen8_logical_ring_disable_irq;
4571         } else {
4572                 /*
4573                  * TODO: On Gen11 interrupt masks need to be clear
4574                  * to allow C6 entry. Keep interrupts enabled at
4575                  * and take the hit of generating extra interrupts
4576                  * until a more refined solution exists.
4577                  */
4578         }
4579 }
4580
4581 static inline void
4582 logical_ring_default_irqs(struct intel_engine_cs *engine)
4583 {
4584         unsigned int shift = 0;
4585
4586         if (INTEL_GEN(engine->i915) < 11) {
4587                 const u8 irq_shifts[] = {
4588                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
4589                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
4590                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4591                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4592                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
4593                 };
4594
4595                 shift = irq_shifts[engine->id];
4596         }
4597
4598         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4599         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4600         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4601         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
4602 }
4603
4604 static void rcs_submission_override(struct intel_engine_cs *engine)
4605 {
4606         switch (INTEL_GEN(engine->i915)) {
4607         case 12:
4608                 engine->emit_flush = gen12_emit_flush_render;
4609                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4610                 break;
4611         case 11:
4612                 engine->emit_flush = gen11_emit_flush_render;
4613                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4614                 break;
4615         default:
4616                 engine->emit_flush = gen8_emit_flush_render;
4617                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4618                 break;
4619         }
4620 }
4621
4622 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4623 {
4624         struct intel_engine_execlists * const execlists = &engine->execlists;
4625         struct drm_i915_private *i915 = engine->i915;
4626         struct intel_uncore *uncore = engine->uncore;
4627         u32 base = engine->mmio_base;
4628
4629         tasklet_init(&engine->execlists.tasklet,
4630                      execlists_submission_tasklet, (unsigned long)engine);
4631         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4632         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4633
4634         logical_ring_default_vfuncs(engine);
4635         logical_ring_default_irqs(engine);
4636
4637         if (engine->class == RENDER_CLASS)
4638                 rcs_submission_override(engine);
4639
4640         if (intel_init_workaround_bb(engine))
4641                 /*
4642                  * We continue even if we fail to initialize WA batch
4643                  * because we only expect rare glitches but nothing
4644                  * critical to prevent us from using GPU
4645                  */
4646                 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
4647
4648         if (HAS_LOGICAL_RING_ELSQ(i915)) {
4649                 execlists->submit_reg = uncore->regs +
4650                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4651                 execlists->ctrl_reg = uncore->regs +
4652                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4653         } else {
4654                 execlists->submit_reg = uncore->regs +
4655                         i915_mmio_reg_offset(RING_ELSP(base));
4656         }
4657
4658         execlists->csb_status =
4659                 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4660
4661         execlists->csb_write =
4662                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4663
4664         if (INTEL_GEN(i915) < 11)
4665                 execlists->csb_size = GEN8_CSB_ENTRIES;
4666         else
4667                 execlists->csb_size = GEN11_CSB_ENTRIES;
4668
4669         /* Finally, take ownership and responsibility for cleanup! */
4670         engine->sanitize = execlists_sanitize;
4671         engine->release = execlists_release;
4672
4673         return 0;
4674 }
4675
4676 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4677 {
4678         u32 indirect_ctx_offset;
4679
4680         switch (INTEL_GEN(engine->i915)) {
4681         default:
4682                 MISSING_CASE(INTEL_GEN(engine->i915));
4683                 /* fall through */
4684         case 12:
4685                 indirect_ctx_offset =
4686                         GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4687                 break;
4688         case 11:
4689                 indirect_ctx_offset =
4690                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4691                 break;
4692         case 10:
4693                 indirect_ctx_offset =
4694                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4695                 break;
4696         case 9:
4697                 indirect_ctx_offset =
4698                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4699                 break;
4700         case 8:
4701                 indirect_ctx_offset =
4702                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4703                 break;
4704         }
4705
4706         return indirect_ctx_offset;
4707 }
4708
4709
4710 static void init_common_reg_state(u32 * const regs,
4711                                   const struct intel_engine_cs *engine,
4712                                   const struct intel_ring *ring,
4713                                   bool inhibit)
4714 {
4715         u32 ctl;
4716
4717         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4718         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4719         if (inhibit)
4720                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4721         if (INTEL_GEN(engine->i915) < 11)
4722                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4723                                            CTX_CTRL_RS_CTX_ENABLE);
4724         regs[CTX_CONTEXT_CONTROL] = ctl;
4725
4726         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4727         regs[CTX_TIMESTAMP] = 0;
4728 }
4729
4730 static void init_wa_bb_reg_state(u32 * const regs,
4731                                  const struct intel_engine_cs *engine,
4732                                  u32 pos_bb_per_ctx)
4733 {
4734         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4735
4736         if (wa_ctx->per_ctx.size) {
4737                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4738
4739                 regs[pos_bb_per_ctx] =
4740                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4741         }
4742
4743         if (wa_ctx->indirect_ctx.size) {
4744                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4745
4746                 regs[pos_bb_per_ctx + 2] =
4747                         (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4748                         (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4749
4750                 regs[pos_bb_per_ctx + 4] =
4751                         intel_lr_indirect_ctx_offset(engine) << 6;
4752         }
4753 }
4754
4755 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4756 {
4757         if (i915_vm_is_4lvl(&ppgtt->vm)) {
4758                 /* 64b PPGTT (48bit canonical)
4759                  * PDP0_DESCRIPTOR contains the base address to PML4 and
4760                  * other PDP Descriptors are ignored.
4761                  */
4762                 ASSIGN_CTX_PML4(ppgtt, regs);
4763         } else {
4764                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
4765                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
4766                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
4767                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
4768         }
4769 }
4770
4771 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4772 {
4773         if (i915_is_ggtt(vm))
4774                 return i915_vm_to_ggtt(vm)->alias;
4775         else
4776                 return i915_vm_to_ppgtt(vm);
4777 }
4778
4779 static void execlists_init_reg_state(u32 *regs,
4780                                      const struct intel_context *ce,
4781                                      const struct intel_engine_cs *engine,
4782                                      const struct intel_ring *ring,
4783                                      bool inhibit)
4784 {
4785         /*
4786          * A context is actually a big batch buffer with several
4787          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4788          * values we are setting here are only for the first context restore:
4789          * on a subsequent save, the GPU will recreate this batchbuffer with new
4790          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4791          * we are not initializing here).
4792          *
4793          * Must keep consistent with virtual_update_register_offsets().
4794          */
4795         set_offsets(regs, reg_offsets(engine), engine, inhibit);
4796
4797         init_common_reg_state(regs, engine, ring, inhibit);
4798         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4799
4800         init_wa_bb_reg_state(regs, engine,
4801                              INTEL_GEN(engine->i915) >= 12 ?
4802                              GEN12_CTX_BB_PER_CTX_PTR :
4803                              CTX_BB_PER_CTX_PTR);
4804
4805         __reset_stop_ring(regs, engine);
4806 }
4807
4808 static int
4809 populate_lr_context(struct intel_context *ce,
4810                     struct drm_i915_gem_object *ctx_obj,
4811                     struct intel_engine_cs *engine,
4812                     struct intel_ring *ring)
4813 {
4814         bool inhibit = true;
4815         void *vaddr;
4816         int ret;
4817
4818         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4819         if (IS_ERR(vaddr)) {
4820                 ret = PTR_ERR(vaddr);
4821                 drm_dbg(&engine->i915->drm,
4822                         "Could not map object pages! (%d)\n", ret);
4823                 return ret;
4824         }
4825
4826         set_redzone(vaddr, engine);
4827
4828         if (engine->default_state) {
4829                 void *defaults;
4830
4831                 defaults = i915_gem_object_pin_map(engine->default_state,
4832                                                    I915_MAP_WB);
4833                 if (IS_ERR(defaults)) {
4834                         ret = PTR_ERR(defaults);
4835                         goto err_unpin_ctx;
4836                 }
4837
4838                 memcpy(vaddr, defaults, engine->context_size);
4839                 i915_gem_object_unpin_map(engine->default_state);
4840                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4841                 inhibit = false;
4842         }
4843
4844         /* Clear the ppHWSP (inc. per-context counters) */
4845         memset(vaddr, 0, PAGE_SIZE);
4846
4847         /*
4848          * The second page of the context object contains some registers which
4849          * must be set up prior to the first execution.
4850          */
4851         execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4852                                  ce, engine, ring, inhibit);
4853
4854         ret = 0;
4855 err_unpin_ctx:
4856         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4857         i915_gem_object_unpin_map(ctx_obj);
4858         return ret;
4859 }
4860
4861 static int __execlists_context_alloc(struct intel_context *ce,
4862                                      struct intel_engine_cs *engine)
4863 {
4864         struct drm_i915_gem_object *ctx_obj;
4865         struct intel_ring *ring;
4866         struct i915_vma *vma;
4867         u32 context_size;
4868         int ret;
4869
4870         GEM_BUG_ON(ce->state);
4871         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4872
4873         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4874                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4875
4876         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4877         if (IS_ERR(ctx_obj))
4878                 return PTR_ERR(ctx_obj);
4879
4880         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4881         if (IS_ERR(vma)) {
4882                 ret = PTR_ERR(vma);
4883                 goto error_deref_obj;
4884         }
4885
4886         if (!ce->timeline) {
4887                 struct intel_timeline *tl;
4888                 struct i915_vma *hwsp;
4889
4890                 /*
4891                  * Use the static global HWSP for the kernel context, and
4892                  * a dynamically allocated cacheline for everyone else.
4893                  */
4894                 hwsp = NULL;
4895                 if (unlikely(intel_context_is_barrier(ce)))
4896                         hwsp = engine->status_page.vma;
4897
4898                 tl = intel_timeline_create(engine->gt, hwsp);
4899                 if (IS_ERR(tl)) {
4900                         ret = PTR_ERR(tl);
4901                         goto error_deref_obj;
4902                 }
4903
4904                 ce->timeline = tl;
4905         }
4906
4907         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4908         if (IS_ERR(ring)) {
4909                 ret = PTR_ERR(ring);
4910                 goto error_deref_obj;
4911         }
4912
4913         ret = populate_lr_context(ce, ctx_obj, engine, ring);
4914         if (ret) {
4915                 drm_dbg(&engine->i915->drm,
4916                         "Failed to populate LRC: %d\n", ret);
4917                 goto error_ring_free;
4918         }
4919
4920         ce->ring = ring;
4921         ce->state = vma;
4922
4923         return 0;
4924
4925 error_ring_free:
4926         intel_ring_put(ring);
4927 error_deref_obj:
4928         i915_gem_object_put(ctx_obj);
4929         return ret;
4930 }
4931
4932 static struct list_head *virtual_queue(struct virtual_engine *ve)
4933 {
4934         return &ve->base.execlists.default_priolist.requests[0];
4935 }
4936
4937 static void virtual_context_destroy(struct kref *kref)
4938 {
4939         struct virtual_engine *ve =
4940                 container_of(kref, typeof(*ve), context.ref);
4941         unsigned int n;
4942
4943         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4944         GEM_BUG_ON(ve->request);
4945         GEM_BUG_ON(ve->context.inflight);
4946
4947         for (n = 0; n < ve->num_siblings; n++) {
4948                 struct intel_engine_cs *sibling = ve->siblings[n];
4949                 struct rb_node *node = &ve->nodes[sibling->id].rb;
4950                 unsigned long flags;
4951
4952                 if (RB_EMPTY_NODE(node))
4953                         continue;
4954
4955                 spin_lock_irqsave(&sibling->active.lock, flags);
4956
4957                 /* Detachment is lazily performed in the execlists tasklet */
4958                 if (!RB_EMPTY_NODE(node))
4959                         rb_erase_cached(node, &sibling->execlists.virtual);
4960
4961                 spin_unlock_irqrestore(&sibling->active.lock, flags);
4962         }
4963         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4964
4965         if (ve->context.state)
4966                 __execlists_context_fini(&ve->context);
4967         intel_context_fini(&ve->context);
4968
4969         intel_engine_free_request_pool(&ve->base);
4970
4971         kfree(ve->bonds);
4972         kfree(ve);
4973 }
4974
4975 static void virtual_engine_initial_hint(struct virtual_engine *ve)
4976 {
4977         int swp;
4978
4979         /*
4980          * Pick a random sibling on starting to help spread the load around.
4981          *
4982          * New contexts are typically created with exactly the same order
4983          * of siblings, and often started in batches. Due to the way we iterate
4984          * the array of sibling when submitting requests, sibling[0] is
4985          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4986          * randomised across the system, we also help spread the load by the
4987          * first engine we inspect being different each time.
4988          *
4989          * NB This does not force us to execute on this engine, it will just
4990          * typically be the first we inspect for submission.
4991          */
4992         swp = prandom_u32_max(ve->num_siblings);
4993         if (!swp)
4994                 return;
4995
4996         swap(ve->siblings[swp], ve->siblings[0]);
4997         if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4998                 virtual_update_register_offsets(ve->context.lrc_reg_state,
4999                                                 ve->siblings[0]);
5000 }
5001
5002 static int virtual_context_alloc(struct intel_context *ce)
5003 {
5004         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5005
5006         return __execlists_context_alloc(ce, ve->siblings[0]);
5007 }
5008
5009 static int virtual_context_pin(struct intel_context *ce)
5010 {
5011         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5012         int err;
5013
5014         /* Note: we must use a real engine class for setting up reg state */
5015         err = __execlists_context_pin(ce, ve->siblings[0]);
5016         if (err)
5017                 return err;
5018
5019         virtual_engine_initial_hint(ve);
5020         return 0;
5021 }
5022
5023 static void virtual_context_enter(struct intel_context *ce)
5024 {
5025         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5026         unsigned int n;
5027
5028         for (n = 0; n < ve->num_siblings; n++)
5029                 intel_engine_pm_get(ve->siblings[n]);
5030
5031         intel_timeline_enter(ce->timeline);
5032 }
5033
5034 static void virtual_context_exit(struct intel_context *ce)
5035 {
5036         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5037         unsigned int n;
5038
5039         intel_timeline_exit(ce->timeline);
5040
5041         for (n = 0; n < ve->num_siblings; n++)
5042                 intel_engine_pm_put(ve->siblings[n]);
5043 }
5044
5045 static const struct intel_context_ops virtual_context_ops = {
5046         .alloc = virtual_context_alloc,
5047
5048         .pin = virtual_context_pin,
5049         .unpin = execlists_context_unpin,
5050
5051         .enter = virtual_context_enter,
5052         .exit = virtual_context_exit,
5053
5054         .destroy = virtual_context_destroy,
5055 };
5056
5057 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5058 {
5059         struct i915_request *rq;
5060         intel_engine_mask_t mask;
5061
5062         rq = READ_ONCE(ve->request);
5063         if (!rq)
5064                 return 0;
5065
5066         /* The rq is ready for submission; rq->execution_mask is now stable. */
5067         mask = rq->execution_mask;
5068         if (unlikely(!mask)) {
5069                 /* Invalid selection, submit to a random engine in error */
5070                 i915_request_set_error_once(rq, -ENODEV);
5071                 mask = ve->siblings[0]->mask;
5072         }
5073
5074         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5075                      rq->fence.context, rq->fence.seqno,
5076                      mask, ve->base.execlists.queue_priority_hint);
5077
5078         return mask;
5079 }
5080
5081 static void virtual_submission_tasklet(unsigned long data)
5082 {
5083         struct virtual_engine * const ve = (struct virtual_engine *)data;
5084         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5085         intel_engine_mask_t mask;
5086         unsigned int n;
5087
5088         rcu_read_lock();
5089         mask = virtual_submission_mask(ve);
5090         rcu_read_unlock();
5091         if (unlikely(!mask))
5092                 return;
5093
5094         local_irq_disable();
5095         for (n = 0; n < ve->num_siblings; n++) {
5096                 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5097                 struct ve_node * const node = &ve->nodes[sibling->id];
5098                 struct rb_node **parent, *rb;
5099                 bool first;
5100
5101                 if (!READ_ONCE(ve->request))
5102                         break; /* already handled by a sibling's tasklet */
5103
5104                 if (unlikely(!(mask & sibling->mask))) {
5105                         if (!RB_EMPTY_NODE(&node->rb)) {
5106                                 spin_lock(&sibling->active.lock);
5107                                 rb_erase_cached(&node->rb,
5108                                                 &sibling->execlists.virtual);
5109                                 RB_CLEAR_NODE(&node->rb);
5110                                 spin_unlock(&sibling->active.lock);
5111                         }
5112                         continue;
5113                 }
5114
5115                 spin_lock(&sibling->active.lock);
5116
5117                 if (!RB_EMPTY_NODE(&node->rb)) {
5118                         /*
5119                          * Cheat and avoid rebalancing the tree if we can
5120                          * reuse this node in situ.
5121                          */
5122                         first = rb_first_cached(&sibling->execlists.virtual) ==
5123                                 &node->rb;
5124                         if (prio == node->prio || (prio > node->prio && first))
5125                                 goto submit_engine;
5126
5127                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5128                 }
5129
5130                 rb = NULL;
5131                 first = true;
5132                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5133                 while (*parent) {
5134                         struct ve_node *other;
5135
5136                         rb = *parent;
5137                         other = rb_entry(rb, typeof(*other), rb);
5138                         if (prio > other->prio) {
5139                                 parent = &rb->rb_left;
5140                         } else {
5141                                 parent = &rb->rb_right;
5142                                 first = false;
5143                         }
5144                 }
5145
5146                 rb_link_node(&node->rb, rb, parent);
5147                 rb_insert_color_cached(&node->rb,
5148                                        &sibling->execlists.virtual,
5149                                        first);
5150
5151 submit_engine:
5152                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5153                 node->prio = prio;
5154                 if (first && prio > sibling->execlists.queue_priority_hint)
5155                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5156
5157                 spin_unlock(&sibling->active.lock);
5158         }
5159         local_irq_enable();
5160 }
5161
5162 static void virtual_submit_request(struct i915_request *rq)
5163 {
5164         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5165         struct i915_request *old;
5166         unsigned long flags;
5167
5168         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5169                      rq->fence.context,
5170                      rq->fence.seqno);
5171
5172         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5173
5174         spin_lock_irqsave(&ve->base.active.lock, flags);
5175
5176         old = ve->request;
5177         if (old) { /* background completion event from preempt-to-busy */
5178                 GEM_BUG_ON(!i915_request_completed(old));
5179                 __i915_request_submit(old);
5180                 i915_request_put(old);
5181         }
5182
5183         if (i915_request_completed(rq)) {
5184                 __i915_request_submit(rq);
5185
5186                 ve->base.execlists.queue_priority_hint = INT_MIN;
5187                 ve->request = NULL;
5188         } else {
5189                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5190                 ve->request = i915_request_get(rq);
5191
5192                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5193                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5194
5195                 tasklet_schedule(&ve->base.execlists.tasklet);
5196         }
5197
5198         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5199 }
5200
5201 static struct ve_bond *
5202 virtual_find_bond(struct virtual_engine *ve,
5203                   const struct intel_engine_cs *master)
5204 {
5205         int i;
5206
5207         for (i = 0; i < ve->num_bonds; i++) {
5208                 if (ve->bonds[i].master == master)
5209                         return &ve->bonds[i];
5210         }
5211
5212         return NULL;
5213 }
5214
5215 static void
5216 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5217 {
5218         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5219         intel_engine_mask_t allowed, exec;
5220         struct ve_bond *bond;
5221
5222         allowed = ~to_request(signal)->engine->mask;
5223
5224         bond = virtual_find_bond(ve, to_request(signal)->engine);
5225         if (bond)
5226                 allowed &= bond->sibling_mask;
5227
5228         /* Restrict the bonded request to run on only the available engines */
5229         exec = READ_ONCE(rq->execution_mask);
5230         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5231                 ;
5232
5233         /* Prevent the master from being re-run on the bonded engines */
5234         to_request(signal)->execution_mask &= ~allowed;
5235 }
5236
5237 struct intel_context *
5238 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5239                                unsigned int count)
5240 {
5241         struct virtual_engine *ve;
5242         unsigned int n;
5243         int err;
5244
5245         if (count == 0)
5246                 return ERR_PTR(-EINVAL);
5247
5248         if (count == 1)
5249                 return intel_context_create(siblings[0]);
5250
5251         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5252         if (!ve)
5253                 return ERR_PTR(-ENOMEM);
5254
5255         ve->base.i915 = siblings[0]->i915;
5256         ve->base.gt = siblings[0]->gt;
5257         ve->base.uncore = siblings[0]->uncore;
5258         ve->base.id = -1;
5259
5260         ve->base.class = OTHER_CLASS;
5261         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5262         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5263         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5264
5265         /*
5266          * The decision on whether to submit a request using semaphores
5267          * depends on the saturated state of the engine. We only compute
5268          * this during HW submission of the request, and we need for this
5269          * state to be globally applied to all requests being submitted
5270          * to this engine. Virtual engines encompass more than one physical
5271          * engine and so we cannot accurately tell in advance if one of those
5272          * engines is already saturated and so cannot afford to use a semaphore
5273          * and be pessimized in priority for doing so -- if we are the only
5274          * context using semaphores after all other clients have stopped, we
5275          * will be starved on the saturated system. Such a global switch for
5276          * semaphores is less than ideal, but alas is the current compromise.
5277          */
5278         ve->base.saturated = ALL_ENGINES;
5279
5280         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5281
5282         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5283         intel_engine_init_breadcrumbs(&ve->base);
5284         intel_engine_init_execlists(&ve->base);
5285
5286         ve->base.cops = &virtual_context_ops;
5287         ve->base.request_alloc = execlists_request_alloc;
5288
5289         ve->base.schedule = i915_schedule;
5290         ve->base.submit_request = virtual_submit_request;
5291         ve->base.bond_execute = virtual_bond_execute;
5292
5293         INIT_LIST_HEAD(virtual_queue(ve));
5294         ve->base.execlists.queue_priority_hint = INT_MIN;
5295         tasklet_init(&ve->base.execlists.tasklet,
5296                      virtual_submission_tasklet,
5297                      (unsigned long)ve);
5298
5299         intel_context_init(&ve->context, &ve->base);
5300
5301         for (n = 0; n < count; n++) {
5302                 struct intel_engine_cs *sibling = siblings[n];
5303
5304                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5305                 if (sibling->mask & ve->base.mask) {
5306                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5307                                   sibling->name);
5308                         err = -EINVAL;
5309                         goto err_put;
5310                 }
5311
5312                 /*
5313                  * The virtual engine implementation is tightly coupled to
5314                  * the execlists backend -- we push out request directly
5315                  * into a tree inside each physical engine. We could support
5316                  * layering if we handle cloning of the requests and
5317                  * submitting a copy into each backend.
5318                  */
5319                 if (sibling->execlists.tasklet.func !=
5320                     execlists_submission_tasklet) {
5321                         err = -ENODEV;
5322                         goto err_put;
5323                 }
5324
5325                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5326                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5327
5328                 ve->siblings[ve->num_siblings++] = sibling;
5329                 ve->base.mask |= sibling->mask;
5330
5331                 /*
5332                  * All physical engines must be compatible for their emission
5333                  * functions (as we build the instructions during request
5334                  * construction and do not alter them before submission
5335                  * on the physical engine). We use the engine class as a guide
5336                  * here, although that could be refined.
5337                  */
5338                 if (ve->base.class != OTHER_CLASS) {
5339                         if (ve->base.class != sibling->class) {
5340                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5341                                           sibling->class, ve->base.class);
5342                                 err = -EINVAL;
5343                                 goto err_put;
5344                         }
5345                         continue;
5346                 }
5347
5348                 ve->base.class = sibling->class;
5349                 ve->base.uabi_class = sibling->uabi_class;
5350                 snprintf(ve->base.name, sizeof(ve->base.name),
5351                          "v%dx%d", ve->base.class, count);
5352                 ve->base.context_size = sibling->context_size;
5353
5354                 ve->base.emit_bb_start = sibling->emit_bb_start;
5355                 ve->base.emit_flush = sibling->emit_flush;
5356                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5357                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5358                 ve->base.emit_fini_breadcrumb_dw =
5359                         sibling->emit_fini_breadcrumb_dw;
5360
5361                 ve->base.flags = sibling->flags;
5362         }
5363
5364         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5365
5366         return &ve->context;
5367
5368 err_put:
5369         intel_context_put(&ve->context);
5370         return ERR_PTR(err);
5371 }
5372
5373 struct intel_context *
5374 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5375 {
5376         struct virtual_engine *se = to_virtual_engine(src);
5377         struct intel_context *dst;
5378
5379         dst = intel_execlists_create_virtual(se->siblings,
5380                                              se->num_siblings);
5381         if (IS_ERR(dst))
5382                 return dst;
5383
5384         if (se->num_bonds) {
5385                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5386
5387                 de->bonds = kmemdup(se->bonds,
5388                                     sizeof(*se->bonds) * se->num_bonds,
5389                                     GFP_KERNEL);
5390                 if (!de->bonds) {
5391                         intel_context_put(dst);
5392                         return ERR_PTR(-ENOMEM);
5393                 }
5394
5395                 de->num_bonds = se->num_bonds;
5396         }
5397
5398         return dst;
5399 }
5400
5401 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5402                                      const struct intel_engine_cs *master,
5403                                      const struct intel_engine_cs *sibling)
5404 {
5405         struct virtual_engine *ve = to_virtual_engine(engine);
5406         struct ve_bond *bond;
5407         int n;
5408
5409         /* Sanity check the sibling is part of the virtual engine */
5410         for (n = 0; n < ve->num_siblings; n++)
5411                 if (sibling == ve->siblings[n])
5412                         break;
5413         if (n == ve->num_siblings)
5414                 return -EINVAL;
5415
5416         bond = virtual_find_bond(ve, master);
5417         if (bond) {
5418                 bond->sibling_mask |= sibling->mask;
5419                 return 0;
5420         }
5421
5422         bond = krealloc(ve->bonds,
5423                         sizeof(*bond) * (ve->num_bonds + 1),
5424                         GFP_KERNEL);
5425         if (!bond)
5426                 return -ENOMEM;
5427
5428         bond[ve->num_bonds].master = master;
5429         bond[ve->num_bonds].sibling_mask = sibling->mask;
5430
5431         ve->bonds = bond;
5432         ve->num_bonds++;
5433
5434         return 0;
5435 }
5436
5437 struct intel_engine_cs *
5438 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5439                                  unsigned int sibling)
5440 {
5441         struct virtual_engine *ve = to_virtual_engine(engine);
5442
5443         if (sibling >= ve->num_siblings)
5444                 return NULL;
5445
5446         return ve->siblings[sibling];
5447 }
5448
5449 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5450                                    struct drm_printer *m,
5451                                    void (*show_request)(struct drm_printer *m,
5452                                                         struct i915_request *rq,
5453                                                         const char *prefix),
5454                                    unsigned int max)
5455 {
5456         const struct intel_engine_execlists *execlists = &engine->execlists;
5457         struct i915_request *rq, *last;
5458         unsigned long flags;
5459         unsigned int count;
5460         struct rb_node *rb;
5461
5462         spin_lock_irqsave(&engine->active.lock, flags);
5463
5464         last = NULL;
5465         count = 0;
5466         list_for_each_entry(rq, &engine->active.requests, sched.link) {
5467                 if (count++ < max - 1)
5468                         show_request(m, rq, "\t\tE ");
5469                 else
5470                         last = rq;
5471         }
5472         if (last) {
5473                 if (count > max) {
5474                         drm_printf(m,
5475                                    "\t\t...skipping %d executing requests...\n",
5476                                    count - max);
5477                 }
5478                 show_request(m, last, "\t\tE ");
5479         }
5480
5481         if (execlists->switch_priority_hint != INT_MIN)
5482                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
5483                            READ_ONCE(execlists->switch_priority_hint));
5484         if (execlists->queue_priority_hint != INT_MIN)
5485                 drm_printf(m, "\t\tQueue priority hint: %d\n",
5486                            READ_ONCE(execlists->queue_priority_hint));
5487
5488         last = NULL;
5489         count = 0;
5490         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5491                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5492                 int i;
5493
5494                 priolist_for_each_request(rq, p, i) {
5495                         if (count++ < max - 1)
5496                                 show_request(m, rq, "\t\tQ ");
5497                         else
5498                                 last = rq;
5499                 }
5500         }
5501         if (last) {
5502                 if (count > max) {
5503                         drm_printf(m,
5504                                    "\t\t...skipping %d queued requests...\n",
5505                                    count - max);
5506                 }
5507                 show_request(m, last, "\t\tQ ");
5508         }
5509
5510         last = NULL;
5511         count = 0;
5512         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5513                 struct virtual_engine *ve =
5514                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5515                 struct i915_request *rq = READ_ONCE(ve->request);
5516
5517                 if (rq) {
5518                         if (count++ < max - 1)
5519                                 show_request(m, rq, "\t\tV ");
5520                         else
5521                                 last = rq;
5522                 }
5523         }
5524         if (last) {
5525                 if (count > max) {
5526                         drm_printf(m,
5527                                    "\t\t...skipping %d virtual requests...\n",
5528                                    count - max);
5529                 }
5530                 show_request(m, last, "\t\tV ");
5531         }
5532
5533         spin_unlock_irqrestore(&engine->active.lock, flags);
5534 }
5535
5536 void intel_lr_context_reset(struct intel_engine_cs *engine,
5537                             struct intel_context *ce,
5538                             u32 head,
5539                             bool scrub)
5540 {
5541         GEM_BUG_ON(!intel_context_is_pinned(ce));
5542
5543         /*
5544          * We want a simple context + ring to execute the breadcrumb update.
5545          * We cannot rely on the context being intact across the GPU hang,
5546          * so clear it and rebuild just what we need for the breadcrumb.
5547          * All pending requests for this context will be zapped, and any
5548          * future request will be after userspace has had the opportunity
5549          * to recreate its own state.
5550          */
5551         if (scrub)
5552                 restore_default_state(ce, engine);
5553
5554         /* Rerun the request; its payload has been neutered (if guilty). */
5555         __execlists_update_reg_state(ce, engine, head);
5556 }
5557
5558 bool
5559 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5560 {
5561         return engine->set_default_submission ==
5562                intel_execlists_set_default_submission;
5563 }
5564
5565 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5566 #include "selftest_lrc.c"
5567 #endif