drivers/gpu/drm/i915/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include <drm/drmP.h>
 137 #include <drm/i915_drm.h>
 138 #include "i915_drv.h"
 139 #include "i915_gem_render_state.h"
 140 #include "i915_vgpu.h"
 141 #include "intel_lrc_reg.h"
 142 #include "intel_mocs.h"
 143 #include "intel_workarounds.h"
 144
 145 #define RING_EXECLIST_QFULL             (1 << 0x2)
 146 #define RING_EXECLIST1_VALID            (1 << 0x3)
 147 #define RING_EXECLIST0_VALID            (1 << 0x4)
 148 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 149 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 150 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 151
 152 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 153 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 154 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 155 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 156 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 157 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 158
 159 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 160          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 161
 162 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 163 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 164 #define WA_TAIL_DWORDS 2
 165 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 166
 167 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 168                                             struct intel_engine_cs *engine,
 169                                             struct intel_context *ce);
 170 static void execlists_init_reg_state(u32 *reg_state,
 171                                      struct i915_gem_context *ctx,
 172                                      struct intel_engine_cs *engine,
 173                                      struct intel_ring *ring);
 174
 175 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 176 {
 177         return rb_entry(rb, struct i915_priolist, node);
 178 }
 179
 180 static inline int rq_prio(const struct i915_request *rq)
 181 {
 182         return rq->sched.attr.priority;
 183 }
 184
 185 static inline bool need_preempt(const struct intel_engine_cs *engine,
 186                                 const struct i915_request *last,
 187                                 int prio)
 188 {
 189         return (intel_engine_has_preemption(engine) &&
 190                 __execlists_need_preempt(prio, rq_prio(last)) &&
 191                 !i915_request_completed(last));
 192 }
 193
 194 /*
 195  * The context descriptor encodes various attributes of a context,
 196  * including its GTT address and some flags. Because it's fairly
 197  * expensive to calculate, we'll just do it once and cache the result,
 198  * which remains valid until the context is unpinned.
 199  *
 200  * This is what a descriptor looks like, from LSB to MSB::
 201  *
 202  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 203  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 204  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 205  *      bits 53-54:    mbz, reserved for use by hardware
 206  *      bits 55-63:    group ID, currently unused and set to 0
 207  *
 208  * Starting from Gen11, the upper dword of the descriptor has a new format:
 209  *
 210  *      bits 32-36:    reserved
 211  *      bits 37-47:    SW context ID
 212  *      bits 48:53:    engine instance
 213  *      bit 54:        mbz, reserved for use by hardware
 214  *      bits 55-60:    SW counter
 215  *      bits 61-63:    engine class
 216  *
 217  * engine info, SW context ID and SW counter need to form a unique number
 218  * (Context ID) per lrc.
 219  */
 220 static void
 221 intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
 222                                    struct intel_engine_cs *engine,
 223                                    struct intel_context *ce)
 224 {
 225         u64 desc;
 226
 227         BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH)));
 228         BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH)));
 229
 230         desc = ctx->desc_template;                              /* bits  0-11 */
 231         GEM_BUG_ON(desc & GENMASK_ULL(63, 12));
 232
 233         desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE;
 234                                                                 /* bits 12-31 */
 235         GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
 236
 237         /*
 238          * The following 32bits are copied into the OA reports (dword 2).
 239          * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
 240          * anything below.
 241          */
 242         if (INTEL_GEN(ctx->i915) >= 11) {
 243                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
 244                 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
 245                                                                 /* bits 37-47 */
 246
 247                 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
 248                                                                 /* bits 48-53 */
 249
 250                 /* TODO: decide what to do with SW counter (bits 55-60) */
 251
 252                 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
 253                                                                 /* bits 61-63 */
 254         } else {
 255                 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH));
 256                 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT;   /* bits 32-52 */
 257         }
 258
 259         ce->lrc_desc = desc;
 260 }
 261
 262 static struct i915_priolist *
 263 lookup_priolist(struct intel_engine_cs *engine, int prio)
 264 {
 265         struct intel_engine_execlists * const execlists = &engine->execlists;
 266         struct i915_priolist *p;
 267         struct rb_node **parent, *rb;
 268         bool first = true;
 269
 270         if (unlikely(execlists->no_priolist))
 271                 prio = I915_PRIORITY_NORMAL;
 272
 273 find_priolist:
 274         /* most positive priority is scheduled first, equal priorities fifo */
 275         rb = NULL;
 276         parent = &execlists->queue.rb_root.rb_node;
 277         while (*parent) {
 278                 rb = *parent;
 279                 p = to_priolist(rb);
 280                 if (prio > p->priority) {
 281                         parent = &rb->rb_left;
 282                 } else if (prio < p->priority) {
 283                         parent = &rb->rb_right;
 284                         first = false;
 285                 } else {
 286                         return p;
 287                 }
 288         }
 289
 290         if (prio == I915_PRIORITY_NORMAL) {
 291                 p = &execlists->default_priolist;
 292         } else {
 293                 p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
 294                 /* Convert an allocation failure to a priority bump */
 295                 if (unlikely(!p)) {
 296                         prio = I915_PRIORITY_NORMAL; /* recurses just once */
 297
 298                         /* To maintain ordering with all rendering, after an
 299                          * allocation failure we have to disable all scheduling.
 300                          * Requests will then be executed in fifo, and schedule
 301                          * will ensure that dependencies are emitted in fifo.
 302                          * There will be still some reordering with existing
 303                          * requests, so if userspace lied about their
 304                          * dependencies that reordering may be visible.
 305                          */
 306                         execlists->no_priolist = true;
 307                         goto find_priolist;
 308                 }
 309         }
 310
 311         p->priority = prio;
 312         INIT_LIST_HEAD(&p->requests);
 313         rb_link_node(&p->node, rb, parent);
 314         rb_insert_color_cached(&p->node, &execlists->queue, first);
 315
 316         return p;
 317 }
 318
 319 static void unwind_wa_tail(struct i915_request *rq)
 320 {
 321         rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
 322         assert_ring_tail_valid(rq->ring, rq->tail);
 323 }
 324
 325 static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 326 {
 327         struct i915_request *rq, *rn;
 328         struct i915_priolist *uninitialized_var(p);
 329         int last_prio = I915_PRIORITY_INVALID;
 330
 331         lockdep_assert_held(&engine->timeline.lock);
 332
 333         list_for_each_entry_safe_reverse(rq, rn,
 334                                          &engine->timeline.requests,
 335                                          link) {
 336                 if (i915_request_completed(rq))
 337                         return;
 338
 339                 __i915_request_unsubmit(rq);
 340                 unwind_wa_tail(rq);
 341
 342                 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
 343                 if (rq_prio(rq) != last_prio) {
 344                         last_prio = rq_prio(rq);
 345                         p = lookup_priolist(engine, last_prio);
 346                 }
 347
 348                 GEM_BUG_ON(p->priority != rq_prio(rq));
 349                 list_add(&rq->sched.link, &p->requests);
 350         }
 351 }
 352
 353 void
 354 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
 355 {
 356         struct intel_engine_cs *engine =
 357                 container_of(execlists, typeof(*engine), execlists);
 358         unsigned long flags;
 359
 360         spin_lock_irqsave(&engine->timeline.lock, flags);
 361
 362         __unwind_incomplete_requests(engine);
 363
 364         spin_unlock_irqrestore(&engine->timeline.lock, flags);
 365 }
 366
 367 static inline void
 368 execlists_context_status_change(struct i915_request *rq, unsigned long status)
 369 {
 370         /*
 371          * Only used when GVT-g is enabled now. When GVT-g is disabled,
 372          * The compiler should eliminate this function as dead-code.
 373          */
 374         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
 375                 return;
 376
 377         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
 378                                    status, rq);
 379 }
 380
 381 inline void
 382 execlists_user_begin(struct intel_engine_execlists *execlists,
 383                      const struct execlist_port *port)
 384 {
 385         execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER);
 386 }
 387
 388 inline void
 389 execlists_user_end(struct intel_engine_execlists *execlists)
 390 {
 391         execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER);
 392 }
 393
 394 static inline void
 395 execlists_context_schedule_in(struct i915_request *rq)
 396 {
 397         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
 398         intel_engine_context_in(rq->engine);
 399 }
 400
 401 static inline void
 402 execlists_context_schedule_out(struct i915_request *rq, unsigned long status)
 403 {
 404         intel_engine_context_out(rq->engine);
 405         execlists_context_status_change(rq, status);
 406         trace_i915_request_out(rq);
 407 }
 408
 409 static void
 410 execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
 411 {
 412         ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
 413         ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
 414         ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
 415         ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
 416 }
 417
 418 static u64 execlists_update_context(struct i915_request *rq)
 419 {
 420         struct intel_context *ce = rq->hw_context;
 421         struct i915_hw_ppgtt *ppgtt =
 422                 rq->gem_context->ppgtt ?: rq->i915->mm.aliasing_ppgtt;
 423         u32 *reg_state = ce->lrc_reg_state;
 424
 425         reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
 426
 427         /* True 32b PPGTT with dynamic page allocation: update PDP
 428          * registers and point the unallocated PDPs to scratch page.
 429          * PML4 is allocated during ppgtt init, so this is not needed
 430          * in 48-bit mode.
 431          */
 432         if (ppgtt && !i915_vm_is_48bit(&ppgtt->vm))
 433                 execlists_update_context_pdps(ppgtt, reg_state);
 434
 435         return ce->lrc_desc;
 436 }
 437
 438 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
 439 {
 440         if (execlists->ctrl_reg) {
 441                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
 442                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
 443         } else {
 444                 writel(upper_32_bits(desc), execlists->submit_reg);
 445                 writel(lower_32_bits(desc), execlists->submit_reg);
 446         }
 447 }
 448
 449 static void execlists_submit_ports(struct intel_engine_cs *engine)
 450 {
 451         struct intel_engine_execlists *execlists = &engine->execlists;
 452         struct execlist_port *port = execlists->port;
 453         unsigned int n;
 454
 455         /*
 456          * We can skip acquiring intel_runtime_pm_get() here as it was taken
 457          * on our behalf by the request (see i915_gem_mark_busy()) and it will
 458          * not be relinquished until the device is idle (see
 459          * i915_gem_idle_work_handler()). As a precaution, we make sure
 460          * that all ELSP are drained i.e. we have processed the CSB,
 461          * before allowing ourselves to idle and calling intel_runtime_pm_put().
 462          */
 463         GEM_BUG_ON(!engine->i915->gt.awake);
 464
 465         /*
 466          * ELSQ note: the submit queue is not cleared after being submitted
 467          * to the HW so we need to make sure we always clean it up. This is
 468          * currently ensured by the fact that we always write the same number
 469          * of elsq entries, keep this in mind before changing the loop below.
 470          */
 471         for (n = execlists_num_ports(execlists); n--; ) {
 472                 struct i915_request *rq;
 473                 unsigned int count;
 474                 u64 desc;
 475
 476                 rq = port_unpack(&port[n], &count);
 477                 if (rq) {
 478                         GEM_BUG_ON(count > !n);
 479                         if (!count++)
 480                                 execlists_context_schedule_in(rq);
 481                         port_set(&port[n], port_pack(rq, count));
 482                         desc = execlists_update_context(rq);
 483                         GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
 484
 485                         GEM_TRACE("%s in[%d]:  ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
 486                                   engine->name, n,
 487                                   port[n].context_id, count,
 488                                   rq->global_seqno,
 489                                   rq->fence.context, rq->fence.seqno,
 490                                   intel_engine_get_seqno(engine),
 491                                   rq_prio(rq));
 492                 } else {
 493                         GEM_BUG_ON(!n);
 494                         desc = 0;
 495                 }
 496
 497                 write_desc(execlists, desc, n);
 498         }
 499
 500         /* we need to manually load the submit queue */
 501         if (execlists->ctrl_reg)
 502                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
 503
 504         execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
 505 }
 506
 507 static bool ctx_single_port_submission(const struct intel_context *ce)
 508 {
 509         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
 510                 i915_gem_context_force_single_submission(ce->gem_context));
 511 }
 512
 513 static bool can_merge_ctx(const struct intel_context *prev,
 514                           const struct intel_context *next)
 515 {
 516         if (prev != next)
 517                 return false;
 518
 519         if (ctx_single_port_submission(prev))
 520                 return false;
 521
 522         return true;
 523 }
 524
 525 static void port_assign(struct execlist_port *port, struct i915_request *rq)
 526 {
 527         GEM_BUG_ON(rq == port_request(port));
 528
 529         if (port_isset(port))
 530                 i915_request_put(port_request(port));
 531
 532         port_set(port, port_pack(i915_request_get(rq), port_count(port)));
 533 }
 534
 535 static void inject_preempt_context(struct intel_engine_cs *engine)
 536 {
 537         struct intel_engine_execlists *execlists = &engine->execlists;
 538         struct intel_context *ce =
 539                 to_intel_context(engine->i915->preempt_context, engine);
 540         unsigned int n;
 541
 542         GEM_BUG_ON(execlists->preempt_complete_status !=
 543                    upper_32_bits(ce->lrc_desc));
 544         GEM_BUG_ON((ce->lrc_reg_state[CTX_CONTEXT_CONTROL + 1] &
 545                     _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
 546                                        CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT)) !=
 547                    _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
 548                                       CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
 549
 550         /*
 551          * Switch to our empty preempt context so
 552          * the state of the GPU is known (idle).
 553          */
 554         GEM_TRACE("%s\n", engine->name);
 555         for (n = execlists_num_ports(execlists); --n; )
 556                 write_desc(execlists, 0, n);
 557
 558         write_desc(execlists, ce->lrc_desc, n);
 559
 560         /* we need to manually load the submit queue */
 561         if (execlists->ctrl_reg)
 562                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
 563
 564         execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK);
 565         execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT);
 566 }
 567
 568 static void complete_preempt_context(struct intel_engine_execlists *execlists)
 569 {
 570         GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT));
 571
 572         if (inject_preempt_hang(execlists))
 573                 return;
 574
 575         execlists_cancel_port_requests(execlists);
 576         __unwind_incomplete_requests(container_of(execlists,
 577                                                   struct intel_engine_cs,
 578                                                   execlists));
 579 }
 580
 581 static void execlists_dequeue(struct intel_engine_cs *engine)
 582 {
 583         struct intel_engine_execlists * const execlists = &engine->execlists;
 584         struct execlist_port *port = execlists->port;
 585         const struct execlist_port * const last_port =
 586                 &execlists->port[execlists->port_mask];
 587         struct i915_request *last = port_request(port);
 588         struct rb_node *rb;
 589         bool submit = false;
 590
 591         /*
 592          * Hardware submission is through 2 ports. Conceptually each port
 593          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
 594          * static for a context, and unique to each, so we only execute
 595          * requests belonging to a single context from each ring. RING_HEAD
 596          * is maintained by the CS in the context image, it marks the place
 597          * where it got up to last time, and through RING_TAIL we tell the CS
 598          * where we want to execute up to this time.
 599          *
 600          * In this list the requests are in order of execution. Consecutive
 601          * requests from the same context are adjacent in the ringbuffer. We
 602          * can combine these requests into a single RING_TAIL update:
 603          *
 604          *              RING_HEAD...req1...req2
 605          *                                    ^- RING_TAIL
 606          * since to execute req2 the CS must first execute req1.
 607          *
 608          * Our goal then is to point each port to the end of a consecutive
 609          * sequence of requests as being the most optimal (fewest wake ups
 610          * and context switches) submission.
 611          */
 612
 613         if (last) {
 614                 /*
 615                  * Don't resubmit or switch until all outstanding
 616                  * preemptions (lite-restore) are seen. Then we
 617                  * know the next preemption status we see corresponds
 618                  * to this ELSP update.
 619                  */
 620                 GEM_BUG_ON(!execlists_is_active(execlists,
 621                                                 EXECLISTS_ACTIVE_USER));
 622                 GEM_BUG_ON(!port_count(&port[0]));
 623
 624                 /*
 625                  * If we write to ELSP a second time before the HW has had
 626                  * a chance to respond to the previous write, we can confuse
 627                  * the HW and hit "undefined behaviour". After writing to ELSP,
 628                  * we must then wait until we see a context-switch event from
 629                  * the HW to indicate that it has had a chance to respond.
 630                  */
 631                 if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 632                         return;
 633
 634                 if (need_preempt(engine, last, execlists->queue_priority)) {
 635                         inject_preempt_context(engine);
 636                         return;
 637                 }
 638
 639                 /*
 640                  * In theory, we could coalesce more requests onto
 641                  * the second port (the first port is active, with
 642                  * no preemptions pending). However, that means we
 643                  * then have to deal with the possible lite-restore
 644                  * of the second port (as we submit the ELSP, there
 645                  * may be a context-switch) but also we may complete
 646                  * the resubmission before the context-switch. Ergo,
 647                  * coalescing onto the second port will cause a
 648                  * preemption event, but we cannot predict whether
 649                  * that will affect port[0] or port[1].
 650                  *
 651                  * If the second port is already active, we can wait
 652                  * until the next context-switch before contemplating
 653                  * new requests. The GPU will be busy and we should be
 654                  * able to resubmit the new ELSP before it idles,
 655                  * avoiding pipeline bubbles (momentary pauses where
 656                  * the driver is unable to keep up the supply of new
 657                  * work). However, we have to double check that the
 658                  * priorities of the ports haven't been switch.
 659                  */
 660                 if (port_count(&port[1]))
 661                         return;
 662
 663                 /*
 664                  * WaIdleLiteRestore:bdw,skl
 665                  * Apply the wa NOOPs to prevent
 666                  * ring:HEAD == rq:TAIL as we resubmit the
 667                  * request. See gen8_emit_breadcrumb() for
 668                  * where we prepare the padding after the
 669                  * end of the request.
 670                  */
 671                 last->tail = last->wa_tail;
 672         }
 673
 674         while ((rb = rb_first_cached(&execlists->queue))) {
 675                 struct i915_priolist *p = to_priolist(rb);
 676                 struct i915_request *rq, *rn;
 677
 678                 list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
 679                         /*
 680                          * Can we combine this request with the current port?
 681                          * It has to be the same context/ringbuffer and not
 682                          * have any exceptions (e.g. GVT saying never to
 683                          * combine contexts).
 684                          *
 685                          * If we can combine the requests, we can execute both
 686                          * by updating the RING_TAIL to point to the end of the
 687                          * second request, and so we never need to tell the
 688                          * hardware about the first.
 689                          */
 690                         if (last &&
 691                             !can_merge_ctx(rq->hw_context, last->hw_context)) {
 692                                 /*
 693                                  * If we are on the second port and cannot
 694                                  * combine this request with the last, then we
 695                                  * are done.
 696                                  */
 697                                 if (port == last_port) {
 698                                         __list_del_many(&p->requests,
 699                                                         &rq->sched.link);
 700                                         goto done;
 701                                 }
 702
 703                                 /*
 704                                  * If GVT overrides us we only ever submit
 705                                  * port[0], leaving port[1] empty. Note that we
 706                                  * also have to be careful that we don't queue
 707                                  * the same context (even though a different
 708                                  * request) to the second port.
 709                                  */
 710                                 if (ctx_single_port_submission(last->hw_context) ||
 711                                     ctx_single_port_submission(rq->hw_context)) {
 712                                         __list_del_many(&p->requests,
 713                                                         &rq->sched.link);
 714                                         goto done;
 715                                 }
 716
 717                                 GEM_BUG_ON(last->hw_context == rq->hw_context);
 718
 719                                 if (submit)
 720                                         port_assign(port, last);
 721                                 port++;
 722
 723                                 GEM_BUG_ON(port_isset(port));
 724                         }
 725
 726                         INIT_LIST_HEAD(&rq->sched.link);
 727                         __i915_request_submit(rq);
 728                         trace_i915_request_in(rq, port_index(port, execlists));
 729                         last = rq;
 730                         submit = true;
 731                 }
 732
 733                 rb_erase_cached(&p->node, &execlists->queue);
 734                 INIT_LIST_HEAD(&p->requests);
 735                 if (p->priority != I915_PRIORITY_NORMAL)
 736                         kmem_cache_free(engine->i915->priorities, p);
 737         }
 738
 739 done:
 740         /*
 741          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
 742          *
 743          * We choose queue_priority such that if we add a request of greater
 744          * priority than this, we kick the submission tasklet to decide on
 745          * the right order of submitting the requests to hardware. We must
 746          * also be prepared to reorder requests as they are in-flight on the
 747          * HW. We derive the queue_priority then as the first "hole" in
 748          * the HW submission ports and if there are no available slots,
 749          * the priority of the lowest executing request, i.e. last.
 750          *
 751          * When we do receive a higher priority request ready to run from the
 752          * user, see queue_request(), the queue_priority is bumped to that
 753          * request triggering preemption on the next dequeue (or subsequent
 754          * interrupt for secondary ports).
 755          */
 756         execlists->queue_priority =
 757                 port != execlists->port ? rq_prio(last) : INT_MIN;
 758
 759         if (submit) {
 760                 port_assign(port, last);
 761                 execlists_submit_ports(engine);
 762         }
 763
 764         /* We must always keep the beast fed if we have work piled up */
 765         GEM_BUG_ON(rb_first_cached(&execlists->queue) &&
 766                    !port_isset(execlists->port));
 767
 768         /* Re-evaluate the executing context setup after each preemptive kick */
 769         if (last)
 770                 execlists_user_begin(execlists, execlists->port);
 771
 772         /* If the engine is now idle, so should be the flag; and vice versa. */
 773         GEM_BUG_ON(execlists_is_active(&engine->execlists,
 774                                        EXECLISTS_ACTIVE_USER) ==
 775                    !port_isset(engine->execlists.port));
 776 }
 777
 778 void
 779 execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
 780 {
 781         struct execlist_port *port = execlists->port;
 782         unsigned int num_ports = execlists_num_ports(execlists);
 783
 784         while (num_ports-- && port_isset(port)) {
 785                 struct i915_request *rq = port_request(port);
 786
 787                 GEM_TRACE("%s:port%u global=%d (fence %llx:%d), (current %d)\n",
 788                           rq->engine->name,
 789                           (unsigned int)(port - execlists->port),
 790                           rq->global_seqno,
 791                           rq->fence.context, rq->fence.seqno,
 792                           intel_engine_get_seqno(rq->engine));
 793
 794                 GEM_BUG_ON(!execlists->active);
 795                 execlists_context_schedule_out(rq,
 796                                                i915_request_completed(rq) ?
 797                                                INTEL_CONTEXT_SCHEDULE_OUT :
 798                                                INTEL_CONTEXT_SCHEDULE_PREEMPTED);
 799
 800                 i915_request_put(rq);
 801
 802                 memset(port, 0, sizeof(*port));
 803                 port++;
 804         }
 805
 806         execlists_clear_all_active(execlists);
 807 }
 808
 809 static void reset_csb_pointers(struct intel_engine_execlists *execlists)
 810 {
 811         /*
 812          * After a reset, the HW starts writing into CSB entry [0]. We
 813          * therefore have to set our HEAD pointer back one entry so that
 814          * the *first* entry we check is entry 0. To complicate this further,
 815          * as we don't wait for the first interrupt after reset, we have to
 816          * fake the HW write to point back to the last entry so that our
 817          * inline comparison of our cached head position against the last HW
 818          * write works even before the first interrupt.
 819          */
 820         execlists->csb_head = execlists->csb_write_reset;
 821         WRITE_ONCE(*execlists->csb_write, execlists->csb_write_reset);
 822 }
 823
 824 static void nop_submission_tasklet(unsigned long data)
 825 {
 826         /* The driver is wedged; don't process any more events. */
 827 }
 828
 829 static void execlists_cancel_requests(struct intel_engine_cs *engine)
 830 {
 831         struct intel_engine_execlists * const execlists = &engine->execlists;
 832         struct i915_request *rq, *rn;
 833         struct rb_node *rb;
 834         unsigned long flags;
 835
 836         GEM_TRACE("%s current %d\n",
 837                   engine->name, intel_engine_get_seqno(engine));
 838
 839         /*
 840          * Before we call engine->cancel_requests(), we should have exclusive
 841          * access to the submission state. This is arranged for us by the
 842          * caller disabling the interrupt generation, the tasklet and other
 843          * threads that may then access the same state, giving us a free hand
 844          * to reset state. However, we still need to let lockdep be aware that
 845          * we know this state may be accessed in hardirq context, so we
 846          * disable the irq around this manipulation and we want to keep
 847          * the spinlock focused on its duties and not accidentally conflate
 848          * coverage to the submission's irq state. (Similarly, although we
 849          * shouldn't need to disable irq around the manipulation of the
 850          * submission's irq state, we also wish to remind ourselves that
 851          * it is irq state.)
 852          */
 853         spin_lock_irqsave(&engine->timeline.lock, flags);
 854
 855         /* Cancel the requests on the HW and clear the ELSP tracker. */
 856         execlists_cancel_port_requests(execlists);
 857         execlists_user_end(execlists);
 858
 859         /* Mark all executing requests as skipped. */
 860         list_for_each_entry(rq, &engine->timeline.requests, link) {
 861                 GEM_BUG_ON(!rq->global_seqno);
 862                 if (!i915_request_completed(rq))
 863                         dma_fence_set_error(&rq->fence, -EIO);
 864         }
 865
 866         /* Flush the queued requests to the timeline list (for retiring). */
 867         while ((rb = rb_first_cached(&execlists->queue))) {
 868                 struct i915_priolist *p = to_priolist(rb);
 869
 870                 list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
 871                         INIT_LIST_HEAD(&rq->sched.link);
 872
 873                         dma_fence_set_error(&rq->fence, -EIO);
 874                         __i915_request_submit(rq);
 875                 }
 876
 877                 rb_erase_cached(&p->node, &execlists->queue);
 878                 INIT_LIST_HEAD(&p->requests);
 879                 if (p->priority != I915_PRIORITY_NORMAL)
 880                         kmem_cache_free(engine->i915->priorities, p);
 881         }
 882
 883         /* Remaining _unready_ requests will be nop'ed when submitted */
 884
 885         execlists->queue_priority = INT_MIN;
 886         execlists->queue = RB_ROOT_CACHED;
 887         GEM_BUG_ON(port_isset(execlists->port));
 888
 889         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
 890         execlists->tasklet.func = nop_submission_tasklet;
 891
 892         spin_unlock_irqrestore(&engine->timeline.lock, flags);
 893 }
 894
 895 static inline bool
 896 reset_in_progress(const struct intel_engine_execlists *execlists)
 897 {
 898         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
 899 }
 900
 901 static void process_csb(struct intel_engine_cs *engine)
 902 {
 903         struct intel_engine_execlists * const execlists = &engine->execlists;
 904         struct execlist_port *port = execlists->port;
 905         const u32 * const buf = execlists->csb_status;
 906         u8 head, tail;
 907
 908         /*
 909          * Note that csb_write, csb_status may be either in HWSP or mmio.
 910          * When reading from the csb_write mmio register, we have to be
 911          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
 912          * the low 4bits. As it happens we know the next 4bits are always
 913          * zero and so we can simply masked off the low u8 of the register
 914          * and treat it identically to reading from the HWSP (without having
 915          * to use explicit shifting and masking, and probably bifurcating
 916          * the code to handle the legacy mmio read).
 917          */
 918         head = execlists->csb_head;
 919         tail = READ_ONCE(*execlists->csb_write);
 920         GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail);
 921         if (unlikely(head == tail))
 922                 return;
 923
 924         /*
 925          * Hopefully paired with a wmb() in HW!
 926          *
 927          * We must complete the read of the write pointer before any reads
 928          * from the CSB, so that we do not see stale values. Without an rmb
 929          * (lfence) the HW may speculatively perform the CSB[] reads *before*
 930          * we perform the READ_ONCE(*csb_write).
 931          */
 932         rmb();
 933
 934         do {
 935                 struct i915_request *rq;
 936                 unsigned int status;
 937                 unsigned int count;
 938
 939                 if (++head == GEN8_CSB_ENTRIES)
 940                         head = 0;
 941
 942                 /*
 943                  * We are flying near dragons again.
 944                  *
 945                  * We hold a reference to the request in execlist_port[]
 946                  * but no more than that. We are operating in softirq
 947                  * context and so cannot hold any mutex or sleep. That
 948                  * prevents us stopping the requests we are processing
 949                  * in port[] from being retired simultaneously (the
 950                  * breadcrumb will be complete before we see the
 951                  * context-switch). As we only hold the reference to the
 952                  * request, any pointer chasing underneath the request
 953                  * is subject to a potential use-after-free. Thus we
 954                  * store all of the bookkeeping within port[] as
 955                  * required, and avoid using unguarded pointers beneath
 956                  * request itself. The same applies to the atomic
 957                  * status notifier.
 958                  */
 959
 960                 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
 961                           engine->name, head,
 962                           buf[2 * head + 0], buf[2 * head + 1],
 963                           execlists->active);
 964
 965                 status = buf[2 * head];
 966                 if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
 967                               GEN8_CTX_STATUS_PREEMPTED))
 968                         execlists_set_active(execlists,
 969                                              EXECLISTS_ACTIVE_HWACK);
 970                 if (status & GEN8_CTX_STATUS_ACTIVE_IDLE)
 971                         execlists_clear_active(execlists,
 972                                                EXECLISTS_ACTIVE_HWACK);
 973
 974                 if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
 975                         continue;
 976
 977                 /* We should never get a COMPLETED | IDLE_ACTIVE! */
 978                 GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE);
 979
 980                 if (status & GEN8_CTX_STATUS_COMPLETE &&
 981                     buf[2*head + 1] == execlists->preempt_complete_status) {
 982                         GEM_TRACE("%s preempt-idle\n", engine->name);
 983                         complete_preempt_context(execlists);
 984                         continue;
 985                 }
 986
 987                 if (status & GEN8_CTX_STATUS_PREEMPTED &&
 988                     execlists_is_active(execlists,
 989                                         EXECLISTS_ACTIVE_PREEMPT))
 990                         continue;
 991
 992                 GEM_BUG_ON(!execlists_is_active(execlists,
 993                                                 EXECLISTS_ACTIVE_USER));
 994
 995                 rq = port_unpack(port, &count);
 996                 GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%d) (current %d), prio=%d\n",
 997                           engine->name,
 998                           port->context_id, count,
 999                           rq ? rq->global_seqno : 0,
1000                           rq ? rq->fence.context : 0,
1001                           rq ? rq->fence.seqno : 0,
1002                           intel_engine_get_seqno(engine),
1003                           rq ? rq_prio(rq) : 0);
1004
1005                 /* Check the context/desc id for this event matches */
1006                 GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
1007
1008                 GEM_BUG_ON(count == 0);
1009                 if (--count == 0) {
1010                         /*
1011                          * On the final event corresponding to the
1012                          * submission of this context, we expect either
1013                          * an element-switch event or a completion
1014                          * event (and on completion, the active-idle
1015                          * marker). No more preemptions, lite-restore
1016                          * or otherwise.
1017                          */
1018                         GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
1019                         GEM_BUG_ON(port_isset(&port[1]) &&
1020                                    !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH));
1021                         GEM_BUG_ON(!port_isset(&port[1]) &&
1022                                    !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
1023
1024                         /*
1025                          * We rely on the hardware being strongly
1026                          * ordered, that the breadcrumb write is
1027                          * coherent (visible from the CPU) before the
1028                          * user interrupt and CSB is processed.
1029                          */
1030                         GEM_BUG_ON(!i915_request_completed(rq));
1031
1032                         execlists_context_schedule_out(rq,
1033                                                        INTEL_CONTEXT_SCHEDULE_OUT);
1034                         i915_request_put(rq);
1035
1036                         GEM_TRACE("%s completed ctx=%d\n",
1037                                   engine->name, port->context_id);
1038
1039                         port = execlists_port_complete(execlists, port);
1040                         if (port_isset(port))
1041                                 execlists_user_begin(execlists, port);
1042                         else
1043                                 execlists_user_end(execlists);
1044                 } else {
1045                         port_set(port, port_pack(rq, count));
1046                 }
1047         } while (head != tail);
1048
1049         execlists->csb_head = head;
1050 }
1051
1052 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
1053 {
1054         lockdep_assert_held(&engine->timeline.lock);
1055
1056         process_csb(engine);
1057         if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
1058                 execlists_dequeue(engine);
1059 }
1060
1061 /*
1062  * Check the unread Context Status Buffers and manage the submission of new
1063  * contexts to the ELSP accordingly.
1064  */
1065 static void execlists_submission_tasklet(unsigned long data)
1066 {
1067         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
1068         unsigned long flags;
1069
1070         GEM_TRACE("%s awake?=%d, active=%x\n",
1071                   engine->name,
1072                   engine->i915->gt.awake,
1073                   engine->execlists.active);
1074
1075         spin_lock_irqsave(&engine->timeline.lock, flags);
1076         __execlists_submission_tasklet(engine);
1077         spin_unlock_irqrestore(&engine->timeline.lock, flags);
1078 }
1079
1080 static void queue_request(struct intel_engine_cs *engine,
1081                           struct i915_sched_node *node,
1082                           int prio)
1083 {
1084         list_add_tail(&node->link,
1085                       &lookup_priolist(engine, prio)->requests);
1086 }
1087
1088 static void __update_queue(struct intel_engine_cs *engine, int prio)
1089 {
1090         engine->execlists.queue_priority = prio;
1091 }
1092
1093 static void __submit_queue_imm(struct intel_engine_cs *engine)
1094 {
1095         struct intel_engine_execlists * const execlists = &engine->execlists;
1096
1097         if (reset_in_progress(execlists))
1098                 return; /* defer until we restart the engine following reset */
1099
1100         if (execlists->tasklet.func == execlists_submission_tasklet)
1101                 __execlists_submission_tasklet(engine);
1102         else
1103                 tasklet_hi_schedule(&execlists->tasklet);
1104 }
1105
1106 static void submit_queue(struct intel_engine_cs *engine, int prio)
1107 {
1108         if (prio > engine->execlists.queue_priority) {
1109                 __update_queue(engine, prio);
1110                 __submit_queue_imm(engine);
1111         }
1112 }
1113
1114 static void execlists_submit_request(struct i915_request *request)
1115 {
1116         struct intel_engine_cs *engine = request->engine;
1117         unsigned long flags;
1118
1119         /* Will be called from irq-context when using foreign fences. */
1120         spin_lock_irqsave(&engine->timeline.lock, flags);
1121
1122         queue_request(engine, &request->sched, rq_prio(request));
1123
1124         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1125         GEM_BUG_ON(list_empty(&request->sched.link));
1126
1127         submit_queue(engine, rq_prio(request));
1128
1129         spin_unlock_irqrestore(&engine->timeline.lock, flags);
1130 }
1131
1132 static struct i915_request *sched_to_request(struct i915_sched_node *node)
1133 {
1134         return container_of(node, struct i915_request, sched);
1135 }
1136
1137 static struct intel_engine_cs *
1138 sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
1139 {
1140         struct intel_engine_cs *engine = sched_to_request(node)->engine;
1141
1142         GEM_BUG_ON(!locked);
1143
1144         if (engine != locked) {
1145                 spin_unlock(&locked->timeline.lock);
1146                 spin_lock(&engine->timeline.lock);
1147         }
1148
1149         return engine;
1150 }
1151
1152 static void execlists_schedule(struct i915_request *request,
1153                                const struct i915_sched_attr *attr)
1154 {
1155         struct i915_priolist *uninitialized_var(pl);
1156         struct intel_engine_cs *engine, *last;
1157         struct i915_dependency *dep, *p;
1158         struct i915_dependency stack;
1159         const int prio = attr->priority;
1160         LIST_HEAD(dfs);
1161
1162         GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
1163
1164         if (i915_request_completed(request))
1165                 return;
1166
1167         if (prio <= READ_ONCE(request->sched.attr.priority))
1168                 return;
1169
1170         /* Need BKL in order to use the temporary link inside i915_dependency */
1171         lockdep_assert_held(&request->i915->drm.struct_mutex);
1172
1173         stack.signaler = &request->sched;
1174         list_add(&stack.dfs_link, &dfs);
1175
1176         /*
1177          * Recursively bump all dependent priorities to match the new request.
1178          *
1179          * A naive approach would be to use recursion:
1180          * static void update_priorities(struct i915_sched_node *node, prio) {
1181          *      list_for_each_entry(dep, &node->signalers_list, signal_link)
1182          *              update_priorities(dep->signal, prio)
1183          *      queue_request(node);
1184          * }
1185          * but that may have unlimited recursion depth and so runs a very
1186          * real risk of overunning the kernel stack. Instead, we build
1187          * a flat list of all dependencies starting with the current request.
1188          * As we walk the list of dependencies, we add all of its dependencies
1189          * to the end of the list (this may include an already visited
1190          * request) and continue to walk onwards onto the new dependencies. The
1191          * end result is a topological list of requests in reverse order, the
1192          * last element in the list is the request we must execute first.
1193          */
1194         list_for_each_entry(dep, &dfs, dfs_link) {
1195                 struct i915_sched_node *node = dep->signaler;
1196
1197                 /*
1198                  * Within an engine, there can be no cycle, but we may
1199                  * refer to the same dependency chain multiple times
1200                  * (redundant dependencies are not eliminated) and across
1201                  * engines.
1202                  */
1203                 list_for_each_entry(p, &node->signalers_list, signal_link) {
1204                         GEM_BUG_ON(p == dep); /* no cycles! */
1205
1206                         if (i915_sched_node_signaled(p->signaler))
1207                                 continue;
1208
1209                         GEM_BUG_ON(p->signaler->attr.priority < node->attr.priority);
1210                         if (prio > READ_ONCE(p->signaler->attr.priority))
1211                                 list_move_tail(&p->dfs_link, &dfs);
1212                 }
1213         }
1214
1215         /*
1216          * If we didn't need to bump any existing priorities, and we haven't
1217          * yet submitted this request (i.e. there is no potential race with
1218          * execlists_submit_request()), we can set our own priority and skip
1219          * acquiring the engine locks.
1220          */
1221         if (request->sched.attr.priority == I915_PRIORITY_INVALID) {
1222                 GEM_BUG_ON(!list_empty(&request->sched.link));
1223                 request->sched.attr = *attr;
1224                 if (stack.dfs_link.next == stack.dfs_link.prev)
1225                         return;
1226                 __list_del_entry(&stack.dfs_link);
1227         }
1228
1229         last = NULL;
1230         engine = request->engine;
1231         spin_lock_irq(&engine->timeline.lock);
1232
1233         /* Fifo and depth-first replacement ensure our deps execute before us */
1234         list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
1235                 struct i915_sched_node *node = dep->signaler;
1236
1237                 INIT_LIST_HEAD(&dep->dfs_link);
1238
1239                 engine = sched_lock_engine(node, engine);
1240
1241                 if (prio <= node->attr.priority)
1242                         continue;
1243
1244                 node->attr.priority = prio;
1245                 if (!list_empty(&node->link)) {
1246                         if (last != engine) {
1247                                 pl = lookup_priolist(engine, prio);
1248                                 last = engine;
1249                         }
1250                         GEM_BUG_ON(pl->priority != prio);
1251                         list_move_tail(&node->link, &pl->requests);
1252                 }
1253
1254                 if (prio > engine->execlists.queue_priority &&
1255                     i915_sw_fence_done(&sched_to_request(node)->submit)) {
1256                         /* defer submission until after all of our updates */
1257                         __update_queue(engine, prio);
1258                         tasklet_hi_schedule(&engine->execlists.tasklet);
1259                 }
1260         }
1261
1262         spin_unlock_irq(&engine->timeline.lock);
1263 }
1264
1265 static void execlists_context_destroy(struct intel_context *ce)
1266 {
1267         GEM_BUG_ON(ce->pin_count);
1268
1269         if (!ce->state)
1270                 return;
1271
1272         intel_ring_free(ce->ring);
1273
1274         GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj));
1275         i915_gem_object_put(ce->state->obj);
1276 }
1277
1278 static void execlists_context_unpin(struct intel_context *ce)
1279 {
1280         intel_ring_unpin(ce->ring);
1281
1282         ce->state->obj->pin_global--;
1283         i915_gem_object_unpin_map(ce->state->obj);
1284         i915_vma_unpin(ce->state);
1285
1286         i915_gem_context_put(ce->gem_context);
1287 }
1288
1289 static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
1290 {
1291         unsigned int flags;
1292         int err;
1293
1294         /*
1295          * Clear this page out of any CPU caches for coherent swap-in/out.
1296          * We only want to do this on the first bind so that we do not stall
1297          * on an active context (which by nature is already on the GPU).
1298          */
1299         if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
1300                 err = i915_gem_object_set_to_gtt_domain(vma->obj, true);
1301                 if (err)
1302                         return err;
1303         }
1304
1305         flags = PIN_GLOBAL | PIN_HIGH;
1306         if (ctx->ggtt_offset_bias)
1307                 flags |= PIN_OFFSET_BIAS | ctx->ggtt_offset_bias;
1308
1309         return i915_vma_pin(vma, 0, 0, flags);
1310 }
1311
1312 static struct intel_context *
1313 __execlists_context_pin(struct intel_engine_cs *engine,
1314                         struct i915_gem_context *ctx,
1315                         struct intel_context *ce)
1316 {
1317         void *vaddr;
1318         int ret;
1319
1320         ret = execlists_context_deferred_alloc(ctx, engine, ce);
1321         if (ret)
1322                 goto err;
1323         GEM_BUG_ON(!ce->state);
1324
1325         ret = __context_pin(ctx, ce->state);
1326         if (ret)
1327                 goto err;
1328
1329         vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
1330         if (IS_ERR(vaddr)) {
1331                 ret = PTR_ERR(vaddr);
1332                 goto unpin_vma;
1333         }
1334
1335         ret = intel_ring_pin(ce->ring, ctx->i915, ctx->ggtt_offset_bias);
1336         if (ret)
1337                 goto unpin_map;
1338
1339         intel_lr_context_descriptor_update(ctx, engine, ce);
1340
1341         ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
1342         ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
1343                 i915_ggtt_offset(ce->ring->vma);
1344         GEM_BUG_ON(!intel_ring_offset_valid(ce->ring, ce->ring->head));
1345         ce->lrc_reg_state[CTX_RING_HEAD+1] = ce->ring->head;
1346
1347         ce->state->obj->pin_global++;
1348         i915_gem_context_get(ctx);
1349         return ce;
1350
1351 unpin_map:
1352         i915_gem_object_unpin_map(ce->state->obj);
1353 unpin_vma:
1354         __i915_vma_unpin(ce->state);
1355 err:
1356         ce->pin_count = 0;
1357         return ERR_PTR(ret);
1358 }
1359
1360 static const struct intel_context_ops execlists_context_ops = {
1361         .unpin = execlists_context_unpin,
1362         .destroy = execlists_context_destroy,
1363 };
1364
1365 static struct intel_context *
1366 execlists_context_pin(struct intel_engine_cs *engine,
1367                       struct i915_gem_context *ctx)
1368 {
1369         struct intel_context *ce = to_intel_context(ctx, engine);
1370
1371         lockdep_assert_held(&ctx->i915->drm.struct_mutex);
1372
1373         if (likely(ce->pin_count++))
1374                 return ce;
1375         GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
1376
1377         ce->ops = &execlists_context_ops;
1378
1379         return __execlists_context_pin(engine, ctx, ce);
1380 }
1381
1382 static int execlists_request_alloc(struct i915_request *request)
1383 {
1384         int ret;
1385
1386         GEM_BUG_ON(!request->hw_context->pin_count);
1387
1388         /* Flush enough space to reduce the likelihood of waiting after
1389          * we start building the request - in which case we will just
1390          * have to repeat work.
1391          */
1392         request->reserved_space += EXECLISTS_REQUEST_SIZE;
1393
1394         ret = intel_ring_wait_for_space(request->ring, request->reserved_space);
1395         if (ret)
1396                 return ret;
1397
1398         /* Note that after this point, we have committed to using
1399          * this request as it is being used to both track the
1400          * state of engine initialisation and liveness of the
1401          * golden renderstate above. Think twice before you try
1402          * to cancel/unwind this request now.
1403          */
1404
1405         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
1406         return 0;
1407 }
1408
1409 /*
1410  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1411  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1412  * but there is a slight complication as this is applied in WA batch where the
1413  * values are only initialized once so we cannot take register value at the
1414  * beginning and reuse it further; hence we save its value to memory, upload a
1415  * constant value with bit21 set and then we restore it back with the saved value.
1416  * To simplify the WA, a constant value is formed by using the default value
1417  * of this register. This shouldn't be a problem because we are only modifying
1418  * it for a short period and this batch in non-premptible. We can ofcourse
1419  * use additional instructions that read the actual value of the register
1420  * at that time and set our bit of interest but it makes the WA complicated.
1421  *
1422  * This WA is also required for Gen9 so extracting as a function avoids
1423  * code duplication.
1424  */
1425 static u32 *
1426 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1427 {
1428         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1429         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1430         *batch++ = i915_ggtt_offset(engine->scratch) + 256;
1431         *batch++ = 0;
1432
1433         *batch++ = MI_LOAD_REGISTER_IMM(1);
1434         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1435         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1436
1437         batch = gen8_emit_pipe_control(batch,
1438                                        PIPE_CONTROL_CS_STALL |
1439                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1440                                        0);
1441
1442         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1443         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1444         *batch++ = i915_ggtt_offset(engine->scratch) + 256;
1445         *batch++ = 0;
1446
1447         return batch;
1448 }
1449
1450 /*
1451  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1452  * initialized at the beginning and shared across all contexts but this field
1453  * helps us to have multiple batches at different offsets and select them based
1454  * on a criteria. At the moment this batch always start at the beginning of the page
1455  * and at this point we don't have multiple wa_ctx batch buffers.
1456  *
1457  * The number of WA applied are not known at the beginning; we use this field
1458  * to return the no of DWORDS written.
1459  *
1460  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1461  * so it adds NOOPs as padding to make it cacheline aligned.
1462  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1463  * makes a complete batch buffer.
1464  */
1465 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1466 {
1467         /* WaDisableCtxRestoreArbitration:bdw,chv */
1468         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1469
1470         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1471         if (IS_BROADWELL(engine->i915))
1472                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1473
1474         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1475         /* Actual scratch location is at 128 bytes offset */
1476         batch = gen8_emit_pipe_control(batch,
1477                                        PIPE_CONTROL_FLUSH_L3 |
1478                                        PIPE_CONTROL_GLOBAL_GTT_IVB |
1479                                        PIPE_CONTROL_CS_STALL |
1480                                        PIPE_CONTROL_QW_WRITE,
1481                                        i915_ggtt_offset(engine->scratch) +
1482                                        2 * CACHELINE_BYTES);
1483
1484         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1485
1486         /* Pad to end of cacheline */
1487         while ((unsigned long)batch % CACHELINE_BYTES)
1488                 *batch++ = MI_NOOP;
1489
1490         /*
1491          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1492          * execution depends on the length specified in terms of cache lines
1493          * in the register CTX_RCS_INDIRECT_CTX
1494          */
1495
1496         return batch;
1497 }
1498
1499 struct lri {
1500         i915_reg_t reg;
1501         u32 value;
1502 };
1503
1504 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1505 {
1506         GEM_BUG_ON(!count || count > 63);
1507
1508         *batch++ = MI_LOAD_REGISTER_IMM(count);
1509         do {
1510                 *batch++ = i915_mmio_reg_offset(lri->reg);
1511                 *batch++ = lri->value;
1512         } while (lri++, --count);
1513         *batch++ = MI_NOOP;
1514
1515         return batch;
1516 }
1517
1518 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1519 {
1520         static const struct lri lri[] = {
1521                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1522                 {
1523                         COMMON_SLICE_CHICKEN2,
1524                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1525                                        0),
1526                 },
1527
1528                 /* BSpec: 11391 */
1529                 {
1530                         FF_SLICE_CHICKEN,
1531                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1532                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1533                 },
1534
1535                 /* BSpec: 11299 */
1536                 {
1537                         _3D_CHICKEN3,
1538                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1539                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1540                 }
1541         };
1542
1543         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1544
1545         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1546         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1547
1548         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1549
1550         /* WaClearSlmSpaceAtContextSwitch:kbl */
1551         /* Actual scratch location is at 128 bytes offset */
1552         if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) {
1553                 batch = gen8_emit_pipe_control(batch,
1554                                                PIPE_CONTROL_FLUSH_L3 |
1555                                                PIPE_CONTROL_GLOBAL_GTT_IVB |
1556                                                PIPE_CONTROL_CS_STALL |
1557                                                PIPE_CONTROL_QW_WRITE,
1558                                                i915_ggtt_offset(engine->scratch)
1559                                                + 2 * CACHELINE_BYTES);
1560         }
1561
1562         /* WaMediaPoolStateCmdInWABB:bxt,glk */
1563         if (HAS_POOLED_EU(engine->i915)) {
1564                 /*
1565                  * EU pool configuration is setup along with golden context
1566                  * during context initialization. This value depends on
1567                  * device type (2x6 or 3x6) and needs to be updated based
1568                  * on which subslice is disabled especially for 2x6
1569                  * devices, however it is safe to load default
1570                  * configuration of 3x6 device instead of masking off
1571                  * corresponding bits because HW ignores bits of a disabled
1572                  * subslice and drops down to appropriate config. Please
1573                  * see render_state_setup() in i915_gem_render_state.c for
1574                  * possible configurations, to avoid duplication they are
1575                  * not shown here again.
1576                  */
1577                 *batch++ = GEN9_MEDIA_POOL_STATE;
1578                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1579                 *batch++ = 0x00777000;
1580                 *batch++ = 0;
1581                 *batch++ = 0;
1582                 *batch++ = 0;
1583         }
1584
1585         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1586
1587         /* Pad to end of cacheline */
1588         while ((unsigned long)batch % CACHELINE_BYTES)
1589                 *batch++ = MI_NOOP;
1590
1591         return batch;
1592 }
1593
1594 static u32 *
1595 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1596 {
1597         int i;
1598
1599         /*
1600          * WaPipeControlBefore3DStateSamplePattern: cnl
1601          *
1602          * Ensure the engine is idle prior to programming a
1603          * 3DSTATE_SAMPLE_PATTERN during a context restore.
1604          */
1605         batch = gen8_emit_pipe_control(batch,
1606                                        PIPE_CONTROL_CS_STALL,
1607                                        0);
1608         /*
1609          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1610          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1611          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1612          * confusing. Since gen8_emit_pipe_control() already advances the
1613          * batch by 6 dwords, we advance the other 10 here, completing a
1614          * cacheline. It's not clear if the workaround requires this padding
1615          * before other commands, or if it's just the regular padding we would
1616          * already have for the workaround bb, so leave it here for now.
1617          */
1618         for (i = 0; i < 10; i++)
1619                 *batch++ = MI_NOOP;
1620
1621         /* Pad to end of cacheline */
1622         while ((unsigned long)batch % CACHELINE_BYTES)
1623                 *batch++ = MI_NOOP;
1624
1625         return batch;
1626 }
1627
1628 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
1629
1630 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
1631 {
1632         struct drm_i915_gem_object *obj;
1633         struct i915_vma *vma;
1634         int err;
1635
1636         obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
1637         if (IS_ERR(obj))
1638                 return PTR_ERR(obj);
1639
1640         vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL);
1641         if (IS_ERR(vma)) {
1642                 err = PTR_ERR(vma);
1643                 goto err;
1644         }
1645
1646         err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
1647         if (err)
1648                 goto err;
1649
1650         engine->wa_ctx.vma = vma;
1651         return 0;
1652
1653 err:
1654         i915_gem_object_put(obj);
1655         return err;
1656 }
1657
1658 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
1659 {
1660         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1661 }
1662
1663 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1664
1665 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
1666 {
1667         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1668         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
1669                                             &wa_ctx->per_ctx };
1670         wa_bb_func_t wa_bb_fn[2];
1671         struct page *page;
1672         void *batch, *batch_ptr;
1673         unsigned int i;
1674         int ret;
1675
1676         if (GEM_WARN_ON(engine->id != RCS))
1677                 return -EINVAL;
1678
1679         switch (INTEL_GEN(engine->i915)) {
1680         case 11:
1681                 return 0;
1682         case 10:
1683                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
1684                 wa_bb_fn[1] = NULL;
1685                 break;
1686         case 9:
1687                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1688                 wa_bb_fn[1] = NULL;
1689                 break;
1690         case 8:
1691                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1692                 wa_bb_fn[1] = NULL;
1693                 break;
1694         default:
1695                 MISSING_CASE(INTEL_GEN(engine->i915));
1696                 return 0;
1697         }
1698
1699         ret = lrc_setup_wa_ctx(engine);
1700         if (ret) {
1701                 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
1702                 return ret;
1703         }
1704
1705         page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
1706         batch = batch_ptr = kmap_atomic(page);
1707
1708         /*
1709          * Emit the two workaround batch buffers, recording the offset from the
1710          * start of the workaround batch buffer object for each and their
1711          * respective sizes.
1712          */
1713         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1714                 wa_bb[i]->offset = batch_ptr - batch;
1715                 if (GEM_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1716                                             CACHELINE_BYTES))) {
1717                         ret = -EINVAL;
1718                         break;
1719                 }
1720                 if (wa_bb_fn[i])
1721                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1722                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1723         }
1724
1725         BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
1726
1727         kunmap_atomic(batch);
1728         if (ret)
1729                 lrc_destroy_wa_ctx(engine);
1730
1731         return ret;
1732 }
1733
1734 static void enable_execlists(struct intel_engine_cs *engine)
1735 {
1736         struct drm_i915_private *dev_priv = engine->i915;
1737
1738         I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff);
1739
1740         /*
1741          * Make sure we're not enabling the new 12-deep CSB
1742          * FIFO as that requires a slightly updated handling
1743          * in the ctx switch irq. Since we're currently only
1744          * using only 2 elements of the enhanced execlists the
1745          * deeper FIFO it's not needed and it's not worth adding
1746          * more statements to the irq handler to support it.
1747          */
1748         if (INTEL_GEN(dev_priv) >= 11)
1749                 I915_WRITE(RING_MODE_GEN7(engine),
1750                            _MASKED_BIT_DISABLE(GEN11_GFX_DISABLE_LEGACY_MODE));
1751         else
1752                 I915_WRITE(RING_MODE_GEN7(engine),
1753                            _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
1754
1755         I915_WRITE(RING_MI_MODE(engine->mmio_base),
1756                    _MASKED_BIT_DISABLE(STOP_RING));
1757
1758         I915_WRITE(RING_HWS_PGA(engine->mmio_base),
1759                    engine->status_page.ggtt_offset);
1760         POSTING_READ(RING_HWS_PGA(engine->mmio_base));
1761 }
1762
1763 static bool unexpected_starting_state(struct intel_engine_cs *engine)
1764 {
1765         struct drm_i915_private *dev_priv = engine->i915;
1766         bool unexpected = false;
1767
1768         if (I915_READ(RING_MI_MODE(engine->mmio_base)) & STOP_RING) {
1769                 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
1770                 unexpected = true;
1771         }
1772
1773         return unexpected;
1774 }
1775
1776 static int gen8_init_common_ring(struct intel_engine_cs *engine)
1777 {
1778         int ret;
1779
1780         ret = intel_mocs_init_engine(engine);
1781         if (ret)
1782                 return ret;
1783
1784         intel_engine_reset_breadcrumbs(engine);
1785
1786         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
1787                 struct drm_printer p = drm_debug_printer(__func__);
1788
1789                 intel_engine_dump(engine, &p, NULL);
1790         }
1791
1792         enable_execlists(engine);
1793
1794         return 0;
1795 }
1796
1797 static int gen8_init_render_ring(struct intel_engine_cs *engine)
1798 {
1799         struct drm_i915_private *dev_priv = engine->i915;
1800         int ret;
1801
1802         ret = gen8_init_common_ring(engine);
1803         if (ret)
1804                 return ret;
1805
1806         intel_whitelist_workarounds_apply(engine);
1807
1808         /* We need to disable the AsyncFlip performance optimisations in order
1809          * to use MI_WAIT_FOR_EVENT within the CS. It should already be
1810          * programmed to '1' on all products.
1811          *
1812          * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv
1813          */
1814         I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
1815
1816         I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
1817
1818         return 0;
1819 }
1820
1821 static int gen9_init_render_ring(struct intel_engine_cs *engine)
1822 {
1823         int ret;
1824
1825         ret = gen8_init_common_ring(engine);
1826         if (ret)
1827                 return ret;
1828
1829         intel_whitelist_workarounds_apply(engine);
1830
1831         return 0;
1832 }
1833
1834 static struct i915_request *
1835 execlists_reset_prepare(struct intel_engine_cs *engine)
1836 {
1837         struct intel_engine_execlists * const execlists = &engine->execlists;
1838         struct i915_request *request, *active;
1839         unsigned long flags;
1840
1841         GEM_TRACE("%s\n", engine->name);
1842
1843         /*
1844          * Prevent request submission to the hardware until we have
1845          * completed the reset in i915_gem_reset_finish(). If a request
1846          * is completed by one engine, it may then queue a request
1847          * to a second via its execlists->tasklet *just* as we are
1848          * calling engine->init_hw() and also writing the ELSP.
1849          * Turning off the execlists->tasklet until the reset is over
1850          * prevents the race.
1851          */
1852         __tasklet_disable_sync_once(&execlists->tasklet);
1853
1854         spin_lock_irqsave(&engine->timeline.lock, flags);
1855
1856         /*
1857          * We want to flush the pending context switches, having disabled
1858          * the tasklet above, we can assume exclusive access to the execlists.
1859          * For this allows us to catch up with an inflight preemption event,
1860          * and avoid blaming an innocent request if the stall was due to the
1861          * preemption itself.
1862          */
1863         process_csb(engine);
1864
1865         /*
1866          * The last active request can then be no later than the last request
1867          * now in ELSP[0]. So search backwards from there, so that if the GPU
1868          * has advanced beyond the last CSB update, it will be pardoned.
1869          */
1870         active = NULL;
1871         request = port_request(execlists->port);
1872         if (request) {
1873                 /*
1874                  * Prevent the breadcrumb from advancing before we decide
1875                  * which request is currently active.
1876                  */
1877                 intel_engine_stop_cs(engine);
1878
1879                 list_for_each_entry_from_reverse(request,
1880                                                  &engine->timeline.requests,
1881                                                  link) {
1882                         if (__i915_request_completed(request,
1883                                                      request->global_seqno))
1884                                 break;
1885
1886                         active = request;
1887                 }
1888         }
1889
1890         spin_unlock_irqrestore(&engine->timeline.lock, flags);
1891
1892         return active;
1893 }
1894
1895 static void execlists_reset(struct intel_engine_cs *engine,
1896                             struct i915_request *request)
1897 {
1898         struct intel_engine_execlists * const execlists = &engine->execlists;
1899         unsigned long flags;
1900         u32 *regs;
1901
1902         GEM_TRACE("%s request global=%x, current=%d\n",
1903                   engine->name, request ? request->global_seqno : 0,
1904                   intel_engine_get_seqno(engine));
1905
1906         spin_lock_irqsave(&engine->timeline.lock, flags);
1907
1908         /*
1909          * Catch up with any missed context-switch interrupts.
1910          *
1911          * Ideally we would just read the remaining CSB entries now that we
1912          * know the gpu is idle. However, the CSB registers are sometimes^W
1913          * often trashed across a GPU reset! Instead we have to rely on
1914          * guessing the missed context-switch events by looking at what
1915          * requests were completed.
1916          */
1917         execlists_cancel_port_requests(execlists);
1918
1919         /* Push back any incomplete requests for replay after the reset. */
1920         __unwind_incomplete_requests(engine);
1921
1922         /* Following the reset, we need to reload the CSB read/write pointers */
1923         reset_csb_pointers(&engine->execlists);
1924
1925         spin_unlock_irqrestore(&engine->timeline.lock, flags);
1926
1927         /*
1928          * If the request was innocent, we leave the request in the ELSP
1929          * and will try to replay it on restarting. The context image may
1930          * have been corrupted by the reset, in which case we may have
1931          * to service a new GPU hang, but more likely we can continue on
1932          * without impact.
1933          *
1934          * If the request was guilty, we presume the context is corrupt
1935          * and have to at least restore the RING register in the context
1936          * image back to the expected values to skip over the guilty request.
1937          */
1938         if (!request || request->fence.error != -EIO)
1939                 return;
1940
1941         /*
1942          * We want a simple context + ring to execute the breadcrumb update.
1943          * We cannot rely on the context being intact across the GPU hang,
1944          * so clear it and rebuild just what we need for the breadcrumb.
1945          * All pending requests for this context will be zapped, and any
1946          * future request will be after userspace has had the opportunity
1947          * to recreate its own state.
1948          */
1949         regs = request->hw_context->lrc_reg_state;
1950         if (engine->pinned_default_state) {
1951                 memcpy(regs, /* skip restoring the vanilla PPHWSP */
1952                        engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1953                        engine->context_size - PAGE_SIZE);
1954         }
1955         execlists_init_reg_state(regs,
1956                                  request->gem_context, engine, request->ring);
1957
1958         /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
1959         regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
1960
1961         request->ring->head = intel_ring_wrap(request->ring, request->postfix);
1962         regs[CTX_RING_HEAD + 1] = request->ring->head;
1963
1964         intel_ring_update_space(request->ring);
1965
1966         /* Reset WaIdleLiteRestore:bdw,skl as well */
1967         unwind_wa_tail(request);
1968 }
1969
1970 static void execlists_reset_finish(struct intel_engine_cs *engine)
1971 {
1972         struct intel_engine_execlists * const execlists = &engine->execlists;
1973
1974         /* After a GPU reset, we may have requests to replay */
1975         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
1976                 tasklet_schedule(&execlists->tasklet);
1977
1978         /*
1979          * Flush the tasklet while we still have the forcewake to be sure
1980          * that it is not allowed to sleep before we restart and reload a
1981          * context.
1982          *
1983          * As before (with execlists_reset_prepare) we rely on the caller
1984          * serialising multiple attempts to reset so that we know that we
1985          * are the only one manipulating tasklet state.
1986          */
1987         __tasklet_enable_sync_once(&execlists->tasklet);
1988
1989         GEM_TRACE("%s\n", engine->name);
1990 }
1991
1992 static int intel_logical_ring_emit_pdps(struct i915_request *rq)
1993 {
1994         struct i915_hw_ppgtt *ppgtt = rq->gem_context->ppgtt;
1995         struct intel_engine_cs *engine = rq->engine;
1996         const int num_lri_cmds = GEN8_3LVL_PDPES * 2;
1997         u32 *cs;
1998         int i;
1999
2000         cs = intel_ring_begin(rq, num_lri_cmds * 2 + 2);
2001         if (IS_ERR(cs))
2002                 return PTR_ERR(cs);
2003
2004         *cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds);
2005         for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) {
2006                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
2007
2008                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i));
2009                 *cs++ = upper_32_bits(pd_daddr);
2010                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i));
2011                 *cs++ = lower_32_bits(pd_daddr);
2012         }
2013
2014         *cs++ = MI_NOOP;
2015         intel_ring_advance(rq, cs);
2016
2017         return 0;
2018 }
2019
2020 static int gen8_emit_bb_start(struct i915_request *rq,
2021                               u64 offset, u32 len,
2022                               const unsigned int flags)
2023 {
2024         u32 *cs;
2025         int ret;
2026
2027         /* Don't rely in hw updating PDPs, specially in lite-restore.
2028          * Ideally, we should set Force PD Restore in ctx descriptor,
2029          * but we can't. Force Restore would be a second option, but
2030          * it is unsafe in case of lite-restore (because the ctx is
2031          * not idle). PML4 is allocated during ppgtt init so this is
2032          * not needed in 48-bit.*/
2033         if (rq->gem_context->ppgtt &&
2034             (intel_engine_flag(rq->engine) & rq->gem_context->ppgtt->pd_dirty_rings) &&
2035             !i915_vm_is_48bit(&rq->gem_context->ppgtt->vm) &&
2036             !intel_vgpu_active(rq->i915)) {
2037                 ret = intel_logical_ring_emit_pdps(rq);
2038                 if (ret)
2039                         return ret;
2040
2041                 rq->gem_context->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
2042         }
2043
2044         cs = intel_ring_begin(rq, 6);
2045         if (IS_ERR(cs))
2046                 return PTR_ERR(cs);
2047
2048         /*
2049          * WaDisableCtxRestoreArbitration:bdw,chv
2050          *
2051          * We don't need to perform MI_ARB_ENABLE as often as we do (in
2052          * particular all the gen that do not need the w/a at all!), if we
2053          * took care to make sure that on every switch into this context
2054          * (both ordinary and for preemption) that arbitrartion was enabled
2055          * we would be fine. However, there doesn't seem to be a downside to
2056          * being paranoid and making sure it is set before each batch and
2057          * every context-switch.
2058          *
2059          * Note that if we fail to enable arbitration before the request
2060          * is complete, then we do not see the context-switch interrupt and
2061          * the engine hangs (with RING_HEAD == RING_TAIL).
2062          *
2063          * That satisfies both the GPGPU w/a and our heavy-handed paranoia.
2064          */
2065         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2066
2067         /* FIXME(BDW): Address space and security selectors. */
2068         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
2069                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
2070                 (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
2071         *cs++ = lower_32_bits(offset);
2072         *cs++ = upper_32_bits(offset);
2073
2074         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
2075         *cs++ = MI_NOOP;
2076         intel_ring_advance(rq, cs);
2077
2078         return 0;
2079 }
2080
2081 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
2082 {
2083         struct drm_i915_private *dev_priv = engine->i915;
2084         I915_WRITE_IMR(engine,
2085                        ~(engine->irq_enable_mask | engine->irq_keep_mask));
2086         POSTING_READ_FW(RING_IMR(engine->mmio_base));
2087 }
2088
2089 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
2090 {
2091         struct drm_i915_private *dev_priv = engine->i915;
2092         I915_WRITE_IMR(engine, ~engine->irq_keep_mask);
2093 }
2094
2095 static int gen8_emit_flush(struct i915_request *request, u32 mode)
2096 {
2097         u32 cmd, *cs;
2098
2099         cs = intel_ring_begin(request, 4);
2100         if (IS_ERR(cs))
2101                 return PTR_ERR(cs);
2102
2103         cmd = MI_FLUSH_DW + 1;
2104
2105         /* We always require a command barrier so that subsequent
2106          * commands, such as breadcrumb interrupts, are strictly ordered
2107          * wrt the contents of the write cache being flushed to memory
2108          * (and thus being coherent from the CPU).
2109          */
2110         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
2111
2112         if (mode & EMIT_INVALIDATE) {
2113                 cmd |= MI_INVALIDATE_TLB;
2114                 if (request->engine->id == VCS)
2115                         cmd |= MI_INVALIDATE_BSD;
2116         }
2117
2118         *cs++ = cmd;
2119         *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
2120         *cs++ = 0; /* upper addr */
2121         *cs++ = 0; /* value */
2122         intel_ring_advance(request, cs);
2123
2124         return 0;
2125 }
2126
2127 static int gen8_emit_flush_render(struct i915_request *request,
2128                                   u32 mode)
2129 {
2130         struct intel_engine_cs *engine = request->engine;
2131         u32 scratch_addr =
2132                 i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES;
2133         bool vf_flush_wa = false, dc_flush_wa = false;
2134         u32 *cs, flags = 0;
2135         int len;
2136
2137         flags |= PIPE_CONTROL_CS_STALL;
2138
2139         if (mode & EMIT_FLUSH) {
2140                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
2141                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
2142                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
2143                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
2144         }
2145
2146         if (mode & EMIT_INVALIDATE) {
2147                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
2148                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
2149                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
2150                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
2151                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
2152                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
2153                 flags |= PIPE_CONTROL_QW_WRITE;
2154                 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
2155
2156                 /*
2157                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
2158                  * pipe control.
2159                  */
2160                 if (IS_GEN9(request->i915))
2161                         vf_flush_wa = true;
2162
2163                 /* WaForGAMHang:kbl */
2164                 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
2165                         dc_flush_wa = true;
2166         }
2167
2168         len = 6;
2169
2170         if (vf_flush_wa)
2171                 len += 6;
2172
2173         if (dc_flush_wa)
2174                 len += 12;
2175
2176         cs = intel_ring_begin(request, len);
2177         if (IS_ERR(cs))
2178                 return PTR_ERR(cs);
2179
2180         if (vf_flush_wa)
2181                 cs = gen8_emit_pipe_control(cs, 0, 0);
2182
2183         if (dc_flush_wa)
2184                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
2185                                             0);
2186
2187         cs = gen8_emit_pipe_control(cs, flags, scratch_addr);
2188
2189         if (dc_flush_wa)
2190                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
2191
2192         intel_ring_advance(request, cs);
2193
2194         return 0;
2195 }
2196
2197 /*
2198  * Reserve space for 2 NOOPs at the end of each request to be
2199  * used as a workaround for not being allowed to do lite
2200  * restore with HEAD==TAIL (WaIdleLiteRestore).
2201  */
2202 static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
2203 {
2204         /* Ensure there's always at least one preemption point per-request. */
2205         *cs++ = MI_ARB_CHECK;
2206         *cs++ = MI_NOOP;
2207         request->wa_tail = intel_ring_offset(request, cs);
2208 }
2209
2210 static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
2211 {
2212         /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
2213         BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
2214
2215         cs = gen8_emit_ggtt_write(cs, request->global_seqno,
2216                                   intel_hws_seqno_address(request->engine));
2217         *cs++ = MI_USER_INTERRUPT;
2218         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2219         request->tail = intel_ring_offset(request, cs);
2220         assert_ring_tail_valid(request->ring, request->tail);
2221
2222         gen8_emit_wa_tail(request, cs);
2223 }
2224 static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
2225
2226 static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
2227 {
2228         /* We're using qword write, seqno should be aligned to 8 bytes. */
2229         BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
2230
2231         cs = gen8_emit_ggtt_write_rcs(cs, request->global_seqno,
2232                                       intel_hws_seqno_address(request->engine));
2233         *cs++ = MI_USER_INTERRUPT;
2234         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
2235         request->tail = intel_ring_offset(request, cs);
2236         assert_ring_tail_valid(request->ring, request->tail);
2237
2238         gen8_emit_wa_tail(request, cs);
2239 }
2240 static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
2241
2242 static int gen8_init_rcs_context(struct i915_request *rq)
2243 {
2244         int ret;
2245
2246         ret = intel_ctx_workarounds_emit(rq);
2247         if (ret)
2248                 return ret;
2249
2250         ret = intel_rcs_context_init_mocs(rq);
2251         /*
2252          * Failing to program the MOCS is non-fatal.The system will not
2253          * run at peak performance. So generate an error and carry on.
2254          */
2255         if (ret)
2256                 DRM_ERROR("MOCS failed to program: expect performance issues.\n");
2257
2258         return i915_gem_render_state_emit(rq);
2259 }
2260
2261 /**
2262  * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
2263  * @engine: Engine Command Streamer.
2264  */
2265 void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
2266 {
2267         struct drm_i915_private *dev_priv;
2268
2269         /*
2270          * Tasklet cannot be active at this point due intel_mark_active/idle
2271          * so this is just for documentation.
2272          */
2273         if (WARN_ON(test_bit(TASKLET_STATE_SCHED,
2274                              &engine->execlists.tasklet.state)))
2275                 tasklet_kill(&engine->execlists.tasklet);
2276
2277         dev_priv = engine->i915;
2278
2279         if (engine->buffer) {
2280                 WARN_ON((I915_READ_MODE(engine) & MODE_IDLE) == 0);
2281         }
2282
2283         if (engine->cleanup)
2284                 engine->cleanup(engine);
2285
2286         intel_engine_cleanup_common(engine);
2287
2288         lrc_destroy_wa_ctx(engine);
2289
2290         engine->i915 = NULL;
2291         dev_priv->engine[engine->id] = NULL;
2292         kfree(engine);
2293 }
2294
2295 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
2296 {
2297         engine->submit_request = execlists_submit_request;
2298         engine->cancel_requests = execlists_cancel_requests;
2299         engine->schedule = execlists_schedule;
2300         engine->execlists.tasklet.func = execlists_submission_tasklet;
2301
2302         engine->reset.prepare = execlists_reset_prepare;
2303
2304         engine->park = NULL;
2305         engine->unpark = NULL;
2306
2307         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
2308         if (engine->i915->preempt_context)
2309                 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
2310
2311         engine->i915->caps.scheduler =
2312                 I915_SCHEDULER_CAP_ENABLED |
2313                 I915_SCHEDULER_CAP_PRIORITY;
2314         if (intel_engine_has_preemption(engine))
2315                 engine->i915->caps.scheduler |= I915_SCHEDULER_CAP_PREEMPTION;
2316 }
2317
2318 static void
2319 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
2320 {
2321         /* Default vfuncs which can be overriden by each engine. */
2322         engine->init_hw = gen8_init_common_ring;
2323
2324         engine->reset.prepare = execlists_reset_prepare;
2325         engine->reset.reset = execlists_reset;
2326         engine->reset.finish = execlists_reset_finish;
2327
2328         engine->context_pin = execlists_context_pin;
2329         engine->request_alloc = execlists_request_alloc;
2330
2331         engine->emit_flush = gen8_emit_flush;
2332         engine->emit_breadcrumb = gen8_emit_breadcrumb;
2333         engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
2334
2335         engine->set_default_submission = intel_execlists_set_default_submission;
2336
2337         if (INTEL_GEN(engine->i915) < 11) {
2338                 engine->irq_enable = gen8_logical_ring_enable_irq;
2339                 engine->irq_disable = gen8_logical_ring_disable_irq;
2340         } else {
2341                 /*
2342                  * TODO: On Gen11 interrupt masks need to be clear
2343                  * to allow C6 entry. Keep interrupts enabled at
2344                  * and take the hit of generating extra interrupts
2345                  * until a more refined solution exists.
2346                  */
2347         }
2348         engine->emit_bb_start = gen8_emit_bb_start;
2349 }
2350
2351 static inline void
2352 logical_ring_default_irqs(struct intel_engine_cs *engine)
2353 {
2354         unsigned int shift = 0;
2355
2356         if (INTEL_GEN(engine->i915) < 11) {
2357                 const u8 irq_shifts[] = {
2358                         [RCS]  = GEN8_RCS_IRQ_SHIFT,
2359                         [BCS]  = GEN8_BCS_IRQ_SHIFT,
2360                         [VCS]  = GEN8_VCS1_IRQ_SHIFT,
2361                         [VCS2] = GEN8_VCS2_IRQ_SHIFT,
2362                         [VECS] = GEN8_VECS_IRQ_SHIFT,
2363                 };
2364
2365                 shift = irq_shifts[engine->id];
2366         }
2367
2368         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
2369         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
2370 }
2371
2372 static void
2373 logical_ring_setup(struct intel_engine_cs *engine)
2374 {
2375         intel_engine_setup_common(engine);
2376
2377         /* Intentionally left blank. */
2378         engine->buffer = NULL;
2379
2380         tasklet_init(&engine->execlists.tasklet,
2381                      execlists_submission_tasklet, (unsigned long)engine);
2382
2383         logical_ring_default_vfuncs(engine);
2384         logical_ring_default_irqs(engine);
2385 }
2386
2387 static bool csb_force_mmio(struct drm_i915_private *i915)
2388 {
2389         /* Older GVT emulation depends upon intercepting CSB mmio */
2390         return intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915);
2391 }
2392
2393 static int logical_ring_init(struct intel_engine_cs *engine)
2394 {
2395         struct drm_i915_private *i915 = engine->i915;
2396         struct intel_engine_execlists * const execlists = &engine->execlists;
2397         int ret;
2398
2399         ret = intel_engine_init_common(engine);
2400         if (ret)
2401                 goto error;
2402
2403         if (HAS_LOGICAL_RING_ELSQ(i915)) {
2404                 execlists->submit_reg = i915->regs +
2405                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(engine));
2406                 execlists->ctrl_reg = i915->regs +
2407                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(engine));
2408         } else {
2409                 execlists->submit_reg = i915->regs +
2410                         i915_mmio_reg_offset(RING_ELSP(engine));
2411         }
2412
2413         execlists->preempt_complete_status = ~0u;
2414         if (i915->preempt_context) {
2415                 struct intel_context *ce =
2416                         to_intel_context(i915->preempt_context, engine);
2417
2418                 execlists->preempt_complete_status =
2419                         upper_32_bits(ce->lrc_desc);
2420         }
2421
2422         execlists->csb_read =
2423                 i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
2424         if (csb_force_mmio(i915)) {
2425                 execlists->csb_status = (u32 __force *)
2426                         (i915->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
2427
2428                 execlists->csb_write = (u32 __force *)execlists->csb_read;
2429                 execlists->csb_write_reset =
2430                         _MASKED_FIELD(GEN8_CSB_WRITE_PTR_MASK,
2431                                       GEN8_CSB_ENTRIES - 1);
2432         } else {
2433                 execlists->csb_status =
2434                         &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
2435
2436                 execlists->csb_write =
2437                         &engine->status_page.page_addr[intel_hws_csb_write_index(i915)];
2438                 execlists->csb_write_reset = GEN8_CSB_ENTRIES - 1;
2439         }
2440         reset_csb_pointers(execlists);
2441
2442         return 0;
2443
2444 error:
2445         intel_logical_ring_cleanup(engine);
2446         return ret;
2447 }
2448
2449 int logical_render_ring_init(struct intel_engine_cs *engine)
2450 {
2451         struct drm_i915_private *dev_priv = engine->i915;
2452         int ret;
2453
2454         logical_ring_setup(engine);
2455
2456         if (HAS_L3_DPF(dev_priv))
2457                 engine->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2458
2459         /* Override some for render ring. */
2460         if (INTEL_GEN(dev_priv) >= 9)
2461                 engine->init_hw = gen9_init_render_ring;
2462         else
2463                 engine->init_hw = gen8_init_render_ring;
2464         engine->init_context = gen8_init_rcs_context;
2465         engine->emit_flush = gen8_emit_flush_render;
2466         engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
2467         engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
2468
2469         ret = intel_engine_create_scratch(engine, PAGE_SIZE);
2470         if (ret)
2471                 return ret;
2472
2473         ret = intel_init_workaround_bb(engine);
2474         if (ret) {
2475                 /*
2476                  * We continue even if we fail to initialize WA batch
2477                  * because we only expect rare glitches but nothing
2478                  * critical to prevent us from using GPU
2479                  */
2480                 DRM_ERROR("WA batch buffer initialization failed: %d\n",
2481                           ret);
2482         }
2483
2484         return logical_ring_init(engine);
2485 }
2486
2487 int logical_xcs_ring_init(struct intel_engine_cs *engine)
2488 {
2489         logical_ring_setup(engine);
2490
2491         return logical_ring_init(engine);
2492 }
2493
2494 static u32
2495 make_rpcs(struct drm_i915_private *dev_priv)
2496 {
2497         u32 rpcs = 0;
2498
2499         /*
2500          * No explicit RPCS request is needed to ensure full
2501          * slice/subslice/EU enablement prior to Gen9.
2502         */
2503         if (INTEL_GEN(dev_priv) < 9)
2504                 return 0;
2505
2506         /*
2507          * Starting in Gen9, render power gating can leave
2508          * slice/subslice/EU in a partially enabled state. We
2509          * must make an explicit request through RPCS for full
2510          * enablement.
2511         */
2512         if (INTEL_INFO(dev_priv)->sseu.has_slice_pg) {
2513                 rpcs |= GEN8_RPCS_S_CNT_ENABLE;
2514                 rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask) <<
2515                         GEN8_RPCS_S_CNT_SHIFT;
2516                 rpcs |= GEN8_RPCS_ENABLE;
2517         }
2518
2519         if (INTEL_INFO(dev_priv)->sseu.has_subslice_pg) {
2520                 rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
2521                 rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]) <<
2522                         GEN8_RPCS_SS_CNT_SHIFT;
2523                 rpcs |= GEN8_RPCS_ENABLE;
2524         }
2525
2526         if (INTEL_INFO(dev_priv)->sseu.has_eu_pg) {
2527                 rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
2528                         GEN8_RPCS_EU_MIN_SHIFT;
2529                 rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
2530                         GEN8_RPCS_EU_MAX_SHIFT;
2531                 rpcs |= GEN8_RPCS_ENABLE;
2532         }
2533
2534         return rpcs;
2535 }
2536
2537 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine)
2538 {
2539         u32 indirect_ctx_offset;
2540
2541         switch (INTEL_GEN(engine->i915)) {
2542         default:
2543                 MISSING_CASE(INTEL_GEN(engine->i915));
2544                 /* fall through */
2545         case 11:
2546                 indirect_ctx_offset =
2547                         GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2548                 break;
2549         case 10:
2550                 indirect_ctx_offset =
2551                         GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2552                 break;
2553         case 9:
2554                 indirect_ctx_offset =
2555                         GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2556                 break;
2557         case 8:
2558                 indirect_ctx_offset =
2559                         GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2560                 break;
2561         }
2562
2563         return indirect_ctx_offset;
2564 }
2565
2566 static void execlists_init_reg_state(u32 *regs,
2567                                      struct i915_gem_context *ctx,
2568                                      struct intel_engine_cs *engine,
2569                                      struct intel_ring *ring)
2570 {
2571         struct drm_i915_private *dev_priv = engine->i915;
2572         struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt;
2573         u32 base = engine->mmio_base;
2574         bool rcs = engine->class == RENDER_CLASS;
2575
2576         /* A context is actually a big batch buffer with several
2577          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
2578          * values we are setting here are only for the first context restore:
2579          * on a subsequent save, the GPU will recreate this batchbuffer with new
2580          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
2581          * we are not initializing here).
2582          */
2583         regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
2584                                  MI_LRI_FORCE_POSTED;
2585
2586         CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine),
2587                 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
2588                                     CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT) |
2589                 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
2590                                    (HAS_RESOURCE_STREAMER(dev_priv) ?
2591                                    CTX_CTRL_RS_CTX_ENABLE : 0)));
2592         CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
2593         CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
2594         CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
2595         CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base),
2596                 RING_CTL_SIZE(ring->size) | RING_VALID);
2597         CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0);
2598         CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0);
2599         CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT);
2600         CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
2601         CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0);
2602         CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0);
2603         if (rcs) {
2604                 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
2605
2606                 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0);
2607                 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET,
2608                         RING_INDIRECT_CTX_OFFSET(base), 0);
2609                 if (wa_ctx->indirect_ctx.size) {
2610                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
2611
2612                         regs[CTX_RCS_INDIRECT_CTX + 1] =
2613                                 (ggtt_offset + wa_ctx->indirect_ctx.offset) |
2614                                 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
2615
2616                         regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
2617                                 intel_lr_indirect_ctx_offset(engine) << 6;
2618                 }
2619
2620                 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0);
2621                 if (wa_ctx->per_ctx.size) {
2622                         u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
2623
2624                         regs[CTX_BB_PER_CTX_PTR + 1] =
2625                                 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
2626                 }
2627         }
2628
2629         regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
2630
2631         CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
2632         /* PDP values well be assigned later if needed */
2633         CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 0);
2634         CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 0);
2635         CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 0);
2636         CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 0);
2637         CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 0);
2638         CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 0);
2639         CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
2640         CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
2641
2642         if (ppgtt && i915_vm_is_48bit(&ppgtt->vm)) {
2643                 /* 64b PPGTT (48bit canonical)
2644                  * PDP0_DESCRIPTOR contains the base address to PML4 and
2645                  * other PDP Descriptors are ignored.
2646                  */
2647                 ASSIGN_CTX_PML4(ppgtt, regs);
2648         }
2649
2650         if (rcs) {
2651                 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
2652                 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
2653                         make_rpcs(dev_priv));
2654
2655                 i915_oa_init_reg_state(engine, ctx, regs);
2656         }
2657 }
2658
2659 static int
2660 populate_lr_context(struct i915_gem_context *ctx,
2661                     struct drm_i915_gem_object *ctx_obj,
2662                     struct intel_engine_cs *engine,
2663                     struct intel_ring *ring)
2664 {
2665         void *vaddr;
2666         u32 *regs;
2667         int ret;
2668
2669         ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
2670         if (ret) {
2671                 DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
2672                 return ret;
2673         }
2674
2675         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
2676         if (IS_ERR(vaddr)) {
2677                 ret = PTR_ERR(vaddr);
2678                 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
2679                 return ret;
2680         }
2681         ctx_obj->mm.dirty = true;
2682
2683         if (engine->default_state) {
2684                 /*
2685                  * We only want to copy over the template context state;
2686                  * skipping over the headers reserved for GuC communication,
2687                  * leaving those as zero.
2688                  */
2689                 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE;
2690                 void *defaults;
2691
2692                 defaults = i915_gem_object_pin_map(engine->default_state,
2693                                                    I915_MAP_WB);
2694                 if (IS_ERR(defaults)) {
2695                         ret = PTR_ERR(defaults);
2696                         goto err_unpin_ctx;
2697                 }
2698
2699                 memcpy(vaddr + start, defaults + start, engine->context_size);
2700                 i915_gem_object_unpin_map(engine->default_state);
2701         }
2702
2703         /* The second page of the context object contains some fields which must
2704          * be set up prior to the first execution. */
2705         regs = vaddr + LRC_STATE_PN * PAGE_SIZE;
2706         execlists_init_reg_state(regs, ctx, engine, ring);
2707         if (!engine->default_state)
2708                 regs[CTX_CONTEXT_CONTROL + 1] |=
2709                         _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
2710         if (ctx == ctx->i915->preempt_context && INTEL_GEN(engine->i915) < 11)
2711                 regs[CTX_CONTEXT_CONTROL + 1] |=
2712                         _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
2713                                            CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT);
2714
2715 err_unpin_ctx:
2716         i915_gem_object_unpin_map(ctx_obj);
2717         return ret;
2718 }
2719
2720 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
2721                                             struct intel_engine_cs *engine,
2722                                             struct intel_context *ce)
2723 {
2724         struct drm_i915_gem_object *ctx_obj;
2725         struct i915_vma *vma;
2726         uint32_t context_size;
2727         struct intel_ring *ring;
2728         struct i915_timeline *timeline;
2729         int ret;
2730
2731         if (ce->state)
2732                 return 0;
2733
2734         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
2735
2736         /*
2737          * Before the actual start of the context image, we insert a few pages
2738          * for our own use and for sharing with the GuC.
2739          */
2740         context_size += LRC_HEADER_PAGES * PAGE_SIZE;
2741
2742         ctx_obj = i915_gem_object_create(ctx->i915, context_size);
2743         if (IS_ERR(ctx_obj))
2744                 return PTR_ERR(ctx_obj);
2745
2746         vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.vm, NULL);
2747         if (IS_ERR(vma)) {
2748                 ret = PTR_ERR(vma);
2749                 goto error_deref_obj;
2750         }
2751
2752         timeline = i915_timeline_create(ctx->i915, ctx->name);
2753         if (IS_ERR(timeline)) {
2754                 ret = PTR_ERR(timeline);
2755                 goto error_deref_obj;
2756         }
2757
2758         ring = intel_engine_create_ring(engine, timeline, ctx->ring_size);
2759         i915_timeline_put(timeline);
2760         if (IS_ERR(ring)) {
2761                 ret = PTR_ERR(ring);
2762                 goto error_deref_obj;
2763         }
2764
2765         ret = populate_lr_context(ctx, ctx_obj, engine, ring);
2766         if (ret) {
2767                 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
2768                 goto error_ring_free;
2769         }
2770
2771         ce->ring = ring;
2772         ce->state = vma;
2773
2774         return 0;
2775
2776 error_ring_free:
2777         intel_ring_free(ring);
2778 error_deref_obj:
2779         i915_gem_object_put(ctx_obj);
2780         return ret;
2781 }
2782
2783 void intel_lr_context_resume(struct drm_i915_private *dev_priv)
2784 {
2785         struct intel_engine_cs *engine;
2786         struct i915_gem_context *ctx;
2787         enum intel_engine_id id;
2788
2789         /* Because we emit WA_TAIL_DWORDS there may be a disparity
2790          * between our bookkeeping in ce->ring->head and ce->ring->tail and
2791          * that stored in context. As we only write new commands from
2792          * ce->ring->tail onwards, everything before that is junk. If the GPU
2793          * starts reading from its RING_HEAD from the context, it may try to
2794          * execute that junk and die.
2795          *
2796          * So to avoid that we reset the context images upon resume. For
2797          * simplicity, we just zero everything out.
2798          */
2799         list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
2800                 for_each_engine(engine, dev_priv, id) {
2801                         struct intel_context *ce =
2802                                 to_intel_context(ctx, engine);
2803                         u32 *reg;
2804
2805                         if (!ce->state)
2806                                 continue;
2807
2808                         reg = i915_gem_object_pin_map(ce->state->obj,
2809                                                       I915_MAP_WB);
2810                         if (WARN_ON(IS_ERR(reg)))
2811                                 continue;
2812
2813                         reg += LRC_STATE_PN * PAGE_SIZE / sizeof(*reg);
2814                         reg[CTX_RING_HEAD+1] = 0;
2815                         reg[CTX_RING_TAIL+1] = 0;
2816
2817                         ce->state->obj->mm.dirty = true;
2818                         i915_gem_object_unpin_map(ce->state->obj);
2819
2820                         intel_ring_reset(ce->ring, 0);
2821                 }
2822         }
2823 }
2824
2825 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2826 #include "selftests/intel_lrc.c"
2827 #endif