drivers/gpu/drm/i915/gt/gen8_engine_cs.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2014 Intel Corporation
   4  */
   5
   6 #include "gen8_engine_cs.h"
   7 #include "i915_drv.h"
   8 #include "intel_lrc.h"
   9 #include "intel_gpu_commands.h"
  10 #include "intel_ring.h"
  11
  12 int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
  13 {
  14         bool vf_flush_wa = false, dc_flush_wa = false;
  15         u32 *cs, flags = 0;
  16         int len;
  17
  18         flags |= PIPE_CONTROL_CS_STALL;
  19
  20         if (mode & EMIT_FLUSH) {
  21                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
  22                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
  23                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
  24                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
  25         }
  26
  27         if (mode & EMIT_INVALIDATE) {
  28                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
  29                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
  30                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
  31                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
  32                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
  33                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
  34                 flags |= PIPE_CONTROL_QW_WRITE;
  35                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
  36
  37                 /*
  38                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
  39                  * pipe control.
  40                  */
  41                 if (GRAPHICS_VER(rq->engine->i915) == 9)
  42                         vf_flush_wa = true;
  43
  44                 /* WaForGAMHang:kbl */
  45                 if (IS_KBL_GT_STEP(rq->engine->i915, 0, STEP_B0))
  46                         dc_flush_wa = true;
  47         }
  48
  49         len = 6;
  50
  51         if (vf_flush_wa)
  52                 len += 6;
  53
  54         if (dc_flush_wa)
  55                 len += 12;
  56
  57         cs = intel_ring_begin(rq, len);
  58         if (IS_ERR(cs))
  59                 return PTR_ERR(cs);
  60
  61         if (vf_flush_wa)
  62                 cs = gen8_emit_pipe_control(cs, 0, 0);
  63
  64         if (dc_flush_wa)
  65                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
  66                                             0);
  67
  68         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
  69
  70         if (dc_flush_wa)
  71                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
  72
  73         intel_ring_advance(rq, cs);
  74
  75         return 0;
  76 }
  77
  78 int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
  79 {
  80         u32 cmd, *cs;
  81
  82         cs = intel_ring_begin(rq, 4);
  83         if (IS_ERR(cs))
  84                 return PTR_ERR(cs);
  85
  86         cmd = MI_FLUSH_DW + 1;
  87
  88         /*
  89          * We always require a command barrier so that subsequent
  90          * commands, such as breadcrumb interrupts, are strictly ordered
  91          * wrt the contents of the write cache being flushed to memory
  92          * (and thus being coherent from the CPU).
  93          */
  94         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
  95
  96         if (mode & EMIT_INVALIDATE) {
  97                 cmd |= MI_INVALIDATE_TLB;
  98                 if (rq->engine->class == VIDEO_DECODE_CLASS)
  99                         cmd |= MI_INVALIDATE_BSD;
 100         }
 101
 102         *cs++ = cmd;
 103         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 104         *cs++ = 0; /* upper addr */
 105         *cs++ = 0; /* value */
 106         intel_ring_advance(rq, cs);
 107
 108         return 0;
 109 }
 110
 111 int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
 112 {
 113         if (mode & EMIT_FLUSH) {
 114                 u32 *cs;
 115                 u32 flags = 0;
 116
 117                 flags |= PIPE_CONTROL_CS_STALL;
 118
 119                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 120                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 121                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 122                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 123                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 124                 flags |= PIPE_CONTROL_QW_WRITE;
 125                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 126
 127                 cs = intel_ring_begin(rq, 6);
 128                 if (IS_ERR(cs))
 129                         return PTR_ERR(cs);
 130
 131                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 132                 intel_ring_advance(rq, cs);
 133         }
 134
 135         if (mode & EMIT_INVALIDATE) {
 136                 u32 *cs;
 137                 u32 flags = 0;
 138
 139                 flags |= PIPE_CONTROL_CS_STALL;
 140
 141                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 142                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 143                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 144                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 145                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 146                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 147                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 148                 flags |= PIPE_CONTROL_QW_WRITE;
 149                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 150
 151                 cs = intel_ring_begin(rq, 6);
 152                 if (IS_ERR(cs))
 153                         return PTR_ERR(cs);
 154
 155                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 156                 intel_ring_advance(rq, cs);
 157         }
 158
 159         return 0;
 160 }
 161
 162 static u32 preparser_disable(bool state)
 163 {
 164         return MI_ARB_CHECK | 1 << 8 | state;
 165 }
 166
 167 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
 168 {
 169         static const i915_reg_t vd[] = {
 170                 GEN12_VD0_AUX_NV,
 171                 GEN12_VD1_AUX_NV,
 172                 GEN12_VD2_AUX_NV,
 173                 GEN12_VD3_AUX_NV,
 174         };
 175
 176         static const i915_reg_t ve[] = {
 177                 GEN12_VE0_AUX_NV,
 178                 GEN12_VE1_AUX_NV,
 179         };
 180
 181         if (engine->class == VIDEO_DECODE_CLASS)
 182                 return vd[engine->instance];
 183
 184         if (engine->class == VIDEO_ENHANCEMENT_CLASS)
 185                 return ve[engine->instance];
 186
 187         GEM_BUG_ON("unknown aux_inv reg\n");
 188         return INVALID_MMIO_REG;
 189 }
 190
 191 static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
 192 {
 193         *cs++ = MI_LOAD_REGISTER_IMM(1);
 194         *cs++ = i915_mmio_reg_offset(inv_reg);
 195         *cs++ = AUX_INV;
 196         *cs++ = MI_NOOP;
 197
 198         return cs;
 199 }
 200
 201 int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
 202 {
 203         if (mode & EMIT_FLUSH) {
 204                 u32 flags = 0;
 205                 u32 *cs;
 206
 207                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 208                 flags |= PIPE_CONTROL_FLUSH_L3;
 209                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 210                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 211                 /* Wa_1409600907:tgl */
 212                 flags |= PIPE_CONTROL_DEPTH_STALL;
 213                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 214                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 215
 216                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 217                 flags |= PIPE_CONTROL_QW_WRITE;
 218
 219                 flags |= PIPE_CONTROL_CS_STALL;
 220
 221                 cs = intel_ring_begin(rq, 6);
 222                 if (IS_ERR(cs))
 223                         return PTR_ERR(cs);
 224
 225                 cs = gen12_emit_pipe_control(cs,
 226                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 227                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
 228                 intel_ring_advance(rq, cs);
 229         }
 230
 231         if (mode & EMIT_INVALIDATE) {
 232                 u32 flags = 0;
 233                 u32 *cs;
 234
 235                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 236                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 237                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 238                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 239                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 240                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 241                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 242
 243                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 244                 flags |= PIPE_CONTROL_QW_WRITE;
 245
 246                 flags |= PIPE_CONTROL_CS_STALL;
 247
 248                 cs = intel_ring_begin(rq, 8 + 4);
 249                 if (IS_ERR(cs))
 250                         return PTR_ERR(cs);
 251
 252                 /*
 253                  * Prevent the pre-parser from skipping past the TLB
 254                  * invalidate and loading a stale page for the batch
 255                  * buffer / request payload.
 256                  */
 257                 *cs++ = preparser_disable(true);
 258
 259                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 260
 261                 /* hsdes: 1809175790 */
 262                 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
 263
 264                 *cs++ = preparser_disable(false);
 265                 intel_ring_advance(rq, cs);
 266         }
 267
 268         return 0;
 269 }
 270
 271 int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
 272 {
 273         intel_engine_mask_t aux_inv = 0;
 274         u32 cmd, *cs;
 275
 276         cmd = 4;
 277         if (mode & EMIT_INVALIDATE)
 278                 cmd += 2;
 279         if (mode & EMIT_INVALIDATE)
 280                 aux_inv = rq->engine->mask & ~BIT(BCS0);
 281         if (aux_inv)
 282                 cmd += 2 * hweight8(aux_inv) + 2;
 283
 284         cs = intel_ring_begin(rq, cmd);
 285         if (IS_ERR(cs))
 286                 return PTR_ERR(cs);
 287
 288         if (mode & EMIT_INVALIDATE)
 289                 *cs++ = preparser_disable(true);
 290
 291         cmd = MI_FLUSH_DW + 1;
 292
 293         /*
 294          * We always require a command barrier so that subsequent
 295          * commands, such as breadcrumb interrupts, are strictly ordered
 296          * wrt the contents of the write cache being flushed to memory
 297          * (and thus being coherent from the CPU).
 298          */
 299         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 300
 301         if (mode & EMIT_INVALIDATE) {
 302                 cmd |= MI_INVALIDATE_TLB;
 303                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 304                         cmd |= MI_INVALIDATE_BSD;
 305         }
 306
 307         *cs++ = cmd;
 308         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 309         *cs++ = 0; /* upper addr */
 310         *cs++ = 0; /* value */
 311
 312         if (aux_inv) { /* hsdes: 1809175790 */
 313                 struct intel_engine_cs *engine;
 314                 unsigned int tmp;
 315
 316                 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
 317                 for_each_engine_masked(engine, rq->engine->gt,
 318                                        aux_inv, tmp) {
 319                         *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
 320                         *cs++ = AUX_INV;
 321                 }
 322                 *cs++ = MI_NOOP;
 323         }
 324
 325         if (mode & EMIT_INVALIDATE)
 326                 *cs++ = preparser_disable(false);
 327
 328         intel_ring_advance(rq, cs);
 329
 330         return 0;
 331 }
 332
 333 static u32 preempt_address(struct intel_engine_cs *engine)
 334 {
 335         return (i915_ggtt_offset(engine->status_page.vma) +
 336                 I915_GEM_HWS_PREEMPT_ADDR);
 337 }
 338
 339 static u32 hwsp_offset(const struct i915_request *rq)
 340 {
 341         const struct intel_timeline *tl;
 342
 343         /* Before the request is executed, the timeline is fixed */
 344         tl = rcu_dereference_protected(rq->timeline,
 345                                        !i915_request_signaled(rq));
 346
 347         /* See the comment in i915_request_active_seqno(). */
 348         return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno);
 349 }
 350
 351 int gen8_emit_init_breadcrumb(struct i915_request *rq)
 352 {
 353         u32 *cs;
 354
 355         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
 356         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
 357                 return 0;
 358
 359         cs = intel_ring_begin(rq, 6);
 360         if (IS_ERR(cs))
 361                 return PTR_ERR(cs);
 362
 363         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 364         *cs++ = hwsp_offset(rq);
 365         *cs++ = 0;
 366         *cs++ = rq->fence.seqno - 1;
 367
 368         /*
 369          * Check if we have been preempted before we even get started.
 370          *
 371          * After this point i915_request_started() reports true, even if
 372          * we get preempted and so are no longer running.
 373          *
 374          * i915_request_started() is used during preemption processing
 375          * to decide if the request is currently inside the user payload
 376          * or spinning on a kernel semaphore (or earlier). For no-preemption
 377          * requests, we do allow preemption on the semaphore before the user
 378          * payload, but do not allow preemption once the request is started.
 379          *
 380          * i915_request_started() is similarly used during GPU hangs to
 381          * determine if the user's payload was guilty, and if so, the
 382          * request is banned. Before the request is started, it is assumed
 383          * to be unharmed and an innocent victim of another's hang.
 384          */
 385         *cs++ = MI_NOOP;
 386         *cs++ = MI_ARB_CHECK;
 387
 388         intel_ring_advance(rq, cs);
 389
 390         /* Record the updated position of the request's payload */
 391         rq->infix = intel_ring_offset(rq, cs);
 392
 393         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
 394
 395         return 0;
 396 }
 397
 398 int gen8_emit_bb_start_noarb(struct i915_request *rq,
 399                              u64 offset, u32 len,
 400                              const unsigned int flags)
 401 {
 402         u32 *cs;
 403
 404         cs = intel_ring_begin(rq, 4);
 405         if (IS_ERR(cs))
 406                 return PTR_ERR(cs);
 407
 408         /*
 409          * WaDisableCtxRestoreArbitration:bdw,chv
 410          *
 411          * We don't need to perform MI_ARB_ENABLE as often as we do (in
 412          * particular all the gen that do not need the w/a at all!), if we
 413          * took care to make sure that on every switch into this context
 414          * (both ordinary and for preemption) that arbitrartion was enabled
 415          * we would be fine.  However, for gen8 there is another w/a that
 416          * requires us to not preempt inside GPGPU execution, so we keep
 417          * arbitration disabled for gen8 batches. Arbitration will be
 418          * re-enabled before we close the request
 419          * (engine->emit_fini_breadcrumb).
 420          */
 421         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 422
 423         /* FIXME(BDW+): Address space and security selectors. */
 424         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 425                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 426         *cs++ = lower_32_bits(offset);
 427         *cs++ = upper_32_bits(offset);
 428
 429         intel_ring_advance(rq, cs);
 430
 431         return 0;
 432 }
 433
 434 int gen8_emit_bb_start(struct i915_request *rq,
 435                        u64 offset, u32 len,
 436                        const unsigned int flags)
 437 {
 438         u32 *cs;
 439
 440         if (unlikely(i915_request_has_nopreempt(rq)))
 441                 return gen8_emit_bb_start_noarb(rq, offset, len, flags);
 442
 443         cs = intel_ring_begin(rq, 6);
 444         if (IS_ERR(cs))
 445                 return PTR_ERR(cs);
 446
 447         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 448
 449         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 450                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 451         *cs++ = lower_32_bits(offset);
 452         *cs++ = upper_32_bits(offset);
 453
 454         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 455         *cs++ = MI_NOOP;
 456
 457         intel_ring_advance(rq, cs);
 458
 459         return 0;
 460 }
 461
 462 static void assert_request_valid(struct i915_request *rq)
 463 {
 464         struct intel_ring *ring __maybe_unused = rq->ring;
 465
 466         /* Can we unwind this request without appearing to go forwards? */
 467         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
 468 }
 469
 470 /*
 471  * Reserve space for 2 NOOPs at the end of each request to be
 472  * used as a workaround for not being allowed to do lite
 473  * restore with HEAD==TAIL (WaIdleLiteRestore).
 474  */
 475 static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
 476 {
 477         /* Ensure there's always at least one preemption point per-request. */
 478         *cs++ = MI_ARB_CHECK;
 479         *cs++ = MI_NOOP;
 480         rq->wa_tail = intel_ring_offset(rq, cs);
 481
 482         /* Check that entire request is less than half the ring */
 483         assert_request_valid(rq);
 484
 485         return cs;
 486 }
 487
 488 static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 489 {
 490         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 491         *cs++ = MI_SEMAPHORE_WAIT |
 492                 MI_SEMAPHORE_GLOBAL_GTT |
 493                 MI_SEMAPHORE_POLL |
 494                 MI_SEMAPHORE_SAD_EQ_SDD;
 495         *cs++ = 0;
 496         *cs++ = preempt_address(rq->engine);
 497         *cs++ = 0;
 498         *cs++ = MI_NOOP;
 499
 500         return cs;
 501 }
 502
 503 static __always_inline u32*
 504 gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 505 {
 506         *cs++ = MI_USER_INTERRUPT;
 507
 508         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 509         if (intel_engine_has_semaphores(rq->engine))
 510                 cs = emit_preempt_busywait(rq, cs);
 511
 512         rq->tail = intel_ring_offset(rq, cs);
 513         assert_ring_tail_valid(rq->ring, rq->tail);
 514
 515         return gen8_emit_wa_tail(rq, cs);
 516 }
 517
 518 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
 519 {
 520         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
 521 }
 522
 523 u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 524 {
 525         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
 526 }
 527
 528 u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 529 {
 530         cs = gen8_emit_pipe_control(cs,
 531                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 532                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 533                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
 534                                     0);
 535
 536         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
 537         cs = gen8_emit_ggtt_write_rcs(cs,
 538                                       rq->fence.seqno,
 539                                       hwsp_offset(rq),
 540                                       PIPE_CONTROL_FLUSH_ENABLE |
 541                                       PIPE_CONTROL_CS_STALL);
 542
 543         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 544 }
 545
 546 u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 547 {
 548         cs = gen8_emit_ggtt_write_rcs(cs,
 549                                       rq->fence.seqno,
 550                                       hwsp_offset(rq),
 551                                       PIPE_CONTROL_CS_STALL |
 552                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
 553                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 554                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 555                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
 556                                       PIPE_CONTROL_FLUSH_ENABLE);
 557
 558         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 559 }
 560
 561 /*
 562  * Note that the CS instruction pre-parser will not stall on the breadcrumb
 563  * flush and will continue pre-fetching the instructions after it before the
 564  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
 565  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
 566  * of the next request before the memory has been flushed, we're guaranteed that
 567  * we won't access the batch itself too early.
 568  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
 569  * so, if the current request is modifying an instruction in the next request on
 570  * the same intel_context, we might pre-fetch and then execute the pre-update
 571  * instruction. To avoid this, the users of self-modifying code should either
 572  * disable the parser around the code emitting the memory writes, via a new flag
 573  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
 574  * the in-kernel use-cases we've opted to use a separate context, see
 575  * reloc_gpu() as an example.
 576  * All the above applies only to the instructions themselves. Non-inline data
 577  * used by the instructions is not pre-fetched.
 578  */
 579
 580 static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 581 {
 582         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 583         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
 584                 MI_SEMAPHORE_GLOBAL_GTT |
 585                 MI_SEMAPHORE_POLL |
 586                 MI_SEMAPHORE_SAD_EQ_SDD;
 587         *cs++ = 0;
 588         *cs++ = preempt_address(rq->engine);
 589         *cs++ = 0;
 590         *cs++ = 0;
 591
 592         return cs;
 593 }
 594
 595 static __always_inline u32*
 596 gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 597 {
 598         *cs++ = MI_USER_INTERRUPT;
 599
 600         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 601         if (intel_engine_has_semaphores(rq->engine))
 602                 cs = gen12_emit_preempt_busywait(rq, cs);
 603
 604         rq->tail = intel_ring_offset(rq, cs);
 605         assert_ring_tail_valid(rq->ring, rq->tail);
 606
 607         return gen8_emit_wa_tail(rq, cs);
 608 }
 609
 610 u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 611 {
 612         /* XXX Stalling flush before seqno write; post-sync not */
 613         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
 614         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 615 }
 616
 617 u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 618 {
 619         cs = gen12_emit_ggtt_write_rcs(cs,
 620                                        rq->fence.seqno,
 621                                        hwsp_offset(rq),
 622                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 623                                        PIPE_CONTROL_CS_STALL |
 624                                        PIPE_CONTROL_TILE_CACHE_FLUSH |
 625                                        PIPE_CONTROL_FLUSH_L3 |
 626                                        PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 627                                        PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 628                                        /* Wa_1409600907:tgl */
 629                                        PIPE_CONTROL_DEPTH_STALL |
 630                                        PIPE_CONTROL_DC_FLUSH_ENABLE |
 631                                        PIPE_CONTROL_FLUSH_ENABLE);
 632
 633         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 634 }