drivers/gpu/drm/i915/gt/gen8_engine_cs.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2014 Intel Corporation
   4  */
   5
   6 #include "gen8_engine_cs.h"
   7 #include "i915_drv.h"
   8 #include "intel_lrc.h"
   9 #include "intel_gpu_commands.h"
  10 #include "intel_ring.h"
  11
  12 int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
  13 {
  14         bool vf_flush_wa = false, dc_flush_wa = false;
  15         u32 *cs, flags = 0;
  16         int len;
  17
  18         flags |= PIPE_CONTROL_CS_STALL;
  19
  20         if (mode & EMIT_FLUSH) {
  21                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
  22                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
  23                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
  24                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
  25         }
  26
  27         if (mode & EMIT_INVALIDATE) {
  28                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
  29                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
  30                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
  31                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
  32                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
  33                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
  34                 flags |= PIPE_CONTROL_QW_WRITE;
  35                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
  36
  37                 /*
  38                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
  39                  * pipe control.
  40                  */
  41                 if (IS_GEN(rq->engine->i915, 9))
  42                         vf_flush_wa = true;
  43
  44                 /* WaForGAMHang:kbl */
  45                 if (IS_KBL_GT_REVID(rq->engine->i915, 0, KBL_REVID_B0))
  46                         dc_flush_wa = true;
  47         }
  48
  49         len = 6;
  50
  51         if (vf_flush_wa)
  52                 len += 6;
  53
  54         if (dc_flush_wa)
  55                 len += 12;
  56
  57         cs = intel_ring_begin(rq, len);
  58         if (IS_ERR(cs))
  59                 return PTR_ERR(cs);
  60
  61         if (vf_flush_wa)
  62                 cs = gen8_emit_pipe_control(cs, 0, 0);
  63
  64         if (dc_flush_wa)
  65                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
  66                                             0);
  67
  68         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
  69
  70         if (dc_flush_wa)
  71                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
  72
  73         intel_ring_advance(rq, cs);
  74
  75         return 0;
  76 }
  77
  78 int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
  79 {
  80         u32 cmd, *cs;
  81
  82         cs = intel_ring_begin(rq, 4);
  83         if (IS_ERR(cs))
  84                 return PTR_ERR(cs);
  85
  86         cmd = MI_FLUSH_DW + 1;
  87
  88         /*
  89          * We always require a command barrier so that subsequent
  90          * commands, such as breadcrumb interrupts, are strictly ordered
  91          * wrt the contents of the write cache being flushed to memory
  92          * (and thus being coherent from the CPU).
  93          */
  94         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
  95
  96         if (mode & EMIT_INVALIDATE) {
  97                 cmd |= MI_INVALIDATE_TLB;
  98                 if (rq->engine->class == VIDEO_DECODE_CLASS)
  99                         cmd |= MI_INVALIDATE_BSD;
 100         }
 101
 102         *cs++ = cmd;
 103         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 104         *cs++ = 0; /* upper addr */
 105         *cs++ = 0; /* value */
 106         intel_ring_advance(rq, cs);
 107
 108         return 0;
 109 }
 110
 111 int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
 112 {
 113         if (mode & EMIT_FLUSH) {
 114                 u32 *cs;
 115                 u32 flags = 0;
 116
 117                 flags |= PIPE_CONTROL_CS_STALL;
 118
 119                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 120                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 121                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 122                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 123                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 124                 flags |= PIPE_CONTROL_QW_WRITE;
 125                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 126
 127                 cs = intel_ring_begin(rq, 6);
 128                 if (IS_ERR(cs))
 129                         return PTR_ERR(cs);
 130
 131                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 132                 intel_ring_advance(rq, cs);
 133         }
 134
 135         if (mode & EMIT_INVALIDATE) {
 136                 u32 *cs;
 137                 u32 flags = 0;
 138
 139                 flags |= PIPE_CONTROL_CS_STALL;
 140
 141                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 142                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 143                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 144                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 145                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 146                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 147                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 148                 flags |= PIPE_CONTROL_QW_WRITE;
 149                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 150
 151                 cs = intel_ring_begin(rq, 6);
 152                 if (IS_ERR(cs))
 153                         return PTR_ERR(cs);
 154
 155                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 156                 intel_ring_advance(rq, cs);
 157         }
 158
 159         return 0;
 160 }
 161
 162 static u32 preparser_disable(bool state)
 163 {
 164         return MI_ARB_CHECK | 1 << 8 | state;
 165 }
 166
 167 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
 168 {
 169         static const i915_reg_t vd[] = {
 170                 GEN12_VD0_AUX_NV,
 171                 GEN12_VD1_AUX_NV,
 172                 GEN12_VD2_AUX_NV,
 173                 GEN12_VD3_AUX_NV,
 174         };
 175
 176         static const i915_reg_t ve[] = {
 177                 GEN12_VE0_AUX_NV,
 178                 GEN12_VE1_AUX_NV,
 179         };
 180
 181         if (engine->class == VIDEO_DECODE_CLASS)
 182                 return vd[engine->instance];
 183
 184         if (engine->class == VIDEO_ENHANCEMENT_CLASS)
 185                 return ve[engine->instance];
 186
 187         GEM_BUG_ON("unknown aux_inv reg\n");
 188         return INVALID_MMIO_REG;
 189 }
 190
 191 static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
 192 {
 193         *cs++ = MI_LOAD_REGISTER_IMM(1);
 194         *cs++ = i915_mmio_reg_offset(inv_reg);
 195         *cs++ = AUX_INV;
 196         *cs++ = MI_NOOP;
 197
 198         return cs;
 199 }
 200
 201 int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
 202 {
 203         if (mode & EMIT_FLUSH) {
 204                 u32 flags = 0;
 205                 u32 *cs;
 206
 207                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
 208                 flags |= PIPE_CONTROL_FLUSH_L3;
 209                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 210                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 211                 /* Wa_1409600907:tgl */
 212                 flags |= PIPE_CONTROL_DEPTH_STALL;
 213                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 214                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 215
 216                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 217                 flags |= PIPE_CONTROL_QW_WRITE;
 218
 219                 flags |= PIPE_CONTROL_CS_STALL;
 220
 221                 cs = intel_ring_begin(rq, 6);
 222                 if (IS_ERR(cs))
 223                         return PTR_ERR(cs);
 224
 225                 cs = gen12_emit_pipe_control(cs,
 226                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 227                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
 228                 intel_ring_advance(rq, cs);
 229         }
 230
 231         if (mode & EMIT_INVALIDATE) {
 232                 u32 flags = 0;
 233                 u32 *cs;
 234
 235                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
 236                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 237                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 238                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 239                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 240                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 241                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 242
 243                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
 244                 flags |= PIPE_CONTROL_QW_WRITE;
 245
 246                 flags |= PIPE_CONTROL_CS_STALL;
 247
 248                 cs = intel_ring_begin(rq, 8 + 4);
 249                 if (IS_ERR(cs))
 250                         return PTR_ERR(cs);
 251
 252                 /*
 253                  * Prevent the pre-parser from skipping past the TLB
 254                  * invalidate and loading a stale page for the batch
 255                  * buffer / request payload.
 256                  */
 257                 *cs++ = preparser_disable(true);
 258
 259                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
 260
 261                 /* hsdes: 1809175790 */
 262                 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
 263
 264                 *cs++ = preparser_disable(false);
 265                 intel_ring_advance(rq, cs);
 266         }
 267
 268         return 0;
 269 }
 270
 271 int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
 272 {
 273         intel_engine_mask_t aux_inv = 0;
 274         u32 cmd, *cs;
 275
 276         cmd = 4;
 277         if (mode & EMIT_INVALIDATE)
 278                 cmd += 2;
 279         if (mode & EMIT_INVALIDATE)
 280                 aux_inv = rq->engine->mask & ~BIT(BCS0);
 281         if (aux_inv)
 282                 cmd += 2 * hweight8(aux_inv) + 2;
 283
 284         cs = intel_ring_begin(rq, cmd);
 285         if (IS_ERR(cs))
 286                 return PTR_ERR(cs);
 287
 288         if (mode & EMIT_INVALIDATE)
 289                 *cs++ = preparser_disable(true);
 290
 291         cmd = MI_FLUSH_DW + 1;
 292
 293         /*
 294          * We always require a command barrier so that subsequent
 295          * commands, such as breadcrumb interrupts, are strictly ordered
 296          * wrt the contents of the write cache being flushed to memory
 297          * (and thus being coherent from the CPU).
 298          */
 299         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 300
 301         if (mode & EMIT_INVALIDATE) {
 302                 cmd |= MI_INVALIDATE_TLB;
 303                 if (rq->engine->class == VIDEO_DECODE_CLASS)
 304                         cmd |= MI_INVALIDATE_BSD;
 305         }
 306
 307         *cs++ = cmd;
 308         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
 309         *cs++ = 0; /* upper addr */
 310         *cs++ = 0; /* value */
 311
 312         if (aux_inv) { /* hsdes: 1809175790 */
 313                 struct intel_engine_cs *engine;
 314                 unsigned int tmp;
 315
 316                 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
 317                 for_each_engine_masked(engine, rq->engine->gt,
 318                                        aux_inv, tmp) {
 319                         *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
 320                         *cs++ = AUX_INV;
 321                 }
 322                 *cs++ = MI_NOOP;
 323         }
 324
 325         if (mode & EMIT_INVALIDATE)
 326                 *cs++ = preparser_disable(false);
 327
 328         intel_ring_advance(rq, cs);
 329
 330         return 0;
 331 }
 332
 333 static u32 preempt_address(struct intel_engine_cs *engine)
 334 {
 335         return (i915_ggtt_offset(engine->status_page.vma) +
 336                 I915_GEM_HWS_PREEMPT_ADDR);
 337 }
 338
 339 static u32 hwsp_offset(const struct i915_request *rq)
 340 {
 341         const struct intel_timeline_cacheline *cl;
 342
 343         /* Before the request is executed, the timeline/cachline is fixed */
 344
 345         cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
 346         if (cl)
 347                 return cl->ggtt_offset;
 348
 349         return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
 350 }
 351
 352 int gen8_emit_init_breadcrumb(struct i915_request *rq)
 353 {
 354         u32 *cs;
 355
 356         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
 357         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
 358                 return 0;
 359
 360         cs = intel_ring_begin(rq, 6);
 361         if (IS_ERR(cs))
 362                 return PTR_ERR(cs);
 363
 364         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 365         *cs++ = hwsp_offset(rq);
 366         *cs++ = 0;
 367         *cs++ = rq->fence.seqno - 1;
 368
 369         /*
 370          * Check if we have been preempted before we even get started.
 371          *
 372          * After this point i915_request_started() reports true, even if
 373          * we get preempted and so are no longer running.
 374          *
 375          * i915_request_started() is used during preemption processing
 376          * to decide if the request is currently inside the user payload
 377          * or spinning on a kernel semaphore (or earlier). For no-preemption
 378          * requests, we do allow preemption on the semaphore before the user
 379          * payload, but do not allow preemption once the request is started.
 380          *
 381          * i915_request_started() is similarly used during GPU hangs to
 382          * determine if the user's payload was guilty, and if so, the
 383          * request is banned. Before the request is started, it is assumed
 384          * to be unharmed and an innocent victim of another's hang.
 385          */
 386         *cs++ = MI_NOOP;
 387         *cs++ = MI_ARB_CHECK;
 388
 389         intel_ring_advance(rq, cs);
 390
 391         /* Record the updated position of the request's payload */
 392         rq->infix = intel_ring_offset(rq, cs);
 393
 394         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
 395
 396         return 0;
 397 }
 398
 399 int gen8_emit_bb_start_noarb(struct i915_request *rq,
 400                              u64 offset, u32 len,
 401                              const unsigned int flags)
 402 {
 403         u32 *cs;
 404
 405         cs = intel_ring_begin(rq, 4);
 406         if (IS_ERR(cs))
 407                 return PTR_ERR(cs);
 408
 409         /*
 410          * WaDisableCtxRestoreArbitration:bdw,chv
 411          *
 412          * We don't need to perform MI_ARB_ENABLE as often as we do (in
 413          * particular all the gen that do not need the w/a at all!), if we
 414          * took care to make sure that on every switch into this context
 415          * (both ordinary and for preemption) that arbitrartion was enabled
 416          * we would be fine.  However, for gen8 there is another w/a that
 417          * requires us to not preempt inside GPGPU execution, so we keep
 418          * arbitration disabled for gen8 batches. Arbitration will be
 419          * re-enabled before we close the request
 420          * (engine->emit_fini_breadcrumb).
 421          */
 422         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 423
 424         /* FIXME(BDW+): Address space and security selectors. */
 425         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 426                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 427         *cs++ = lower_32_bits(offset);
 428         *cs++ = upper_32_bits(offset);
 429
 430         intel_ring_advance(rq, cs);
 431
 432         return 0;
 433 }
 434
 435 int gen8_emit_bb_start(struct i915_request *rq,
 436                        u64 offset, u32 len,
 437                        const unsigned int flags)
 438 {
 439         u32 *cs;
 440
 441         if (unlikely(i915_request_has_nopreempt(rq)))
 442                 return gen8_emit_bb_start_noarb(rq, offset, len, flags);
 443
 444         cs = intel_ring_begin(rq, 6);
 445         if (IS_ERR(cs))
 446                 return PTR_ERR(cs);
 447
 448         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 449
 450         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
 451                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
 452         *cs++ = lower_32_bits(offset);
 453         *cs++ = upper_32_bits(offset);
 454
 455         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 456         *cs++ = MI_NOOP;
 457
 458         intel_ring_advance(rq, cs);
 459
 460         return 0;
 461 }
 462
 463 static void assert_request_valid(struct i915_request *rq)
 464 {
 465         struct intel_ring *ring __maybe_unused = rq->ring;
 466
 467         /* Can we unwind this request without appearing to go forwards? */
 468         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
 469 }
 470
 471 /*
 472  * Reserve space for 2 NOOPs at the end of each request to be
 473  * used as a workaround for not being allowed to do lite
 474  * restore with HEAD==TAIL (WaIdleLiteRestore).
 475  */
 476 static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
 477 {
 478         /* Ensure there's always at least one preemption point per-request. */
 479         *cs++ = MI_ARB_CHECK;
 480         *cs++ = MI_NOOP;
 481         rq->wa_tail = intel_ring_offset(rq, cs);
 482
 483         /* Check that entire request is less than half the ring */
 484         assert_request_valid(rq);
 485
 486         return cs;
 487 }
 488
 489 static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 490 {
 491         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 492         *cs++ = MI_SEMAPHORE_WAIT |
 493                 MI_SEMAPHORE_GLOBAL_GTT |
 494                 MI_SEMAPHORE_POLL |
 495                 MI_SEMAPHORE_SAD_EQ_SDD;
 496         *cs++ = 0;
 497         *cs++ = preempt_address(rq->engine);
 498         *cs++ = 0;
 499         *cs++ = MI_NOOP;
 500
 501         return cs;
 502 }
 503
 504 static __always_inline u32*
 505 gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 506 {
 507         *cs++ = MI_USER_INTERRUPT;
 508
 509         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 510         if (intel_engine_has_semaphores(rq->engine))
 511                 cs = emit_preempt_busywait(rq, cs);
 512
 513         rq->tail = intel_ring_offset(rq, cs);
 514         assert_ring_tail_valid(rq->ring, rq->tail);
 515
 516         return gen8_emit_wa_tail(rq, cs);
 517 }
 518
 519 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
 520 {
 521         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
 522 }
 523
 524 u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 525 {
 526         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
 527 }
 528
 529 u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 530 {
 531         cs = gen8_emit_pipe_control(cs,
 532                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 533                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 534                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
 535                                     0);
 536
 537         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
 538         cs = gen8_emit_ggtt_write_rcs(cs,
 539                                       rq->fence.seqno,
 540                                       hwsp_offset(rq),
 541                                       PIPE_CONTROL_FLUSH_ENABLE |
 542                                       PIPE_CONTROL_CS_STALL);
 543
 544         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 545 }
 546
 547 u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 548 {
 549         cs = gen8_emit_ggtt_write_rcs(cs,
 550                                       rq->fence.seqno,
 551                                       hwsp_offset(rq),
 552                                       PIPE_CONTROL_CS_STALL |
 553                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
 554                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 555                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 556                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
 557                                       PIPE_CONTROL_FLUSH_ENABLE);
 558
 559         return gen8_emit_fini_breadcrumb_tail(rq, cs);
 560 }
 561
 562 /*
 563  * Note that the CS instruction pre-parser will not stall on the breadcrumb
 564  * flush and will continue pre-fetching the instructions after it before the
 565  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
 566  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
 567  * of the next request before the memory has been flushed, we're guaranteed that
 568  * we won't access the batch itself too early.
 569  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
 570  * so, if the current request is modifying an instruction in the next request on
 571  * the same intel_context, we might pre-fetch and then execute the pre-update
 572  * instruction. To avoid this, the users of self-modifying code should either
 573  * disable the parser around the code emitting the memory writes, via a new flag
 574  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
 575  * the in-kernel use-cases we've opted to use a separate context, see
 576  * reloc_gpu() as an example.
 577  * All the above applies only to the instructions themselves. Non-inline data
 578  * used by the instructions is not pre-fetched.
 579  */
 580
 581 static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs)
 582 {
 583         *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */
 584         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
 585                 MI_SEMAPHORE_GLOBAL_GTT |
 586                 MI_SEMAPHORE_POLL |
 587                 MI_SEMAPHORE_SAD_EQ_SDD;
 588         *cs++ = 0;
 589         *cs++ = preempt_address(rq->engine);
 590         *cs++ = 0;
 591         *cs++ = 0;
 592
 593         return cs;
 594 }
 595
 596 static __always_inline u32*
 597 gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs)
 598 {
 599         *cs++ = MI_USER_INTERRUPT;
 600
 601         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 602         if (intel_engine_has_semaphores(rq->engine))
 603                 cs = gen12_emit_preempt_busywait(rq, cs);
 604
 605         rq->tail = intel_ring_offset(rq, cs);
 606         assert_ring_tail_valid(rq->ring, rq->tail);
 607
 608         return gen8_emit_wa_tail(rq, cs);
 609 }
 610
 611 u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 612 {
 613         /* XXX Stalling flush before seqno write; post-sync not */
 614         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
 615         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 616 }
 617
 618 u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 619 {
 620         cs = gen12_emit_ggtt_write_rcs(cs,
 621                                        rq->fence.seqno,
 622                                        hwsp_offset(rq),
 623                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
 624                                        PIPE_CONTROL_CS_STALL |
 625                                        PIPE_CONTROL_TILE_CACHE_FLUSH |
 626                                        PIPE_CONTROL_FLUSH_L3 |
 627                                        PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 628                                        PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 629                                        /* Wa_1409600907:tgl */
 630                                        PIPE_CONTROL_DEPTH_STALL |
 631                                        PIPE_CONTROL_DC_FLUSH_ENABLE |
 632                                        PIPE_CONTROL_FLUSH_ENABLE);
 633
 634         return gen12_emit_fini_breadcrumb_tail(rq, cs);
 635 }