drivers/gpu/drm/i915/gt/gen6_engine_cs.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2020 Intel Corporation
   4  */
   5
   6 #include "gen6_engine_cs.h"
   7 #include "intel_engine.h"
   8 #include "intel_gpu_commands.h"
   9 #include "intel_gt.h"
  10 #include "intel_gt_irq.h"
  11 #include "intel_gt_pm_irq.h"
  12 #include "intel_ring.h"
  13
  14 #define HWS_SCRATCH_ADDR        (I915_GEM_HWS_SCRATCH * sizeof(u32))
  15
  16 /*
  17  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
  18  * implementing two workarounds on gen6.  From section 1.4.7.1
  19  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
  20  *
  21  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
  22  * produced by non-pipelined state commands), software needs to first
  23  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
  24  * 0.
  25  *
  26  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
  27  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
  28  *
  29  * And the workaround for these two requires this workaround first:
  30  *
  31  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
  32  * BEFORE the pipe-control with a post-sync op and no write-cache
  33  * flushes.
  34  *
  35  * And this last workaround is tricky because of the requirements on
  36  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
  37  * volume 2 part 1:
  38  *
  39  *     "1 of the following must also be set:
  40  *      - Render Target Cache Flush Enable ([12] of DW1)
  41  *      - Depth Cache Flush Enable ([0] of DW1)
  42  *      - Stall at Pixel Scoreboard ([1] of DW1)
  43  *      - Depth Stall ([13] of DW1)
  44  *      - Post-Sync Operation ([13] of DW1)
  45  *      - Notify Enable ([8] of DW1)"
  46  *
  47  * The cache flushes require the workaround flush that triggered this
  48  * one, so we can't use it.  Depth stall would trigger the same.
  49  * Post-sync nonzero is what triggered this second workaround, so we
  50  * can't use that one either.  Notify enable is IRQs, which aren't
  51  * really our business.  That leaves only stall at scoreboard.
  52  */
  53 static int
  54 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
  55 {
  56         u32 scratch_addr =
  57                 intel_gt_scratch_offset(rq->engine->gt,
  58                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
  59         u32 *cs;
  60
  61         cs = intel_ring_begin(rq, 6);
  62         if (IS_ERR(cs))
  63                 return PTR_ERR(cs);
  64
  65         *cs++ = GFX_OP_PIPE_CONTROL(5);
  66         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
  67         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
  68         *cs++ = 0; /* low dword */
  69         *cs++ = 0; /* high dword */
  70         *cs++ = MI_NOOP;
  71         intel_ring_advance(rq, cs);
  72
  73         cs = intel_ring_begin(rq, 6);
  74         if (IS_ERR(cs))
  75                 return PTR_ERR(cs);
  76
  77         *cs++ = GFX_OP_PIPE_CONTROL(5);
  78         *cs++ = PIPE_CONTROL_QW_WRITE;
  79         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
  80         *cs++ = 0;
  81         *cs++ = 0;
  82         *cs++ = MI_NOOP;
  83         intel_ring_advance(rq, cs);
  84
  85         return 0;
  86 }
  87
  88 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
  89 {
  90         u32 scratch_addr =
  91                 intel_gt_scratch_offset(rq->engine->gt,
  92                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
  93         u32 *cs, flags = 0;
  94         int ret;
  95
  96         /* Force SNB workarounds for PIPE_CONTROL flushes */
  97         ret = gen6_emit_post_sync_nonzero_flush(rq);
  98         if (ret)
  99                 return ret;
 100
 101         /*
 102          * Just flush everything.  Experiments have shown that reducing the
 103          * number of bits based on the write domains has little performance
 104          * impact. And when rearranging requests, the order of flushes is
 105          * unknown.
 106          */
 107         if (mode & EMIT_FLUSH) {
 108                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 109                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 110                 /*
 111                  * Ensure that any following seqno writes only happen
 112                  * when the render cache is indeed flushed.
 113                  */
 114                 flags |= PIPE_CONTROL_CS_STALL;
 115         }
 116         if (mode & EMIT_INVALIDATE) {
 117                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 118                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 119                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 120                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 121                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 122                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 123                 /*
 124                  * TLB invalidate requires a post-sync write.
 125                  */
 126                 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 127         }
 128
 129         cs = intel_ring_begin(rq, 4);
 130         if (IS_ERR(cs))
 131                 return PTR_ERR(cs);
 132
 133         *cs++ = GFX_OP_PIPE_CONTROL(4);
 134         *cs++ = flags;
 135         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
 136         *cs++ = 0;
 137         intel_ring_advance(rq, cs);
 138
 139         return 0;
 140 }
 141
 142 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 143 {
 144         /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
 145         *cs++ = GFX_OP_PIPE_CONTROL(4);
 146         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 147         *cs++ = 0;
 148         *cs++ = 0;
 149
 150         *cs++ = GFX_OP_PIPE_CONTROL(4);
 151         *cs++ = PIPE_CONTROL_QW_WRITE;
 152         *cs++ = intel_gt_scratch_offset(rq->engine->gt,
 153                                         INTEL_GT_SCRATCH_FIELD_DEFAULT) |
 154                 PIPE_CONTROL_GLOBAL_GTT;
 155         *cs++ = 0;
 156
 157         /* Finally we can flush and with it emit the breadcrumb */
 158         *cs++ = GFX_OP_PIPE_CONTROL(4);
 159         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 160                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 161                  PIPE_CONTROL_DC_FLUSH_ENABLE |
 162                  PIPE_CONTROL_QW_WRITE |
 163                  PIPE_CONTROL_CS_STALL);
 164         *cs++ = i915_request_active_seqno(rq) |
 165                 PIPE_CONTROL_GLOBAL_GTT;
 166         *cs++ = rq->fence.seqno;
 167
 168         *cs++ = MI_USER_INTERRUPT;
 169         *cs++ = MI_NOOP;
 170
 171         rq->tail = intel_ring_offset(rq, cs);
 172         assert_ring_tail_valid(rq->ring, rq->tail);
 173
 174         return cs;
 175 }
 176
 177 static int mi_flush_dw(struct i915_request *rq, u32 flags)
 178 {
 179         u32 cmd, *cs;
 180
 181         cs = intel_ring_begin(rq, 4);
 182         if (IS_ERR(cs))
 183                 return PTR_ERR(cs);
 184
 185         cmd = MI_FLUSH_DW;
 186
 187         /*
 188          * We always require a command barrier so that subsequent
 189          * commands, such as breadcrumb interrupts, are strictly ordered
 190          * wrt the contents of the write cache being flushed to memory
 191          * (and thus being coherent from the CPU).
 192          */
 193         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
 194
 195         /*
 196          * Bspec vol 1c.3 - blitter engine command streamer:
 197          * "If ENABLED, all TLBs will be invalidated once the flush
 198          * operation is complete. This bit is only valid when the
 199          * Post-Sync Operation field is a value of 1h or 3h."
 200          */
 201         cmd |= flags;
 202
 203         *cs++ = cmd;
 204         *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
 205         *cs++ = 0;
 206         *cs++ = MI_NOOP;
 207
 208         intel_ring_advance(rq, cs);
 209
 210         return 0;
 211 }
 212
 213 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
 214 {
 215         return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
 216 }
 217
 218 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
 219 {
 220         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
 221 }
 222
 223 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
 224 {
 225         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
 226 }
 227
 228 int gen6_emit_bb_start(struct i915_request *rq,
 229                        u64 offset, u32 len,
 230                        unsigned int dispatch_flags)
 231 {
 232         u32 security;
 233         u32 *cs;
 234
 235         security = MI_BATCH_NON_SECURE_I965;
 236         if (dispatch_flags & I915_DISPATCH_SECURE)
 237                 security = 0;
 238
 239         cs = intel_ring_begin(rq, 2);
 240         if (IS_ERR(cs))
 241                 return PTR_ERR(cs);
 242
 243         cs = __gen6_emit_bb_start(cs, offset, security);
 244         intel_ring_advance(rq, cs);
 245
 246         return 0;
 247 }
 248
 249 int
 250 hsw_emit_bb_start(struct i915_request *rq,
 251                   u64 offset, u32 len,
 252                   unsigned int dispatch_flags)
 253 {
 254         u32 security;
 255         u32 *cs;
 256
 257         security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
 258         if (dispatch_flags & I915_DISPATCH_SECURE)
 259                 security = 0;
 260
 261         cs = intel_ring_begin(rq, 2);
 262         if (IS_ERR(cs))
 263                 return PTR_ERR(cs);
 264
 265         cs = __gen6_emit_bb_start(cs, offset, security);
 266         intel_ring_advance(rq, cs);
 267
 268         return 0;
 269 }
 270
 271 static int gen7_stall_cs(struct i915_request *rq)
 272 {
 273         u32 *cs;
 274
 275         cs = intel_ring_begin(rq, 4);
 276         if (IS_ERR(cs))
 277                 return PTR_ERR(cs);
 278
 279         *cs++ = GFX_OP_PIPE_CONTROL(4);
 280         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
 281         *cs++ = 0;
 282         *cs++ = 0;
 283         intel_ring_advance(rq, cs);
 284
 285         return 0;
 286 }
 287
 288 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
 289 {
 290         u32 scratch_addr =
 291                 intel_gt_scratch_offset(rq->engine->gt,
 292                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
 293         u32 *cs, flags = 0;
 294
 295         /*
 296          * Ensure that any following seqno writes only happen when the render
 297          * cache is indeed flushed.
 298          *
 299          * Workaround: 4th PIPE_CONTROL command (except the ones with only
 300          * read-cache invalidate bits set) must have the CS_STALL bit set. We
 301          * don't try to be clever and just set it unconditionally.
 302          */
 303         flags |= PIPE_CONTROL_CS_STALL;
 304
 305         /*
 306          * CS_STALL suggests at least a post-sync write.
 307          */
 308         flags |= PIPE_CONTROL_QW_WRITE;
 309         flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
 310
 311         /*
 312          * Just flush everything.  Experiments have shown that reducing the
 313          * number of bits based on the write domains has little performance
 314          * impact.
 315          */
 316         if (mode & EMIT_FLUSH) {
 317                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
 318                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
 319                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
 320                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
 321         }
 322         if (mode & EMIT_INVALIDATE) {
 323                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
 324                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
 325                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
 326                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
 327                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
 328                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
 329                 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
 330
 331                 /*
 332                  * Workaround: we must issue a pipe_control with CS-stall bit
 333                  * set before a pipe_control command that has the state cache
 334                  * invalidate bit set.
 335                  */
 336                 gen7_stall_cs(rq);
 337         }
 338
 339         cs = intel_ring_begin(rq, 4);
 340         if (IS_ERR(cs))
 341                 return PTR_ERR(cs);
 342
 343         *cs++ = GFX_OP_PIPE_CONTROL(4);
 344         *cs++ = flags;
 345         *cs++ = scratch_addr;
 346         *cs++ = 0;
 347         intel_ring_advance(rq, cs);
 348
 349         return 0;
 350 }
 351
 352 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
 353 {
 354         *cs++ = GFX_OP_PIPE_CONTROL(4);
 355         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 356                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 357                  PIPE_CONTROL_DC_FLUSH_ENABLE |
 358                  PIPE_CONTROL_FLUSH_ENABLE |
 359                  PIPE_CONTROL_QW_WRITE |
 360                  PIPE_CONTROL_GLOBAL_GTT_IVB |
 361                  PIPE_CONTROL_CS_STALL);
 362         *cs++ = i915_request_active_seqno(rq);
 363         *cs++ = rq->fence.seqno;
 364
 365         *cs++ = MI_USER_INTERRUPT;
 366         *cs++ = MI_NOOP;
 367
 368         rq->tail = intel_ring_offset(rq, cs);
 369         assert_ring_tail_valid(rq->ring, rq->tail);
 370
 371         return cs;
 372 }
 373
 374 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 375 {
 376         GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
 377         GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
 378
 379         *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 380         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 381         *cs++ = rq->fence.seqno;
 382
 383         *cs++ = MI_USER_INTERRUPT;
 384
 385         rq->tail = intel_ring_offset(rq, cs);
 386         assert_ring_tail_valid(rq->ring, rq->tail);
 387
 388         return cs;
 389 }
 390
 391 #define GEN7_XCS_WA 32
 392 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
 393 {
 394         int i;
 395
 396         GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
 397         GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
 398
 399         *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
 400                 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 401         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 402         *cs++ = rq->fence.seqno;
 403
 404         for (i = 0; i < GEN7_XCS_WA; i++) {
 405                 *cs++ = MI_STORE_DWORD_INDEX;
 406                 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
 407                 *cs++ = rq->fence.seqno;
 408         }
 409
 410         *cs++ = MI_FLUSH_DW;
 411         *cs++ = 0;
 412         *cs++ = 0;
 413
 414         *cs++ = MI_USER_INTERRUPT;
 415         *cs++ = MI_NOOP;
 416
 417         rq->tail = intel_ring_offset(rq, cs);
 418         assert_ring_tail_valid(rq->ring, rq->tail);
 419
 420         return cs;
 421 }
 422 #undef GEN7_XCS_WA
 423
 424 void gen6_irq_enable(struct intel_engine_cs *engine)
 425 {
 426         ENGINE_WRITE(engine, RING_IMR,
 427                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
 428
 429         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
 430         ENGINE_POSTING_READ(engine, RING_IMR);
 431
 432         gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
 433 }
 434
 435 void gen6_irq_disable(struct intel_engine_cs *engine)
 436 {
 437         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
 438         gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
 439 }
 440
 441 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
 442 {
 443         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
 444
 445         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
 446         ENGINE_POSTING_READ(engine, RING_IMR);
 447
 448         gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
 449 }
 450
 451 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
 452 {
 453         ENGINE_WRITE(engine, RING_IMR, ~0);
 454         gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
 455 }