Merge drm/drm-next into drm-intel-next-queued
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / intel_lrc.c
index 1744792..4b28225 100644 (file)
@@ -344,6 +344,7 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
                        last_prio = rq_prio(rq);
                        p = lookup_priolist(engine, last_prio);
                }
+               GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 
                GEM_BUG_ON(p->priority != rq_prio(rq));
                list_add(&rq->sched.link, &p->requests);
@@ -429,7 +430,7 @@ static u64 execlists_update_context(struct i915_request *rq)
         * PML4 is allocated during ppgtt init, so this is not needed
         * in 48-bit mode.
         */
-       if (ppgtt && !i915_vm_is_48bit(&ppgtt->vm))
+       if (!i915_vm_is_48bit(&ppgtt->vm))
                execlists_update_context_pdps(ppgtt, reg_state);
 
        return ce->lrc_desc;
@@ -541,11 +542,6 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
 
        GEM_BUG_ON(execlists->preempt_complete_status !=
                   upper_32_bits(ce->lrc_desc));
-       GEM_BUG_ON((ce->lrc_reg_state[CTX_CONTEXT_CONTROL + 1] &
-                   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
-                                      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT)) !=
-                  _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
-                                     CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
 
        /*
         * Switch to our empty preempt context so
@@ -1277,6 +1273,8 @@ static void execlists_context_destroy(struct intel_context *ce)
 
 static void execlists_context_unpin(struct intel_context *ce)
 {
+       i915_gem_context_unpin_hw_id(ce->gem_context);
+
        intel_ring_unpin(ce->ring);
 
        ce->state->obj->pin_global--;
@@ -1297,16 +1295,15 @@ static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
         * on an active context (which by nature is already on the GPU).
         */
        if (!(vma->flags & I915_VMA_GLOBAL_BIND)) {
-               err = i915_gem_object_set_to_gtt_domain(vma->obj, true);
+               err = i915_gem_object_set_to_wc_domain(vma->obj, true);
                if (err)
                        return err;
        }
 
        flags = PIN_GLOBAL | PIN_HIGH;
-       if (ctx->ggtt_offset_bias)
-               flags |= PIN_OFFSET_BIAS | ctx->ggtt_offset_bias;
+       flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma);
 
-       return i915_vma_pin(vma, 0, GEN8_LR_CONTEXT_ALIGN, flags);
+       return i915_vma_pin(vma, 0, 0, flags);
 }
 
 static struct intel_context *
@@ -1326,28 +1323,38 @@ __execlists_context_pin(struct intel_engine_cs *engine,
        if (ret)
                goto err;
 
-       vaddr = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
+       vaddr = i915_gem_object_pin_map(ce->state->obj,
+                                       i915_coherent_map_type(ctx->i915) |
+                                       I915_MAP_OVERRIDE);
        if (IS_ERR(vaddr)) {
                ret = PTR_ERR(vaddr);
                goto unpin_vma;
        }
 
-       ret = intel_ring_pin(ce->ring, ctx->i915, ctx->ggtt_offset_bias);
+       ret = intel_ring_pin(ce->ring);
        if (ret)
                goto unpin_map;
 
+       ret = i915_gem_context_pin_hw_id(ctx);
+       if (ret)
+               goto unpin_ring;
+
        intel_lr_context_descriptor_update(ctx, engine, ce);
 
+       GEM_BUG_ON(!intel_ring_offset_valid(ce->ring, ce->ring->head));
+
        ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
        ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
                i915_ggtt_offset(ce->ring->vma);
-       GEM_BUG_ON(!intel_ring_offset_valid(ce->ring, ce->ring->head));
-       ce->lrc_reg_state[CTX_RING_HEAD+1] = ce->ring->head;
+       ce->lrc_reg_state[CTX_RING_HEAD + 1] = ce->ring->head;
+       ce->lrc_reg_state[CTX_RING_TAIL + 1] = ce->ring->tail;
 
        ce->state->obj->pin_global++;
        i915_gem_context_get(ctx);
        return ce;
 
+unpin_ring:
+       intel_ring_unpin(ce->ring);
 unpin_map:
        i915_gem_object_unpin_map(ce->state->obj);
 unpin_vma:
@@ -1369,6 +1376,7 @@ execlists_context_pin(struct intel_engine_cs *engine,
        struct intel_context *ce = to_intel_context(ctx, engine);
 
        lockdep_assert_held(&ctx->i915->drm.struct_mutex);
+       GEM_BUG_ON(!(ctx->ppgtt ?: ctx->i915->mm.aliasing_ppgtt));
 
        if (likely(ce->pin_count++))
                return ce;
@@ -1643,7 +1651,7 @@ static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
                goto err;
        }
 
-       err = i915_vma_pin(vma, 0, PAGE_SIZE, PIN_GLOBAL | PIN_HIGH);
+       err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
        if (err)
                goto err;
 
@@ -1657,7 +1665,7 @@ err:
 
 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
 {
-       i915_vma_unpin_and_release(&engine->wa_ctx.vma);
+       i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
 }
 
 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
@@ -1775,11 +1783,7 @@ static bool unexpected_starting_state(struct intel_engine_cs *engine)
 
 static int gen8_init_common_ring(struct intel_engine_cs *engine)
 {
-       int ret;
-
-       ret = intel_mocs_init_engine(engine);
-       if (ret)
-               return ret;
+       intel_mocs_init_engine(engine);
 
        intel_engine_reset_breadcrumbs(engine);
 
@@ -1838,7 +1842,8 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
        struct i915_request *request, *active;
        unsigned long flags;
 
-       GEM_TRACE("%s\n", engine->name);
+       GEM_TRACE("%s: depth<-%d\n", engine->name,
+                 atomic_read(&execlists->tasklet.count));
 
        /*
         * Prevent request submission to the hardware until we have
@@ -1971,22 +1976,18 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
 {
        struct intel_engine_execlists * const execlists = &engine->execlists;
 
-       /* After a GPU reset, we may have requests to replay */
-       if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
-               tasklet_schedule(&execlists->tasklet);
-
        /*
-        * Flush the tasklet while we still have the forcewake to be sure
-        * that it is not allowed to sleep before we restart and reload a
-        * context.
+        * After a GPU reset, we may have requests to replay. Do so now while
+        * we still have the forcewake to be sure that the GPU is not allowed
+        * to sleep before we restart and reload a context.
         *
-        * As before (with execlists_reset_prepare) we rely on the caller
-        * serialising multiple attempts to reset so that we know that we
-        * are the only one manipulating tasklet state.
         */
-       __tasklet_enable_sync_once(&execlists->tasklet);
+       if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
+               execlists->tasklet.func(execlists->tasklet.data);
 
-       GEM_TRACE("%s\n", engine->name);
+       tasklet_enable(&execlists->tasklet);
+       GEM_TRACE("%s: depth->%d\n", engine->name,
+                 atomic_read(&execlists->tasklet.count));
 }
 
 static int intel_logical_ring_emit_pdps(struct i915_request *rq)
@@ -2066,8 +2067,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
 
        /* FIXME(BDW): Address space and security selectors. */
        *cs++ = MI_BATCH_BUFFER_START_GEN8 |
-               (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
-               (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
+               (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
        *cs++ = lower_32_bits(offset);
        *cs++ = upper_32_bits(offset);
 
@@ -2398,7 +2398,7 @@ static int logical_ring_init(struct intel_engine_cs *engine)
 
        ret = intel_engine_init_common(engine);
        if (ret)
-               goto error;
+               return ret;
 
        if (HAS_LOGICAL_RING_ELSQ(i915)) {
                execlists->submit_reg = i915->regs +
@@ -2440,10 +2440,6 @@ static int logical_ring_init(struct intel_engine_cs *engine)
        reset_csb_pointers(execlists);
 
        return 0;
-
-error:
-       intel_logical_ring_cleanup(engine);
-       return ret;
 }
 
 int logical_render_ring_init(struct intel_engine_cs *engine)
@@ -2466,10 +2462,14 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
        engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
        engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_rcs_sz;
 
-       ret = intel_engine_create_scratch(engine, PAGE_SIZE);
+       ret = logical_ring_init(engine);
        if (ret)
                return ret;
 
+       ret = intel_engine_create_scratch(engine, PAGE_SIZE);
+       if (ret)
+               goto err_cleanup_common;
+
        ret = intel_init_workaround_bb(engine);
        if (ret) {
                /*
@@ -2481,7 +2481,11 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
                          ret);
        }
 
-       return logical_ring_init(engine);
+       return 0;
+
+err_cleanup_common:
+       intel_engine_cleanup_common(engine);
+       return ret;
 }
 
 int logical_xcs_ring_init(struct intel_engine_cs *engine)
@@ -2494,6 +2498,9 @@ int logical_xcs_ring_init(struct intel_engine_cs *engine)
 static u32
 make_rpcs(struct drm_i915_private *dev_priv)
 {
+       bool subslice_pg = INTEL_INFO(dev_priv)->sseu.has_subslice_pg;
+       u8 slices = hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask);
+       u8 subslices = hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]);
        u32 rpcs = 0;
 
        /*
@@ -2503,6 +2510,38 @@ make_rpcs(struct drm_i915_private *dev_priv)
        if (INTEL_GEN(dev_priv) < 9)
                return 0;
 
+       /*
+        * Since the SScount bitfield in GEN8_R_PWR_CLK_STATE is only three bits
+        * wide and Icelake has up to eight subslices, specfial programming is
+        * needed in order to correctly enable all subslices.
+        *
+        * According to documentation software must consider the configuration
+        * as 2x4x8 and hardware will translate this to 1x8x8.
+        *
+        * Furthemore, even though SScount is three bits, maximum documented
+        * value for it is four. From this some rules/restrictions follow:
+        *
+        * 1.
+        * If enabled subslice count is greater than four, two whole slices must
+        * be enabled instead.
+        *
+        * 2.
+        * When more than one slice is enabled, hardware ignores the subslice
+        * count altogether.
+        *
+        * From these restrictions it follows that it is not possible to enable
+        * a count of subslices between the SScount maximum of four restriction,
+        * and the maximum available number on a particular SKU. Either all
+        * subslices are enabled, or a count between one and four on the first
+        * slice.
+        */
+       if (IS_GEN11(dev_priv) && slices == 1 && subslices >= 4) {
+               GEM_BUG_ON(subslices & 1);
+
+               subslice_pg = false;
+               slices *= 2;
+       }
+
        /*
         * Starting in Gen9, render power gating can leave
         * slice/subslice/EU in a partially enabled state. We
@@ -2510,24 +2549,50 @@ make_rpcs(struct drm_i915_private *dev_priv)
         * enablement.
        */
        if (INTEL_INFO(dev_priv)->sseu.has_slice_pg) {
-               rpcs |= GEN8_RPCS_S_CNT_ENABLE;
-               rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.slice_mask) <<
-                       GEN8_RPCS_S_CNT_SHIFT;
-               rpcs |= GEN8_RPCS_ENABLE;
+               u32 mask, val = slices;
+
+               if (INTEL_GEN(dev_priv) >= 11) {
+                       mask = GEN11_RPCS_S_CNT_MASK;
+                       val <<= GEN11_RPCS_S_CNT_SHIFT;
+               } else {
+                       mask = GEN8_RPCS_S_CNT_MASK;
+                       val <<= GEN8_RPCS_S_CNT_SHIFT;
+               }
+
+               GEM_BUG_ON(val & ~mask);
+               val &= mask;
+
+               rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_S_CNT_ENABLE | val;
        }
 
-       if (INTEL_INFO(dev_priv)->sseu.has_subslice_pg) {
-               rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
-               rpcs |= hweight8(INTEL_INFO(dev_priv)->sseu.subslice_mask[0]) <<
-                       GEN8_RPCS_SS_CNT_SHIFT;
-               rpcs |= GEN8_RPCS_ENABLE;
+       if (subslice_pg) {
+               u32 val = subslices;
+
+               val <<= GEN8_RPCS_SS_CNT_SHIFT;
+
+               GEM_BUG_ON(val & ~GEN8_RPCS_SS_CNT_MASK);
+               val &= GEN8_RPCS_SS_CNT_MASK;
+
+               rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_SS_CNT_ENABLE | val;
        }
 
        if (INTEL_INFO(dev_priv)->sseu.has_eu_pg) {
-               rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
-                       GEN8_RPCS_EU_MIN_SHIFT;
-               rpcs |= INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
-                       GEN8_RPCS_EU_MAX_SHIFT;
+               u32 val;
+
+               val = INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
+                     GEN8_RPCS_EU_MIN_SHIFT;
+               GEM_BUG_ON(val & ~GEN8_RPCS_EU_MIN_MASK);
+               val &= GEN8_RPCS_EU_MIN_MASK;
+
+               rpcs |= val;
+
+               val = INTEL_INFO(dev_priv)->sseu.eu_per_subslice <<
+                     GEN8_RPCS_EU_MAX_SHIFT;
+               GEM_BUG_ON(val & ~GEN8_RPCS_EU_MAX_MASK);
+               val &= GEN8_RPCS_EU_MAX_MASK;
+
+               rpcs |= val;
+
                rpcs |= GEN8_RPCS_ENABLE;
        }
 
@@ -2584,11 +2649,13 @@ static void execlists_init_reg_state(u32 *regs,
                                 MI_LRI_FORCE_POSTED;
 
        CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine),
-               _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
-                                   CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT) |
-               _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
-                                  (HAS_RESOURCE_STREAMER(dev_priv) ?
-                                  CTX_CTRL_RS_CTX_ENABLE : 0)));
+               _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
+               _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH));
+       if (INTEL_GEN(dev_priv) < 11) {
+               regs[CTX_CONTEXT_CONTROL + 1] |=
+                       _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
+                                           CTX_CTRL_RS_CTX_ENABLE);
+       }
        CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0);
        CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0);
        CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0);
@@ -2639,7 +2706,7 @@ static void execlists_init_reg_state(u32 *regs,
        CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0);
        CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0);
 
-       if (ppgtt && i915_vm_is_48bit(&ppgtt->vm)) {
+       if (i915_vm_is_48bit(&ppgtt->vm)) {
                /* 64b PPGTT (48bit canonical)
                 * PDP0_DESCRIPTOR contains the base address to PML4 and
                 * other PDP Descriptors are ignored.
@@ -2654,6 +2721,10 @@ static void execlists_init_reg_state(u32 *regs,
 
                i915_oa_init_reg_state(engine, ctx, regs);
        }
+
+       regs[CTX_END] = MI_BATCH_BUFFER_END;
+       if (INTEL_GEN(dev_priv) >= 10)
+               regs[CTX_END] |= BIT(0);
 }
 
 static int
@@ -2780,13 +2851,14 @@ error_deref_obj:
        return ret;
 }
 
-void intel_lr_context_resume(struct drm_i915_private *dev_priv)
+void intel_lr_context_resume(struct drm_i915_private *i915)
 {
        struct intel_engine_cs *engine;
        struct i915_gem_context *ctx;
        enum intel_engine_id id;
 
-       /* Because we emit WA_TAIL_DWORDS there may be a disparity
+       /*
+        * Because we emit WA_TAIL_DWORDS there may be a disparity
         * between our bookkeeping in ce->ring->head and ce->ring->tail and
         * that stored in context. As we only write new commands from
         * ce->ring->tail onwards, everything before that is junk. If the GPU
@@ -2796,28 +2868,22 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv)
         * So to avoid that we reset the context images upon resume. For
         * simplicity, we just zero everything out.
         */
-       list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
-               for_each_engine(engine, dev_priv, id) {
+       list_for_each_entry(ctx, &i915->contexts.list, link) {
+               for_each_engine(engine, i915, id) {
                        struct intel_context *ce =
                                to_intel_context(ctx, engine);
-                       u32 *reg;
 
                        if (!ce->state)
                                continue;
 
-                       reg = i915_gem_object_pin_map(ce->state->obj,
-                                                     I915_MAP_WB);
-                       if (WARN_ON(IS_ERR(reg)))
-                               continue;
-
-                       reg += LRC_STATE_PN * PAGE_SIZE / sizeof(*reg);
-                       reg[CTX_RING_HEAD+1] = 0;
-                       reg[CTX_RING_TAIL+1] = 0;
+                       intel_ring_reset(ce->ring, 0);
 
-                       ce->state->obj->mm.dirty = true;
-                       i915_gem_object_unpin_map(ce->state->obj);
+                       if (ce->pin_count) { /* otherwise done in context_pin */
+                               u32 *regs = ce->lrc_reg_state;
 
-                       intel_ring_reset(ce->ring, 0);
+                               regs[CTX_RING_HEAD + 1] = ce->ring->head;
+                               regs[CTX_RING_TAIL + 1] = ce->ring->tail;
+                       }
                }
        }
 }