Merge tag 'drm-next-2020-06-02' of git://anongit.freedesktop.org/drm/drm
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gem / i915_gem_execbuffer.c
index 8a4e9c1..3ce1856 100644 (file)
@@ -15,8 +15,8 @@
 
 #include "gem/i915_gem_ioctls.h"
 #include "gt/intel_context.h"
-#include "gt/intel_engine_pool.h"
 #include "gt/intel_gt.h"
+#include "gt/intel_gt_buffer_pool.h"
 #include "gt/intel_gt_pm.h"
 #include "gt/intel_ring.h"
 
@@ -40,6 +40,11 @@ struct eb_vma {
        u32 handle;
 };
 
+struct eb_vma_array {
+       struct kref kref;
+       struct eb_vma vma[];
+};
+
 enum {
        FORCE_CPU_RELOC = 1,
        FORCE_GTT_RELOC,
@@ -52,7 +57,6 @@ enum {
 #define __EXEC_OBJECT_NEEDS_MAP                BIT(29)
 #define __EXEC_OBJECT_NEEDS_BIAS       BIT(28)
 #define __EXEC_OBJECT_INTERNAL_FLAGS   (~0u << 28) /* all of the above */
-#define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
 
 #define __EXEC_HAS_RELOC       BIT(31)
 #define __EXEC_INTERNAL_FLAGS  (~0u << 31)
@@ -264,7 +268,9 @@ struct i915_execbuffer {
                bool has_fence : 1;
                bool needs_unfenced : 1;
 
+               struct i915_vma *target;
                struct i915_request *rq;
+               struct i915_vma *rq_vma;
                u32 *rq_cmd;
                unsigned int rq_size;
        } reloc_cache;
@@ -283,6 +289,7 @@ struct i915_execbuffer {
         */
        int lut_size;
        struct hlist_head *buckets; /** ht for relocation handles */
+       struct eb_vma_array *array;
 };
 
 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
@@ -292,8 +299,62 @@ static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
                 eb->args->batch_len);
 }
 
+static struct eb_vma_array *eb_vma_array_create(unsigned int count)
+{
+       struct eb_vma_array *arr;
+
+       arr = kvmalloc(struct_size(arr, vma, count), GFP_KERNEL | __GFP_NOWARN);
+       if (!arr)
+               return NULL;
+
+       kref_init(&arr->kref);
+       arr->vma[0].vma = NULL;
+
+       return arr;
+}
+
+static inline void eb_unreserve_vma(struct eb_vma *ev)
+{
+       struct i915_vma *vma = ev->vma;
+
+       if (unlikely(ev->flags & __EXEC_OBJECT_HAS_FENCE))
+               __i915_vma_unpin_fence(vma);
+
+       if (ev->flags & __EXEC_OBJECT_HAS_PIN)
+               __i915_vma_unpin(vma);
+
+       ev->flags &= ~(__EXEC_OBJECT_HAS_PIN |
+                      __EXEC_OBJECT_HAS_FENCE);
+}
+
+static void eb_vma_array_destroy(struct kref *kref)
+{
+       struct eb_vma_array *arr = container_of(kref, typeof(*arr), kref);
+       struct eb_vma *ev = arr->vma;
+
+       while (ev->vma) {
+               eb_unreserve_vma(ev);
+               i915_vma_put(ev->vma);
+               ev++;
+       }
+
+       kvfree(arr);
+}
+
+static void eb_vma_array_put(struct eb_vma_array *arr)
+{
+       kref_put(&arr->kref, eb_vma_array_destroy);
+}
+
 static int eb_create(struct i915_execbuffer *eb)
 {
+       /* Allocate an extra slot for use by the command parser + sentinel */
+       eb->array = eb_vma_array_create(eb->buffer_count + 2);
+       if (!eb->array)
+               return -ENOMEM;
+
+       eb->vma = eb->array->vma;
+
        if (!(eb->args->flags & I915_EXEC_HANDLE_LUT)) {
                unsigned int size = 1 + ilog2(eb->buffer_count);
 
@@ -327,8 +388,10 @@ static int eb_create(struct i915_execbuffer *eb)
                                break;
                } while (--size);
 
-               if (unlikely(!size))
+               if (unlikely(!size)) {
+                       eb_vma_array_put(eb->array);
                        return -ENOMEM;
+               }
 
                eb->lut_size = size;
        } else {
@@ -368,6 +431,32 @@ eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
        return false;
 }
 
+static u64 eb_pin_flags(const struct drm_i915_gem_exec_object2 *entry,
+                       unsigned int exec_flags)
+{
+       u64 pin_flags = 0;
+
+       if (exec_flags & EXEC_OBJECT_NEEDS_GTT)
+               pin_flags |= PIN_GLOBAL;
+
+       /*
+        * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
+        * limit address to the first 4GBs for unflagged objects.
+        */
+       if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
+               pin_flags |= PIN_ZONE_4G;
+
+       if (exec_flags & __EXEC_OBJECT_NEEDS_MAP)
+               pin_flags |= PIN_MAPPABLE;
+
+       if (exec_flags & EXEC_OBJECT_PINNED)
+               pin_flags |= entry->offset | PIN_OFFSET_FIXED;
+       else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS)
+               pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
+
+       return pin_flags;
+}
+
 static inline bool
 eb_pin_vma(struct i915_execbuffer *eb,
           const struct drm_i915_gem_exec_object2 *entry,
@@ -385,8 +474,19 @@ eb_pin_vma(struct i915_execbuffer *eb,
        if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_GTT))
                pin_flags |= PIN_GLOBAL;
 
-       if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags)))
-               return false;
+       /* Attempt to reuse the current location if available */
+       if (unlikely(i915_vma_pin(vma, 0, 0, pin_flags))) {
+               if (entry->flags & EXEC_OBJECT_PINNED)
+                       return false;
+
+               /* Failing that pick any _free_ space if suitable */
+               if (unlikely(i915_vma_pin(vma,
+                                         entry->pad_to_size,
+                                         entry->alignment,
+                                         eb_pin_flags(entry, ev->flags) |
+                                         PIN_USER | PIN_NOEVICT)))
+                       return false;
+       }
 
        if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) {
                if (unlikely(i915_vma_pin_fence(vma))) {
@@ -402,26 +502,6 @@ eb_pin_vma(struct i915_execbuffer *eb,
        return !eb_vma_misplaced(entry, vma, ev->flags);
 }
 
-static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags)
-{
-       GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN));
-
-       if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE))
-               __i915_vma_unpin_fence(vma);
-
-       __i915_vma_unpin(vma);
-}
-
-static inline void
-eb_unreserve_vma(struct eb_vma *ev)
-{
-       if (!(ev->flags & __EXEC_OBJECT_HAS_PIN))
-               return;
-
-       __eb_unreserve_vma(ev->vma, ev->flags);
-       ev->flags &= ~__EXEC_OBJECT_RESERVED;
-}
-
 static int
 eb_validate_vma(struct i915_execbuffer *eb,
                struct drm_i915_gem_exec_object2 *entry,
@@ -481,7 +561,7 @@ eb_add_vma(struct i915_execbuffer *eb,
 
        GEM_BUG_ON(i915_vma_is_closed(vma));
 
-       ev->vma = i915_vma_get(vma);
+       ev->vma = vma;
        ev->exec = entry;
        ev->flags = entry->flags;
 
@@ -547,28 +627,9 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
                          u64 pin_flags)
 {
        struct drm_i915_gem_exec_object2 *entry = ev->exec;
-       unsigned int exec_flags = ev->flags;
        struct i915_vma *vma = ev->vma;
        int err;
 
-       if (exec_flags & EXEC_OBJECT_NEEDS_GTT)
-               pin_flags |= PIN_GLOBAL;
-
-       /*
-        * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
-        * limit address to the first 4GBs for unflagged objects.
-        */
-       if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
-               pin_flags |= PIN_ZONE_4G;
-
-       if (exec_flags & __EXEC_OBJECT_NEEDS_MAP)
-               pin_flags |= PIN_MAPPABLE;
-
-       if (exec_flags & EXEC_OBJECT_PINNED)
-               pin_flags |= entry->offset | PIN_OFFSET_FIXED;
-       else if (exec_flags & __EXEC_OBJECT_NEEDS_BIAS)
-               pin_flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
-
        if (drm_mm_node_allocated(&vma->node) &&
            eb_vma_misplaced(entry, vma, ev->flags)) {
                err = i915_vma_unbind(vma);
@@ -578,7 +639,7 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
 
        err = i915_vma_pin(vma,
                           entry->pad_to_size, entry->alignment,
-                          pin_flags);
+                          eb_pin_flags(entry, ev->flags) | pin_flags);
        if (err)
                return err;
 
@@ -587,7 +648,7 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
                eb->args->flags |= __EXEC_HAS_RELOC;
        }
 
-       if (unlikely(exec_flags & EXEC_OBJECT_NEEDS_FENCE)) {
+       if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) {
                err = i915_vma_pin_fence(vma);
                if (unlikely(err)) {
                        i915_vma_unpin(vma);
@@ -595,10 +656,10 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb,
                }
 
                if (vma->fence)
-                       exec_flags |= __EXEC_OBJECT_HAS_FENCE;
+                       ev->flags |= __EXEC_OBJECT_HAS_FENCE;
        }
 
-       ev->flags = exec_flags | __EXEC_OBJECT_HAS_PIN;
+       ev->flags |= __EXEC_OBJECT_HAS_PIN;
        GEM_BUG_ON(eb_vma_misplaced(entry, vma, ev->flags));
 
        return 0;
@@ -728,77 +789,117 @@ static int eb_select_context(struct i915_execbuffer *eb)
        return 0;
 }
 
-static int eb_lookup_vmas(struct i915_execbuffer *eb)
+static int __eb_add_lut(struct i915_execbuffer *eb,
+                       u32 handle, struct i915_vma *vma)
 {
-       struct radix_tree_root *handles_vma = &eb->gem_context->handles_vma;
-       struct drm_i915_gem_object *obj;
-       unsigned int i, batch;
+       struct i915_gem_context *ctx = eb->gem_context;
+       struct i915_lut_handle *lut;
        int err;
 
-       if (unlikely(i915_gem_context_is_closed(eb->gem_context)))
-               return -ENOENT;
+       lut = i915_lut_handle_alloc();
+       if (unlikely(!lut))
+               return -ENOMEM;
 
-       INIT_LIST_HEAD(&eb->relocs);
-       INIT_LIST_HEAD(&eb->unbound);
+       i915_vma_get(vma);
+       if (!atomic_fetch_inc(&vma->open_count))
+               i915_vma_reopen(vma);
+       lut->handle = handle;
+       lut->ctx = ctx;
+
+       /* Check that the context hasn't been closed in the meantime */
+       err = -EINTR;
+       if (!mutex_lock_interruptible(&ctx->mutex)) {
+               err = -ENOENT;
+               if (likely(!i915_gem_context_is_closed(ctx)))
+                       err = radix_tree_insert(&ctx->handles_vma, handle, vma);
+               if (err == 0) { /* And nor has this handle */
+                       struct drm_i915_gem_object *obj = vma->obj;
+
+                       i915_gem_object_lock(obj);
+                       if (idr_find(&eb->file->object_idr, handle) == obj) {
+                               list_add(&lut->obj_link, &obj->lut_list);
+                       } else {
+                               radix_tree_delete(&ctx->handles_vma, handle);
+                               err = -ENOENT;
+                       }
+                       i915_gem_object_unlock(obj);
+               }
+               mutex_unlock(&ctx->mutex);
+       }
+       if (unlikely(err))
+               goto err;
 
-       batch = eb_batch_index(eb);
+       return 0;
 
-       for (i = 0; i < eb->buffer_count; i++) {
-               u32 handle = eb->exec[i].handle;
-               struct i915_lut_handle *lut;
+err:
+       i915_vma_close(vma);
+       i915_vma_put(vma);
+       i915_lut_handle_free(lut);
+       return err;
+}
+
+static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle)
+{
+       do {
+               struct drm_i915_gem_object *obj;
                struct i915_vma *vma;
+               int err;
 
-               vma = radix_tree_lookup(handles_vma, handle);
+               rcu_read_lock();
+               vma = radix_tree_lookup(&eb->gem_context->handles_vma, handle);
                if (likely(vma))
-                       goto add_vma;
+                       vma = i915_vma_tryget(vma);
+               rcu_read_unlock();
+               if (likely(vma))
+                       return vma;
 
                obj = i915_gem_object_lookup(eb->file, handle);
-               if (unlikely(!obj)) {
-                       err = -ENOENT;
-                       goto err_vma;
-               }
+               if (unlikely(!obj))
+                       return ERR_PTR(-ENOENT);
 
                vma = i915_vma_instance(obj, eb->context->vm, NULL);
                if (IS_ERR(vma)) {
-                       err = PTR_ERR(vma);
-                       goto err_obj;
+                       i915_gem_object_put(obj);
+                       return vma;
                }
 
-               lut = i915_lut_handle_alloc();
-               if (unlikely(!lut)) {
-                       err = -ENOMEM;
-                       goto err_obj;
-               }
+               err = __eb_add_lut(eb, handle, vma);
+               if (likely(!err))
+                       return vma;
 
-               err = radix_tree_insert(handles_vma, handle, vma);
-               if (unlikely(err)) {
-                       i915_lut_handle_free(lut);
-                       goto err_obj;
-               }
+               i915_gem_object_put(obj);
+               if (err != -EEXIST)
+                       return ERR_PTR(err);
+       } while (1);
+}
 
-               /* transfer ref to lut */
-               if (!atomic_fetch_inc(&vma->open_count))
-                       i915_vma_reopen(vma);
-               lut->handle = handle;
-               lut->ctx = eb->gem_context;
+static int eb_lookup_vmas(struct i915_execbuffer *eb)
+{
+       unsigned int batch = eb_batch_index(eb);
+       unsigned int i;
+       int err = 0;
 
-               i915_gem_object_lock(obj);
-               list_add(&lut->obj_link, &obj->lut_list);
-               i915_gem_object_unlock(obj);
+       INIT_LIST_HEAD(&eb->relocs);
+       INIT_LIST_HEAD(&eb->unbound);
+
+       for (i = 0; i < eb->buffer_count; i++) {
+               struct i915_vma *vma;
+
+               vma = eb_lookup_vma(eb, eb->exec[i].handle);
+               if (IS_ERR(vma)) {
+                       err = PTR_ERR(vma);
+                       break;
+               }
 
-add_vma:
                err = eb_validate_vma(eb, &eb->exec[i], vma);
-               if (unlikely(err))
-                       goto err_vma;
+               if (unlikely(err)) {
+                       i915_vma_put(vma);
+                       break;
+               }
 
                eb_add_vma(eb, i, batch, vma);
        }
 
-       return 0;
-
-err_obj:
-       i915_gem_object_put(obj);
-err_vma:
        eb->vma[i].vma = NULL;
        return err;
 }
@@ -823,31 +924,13 @@ eb_get_vma(const struct i915_execbuffer *eb, unsigned long handle)
        }
 }
 
-static void eb_release_vmas(const struct i915_execbuffer *eb)
-{
-       const unsigned int count = eb->buffer_count;
-       unsigned int i;
-
-       for (i = 0; i < count; i++) {
-               struct eb_vma *ev = &eb->vma[i];
-               struct i915_vma *vma = ev->vma;
-
-               if (!vma)
-                       break;
-
-               eb->vma[i].vma = NULL;
-
-               if (ev->flags & __EXEC_OBJECT_HAS_PIN)
-                       __eb_unreserve_vma(vma, ev->flags);
-
-               i915_vma_put(vma);
-       }
-}
-
 static void eb_destroy(const struct i915_execbuffer *eb)
 {
        GEM_BUG_ON(eb->reloc_cache.rq);
 
+       if (eb->array)
+               eb_vma_array_put(eb->array);
+
        if (eb->lut_size > 0)
                kfree(eb->buckets);
 }
@@ -872,7 +955,7 @@ static void reloc_cache_init(struct reloc_cache *cache,
        cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
        cache->node.flags = 0;
        cache->rq = NULL;
-       cache->rq_size = 0;
+       cache->target = NULL;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -894,29 +977,122 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
        return &i915->ggtt;
 }
 
-static void reloc_gpu_flush(struct reloc_cache *cache)
+#define RELOC_TAIL 4
+
+static int reloc_gpu_chain(struct reloc_cache *cache)
+{
+       struct intel_gt_buffer_pool_node *pool;
+       struct i915_request *rq = cache->rq;
+       struct i915_vma *batch;
+       u32 *cmd;
+       int err;
+
+       pool = intel_gt_get_buffer_pool(rq->engine->gt, PAGE_SIZE);
+       if (IS_ERR(pool))
+               return PTR_ERR(pool);
+
+       batch = i915_vma_instance(pool->obj, rq->context->vm, NULL);
+       if (IS_ERR(batch)) {
+               err = PTR_ERR(batch);
+               goto out_pool;
+       }
+
+       err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
+       if (err)
+               goto out_pool;
+
+       GEM_BUG_ON(cache->rq_size + RELOC_TAIL > PAGE_SIZE  / sizeof(u32));
+       cmd = cache->rq_cmd + cache->rq_size;
+       *cmd++ = MI_ARB_CHECK;
+       if (cache->gen >= 8)
+               *cmd++ = MI_BATCH_BUFFER_START_GEN8;
+       else if (cache->gen >= 6)
+               *cmd++ = MI_BATCH_BUFFER_START;
+       else
+               *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+       *cmd++ = lower_32_bits(batch->node.start);
+       *cmd++ = upper_32_bits(batch->node.start); /* Always 0 for gen<8 */
+       i915_gem_object_flush_map(cache->rq_vma->obj);
+       i915_gem_object_unpin_map(cache->rq_vma->obj);
+       cache->rq_vma = NULL;
+
+       err = intel_gt_buffer_pool_mark_active(pool, rq);
+       if (err == 0) {
+               i915_vma_lock(batch);
+               err = i915_request_await_object(rq, batch->obj, false);
+               if (err == 0)
+                       err = i915_vma_move_to_active(batch, rq, 0);
+               i915_vma_unlock(batch);
+       }
+       i915_vma_unpin(batch);
+       if (err)
+               goto out_pool;
+
+       cmd = i915_gem_object_pin_map(batch->obj,
+                                     cache->has_llc ?
+                                     I915_MAP_FORCE_WB :
+                                     I915_MAP_FORCE_WC);
+       if (IS_ERR(cmd)) {
+               err = PTR_ERR(cmd);
+               goto out_pool;
+       }
+
+       /* Return with batch mapping (cmd) still pinned */
+       cache->rq_cmd = cmd;
+       cache->rq_size = 0;
+       cache->rq_vma = batch;
+
+out_pool:
+       intel_gt_buffer_pool_put(pool);
+       return err;
+}
+
+static unsigned int reloc_bb_flags(const struct reloc_cache *cache)
 {
-       struct drm_i915_gem_object *obj = cache->rq->batch->obj;
+       return cache->gen > 5 ? 0 : I915_DISPATCH_SECURE;
+}
+
+static int reloc_gpu_flush(struct reloc_cache *cache)
+{
+       struct i915_request *rq;
+       int err;
 
-       GEM_BUG_ON(cache->rq_size >= obj->base.size / sizeof(u32));
-       cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
+       rq = fetch_and_zero(&cache->rq);
+       if (!rq)
+               return 0;
 
-       __i915_gem_object_flush_map(obj, 0, sizeof(u32) * (cache->rq_size + 1));
-       i915_gem_object_unpin_map(obj);
+       if (cache->rq_vma) {
+               struct drm_i915_gem_object *obj = cache->rq_vma->obj;
 
-       intel_gt_chipset_flush(cache->rq->engine->gt);
+               GEM_BUG_ON(cache->rq_size >= obj->base.size / sizeof(u32));
+               cache->rq_cmd[cache->rq_size++] = MI_BATCH_BUFFER_END;
 
-       i915_request_add(cache->rq);
-       cache->rq = NULL;
+               __i915_gem_object_flush_map(obj,
+                                           0, sizeof(u32) * cache->rq_size);
+               i915_gem_object_unpin_map(obj);
+       }
+
+       err = 0;
+       if (rq->engine->emit_init_breadcrumb)
+               err = rq->engine->emit_init_breadcrumb(rq);
+       if (!err)
+               err = rq->engine->emit_bb_start(rq,
+                                               rq->batch->node.start,
+                                               PAGE_SIZE,
+                                               reloc_bb_flags(cache));
+       if (err)
+               i915_request_set_error_once(rq, err);
+
+       intel_gt_chipset_flush(rq->engine->gt);
+       i915_request_add(rq);
+
+       return err;
 }
 
 static void reloc_cache_reset(struct reloc_cache *cache)
 {
        void *vaddr;
 
-       if (cache->rq)
-               reloc_gpu_flush(cache);
-
        if (!cache->vaddr)
                return;
 
@@ -1109,17 +1285,17 @@ static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma)
 }
 
 static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
-                            struct i915_vma *vma,
+                            struct intel_engine_cs *engine,
                             unsigned int len)
 {
        struct reloc_cache *cache = &eb->reloc_cache;
-       struct intel_engine_pool_node *pool;
+       struct intel_gt_buffer_pool_node *pool;
        struct i915_request *rq;
        struct i915_vma *batch;
        u32 *cmd;
        int err;
 
-       pool = intel_engine_get_pool(eb->engine, PAGE_SIZE);
+       pool = intel_gt_get_buffer_pool(engine->gt, PAGE_SIZE);
        if (IS_ERR(pool))
                return PTR_ERR(pool);
 
@@ -1132,7 +1308,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
                goto out_pool;
        }
 
-       batch = i915_vma_instance(pool->obj, vma->vm, NULL);
+       batch = i915_vma_instance(pool->obj, eb->context->vm, NULL);
        if (IS_ERR(batch)) {
                err = PTR_ERR(batch);
                goto err_unmap;
@@ -1142,26 +1318,32 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
        if (err)
                goto err_unmap;
 
-       rq = i915_request_create(eb->context);
+       if (engine == eb->context->engine) {
+               rq = i915_request_create(eb->context);
+       } else {
+               struct intel_context *ce;
+
+               ce = intel_context_create(engine);
+               if (IS_ERR(ce)) {
+                       err = PTR_ERR(ce);
+                       goto err_unpin;
+               }
+
+               i915_vm_put(ce->vm);
+               ce->vm = i915_vm_get(eb->context->vm);
+
+               rq = intel_context_create_request(ce);
+               intel_context_put(ce);
+       }
        if (IS_ERR(rq)) {
                err = PTR_ERR(rq);
                goto err_unpin;
        }
 
-       err = intel_engine_pool_mark_active(pool, rq);
+       err = intel_gt_buffer_pool_mark_active(pool, rq);
        if (err)
                goto err_request;
 
-       err = reloc_move_to_gpu(rq, vma);
-       if (err)
-               goto err_request;
-
-       err = eb->engine->emit_bb_start(rq,
-                                       batch->node.start, PAGE_SIZE,
-                                       cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
-       if (err)
-               goto skip_request;
-
        i915_vma_lock(batch);
        err = i915_request_await_object(rq, batch->obj, false);
        if (err == 0)
@@ -1176,6 +1358,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
        cache->rq = rq;
        cache->rq_cmd = cmd;
        cache->rq_size = 0;
+       cache->rq_vma = batch;
 
        /* Return with batch mapping (cmd) still pinned */
        goto out_pool;
@@ -1189,124 +1372,206 @@ err_unpin:
 err_unmap:
        i915_gem_object_unpin_map(pool->obj);
 out_pool:
-       intel_engine_pool_put(pool);
+       intel_gt_buffer_pool_put(pool);
        return err;
 }
 
+static bool reloc_can_use_engine(const struct intel_engine_cs *engine)
+{
+       return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
+}
+
 static u32 *reloc_gpu(struct i915_execbuffer *eb,
                      struct i915_vma *vma,
                      unsigned int len)
 {
        struct reloc_cache *cache = &eb->reloc_cache;
        u32 *cmd;
-
-       if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
-               reloc_gpu_flush(cache);
+       int err;
 
        if (unlikely(!cache->rq)) {
-               int err;
+               struct intel_engine_cs *engine = eb->engine;
 
-               if (!intel_engine_can_store_dword(eb->engine))
-                       return ERR_PTR(-ENODEV);
+               if (!reloc_can_use_engine(engine)) {
+                       engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0];
+                       if (!engine)
+                               return ERR_PTR(-ENODEV);
+               }
 
-               err = __reloc_gpu_alloc(eb, vma, len);
+               err = __reloc_gpu_alloc(eb, engine, len);
                if (unlikely(err))
                        return ERR_PTR(err);
        }
 
+       if (vma != cache->target) {
+               err = reloc_move_to_gpu(cache->rq, vma);
+               if (unlikely(err)) {
+                       i915_request_set_error_once(cache->rq, err);
+                       return ERR_PTR(err);
+               }
+
+               cache->target = vma;
+       }
+
+       if (unlikely(cache->rq_size + len >
+                    PAGE_SIZE / sizeof(u32) - RELOC_TAIL)) {
+               err = reloc_gpu_chain(cache);
+               if (unlikely(err)) {
+                       i915_request_set_error_once(cache->rq, err);
+                       return ERR_PTR(err);
+               }
+       }
+
+       GEM_BUG_ON(cache->rq_size + len >= PAGE_SIZE  / sizeof(u32));
        cmd = cache->rq_cmd + cache->rq_size;
        cache->rq_size += len;
 
        return cmd;
 }
 
-static u64
-relocate_entry(struct i915_vma *vma,
-              const struct drm_i915_gem_relocation_entry *reloc,
-              struct i915_execbuffer *eb,
-              const struct i915_vma *target)
+static inline bool use_reloc_gpu(struct i915_vma *vma)
 {
-       u64 offset = reloc->offset;
-       u64 target_offset = relocation_target(reloc, target);
-       bool wide = eb->reloc_cache.use_64bit_reloc;
-       void *vaddr;
+       if (DBG_FORCE_RELOC == FORCE_GPU_RELOC)
+               return true;
 
-       if (!eb->reloc_cache.vaddr &&
-           (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
-            !dma_resv_test_signaled_rcu(vma->resv, true))) {
-               const unsigned int gen = eb->reloc_cache.gen;
-               unsigned int len;
-               u32 *batch;
-               u64 addr;
-
-               if (wide)
-                       len = offset & 7 ? 8 : 5;
-               else if (gen >= 4)
-                       len = 4;
-               else
-                       len = 3;
+       if (DBG_FORCE_RELOC)
+               return false;
 
-               batch = reloc_gpu(eb, vma, len);
-               if (IS_ERR(batch))
-                       goto repeat;
+       return !dma_resv_test_signaled_rcu(vma->resv, true);
+}
 
-               addr = gen8_canonical_addr(vma->node.start + offset);
-               if (wide) {
-                       if (offset & 7) {
-                               *batch++ = MI_STORE_DWORD_IMM_GEN4;
-                               *batch++ = lower_32_bits(addr);
-                               *batch++ = upper_32_bits(addr);
-                               *batch++ = lower_32_bits(target_offset);
-
-                               addr = gen8_canonical_addr(addr + 4);
-
-                               *batch++ = MI_STORE_DWORD_IMM_GEN4;
-                               *batch++ = lower_32_bits(addr);
-                               *batch++ = upper_32_bits(addr);
-                               *batch++ = upper_32_bits(target_offset);
-                       } else {
-                               *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
-                               *batch++ = lower_32_bits(addr);
-                               *batch++ = upper_32_bits(addr);
-                               *batch++ = lower_32_bits(target_offset);
-                               *batch++ = upper_32_bits(target_offset);
-                       }
-               } else if (gen >= 6) {
+static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
+{
+       struct page *page;
+       unsigned long addr;
+
+       GEM_BUG_ON(vma->pages != vma->obj->mm.pages);
+
+       page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT);
+       addr = PFN_PHYS(page_to_pfn(page));
+       GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */
+
+       return addr + offset_in_page(offset);
+}
+
+static bool __reloc_entry_gpu(struct i915_execbuffer *eb,
+                             struct i915_vma *vma,
+                             u64 offset,
+                             u64 target_addr)
+{
+       const unsigned int gen = eb->reloc_cache.gen;
+       unsigned int len;
+       u32 *batch;
+       u64 addr;
+
+       if (gen >= 8)
+               len = offset & 7 ? 8 : 5;
+       else if (gen >= 4)
+               len = 4;
+       else
+               len = 3;
+
+       batch = reloc_gpu(eb, vma, len);
+       if (IS_ERR(batch))
+               return false;
+
+       addr = gen8_canonical_addr(vma->node.start + offset);
+       if (gen >= 8) {
+               if (offset & 7) {
                        *batch++ = MI_STORE_DWORD_IMM_GEN4;
-                       *batch++ = 0;
-                       *batch++ = addr;
-                       *batch++ = target_offset;
-               } else if (gen >= 4) {
-                       *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
-                       *batch++ = 0;
-                       *batch++ = addr;
-                       *batch++ = target_offset;
+                       *batch++ = lower_32_bits(addr);
+                       *batch++ = upper_32_bits(addr);
+                       *batch++ = lower_32_bits(target_addr);
+
+                       addr = gen8_canonical_addr(addr + 4);
+
+                       *batch++ = MI_STORE_DWORD_IMM_GEN4;
+                       *batch++ = lower_32_bits(addr);
+                       *batch++ = upper_32_bits(addr);
+                       *batch++ = upper_32_bits(target_addr);
                } else {
-                       *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
-                       *batch++ = addr;
-                       *batch++ = target_offset;
+                       *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
+                       *batch++ = lower_32_bits(addr);
+                       *batch++ = upper_32_bits(addr);
+                       *batch++ = lower_32_bits(target_addr);
+                       *batch++ = upper_32_bits(target_addr);
                }
-
-               goto out;
+       } else if (gen >= 6) {
+               *batch++ = MI_STORE_DWORD_IMM_GEN4;
+               *batch++ = 0;
+               *batch++ = addr;
+               *batch++ = target_addr;
+       } else if (IS_I965G(eb->i915)) {
+               *batch++ = MI_STORE_DWORD_IMM_GEN4;
+               *batch++ = 0;
+               *batch++ = vma_phys_addr(vma, offset);
+               *batch++ = target_addr;
+       } else if (gen >= 4) {
+               *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+               *batch++ = 0;
+               *batch++ = addr;
+               *batch++ = target_addr;
+       } else if (gen >= 3 &&
+                  !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) {
+               *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+               *batch++ = addr;
+               *batch++ = target_addr;
+       } else {
+               *batch++ = MI_STORE_DWORD_IMM;
+               *batch++ = vma_phys_addr(vma, offset);
+               *batch++ = target_addr;
        }
 
+       return true;
+}
+
+static bool reloc_entry_gpu(struct i915_execbuffer *eb,
+                           struct i915_vma *vma,
+                           u64 offset,
+                           u64 target_addr)
+{
+       if (eb->reloc_cache.vaddr)
+               return false;
+
+       if (!use_reloc_gpu(vma))
+               return false;
+
+       return __reloc_entry_gpu(eb, vma, offset, target_addr);
+}
+
+static u64
+relocate_entry(struct i915_vma *vma,
+              const struct drm_i915_gem_relocation_entry *reloc,
+              struct i915_execbuffer *eb,
+              const struct i915_vma *target)
+{
+       u64 target_addr = relocation_target(reloc, target);
+       u64 offset = reloc->offset;
+
+       if (!reloc_entry_gpu(eb, vma, offset, target_addr)) {
+               bool wide = eb->reloc_cache.use_64bit_reloc;
+               void *vaddr;
+
 repeat:
-       vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
-       if (IS_ERR(vaddr))
-               return PTR_ERR(vaddr);
+               vaddr = reloc_vaddr(vma->obj,
+                                   &eb->reloc_cache,
+                                   offset >> PAGE_SHIFT);
+               if (IS_ERR(vaddr))
+                       return PTR_ERR(vaddr);
 
-       clflush_write32(vaddr + offset_in_page(offset),
-                       lower_32_bits(target_offset),
-                       eb->reloc_cache.vaddr);
+               GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32)));
+               clflush_write32(vaddr + offset_in_page(offset),
+                               lower_32_bits(target_addr),
+                               eb->reloc_cache.vaddr);
 
-       if (wide) {
-               offset += sizeof(u32);
-               target_offset >>= 32;
-               wide = false;
-               goto repeat;
+               if (wide) {
+                       offset += sizeof(u32);
+                       target_addr >>= 32;
+                       wide = false;
+                       goto repeat;
+               }
        }
 
-out:
        return target->node.start | UPDATE;
 }
 
@@ -1411,12 +1676,11 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
 {
 #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
        struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
-       struct drm_i915_gem_relocation_entry __user *urelocs;
        const struct drm_i915_gem_exec_object2 *entry = ev->exec;
-       unsigned int remain;
+       struct drm_i915_gem_relocation_entry __user *urelocs =
+               u64_to_user_ptr(entry->relocs_ptr);
+       unsigned long remain = entry->relocation_count;
 
-       urelocs = u64_to_user_ptr(entry->relocs_ptr);
-       remain = entry->relocation_count;
        if (unlikely(remain > N_RELOC(ULONG_MAX)))
                return -EINVAL;
 
@@ -1425,13 +1689,13 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
         * to read. However, if the array is not writable the user loses
         * the updated relocation values.
         */
-       if (unlikely(!access_ok(urelocs, remain*sizeof(*urelocs))))
+       if (unlikely(!access_ok(urelocs, remain * sizeof(*urelocs))))
                return -EFAULT;
 
        do {
                struct drm_i915_gem_relocation_entry *r = stack;
                unsigned int count =
-                       min_t(unsigned int, remain, ARRAY_SIZE(stack));
+                       min_t(unsigned long, remain, ARRAY_SIZE(stack));
                unsigned int copied;
 
                /*
@@ -1494,9 +1758,7 @@ static int eb_relocate(struct i915_execbuffer *eb)
 {
        int err;
 
-       mutex_lock(&eb->gem_context->mutex);
        err = eb_lookup_vmas(eb);
-       mutex_unlock(&eb->gem_context->mutex);
        if (err)
                return err;
 
@@ -1509,15 +1771,20 @@ static int eb_relocate(struct i915_execbuffer *eb)
        /* The objects are in their final locations, apply the relocations. */
        if (eb->args->flags & __EXEC_HAS_RELOC) {
                struct eb_vma *ev;
+               int flush;
 
                list_for_each_entry(ev, &eb->relocs, reloc_link) {
                        err = eb_relocate_vma(eb, ev);
                        if (err)
-                               return err;
+                               break;
                }
+
+               flush = reloc_gpu_flush(&eb->reloc_cache);
+               if (!err)
+                       err = flush;
        }
 
-       return 0;
+       return err;
 }
 
 static int eb_move_to_gpu(struct i915_execbuffer *eb)
@@ -1597,19 +1864,15 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
                        err = i915_vma_move_to_active(vma, eb->request, flags);
 
                i915_vma_unlock(vma);
-
-               __eb_unreserve_vma(vma, flags);
-               i915_vma_put(vma);
-
-               ev->vma = NULL;
+               eb_unreserve_vma(ev);
        }
        ww_acquire_fini(&acquire);
 
+       eb_vma_array_put(fetch_and_zero(&eb->array));
+
        if (unlikely(err))
                goto err_skip;
 
-       eb->exec = NULL;
-
        /* Unconditionally flush any chipset caches (for streaming writes). */
        intel_gt_chipset_flush(eb->engine->gt);
        return 0;
@@ -1784,7 +2047,7 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb,
        dma_resv_add_excl_fence(shadow->resv, &pw->base.dma);
        dma_resv_unlock(shadow->resv);
 
-       dma_fence_work_commit(&pw->base);
+       dma_fence_work_commit_imm(&pw->base);
        return 0;
 
 err_batch_unlock:
@@ -1804,7 +2067,7 @@ err_free:
 static int eb_parse(struct i915_execbuffer *eb)
 {
        struct drm_i915_private *i915 = eb->i915;
-       struct intel_engine_pool_node *pool;
+       struct intel_gt_buffer_pool_node *pool;
        struct i915_vma *shadow, *trampoline;
        unsigned int len;
        int err;
@@ -1827,7 +2090,7 @@ static int eb_parse(struct i915_execbuffer *eb)
                len += I915_CMD_PARSER_TRAMPOLINE_SIZE;
        }
 
-       pool = intel_engine_get_pool(eb->engine, len);
+       pool = intel_gt_get_buffer_pool(eb->engine->gt, len);
        if (IS_ERR(pool))
                return PTR_ERR(pool);
 
@@ -1861,6 +2124,7 @@ static int eb_parse(struct i915_execbuffer *eb)
        eb->vma[eb->buffer_count].vma = i915_vma_get(shadow);
        eb->vma[eb->buffer_count].flags = __EXEC_OBJECT_HAS_PIN;
        eb->batch = &eb->vma[eb->buffer_count++];
+       eb->vma[eb->buffer_count].vma = NULL;
 
        eb->trampoline = trampoline;
        eb->batch_start_offset = 0;
@@ -1874,7 +2138,7 @@ err_trampoline:
 err_shadow:
        i915_vma_unpin(shadow);
 err:
-       intel_engine_pool_put(pool);
+       intel_gt_buffer_pool_put(pool);
        return err;
 }
 
@@ -2318,39 +2582,13 @@ static void eb_request_add(struct i915_execbuffer *eb)
        /* Check that the context wasn't destroyed before submission */
        if (likely(!intel_context_is_closed(eb->context))) {
                attr = eb->gem_context->sched;
-
-               /*
-                * Boost actual workloads past semaphores!
-                *
-                * With semaphores we spin on one engine waiting for another,
-                * simply to reduce the latency of starting our work when
-                * the signaler completes. However, if there is any other
-                * work that we could be doing on this engine instead, that
-                * is better utilisation and will reduce the overall duration
-                * of the current work. To avoid PI boosting a semaphore
-                * far in the distance past over useful work, we keep a history
-                * of any semaphore use along our dependency chain.
-                */
-               if (!(rq->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN))
-                       attr.priority |= I915_PRIORITY_NOSEMAPHORE;
-
-               /*
-                * Boost priorities to new clients (new request flows).
-                *
-                * Allow interactive/synchronous clients to jump ahead of
-                * the bulk clients. (FQ_CODEL)
-                */
-               if (list_empty(&rq->sched.signalers_list))
-                       attr.priority |= I915_PRIORITY_WAIT;
        } else {
                /* Serialise with context_close via the add_to_timeline */
                i915_request_set_error_once(rq, -ENOENT);
                __i915_request_skip(rq);
        }
 
-       local_bh_disable();
        __i915_request_queue(rq, &attr);
-       local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
 
        /* Try to clean up the client's timeline after submitting the request */
        if (prev)
@@ -2369,7 +2607,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
        struct drm_i915_private *i915 = to_i915(dev);
        struct i915_execbuffer eb;
        struct dma_fence *in_fence = NULL;
-       struct dma_fence *exec_fence = NULL;
        struct sync_file *out_fence = NULL;
        struct i915_vma *batch;
        int out_fence_fd = -1;
@@ -2386,8 +2623,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
                args->flags |= __EXEC_HAS_RELOC;
 
        eb.exec = exec;
-       eb.vma = (struct eb_vma *)(exec + args->buffer_count + 1);
-       eb.vma[0].vma = NULL;
 
        eb.invalid_flags = __EXEC_OBJECT_UNKNOWN_FLAGS;
        reloc_cache_init(&eb.reloc_cache, eb.i915);
@@ -2414,30 +2649,22 @@ i915_gem_do_execbuffer(struct drm_device *dev,
        if (args->flags & I915_EXEC_IS_PINNED)
                eb.batch_flags |= I915_DISPATCH_PINNED;
 
-       if (args->flags & I915_EXEC_FENCE_IN) {
+#define IN_FENCES (I915_EXEC_FENCE_IN | I915_EXEC_FENCE_SUBMIT)
+       if (args->flags & IN_FENCES) {
+               if ((args->flags & IN_FENCES) == IN_FENCES)
+                       return -EINVAL;
+
                in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
                if (!in_fence)
                        return -EINVAL;
        }
-
-       if (args->flags & I915_EXEC_FENCE_SUBMIT) {
-               if (in_fence) {
-                       err = -EINVAL;
-                       goto err_in_fence;
-               }
-
-               exec_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
-               if (!exec_fence) {
-                       err = -EINVAL;
-                       goto err_in_fence;
-               }
-       }
+#undef IN_FENCES
 
        if (args->flags & I915_EXEC_FENCE_OUT) {
                out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
                if (out_fence_fd < 0) {
                        err = out_fence_fd;
-                       goto err_exec_fence;
+                       goto err_in_fence;
                }
        }
 
@@ -2528,14 +2755,13 @@ i915_gem_do_execbuffer(struct drm_device *dev,
        }
 
        if (in_fence) {
-               err = i915_request_await_dma_fence(eb.request, in_fence);
-               if (err < 0)
-                       goto err_request;
-       }
-
-       if (exec_fence) {
-               err = i915_request_await_execution(eb.request, exec_fence,
-                                                  eb.engine->bond_execute);
+               if (args->flags & I915_EXEC_FENCE_SUBMIT)
+                       err = i915_request_await_execution(eb.request,
+                                                          in_fence,
+                                                          eb.engine->bond_execute);
+               else
+                       err = i915_request_await_dma_fence(eb.request,
+                                                          in_fence);
                if (err < 0)
                        goto err_request;
        }
@@ -2563,7 +2789,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
         */
        eb.request->batch = batch;
        if (batch->private)
-               intel_engine_pool_mark_active(batch->private, eb.request);
+               intel_gt_buffer_pool_mark_active(batch->private, eb.request);
 
        trace_i915_request_queue(eb.request, eb.batch_flags);
        err = eb_submit(&eb, batch);
@@ -2592,10 +2818,8 @@ err_batch_unpin:
                i915_vma_unpin(batch);
 err_parse:
        if (batch->private)
-               intel_engine_pool_put(batch->private);
+               intel_gt_buffer_pool_put(batch->private);
 err_vma:
-       if (eb.exec)
-               eb_release_vmas(&eb);
        if (eb.trampoline)
                i915_vma_unpin(eb.trampoline);
        eb_unpin_engine(&eb);
@@ -2606,8 +2830,6 @@ err_destroy:
 err_out_fence:
        if (out_fence_fd != -1)
                put_unused_fd(out_fence_fd);
-err_exec_fence:
-       dma_fence_put(exec_fence);
 err_in_fence:
        dma_fence_put(in_fence);
        return err;
@@ -2615,7 +2837,7 @@ err_in_fence:
 
 static size_t eb_element_size(void)
 {
-       return sizeof(struct drm_i915_gem_exec_object2) + sizeof(struct eb_vma);
+       return sizeof(struct drm_i915_gem_exec_object2);
 }
 
 static bool check_buffer_count(size_t count)
@@ -2671,7 +2893,7 @@ i915_gem_execbuffer_ioctl(struct drm_device *dev, void *data,
        /* Copy in the exec list from userland */
        exec_list = kvmalloc_array(count, sizeof(*exec_list),
                                   __GFP_NOWARN | GFP_KERNEL);
-       exec2_list = kvmalloc_array(count + 1, eb_element_size(),
+       exec2_list = kvmalloc_array(count, eb_element_size(),
                                    __GFP_NOWARN | GFP_KERNEL);
        if (exec_list == NULL || exec2_list == NULL) {
                drm_dbg(&i915->drm,
@@ -2749,8 +2971,7 @@ i915_gem_execbuffer2_ioctl(struct drm_device *dev, void *data,
        if (err)
                return err;
 
-       /* Allocate an extra slot for use by the command parser */
-       exec2_list = kvmalloc_array(count + 1, eb_element_size(),
+       exec2_list = kvmalloc_array(count, eb_element_size(),
                                    __GFP_NOWARN | GFP_KERNEL);
        if (exec2_list == NULL) {
                drm_dbg(&i915->drm, "Failed to allocate exec list for %zd buffers\n",
@@ -2818,3 +3039,7 @@ end:;
        kvfree(exec2_list);
        return err;
 }
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftests/i915_gem_execbuffer.c"
+#endif