drm/xe: Ban a VM if rebind worker hits an error
authorMatthew Brost <matthew.brost@intel.com>
Thu, 22 Jun 2023 19:39:48 +0000 (12:39 -0700)
committerRodrigo Vivi <rodrigo.vivi@intel.com>
Thu, 21 Dec 2023 16:35:18 +0000 (11:35 -0500)
We cannot recover a VM if a rebind worker hits an error, ban the VM if
happens to ensure we do not attempt to place this VM on the hardware
again.

A follow up will inform the user if this happens.

v2: Return -ECANCELED in exec VM closed or banned, check for closed or
banned within VM lock.
v3: Fix lockdep splat by looking engine outside of vm->lock
v4: Fix error path when engine lookup fails
v5: Add debug message in rebind worker on error, update comments wrt
locking, add xe_vm_close helper

Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
drivers/gpu/drm/xe/xe_engine.c
drivers/gpu/drm/xe/xe_exec.c
drivers/gpu/drm/xe/xe_trace.h
drivers/gpu/drm/xe/xe_vm.c
drivers/gpu/drm/xe/xe_vm.h
drivers/gpu/drm/xe/xe_vm_madvise.c
drivers/gpu/drm/xe/xe_vm_types.h

index f1b8b22..af75c9a 100644 (file)
@@ -597,10 +597,23 @@ int xe_engine_create_ioctl(struct drm_device *dev, void *data,
                if (XE_IOCTL_ERR(xe, !vm))
                        return -ENOENT;
 
+               err = down_read_interruptible(&vm->lock);
+               if (err) {
+                       xe_vm_put(vm);
+                       return err;
+               }
+
+               if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
+                       up_read(&vm->lock);
+                       xe_vm_put(vm);
+                       return -ENOENT;
+               }
+
                e = xe_engine_create(xe, vm, logical_mask,
                                     args->width, hwe,
                                     xe_vm_no_dma_fences(vm) ? 0 :
                                     ENGINE_FLAG_PERSISTENT);
+               up_read(&vm->lock);
                xe_vm_put(vm);
                if (IS_ERR(e))
                        return PTR_ERR(e);
index c52edff..bdf00e5 100644 (file)
@@ -297,9 +297,9 @@ retry:
        if (err)
                goto err_unlock_list;
 
-       if (xe_vm_is_closed(engine->vm)) {
-               drm_warn(&xe->drm, "Trying to schedule after vm is closed\n");
-               err = -EIO;
+       if (xe_vm_is_closed_or_banned(engine->vm)) {
+               drm_warn(&xe->drm, "Trying to schedule after vm is closed or banned\n");
+               err = -ECANCELED;
                goto err_engine_end;
        }
 
index 8a5d35f..d589457 100644 (file)
@@ -478,6 +478,11 @@ DECLARE_EVENT_CLASS(xe_vm,
                              __entry->asid)
 );
 
+DEFINE_EVENT(xe_vm, xe_vm_kill,
+            TP_PROTO(struct xe_vm *vm),
+            TP_ARGS(vm)
+);
+
 DEFINE_EVENT(xe_vm, xe_vm_create,
             TP_PROTO(struct xe_vm *vm),
             TP_ARGS(vm)
index a9cf62f..47e3d37 100644 (file)
@@ -514,6 +514,24 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
 
 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
 
+static void xe_vm_kill(struct xe_vm *vm)
+{
+       struct ww_acquire_ctx ww;
+       struct xe_engine *e;
+
+       lockdep_assert_held(&vm->lock);
+
+       xe_vm_lock(vm, &ww, 0, false);
+       vm->flags |= XE_VM_FLAG_BANNED;
+       trace_xe_vm_kill(vm);
+
+       list_for_each_entry(e, &vm->preempt.engines, compute.link)
+               e->ops->kill(e);
+       xe_vm_unlock(vm, &ww);
+
+       /* TODO: Inform user the VM is banned */
+}
+
 static void preempt_rebind_work_func(struct work_struct *w)
 {
        struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@@ -533,13 +551,14 @@ static void preempt_rebind_work_func(struct work_struct *w)
        XE_BUG_ON(!xe_vm_in_compute_mode(vm));
        trace_xe_vm_rebind_worker_enter(vm);
 
-       if (xe_vm_is_closed(vm)) {
+       down_write(&vm->lock);
+
+       if (xe_vm_is_closed_or_banned(vm)) {
+               up_write(&vm->lock);
                trace_xe_vm_rebind_worker_exit(vm);
                return;
        }
 
-       down_write(&vm->lock);
-
 retry:
        if (vm->async_ops.error)
                goto out_unlock_outer;
@@ -666,11 +685,14 @@ out_unlock_outer:
                        goto retry;
                }
        }
+       if (err) {
+               drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
+               xe_vm_kill(vm);
+       }
        up_write(&vm->lock);
 
        free_preempt_fences(&preempt_fences);
 
-       XE_WARN_ON(err < 0);    /* TODO: Kill VM or put in error state */
        trace_xe_vm_rebind_worker_exit(vm);
 }
 
@@ -1140,11 +1162,12 @@ xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma *vma)
 {
        struct rb_node *node;
 
-       if (xe_vm_is_closed(vm))
+       lockdep_assert_held(&vm->lock);
+
+       if (xe_vm_is_closed_or_banned(vm))
                return NULL;
 
        XE_BUG_ON(vma->end >= vm->size);
-       lockdep_assert_held(&vm->lock);
 
        node = rb_find(vma, &vm->vmas, xe_vma_cmp_vma_cb);
 
@@ -1377,6 +1400,13 @@ mm_closed:
        wake_up_all(&vm->async_ops.error_capture.wq);
 }
 
+static void xe_vm_close(struct xe_vm *vm)
+{
+       down_write(&vm->lock);
+       vm->size = 0;
+       up_write(&vm->lock);
+}
+
 void xe_vm_close_and_put(struct xe_vm *vm)
 {
        struct rb_root contested = RB_ROOT;
@@ -1387,8 +1417,8 @@ void xe_vm_close_and_put(struct xe_vm *vm)
 
        XE_BUG_ON(vm->preempt.num_engines);
 
-       vm->size = 0;
-       smp_mb();
+       xe_vm_close(vm);
+
        flush_async_ops(vm);
        if (xe_vm_in_compute_mode(vm))
                flush_work(&vm->preempt.rebind_work);
@@ -3072,30 +3102,34 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
        if (err)
                return err;
 
-       vm = xe_vm_lookup(xef, args->vm_id);
-       if (XE_IOCTL_ERR(xe, !vm)) {
-               err = -EINVAL;
-               goto free_objs;
-       }
-
-       if (XE_IOCTL_ERR(xe, xe_vm_is_closed(vm))) {
-               drm_err(dev, "VM closed while we began looking up?\n");
-               err = -ENOENT;
-               goto put_vm;
-       }
-
        if (args->engine_id) {
                e = xe_engine_lookup(xef, args->engine_id);
                if (XE_IOCTL_ERR(xe, !e)) {
                        err = -ENOENT;
-                       goto put_vm;
+                       goto free_objs;
                }
+
                if (XE_IOCTL_ERR(xe, !(e->flags & ENGINE_FLAG_VM))) {
                        err = -EINVAL;
                        goto put_engine;
                }
        }
 
+       vm = xe_vm_lookup(xef, args->vm_id);
+       if (XE_IOCTL_ERR(xe, !vm)) {
+               err = -EINVAL;
+               goto put_engine;
+       }
+
+       err = down_write_killable(&vm->lock);
+       if (err)
+               goto put_vm;
+
+       if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
+               err = -ENOENT;
+               goto release_vm_lock;
+       }
+
        if (VM_BIND_OP(bind_ops[0].op) == XE_VM_BIND_OP_RESTART) {
                if (XE_IOCTL_ERR(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
                        err = -EOPNOTSUPP;
@@ -3105,10 +3139,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
                        err = -EPROTO;
 
                if (!err) {
-                       down_write(&vm->lock);
                        trace_xe_vm_restart(vm);
                        vm_set_async_error(vm, 0);
-                       up_write(&vm->lock);
 
                        queue_work(system_unbound_wq, &vm->async_ops.work);
 
@@ -3117,13 +3149,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
                                xe_vm_queue_rebind_worker(vm);
                }
 
-               goto put_engine;
+               goto release_vm_lock;
        }
 
        if (XE_IOCTL_ERR(xe, !vm->async_ops.error &&
                         async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) {
                err = -EOPNOTSUPP;
-               goto put_engine;
+               goto release_vm_lock;
        }
 
        for (i = 0; i < args->num_binds; ++i) {
@@ -3133,7 +3165,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
                if (XE_IOCTL_ERR(xe, range > vm->size) ||
                    XE_IOCTL_ERR(xe, addr > vm->size - range)) {
                        err = -EINVAL;
-                       goto put_engine;
+                       goto release_vm_lock;
                }
 
                if (bind_ops[i].tile_mask) {
@@ -3142,7 +3174,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
                        if (XE_IOCTL_ERR(xe, bind_ops[i].tile_mask &
                                         ~valid_tiles)) {
                                err = -EINVAL;
-                               goto put_engine;
+                               goto release_vm_lock;
                        }
                }
        }
@@ -3150,13 +3182,13 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
        bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
        if (!bos) {
                err = -ENOMEM;
-               goto put_engine;
+               goto release_vm_lock;
        }
 
        vmas = kzalloc(sizeof(*vmas) * args->num_binds, GFP_KERNEL);
        if (!vmas) {
                err = -ENOMEM;
-               goto put_engine;
+               goto release_vm_lock;
        }
 
        for (i = 0; i < args->num_binds; ++i) {
@@ -3211,10 +3243,6 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
                        goto free_syncs;
        }
 
-       err = down_write_killable(&vm->lock);
-       if (err)
-               goto free_syncs;
-
        /* Do some error checking first to make the unwind easier */
        for (i = 0; i < args->num_binds; ++i) {
                u64 range = bind_ops[i].range;
@@ -3223,7 +3251,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
                err = __vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op);
                if (err)
-                       goto release_vm_lock;
+                       goto free_syncs;
        }
 
        for (i = 0; i < args->num_binds; ++i) {
@@ -3343,8 +3371,6 @@ destroy_vmas:
                        break;
                }
        }
-release_vm_lock:
-       up_write(&vm->lock);
 free_syncs:
        while (num_syncs--) {
                if (async && j &&
@@ -3357,11 +3383,13 @@ free_syncs:
 put_obj:
        for (i = j; i < args->num_binds; ++i)
                xe_bo_put(bos[i]);
+release_vm_lock:
+       up_write(&vm->lock);
+put_vm:
+       xe_vm_put(vm);
 put_engine:
        if (e)
                xe_engine_put(e);
-put_vm:
-       xe_vm_put(vm);
 free_objs:
        kfree(bos);
        kfree(vmas);
index 5edb777..02b409d 100644 (file)
@@ -45,10 +45,21 @@ void xe_vm_unlock(struct xe_vm *vm, struct ww_acquire_ctx *ww);
 
 static inline bool xe_vm_is_closed(struct xe_vm *vm)
 {
-       /* Only guaranteed not to change when vm->resv is held */
+       /* Only guaranteed not to change when vm->lock is held */
        return !vm->size;
 }
 
+static inline bool xe_vm_is_banned(struct xe_vm *vm)
+{
+       return vm->flags & XE_VM_FLAG_BANNED;
+}
+
+static inline bool xe_vm_is_closed_or_banned(struct xe_vm *vm)
+{
+       lockdep_assert_held(&vm->lock);
+       return xe_vm_is_closed(vm) || xe_vm_is_banned(vm);
+}
+
 struct xe_vma *
 xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma *vma);
 
index 6c19643..670c80c 100644 (file)
@@ -313,7 +313,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data,
        if (XE_IOCTL_ERR(xe, !vm))
                return -EINVAL;
 
-       if (XE_IOCTL_ERR(xe, xe_vm_is_closed(vm))) {
+       if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
                err = -ENOENT;
                goto put_vm;
        }
index c148dd4..3c88521 100644 (file)
@@ -176,15 +176,19 @@ struct xe_vm {
        struct xe_bo *scratch_bo[XE_MAX_TILES_PER_DEVICE];
        struct xe_pt *scratch_pt[XE_MAX_TILES_PER_DEVICE][XE_VM_MAX_LEVEL];
 
-       /** @flags: flags for this VM, statically setup a creation time */
+       /**
+        * @flags: flags for this VM, statically setup a creation time aside
+        * from XE_VM_FLAG_BANNED which requires vm->lock to set / read safely
+        */
 #define XE_VM_FLAGS_64K                        BIT(0)
 #define XE_VM_FLAG_COMPUTE_MODE                BIT(1)
 #define XE_VM_FLAG_ASYNC_BIND_OPS      BIT(2)
 #define XE_VM_FLAG_MIGRATION           BIT(3)
 #define XE_VM_FLAG_SCRATCH_PAGE                BIT(4)
 #define XE_VM_FLAG_FAULT_MODE          BIT(5)
-#define XE_VM_FLAG_GT_ID(flags)                (((flags) >> 6) & 0x3)
-#define XE_VM_FLAG_SET_TILE_ID(tile)   ((tile)->id << 6)
+#define XE_VM_FLAG_BANNED              BIT(6)
+#define XE_VM_FLAG_GT_ID(flags)                (((flags) >> 7) & 0x3)
+#define XE_VM_FLAG_SET_TILE_ID(tile)   ((tile)->id << 7)
        unsigned long flags;
 
        /** @composite_fence_ctx: context composite fence */