drm/xe: Fix BUG_ON during bind with prefetch
[linux-2.6-microblaze.git] / drivers / gpu / drm / xe / xe_vm.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5
6 #include "xe_vm.h"
7
8 #include <linux/dma-fence-array.h>
9
10 #include <drm/drm_print.h>
11 #include <drm/ttm/ttm_execbuf_util.h>
12 #include <drm/ttm/ttm_tt.h>
13 #include <drm/xe_drm.h>
14 #include <linux/delay.h>
15 #include <linux/kthread.h>
16 #include <linux/mm.h>
17 #include <linux/swap.h>
18
19 #include "xe_bo.h"
20 #include "xe_device.h"
21 #include "xe_engine.h"
22 #include "xe_gt.h"
23 #include "xe_gt_pagefault.h"
24 #include "xe_gt_tlb_invalidation.h"
25 #include "xe_migrate.h"
26 #include "xe_pm.h"
27 #include "xe_preempt_fence.h"
28 #include "xe_pt.h"
29 #include "xe_res_cursor.h"
30 #include "xe_sync.h"
31 #include "xe_trace.h"
32
33 #define TEST_VM_ASYNC_OPS_ERROR
34
35 /**
36  * xe_vma_userptr_check_repin() - Advisory check for repin needed
37  * @vma: The userptr vma
38  *
39  * Check if the userptr vma has been invalidated since last successful
40  * repin. The check is advisory only and can the function can be called
41  * without the vm->userptr.notifier_lock held. There is no guarantee that the
42  * vma userptr will remain valid after a lockless check, so typically
43  * the call needs to be followed by a proper check under the notifier_lock.
44  *
45  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
46  */
47 int xe_vma_userptr_check_repin(struct xe_vma *vma)
48 {
49         return mmu_interval_check_retry(&vma->userptr.notifier,
50                                         vma->userptr.notifier_seq) ?
51                 -EAGAIN : 0;
52 }
53
54 int xe_vma_userptr_pin_pages(struct xe_vma *vma)
55 {
56         struct xe_vm *vm = xe_vma_vm(vma);
57         struct xe_device *xe = vm->xe;
58         const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
59         struct page **pages;
60         bool in_kthread = !current->mm;
61         unsigned long notifier_seq;
62         int pinned, ret, i;
63         bool read_only = xe_vma_read_only(vma);
64
65         lockdep_assert_held(&vm->lock);
66         XE_BUG_ON(!xe_vma_is_userptr(vma));
67 retry:
68         if (vma->gpuva.flags & XE_VMA_DESTROYED)
69                 return 0;
70
71         notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
72         if (notifier_seq == vma->userptr.notifier_seq)
73                 return 0;
74
75         pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
76         if (!pages)
77                 return -ENOMEM;
78
79         if (vma->userptr.sg) {
80                 dma_unmap_sgtable(xe->drm.dev,
81                                   vma->userptr.sg,
82                                   read_only ? DMA_TO_DEVICE :
83                                   DMA_BIDIRECTIONAL, 0);
84                 sg_free_table(vma->userptr.sg);
85                 vma->userptr.sg = NULL;
86         }
87
88         pinned = ret = 0;
89         if (in_kthread) {
90                 if (!mmget_not_zero(vma->userptr.notifier.mm)) {
91                         ret = -EFAULT;
92                         goto mm_closed;
93                 }
94                 kthread_use_mm(vma->userptr.notifier.mm);
95         }
96
97         while (pinned < num_pages) {
98                 ret = get_user_pages_fast(xe_vma_userptr(vma) +
99                                           pinned * PAGE_SIZE,
100                                           num_pages - pinned,
101                                           read_only ? 0 : FOLL_WRITE,
102                                           &pages[pinned]);
103                 if (ret < 0) {
104                         if (in_kthread)
105                                 ret = 0;
106                         break;
107                 }
108
109                 pinned += ret;
110                 ret = 0;
111         }
112
113         if (in_kthread) {
114                 kthread_unuse_mm(vma->userptr.notifier.mm);
115                 mmput(vma->userptr.notifier.mm);
116         }
117 mm_closed:
118         if (ret)
119                 goto out;
120
121         ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages,
122                                                 pinned, 0,
123                                                 (u64)pinned << PAGE_SHIFT,
124                                                 xe_sg_segment_size(xe->drm.dev),
125                                                 GFP_KERNEL);
126         if (ret) {
127                 vma->userptr.sg = NULL;
128                 goto out;
129         }
130         vma->userptr.sg = &vma->userptr.sgt;
131
132         ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg,
133                               read_only ? DMA_TO_DEVICE :
134                               DMA_BIDIRECTIONAL,
135                               DMA_ATTR_SKIP_CPU_SYNC |
136                               DMA_ATTR_NO_KERNEL_MAPPING);
137         if (ret) {
138                 sg_free_table(vma->userptr.sg);
139                 vma->userptr.sg = NULL;
140                 goto out;
141         }
142
143         for (i = 0; i < pinned; ++i) {
144                 if (!read_only) {
145                         lock_page(pages[i]);
146                         set_page_dirty(pages[i]);
147                         unlock_page(pages[i]);
148                 }
149
150                 mark_page_accessed(pages[i]);
151         }
152
153 out:
154         release_pages(pages, pinned);
155         kvfree(pages);
156
157         if (!(ret < 0)) {
158                 vma->userptr.notifier_seq = notifier_seq;
159                 if (xe_vma_userptr_check_repin(vma) == -EAGAIN)
160                         goto retry;
161         }
162
163         return ret < 0 ? ret : 0;
164 }
165
166 static bool preempt_fences_waiting(struct xe_vm *vm)
167 {
168         struct xe_engine *e;
169
170         lockdep_assert_held(&vm->lock);
171         xe_vm_assert_held(vm);
172
173         list_for_each_entry(e, &vm->preempt.engines, compute.link) {
174                 if (!e->compute.pfence || (e->compute.pfence &&
175                     test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
176                              &e->compute.pfence->flags))) {
177                         return true;
178                 }
179         }
180
181         return false;
182 }
183
184 static void free_preempt_fences(struct list_head *list)
185 {
186         struct list_head *link, *next;
187
188         list_for_each_safe(link, next, list)
189                 xe_preempt_fence_free(to_preempt_fence_from_link(link));
190 }
191
192 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
193                                 unsigned int *count)
194 {
195         lockdep_assert_held(&vm->lock);
196         xe_vm_assert_held(vm);
197
198         if (*count >= vm->preempt.num_engines)
199                 return 0;
200
201         for (; *count < vm->preempt.num_engines; ++(*count)) {
202                 struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
203
204                 if (IS_ERR(pfence))
205                         return PTR_ERR(pfence);
206
207                 list_move_tail(xe_preempt_fence_link(pfence), list);
208         }
209
210         return 0;
211 }
212
213 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
214 {
215         struct xe_engine *e;
216
217         xe_vm_assert_held(vm);
218
219         list_for_each_entry(e, &vm->preempt.engines, compute.link) {
220                 if (e->compute.pfence) {
221                         long timeout = dma_fence_wait(e->compute.pfence, false);
222
223                         if (timeout < 0)
224                                 return -ETIME;
225                         dma_fence_put(e->compute.pfence);
226                         e->compute.pfence = NULL;
227                 }
228         }
229
230         return 0;
231 }
232
233 static bool xe_vm_is_idle(struct xe_vm *vm)
234 {
235         struct xe_engine *e;
236
237         xe_vm_assert_held(vm);
238         list_for_each_entry(e, &vm->preempt.engines, compute.link) {
239                 if (!xe_engine_is_idle(e))
240                         return false;
241         }
242
243         return true;
244 }
245
246 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
247 {
248         struct list_head *link;
249         struct xe_engine *e;
250
251         list_for_each_entry(e, &vm->preempt.engines, compute.link) {
252                 struct dma_fence *fence;
253
254                 link = list->next;
255                 XE_BUG_ON(link == list);
256
257                 fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
258                                              e, e->compute.context,
259                                              ++e->compute.seqno);
260                 dma_fence_put(e->compute.pfence);
261                 e->compute.pfence = fence;
262         }
263 }
264
265 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
266 {
267         struct xe_engine *e;
268         struct ww_acquire_ctx ww;
269         int err;
270
271         err = xe_bo_lock(bo, &ww, vm->preempt.num_engines, true);
272         if (err)
273                 return err;
274
275         list_for_each_entry(e, &vm->preempt.engines, compute.link)
276                 if (e->compute.pfence) {
277                         dma_resv_add_fence(bo->ttm.base.resv,
278                                            e->compute.pfence,
279                                            DMA_RESV_USAGE_BOOKKEEP);
280                 }
281
282         xe_bo_unlock(bo, &ww);
283         return 0;
284 }
285
286 /**
287  * xe_vm_fence_all_extobjs() - Add a fence to vm's external objects' resv
288  * @vm: The vm.
289  * @fence: The fence to add.
290  * @usage: The resv usage for the fence.
291  *
292  * Loops over all of the vm's external object bindings and adds a @fence
293  * with the given @usage to all of the external object's reservation
294  * objects.
295  */
296 void xe_vm_fence_all_extobjs(struct xe_vm *vm, struct dma_fence *fence,
297                              enum dma_resv_usage usage)
298 {
299         struct xe_vma *vma;
300
301         list_for_each_entry(vma, &vm->extobj.list, extobj.link)
302                 dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, usage);
303 }
304
305 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm)
306 {
307         struct xe_engine *e;
308
309         lockdep_assert_held(&vm->lock);
310         xe_vm_assert_held(vm);
311
312         list_for_each_entry(e, &vm->preempt.engines, compute.link) {
313                 e->ops->resume(e);
314
315                 dma_resv_add_fence(xe_vm_resv(vm), e->compute.pfence,
316                                    DMA_RESV_USAGE_BOOKKEEP);
317                 xe_vm_fence_all_extobjs(vm, e->compute.pfence,
318                                         DMA_RESV_USAGE_BOOKKEEP);
319         }
320 }
321
322 int xe_vm_add_compute_engine(struct xe_vm *vm, struct xe_engine *e)
323 {
324         struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
325         struct ttm_validate_buffer *tv;
326         struct ww_acquire_ctx ww;
327         struct list_head objs;
328         struct dma_fence *pfence;
329         int err;
330         bool wait;
331
332         XE_BUG_ON(!xe_vm_in_compute_mode(vm));
333
334         down_write(&vm->lock);
335
336         err = xe_vm_lock_dma_resv(vm, &ww, tv_onstack, &tv, &objs, true, 1);
337         if (err)
338                 goto out_unlock_outer;
339
340         pfence = xe_preempt_fence_create(e, e->compute.context,
341                                          ++e->compute.seqno);
342         if (!pfence) {
343                 err = -ENOMEM;
344                 goto out_unlock;
345         }
346
347         list_add(&e->compute.link, &vm->preempt.engines);
348         ++vm->preempt.num_engines;
349         e->compute.pfence = pfence;
350
351         down_read(&vm->userptr.notifier_lock);
352
353         dma_resv_add_fence(xe_vm_resv(vm), pfence,
354                            DMA_RESV_USAGE_BOOKKEEP);
355
356         xe_vm_fence_all_extobjs(vm, pfence, DMA_RESV_USAGE_BOOKKEEP);
357
358         /*
359          * Check to see if a preemption on VM is in flight or userptr
360          * invalidation, if so trigger this preempt fence to sync state with
361          * other preempt fences on the VM.
362          */
363         wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
364         if (wait)
365                 dma_fence_enable_sw_signaling(pfence);
366
367         up_read(&vm->userptr.notifier_lock);
368
369 out_unlock:
370         xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
371 out_unlock_outer:
372         up_write(&vm->lock);
373
374         return err;
375 }
376
377 /**
378  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
379  * that need repinning.
380  * @vm: The VM.
381  *
382  * This function checks for whether the VM has userptrs that need repinning,
383  * and provides a release-type barrier on the userptr.notifier_lock after
384  * checking.
385  *
386  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
387  */
388 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
389 {
390         lockdep_assert_held_read(&vm->userptr.notifier_lock);
391
392         return (list_empty(&vm->userptr.repin_list) &&
393                 list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
394 }
395
396 /**
397  * xe_vm_lock_dma_resv() - Lock the vm dma_resv object and the dma_resv
398  * objects of the vm's external buffer objects.
399  * @vm: The vm.
400  * @ww: Pointer to a struct ww_acquire_ctx locking context.
401  * @tv_onstack: Array size XE_ONSTACK_TV of storage for the struct
402  * ttm_validate_buffers used for locking.
403  * @tv: Pointer to a pointer that on output contains the actual storage used.
404  * @objs: List head for the buffer objects locked.
405  * @intr: Whether to lock interruptible.
406  * @num_shared: Number of dma-fence slots to reserve in the locked objects.
407  *
408  * Locks the vm dma-resv objects and all the dma-resv objects of the
409  * buffer objects on the vm external object list. The TTM utilities require
410  * a list of struct ttm_validate_buffers pointing to the actual buffer
411  * objects to lock. Storage for those struct ttm_validate_buffers should
412  * be provided in @tv_onstack, and is typically reserved on the stack
413  * of the caller. If the size of @tv_onstack isn't sufficient, then
414  * storage will be allocated internally using kvmalloc().
415  *
416  * The function performs deadlock handling internally, and after a
417  * successful return the ww locking transaction should be considered
418  * sealed.
419  *
420  * Return: 0 on success, Negative error code on error. In particular if
421  * @intr is set to true, -EINTR or -ERESTARTSYS may be returned. In case
422  * of error, any locking performed has been reverted.
423  */
424 int xe_vm_lock_dma_resv(struct xe_vm *vm, struct ww_acquire_ctx *ww,
425                         struct ttm_validate_buffer *tv_onstack,
426                         struct ttm_validate_buffer **tv,
427                         struct list_head *objs,
428                         bool intr,
429                         unsigned int num_shared)
430 {
431         struct ttm_validate_buffer *tv_vm, *tv_bo;
432         struct xe_vma *vma, *next;
433         LIST_HEAD(dups);
434         int err;
435
436         lockdep_assert_held(&vm->lock);
437
438         if (vm->extobj.entries < XE_ONSTACK_TV) {
439                 tv_vm = tv_onstack;
440         } else {
441                 tv_vm = kvmalloc_array(vm->extobj.entries + 1, sizeof(*tv_vm),
442                                        GFP_KERNEL);
443                 if (!tv_vm)
444                         return -ENOMEM;
445         }
446         tv_bo = tv_vm + 1;
447
448         INIT_LIST_HEAD(objs);
449         list_for_each_entry(vma, &vm->extobj.list, extobj.link) {
450                 tv_bo->num_shared = num_shared;
451                 tv_bo->bo = &xe_vma_bo(vma)->ttm;
452
453                 list_add_tail(&tv_bo->head, objs);
454                 tv_bo++;
455         }
456         tv_vm->num_shared = num_shared;
457         tv_vm->bo = xe_vm_ttm_bo(vm);
458         list_add_tail(&tv_vm->head, objs);
459         err = ttm_eu_reserve_buffers(ww, objs, intr, &dups);
460         if (err)
461                 goto out_err;
462
463         spin_lock(&vm->notifier.list_lock);
464         list_for_each_entry_safe(vma, next, &vm->notifier.rebind_list,
465                                  notifier.rebind_link) {
466                 xe_bo_assert_held(xe_vma_bo(vma));
467
468                 list_del_init(&vma->notifier.rebind_link);
469                 if (vma->tile_present && !(vma->gpuva.flags & XE_VMA_DESTROYED))
470                         list_move_tail(&vma->rebind_link, &vm->rebind_list);
471         }
472         spin_unlock(&vm->notifier.list_lock);
473
474         *tv = tv_vm;
475         return 0;
476
477 out_err:
478         if (tv_vm != tv_onstack)
479                 kvfree(tv_vm);
480
481         return err;
482 }
483
484 /**
485  * xe_vm_unlock_dma_resv() - Unlock reservation objects locked by
486  * xe_vm_lock_dma_resv()
487  * @vm: The vm.
488  * @tv_onstack: The @tv_onstack array given to xe_vm_lock_dma_resv().
489  * @tv: The value of *@tv given by xe_vm_lock_dma_resv().
490  * @ww: The ww_acquire_context used for locking.
491  * @objs: The list returned from xe_vm_lock_dma_resv().
492  *
493  * Unlocks the reservation objects and frees any memory allocated by
494  * xe_vm_lock_dma_resv().
495  */
496 void xe_vm_unlock_dma_resv(struct xe_vm *vm,
497                            struct ttm_validate_buffer *tv_onstack,
498                            struct ttm_validate_buffer *tv,
499                            struct ww_acquire_ctx *ww,
500                            struct list_head *objs)
501 {
502         /*
503          * Nothing should've been able to enter the list while we were locked,
504          * since we've held the dma-resvs of all the vm's external objects,
505          * and holding the dma_resv of an object is required for list
506          * addition, and we shouldn't add ourselves.
507          */
508         XE_WARN_ON(!list_empty(&vm->notifier.rebind_list));
509
510         ttm_eu_backoff_reservation(ww, objs);
511         if (tv && tv != tv_onstack)
512                 kvfree(tv);
513 }
514
515 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
516
517 static void xe_vm_kill(struct xe_vm *vm)
518 {
519         struct ww_acquire_ctx ww;
520         struct xe_engine *e;
521
522         lockdep_assert_held(&vm->lock);
523
524         xe_vm_lock(vm, &ww, 0, false);
525         vm->flags |= XE_VM_FLAG_BANNED;
526         trace_xe_vm_kill(vm);
527
528         list_for_each_entry(e, &vm->preempt.engines, compute.link)
529                 e->ops->kill(e);
530         xe_vm_unlock(vm, &ww);
531
532         /* TODO: Inform user the VM is banned */
533 }
534
535 static void preempt_rebind_work_func(struct work_struct *w)
536 {
537         struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
538         struct xe_vma *vma;
539         struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
540         struct ttm_validate_buffer *tv;
541         struct ww_acquire_ctx ww;
542         struct list_head objs;
543         struct dma_fence *rebind_fence;
544         unsigned int fence_count = 0;
545         LIST_HEAD(preempt_fences);
546         ktime_t end = 0;
547         int err;
548         long wait;
549         int __maybe_unused tries = 0;
550
551         XE_BUG_ON(!xe_vm_in_compute_mode(vm));
552         trace_xe_vm_rebind_worker_enter(vm);
553
554         down_write(&vm->lock);
555
556         if (xe_vm_is_closed_or_banned(vm)) {
557                 up_write(&vm->lock);
558                 trace_xe_vm_rebind_worker_exit(vm);
559                 return;
560         }
561
562 retry:
563         if (vm->async_ops.error)
564                 goto out_unlock_outer;
565
566         /*
567          * Extreme corner where we exit a VM error state with a munmap style VM
568          * unbind inflight which requires a rebind. In this case the rebind
569          * needs to install some fences into the dma-resv slots. The worker to
570          * do this queued, let that worker make progress by dropping vm->lock
571          * and trying this again.
572          */
573         if (vm->async_ops.munmap_rebind_inflight) {
574                 up_write(&vm->lock);
575                 flush_work(&vm->async_ops.work);
576                 goto retry;
577         }
578
579         if (xe_vm_userptr_check_repin(vm)) {
580                 err = xe_vm_userptr_pin(vm);
581                 if (err)
582                         goto out_unlock_outer;
583         }
584
585         err = xe_vm_lock_dma_resv(vm, &ww, tv_onstack, &tv, &objs,
586                                   false, vm->preempt.num_engines);
587         if (err)
588                 goto out_unlock_outer;
589
590         if (xe_vm_is_idle(vm)) {
591                 vm->preempt.rebind_deactivated = true;
592                 goto out_unlock;
593         }
594
595         /* Fresh preempt fences already installed. Everyting is running. */
596         if (!preempt_fences_waiting(vm))
597                 goto out_unlock;
598
599         /*
600          * This makes sure vm is completely suspended and also balances
601          * xe_engine suspend- and resume; we resume *all* vm engines below.
602          */
603         err = wait_for_existing_preempt_fences(vm);
604         if (err)
605                 goto out_unlock;
606
607         err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
608         if (err)
609                 goto out_unlock;
610
611         list_for_each_entry(vma, &vm->rebind_list, rebind_link) {
612                 if (xe_vma_has_no_bo(vma) ||
613                     vma->gpuva.flags & XE_VMA_DESTROYED)
614                         continue;
615
616                 err = xe_bo_validate(xe_vma_bo(vma), vm, false);
617                 if (err)
618                         goto out_unlock;
619         }
620
621         rebind_fence = xe_vm_rebind(vm, true);
622         if (IS_ERR(rebind_fence)) {
623                 err = PTR_ERR(rebind_fence);
624                 goto out_unlock;
625         }
626
627         if (rebind_fence) {
628                 dma_fence_wait(rebind_fence, false);
629                 dma_fence_put(rebind_fence);
630         }
631
632         /* Wait on munmap style VM unbinds */
633         wait = dma_resv_wait_timeout(xe_vm_resv(vm),
634                                      DMA_RESV_USAGE_KERNEL,
635                                      false, MAX_SCHEDULE_TIMEOUT);
636         if (wait <= 0) {
637                 err = -ETIME;
638                 goto out_unlock;
639         }
640
641 #define retry_required(__tries, __vm) \
642         (IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
643         (!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
644         __xe_vm_userptr_needs_repin(__vm))
645
646         down_read(&vm->userptr.notifier_lock);
647         if (retry_required(tries, vm)) {
648                 up_read(&vm->userptr.notifier_lock);
649                 err = -EAGAIN;
650                 goto out_unlock;
651         }
652
653 #undef retry_required
654
655         spin_lock(&vm->xe->ttm.lru_lock);
656         ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
657         spin_unlock(&vm->xe->ttm.lru_lock);
658
659         /* Point of no return. */
660         arm_preempt_fences(vm, &preempt_fences);
661         resume_and_reinstall_preempt_fences(vm);
662         up_read(&vm->userptr.notifier_lock);
663
664 out_unlock:
665         xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
666 out_unlock_outer:
667         if (err == -EAGAIN) {
668                 trace_xe_vm_rebind_worker_retry(vm);
669                 goto retry;
670         }
671
672         /*
673          * With multiple active VMs, under memory pressure, it is possible that
674          * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
675          * Until ttm properly handles locking in such scenarios, best thing the
676          * driver can do is retry with a timeout. Killing the VM or putting it
677          * in error state after timeout or other error scenarios is still TBD.
678          */
679         if (err == -ENOMEM) {
680                 ktime_t cur = ktime_get();
681
682                 end = end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
683                 if (ktime_before(cur, end)) {
684                         msleep(20);
685                         trace_xe_vm_rebind_worker_retry(vm);
686                         goto retry;
687                 }
688         }
689         if (err) {
690                 drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
691                 xe_vm_kill(vm);
692         }
693         up_write(&vm->lock);
694
695         free_preempt_fences(&preempt_fences);
696
697         trace_xe_vm_rebind_worker_exit(vm);
698 }
699
700 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
701                                    const struct mmu_notifier_range *range,
702                                    unsigned long cur_seq)
703 {
704         struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier);
705         struct xe_vm *vm = xe_vma_vm(vma);
706         struct dma_resv_iter cursor;
707         struct dma_fence *fence;
708         long err;
709
710         XE_BUG_ON(!xe_vma_is_userptr(vma));
711         trace_xe_vma_userptr_invalidate(vma);
712
713         if (!mmu_notifier_range_blockable(range))
714                 return false;
715
716         down_write(&vm->userptr.notifier_lock);
717         mmu_interval_set_seq(mni, cur_seq);
718
719         /* No need to stop gpu access if the userptr is not yet bound. */
720         if (!vma->userptr.initial_bind) {
721                 up_write(&vm->userptr.notifier_lock);
722                 return true;
723         }
724
725         /*
726          * Tell exec and rebind worker they need to repin and rebind this
727          * userptr.
728          */
729         if (!xe_vm_in_fault_mode(vm) &&
730             !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
731                 spin_lock(&vm->userptr.invalidated_lock);
732                 list_move_tail(&vma->userptr.invalidate_link,
733                                &vm->userptr.invalidated);
734                 spin_unlock(&vm->userptr.invalidated_lock);
735         }
736
737         up_write(&vm->userptr.notifier_lock);
738
739         /*
740          * Preempt fences turn into schedule disables, pipeline these.
741          * Note that even in fault mode, we need to wait for binds and
742          * unbinds to complete, and those are attached as BOOKMARK fences
743          * to the vm.
744          */
745         dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
746                             DMA_RESV_USAGE_BOOKKEEP);
747         dma_resv_for_each_fence_unlocked(&cursor, fence)
748                 dma_fence_enable_sw_signaling(fence);
749         dma_resv_iter_end(&cursor);
750
751         err = dma_resv_wait_timeout(xe_vm_resv(vm),
752                                     DMA_RESV_USAGE_BOOKKEEP,
753                                     false, MAX_SCHEDULE_TIMEOUT);
754         XE_WARN_ON(err <= 0);
755
756         if (xe_vm_in_fault_mode(vm)) {
757                 err = xe_vm_invalidate_vma(vma);
758                 XE_WARN_ON(err);
759         }
760
761         trace_xe_vma_userptr_invalidate_complete(vma);
762
763         return true;
764 }
765
766 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
767         .invalidate = vma_userptr_invalidate,
768 };
769
770 int xe_vm_userptr_pin(struct xe_vm *vm)
771 {
772         struct xe_vma *vma, *next;
773         int err = 0;
774         LIST_HEAD(tmp_evict);
775
776         lockdep_assert_held_write(&vm->lock);
777
778         /* Collect invalidated userptrs */
779         spin_lock(&vm->userptr.invalidated_lock);
780         list_for_each_entry_safe(vma, next, &vm->userptr.invalidated,
781                                  userptr.invalidate_link) {
782                 list_del_init(&vma->userptr.invalidate_link);
783                 list_move_tail(&vma->userptr_link, &vm->userptr.repin_list);
784         }
785         spin_unlock(&vm->userptr.invalidated_lock);
786
787         /* Pin and move to temporary list */
788         list_for_each_entry_safe(vma, next, &vm->userptr.repin_list, userptr_link) {
789                 err = xe_vma_userptr_pin_pages(vma);
790                 if (err < 0)
791                         goto out_err;
792
793                 list_move_tail(&vma->userptr_link, &tmp_evict);
794         }
795
796         /* Take lock and move to rebind_list for rebinding. */
797         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
798         if (err)
799                 goto out_err;
800
801         list_for_each_entry_safe(vma, next, &tmp_evict, userptr_link) {
802                 list_del_init(&vma->userptr_link);
803                 list_move_tail(&vma->rebind_link, &vm->rebind_list);
804         }
805
806         dma_resv_unlock(xe_vm_resv(vm));
807
808         return 0;
809
810 out_err:
811         list_splice_tail(&tmp_evict, &vm->userptr.repin_list);
812
813         return err;
814 }
815
816 /**
817  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
818  * that need repinning.
819  * @vm: The VM.
820  *
821  * This function does an advisory check for whether the VM has userptrs that
822  * need repinning.
823  *
824  * Return: 0 if there are no indications of userptrs needing repinning,
825  * -EAGAIN if there are.
826  */
827 int xe_vm_userptr_check_repin(struct xe_vm *vm)
828 {
829         return (list_empty_careful(&vm->userptr.repin_list) &&
830                 list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
831 }
832
833 static struct dma_fence *
834 xe_vm_bind_vma(struct xe_vma *vma, struct xe_engine *e,
835                struct xe_sync_entry *syncs, u32 num_syncs,
836                bool first_op, bool last_op);
837
838 struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
839 {
840         struct dma_fence *fence = NULL;
841         struct xe_vma *vma, *next;
842
843         lockdep_assert_held(&vm->lock);
844         if (xe_vm_no_dma_fences(vm) && !rebind_worker)
845                 return NULL;
846
847         xe_vm_assert_held(vm);
848         list_for_each_entry_safe(vma, next, &vm->rebind_list, rebind_link) {
849                 XE_WARN_ON(!vma->tile_present);
850
851                 list_del_init(&vma->rebind_link);
852                 dma_fence_put(fence);
853                 if (rebind_worker)
854                         trace_xe_vma_rebind_worker(vma);
855                 else
856                         trace_xe_vma_rebind_exec(vma);
857                 fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
858                 if (IS_ERR(fence))
859                         return fence;
860         }
861
862         return fence;
863 }
864
865 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
866                                     struct xe_bo *bo,
867                                     u64 bo_offset_or_userptr,
868                                     u64 start, u64 end,
869                                     bool read_only,
870                                     bool is_null,
871                                     u64 tile_mask)
872 {
873         struct xe_vma *vma;
874         struct xe_tile *tile;
875         u8 id;
876
877         XE_BUG_ON(start >= end);
878         XE_BUG_ON(end >= vm->size);
879
880         vma = kzalloc(sizeof(*vma), GFP_KERNEL);
881         if (!vma) {
882                 vma = ERR_PTR(-ENOMEM);
883                 return vma;
884         }
885
886         INIT_LIST_HEAD(&vma->rebind_link);
887         INIT_LIST_HEAD(&vma->unbind_link);
888         INIT_LIST_HEAD(&vma->userptr_link);
889         INIT_LIST_HEAD(&vma->userptr.invalidate_link);
890         INIT_LIST_HEAD(&vma->notifier.rebind_link);
891         INIT_LIST_HEAD(&vma->extobj.link);
892
893         INIT_LIST_HEAD(&vma->gpuva.gem.entry);
894         vma->gpuva.vm = &vm->gpuvm;
895         vma->gpuva.va.addr = start;
896         vma->gpuva.va.range = end - start + 1;
897         if (read_only)
898                 vma->gpuva.flags |= XE_VMA_READ_ONLY;
899         if (is_null)
900                 vma->gpuva.flags |= DRM_GPUVA_SPARSE;
901
902         if (tile_mask) {
903                 vma->tile_mask = tile_mask;
904         } else {
905                 for_each_tile(tile, vm->xe, id)
906                         vma->tile_mask |= 0x1 << id;
907         }
908
909         if (vm->xe->info.platform == XE_PVC)
910                 vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
911
912         if (bo) {
913                 struct drm_gpuvm_bo *vm_bo;
914
915                 xe_bo_assert_held(bo);
916
917                 vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
918                 if (IS_ERR(vm_bo)) {
919                         kfree(vma);
920                         return ERR_CAST(vm_bo);
921                 }
922
923                 drm_gem_object_get(&bo->ttm.base);
924                 vma->gpuva.gem.obj = &bo->ttm.base;
925                 vma->gpuva.gem.offset = bo_offset_or_userptr;
926                 drm_gpuva_link(&vma->gpuva, vm_bo);
927                 drm_gpuvm_bo_put(vm_bo);
928         } else /* userptr or null */ {
929                 if (!is_null) {
930                         u64 size = end - start + 1;
931                         int err;
932
933                         vma->gpuva.gem.offset = bo_offset_or_userptr;
934
935                         err = mmu_interval_notifier_insert(&vma->userptr.notifier,
936                                                            current->mm,
937                                                            xe_vma_userptr(vma), size,
938                                                            &vma_userptr_notifier_ops);
939                         if (err) {
940                                 kfree(vma);
941                                 vma = ERR_PTR(err);
942                                 return vma;
943                         }
944
945                         vma->userptr.notifier_seq = LONG_MAX;
946                 }
947
948                 xe_vm_get(vm);
949         }
950
951         return vma;
952 }
953
954 static bool vm_remove_extobj(struct xe_vma *vma)
955 {
956         if (!list_empty(&vma->extobj.link)) {
957                 xe_vma_vm(vma)->extobj.entries--;
958                 list_del_init(&vma->extobj.link);
959                 return true;
960         }
961         return false;
962 }
963
964 static void xe_vma_destroy_late(struct xe_vma *vma)
965 {
966         struct xe_vm *vm = xe_vma_vm(vma);
967         struct xe_device *xe = vm->xe;
968         bool read_only = xe_vma_read_only(vma);
969
970         if (xe_vma_is_userptr(vma)) {
971                 if (vma->userptr.sg) {
972                         dma_unmap_sgtable(xe->drm.dev,
973                                           vma->userptr.sg,
974                                           read_only ? DMA_TO_DEVICE :
975                                           DMA_BIDIRECTIONAL, 0);
976                         sg_free_table(vma->userptr.sg);
977                         vma->userptr.sg = NULL;
978                 }
979
980                 /*
981                  * Since userptr pages are not pinned, we can't remove
982                  * the notifer until we're sure the GPU is not accessing
983                  * them anymore
984                  */
985                 mmu_interval_notifier_remove(&vma->userptr.notifier);
986                 xe_vm_put(vm);
987         } else if (xe_vma_is_null(vma)) {
988                 xe_vm_put(vm);
989         } else {
990                 xe_bo_put(xe_vma_bo(vma));
991         }
992
993         kfree(vma);
994 }
995
996 static void vma_destroy_work_func(struct work_struct *w)
997 {
998         struct xe_vma *vma =
999                 container_of(w, struct xe_vma, destroy_work);
1000
1001         xe_vma_destroy_late(vma);
1002 }
1003
1004 static struct xe_vma *
1005 bo_has_vm_references_locked(struct xe_bo *bo, struct xe_vm *vm,
1006                             struct xe_vma *ignore)
1007 {
1008         struct drm_gpuvm_bo *vm_bo;
1009         struct drm_gpuva *va;
1010         struct drm_gem_object *obj = &bo->ttm.base;
1011
1012         xe_bo_assert_held(bo);
1013
1014         drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
1015                 drm_gpuvm_bo_for_each_va(va, vm_bo) {
1016                         struct xe_vma *vma = gpuva_to_vma(va);
1017
1018                         if (vma != ignore && xe_vma_vm(vma) == vm)
1019                                 return vma;
1020                 }
1021         }
1022
1023         return NULL;
1024 }
1025
1026 static bool bo_has_vm_references(struct xe_bo *bo, struct xe_vm *vm,
1027                                  struct xe_vma *ignore)
1028 {
1029         struct ww_acquire_ctx ww;
1030         bool ret;
1031
1032         xe_bo_lock(bo, &ww, 0, false);
1033         ret = !!bo_has_vm_references_locked(bo, vm, ignore);
1034         xe_bo_unlock(bo, &ww);
1035
1036         return ret;
1037 }
1038
1039 static void __vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1040 {
1041         lockdep_assert_held_write(&vm->lock);
1042
1043         list_add(&vma->extobj.link, &vm->extobj.list);
1044         vm->extobj.entries++;
1045 }
1046
1047 static void vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1048 {
1049         struct xe_bo *bo = xe_vma_bo(vma);
1050
1051         lockdep_assert_held_write(&vm->lock);
1052
1053         if (bo_has_vm_references(bo, vm, vma))
1054                 return;
1055
1056         __vm_insert_extobj(vm, vma);
1057 }
1058
1059 static void vma_destroy_cb(struct dma_fence *fence,
1060                            struct dma_fence_cb *cb)
1061 {
1062         struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1063
1064         INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1065         queue_work(system_unbound_wq, &vma->destroy_work);
1066 }
1067
1068 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1069 {
1070         struct xe_vm *vm = xe_vma_vm(vma);
1071
1072         lockdep_assert_held_write(&vm->lock);
1073         XE_BUG_ON(!list_empty(&vma->unbind_link));
1074
1075         if (xe_vma_is_userptr(vma)) {
1076                 XE_WARN_ON(!(vma->gpuva.flags & XE_VMA_DESTROYED));
1077
1078                 spin_lock(&vm->userptr.invalidated_lock);
1079                 list_del_init(&vma->userptr.invalidate_link);
1080                 spin_unlock(&vm->userptr.invalidated_lock);
1081                 list_del(&vma->userptr_link);
1082         } else if (!xe_vma_is_null(vma)) {
1083                 xe_bo_assert_held(xe_vma_bo(vma));
1084
1085                 spin_lock(&vm->notifier.list_lock);
1086                 list_del(&vma->notifier.rebind_link);
1087                 spin_unlock(&vm->notifier.list_lock);
1088
1089                 drm_gpuva_unlink(&vma->gpuva);
1090
1091                 if (!xe_vma_bo(vma)->vm && vm_remove_extobj(vma)) {
1092                         struct xe_vma *other;
1093
1094                         other = bo_has_vm_references_locked(xe_vma_bo(vma), vm, NULL);
1095
1096                         if (other)
1097                                 __vm_insert_extobj(vm, other);
1098                 }
1099         }
1100
1101         xe_vm_assert_held(vm);
1102         if (!list_empty(&vma->rebind_link))
1103                 list_del(&vma->rebind_link);
1104
1105         if (fence) {
1106                 int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1107                                                  vma_destroy_cb);
1108
1109                 if (ret) {
1110                         XE_WARN_ON(ret != -ENOENT);
1111                         xe_vma_destroy_late(vma);
1112                 }
1113         } else {
1114                 xe_vma_destroy_late(vma);
1115         }
1116 }
1117
1118 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1119 {
1120         struct ttm_validate_buffer tv[2];
1121         struct ww_acquire_ctx ww;
1122         struct xe_bo *bo = xe_vma_bo(vma);
1123         LIST_HEAD(objs);
1124         LIST_HEAD(dups);
1125         int err;
1126
1127         memset(tv, 0, sizeof(tv));
1128         tv[0].bo = xe_vm_ttm_bo(xe_vma_vm(vma));
1129         list_add(&tv[0].head, &objs);
1130
1131         if (bo) {
1132                 tv[1].bo = &xe_bo_get(bo)->ttm;
1133                 list_add(&tv[1].head, &objs);
1134         }
1135         err = ttm_eu_reserve_buffers(&ww, &objs, false, &dups);
1136         XE_WARN_ON(err);
1137
1138         xe_vma_destroy(vma, NULL);
1139
1140         ttm_eu_backoff_reservation(&ww, &objs);
1141         if (bo)
1142                 xe_bo_put(bo);
1143 }
1144
1145 struct xe_vma *
1146 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1147 {
1148         struct drm_gpuva *gpuva;
1149
1150         lockdep_assert_held(&vm->lock);
1151
1152         if (xe_vm_is_closed_or_banned(vm))
1153                 return NULL;
1154
1155         XE_BUG_ON(start + range > vm->size);
1156
1157         gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1158
1159         return gpuva ? gpuva_to_vma(gpuva) : NULL;
1160 }
1161
1162 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1163 {
1164         int err;
1165
1166         XE_BUG_ON(xe_vma_vm(vma) != vm);
1167         lockdep_assert_held(&vm->lock);
1168
1169         err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1170         XE_WARN_ON(err);        /* Shouldn't be possible */
1171
1172         return err;
1173 }
1174
1175 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1176 {
1177         XE_BUG_ON(xe_vma_vm(vma) != vm);
1178         lockdep_assert_held(&vm->lock);
1179
1180         drm_gpuva_remove(&vma->gpuva);
1181         if (vm->usm.last_fault_vma == vma)
1182                 vm->usm.last_fault_vma = NULL;
1183 }
1184
1185 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1186 {
1187         struct xe_vma_op *op;
1188
1189         op = kzalloc(sizeof(*op), GFP_KERNEL);
1190
1191         if (unlikely(!op))
1192                 return NULL;
1193
1194         return &op->base;
1195 }
1196
1197 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1198
1199 static struct drm_gpuvm_ops gpuvm_ops = {
1200         .op_alloc = xe_vm_op_alloc,
1201         .vm_free = xe_vm_free,
1202 };
1203
1204 static void xe_vma_op_work_func(struct work_struct *w);
1205 static void vm_destroy_work_func(struct work_struct *w);
1206
1207 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1208 {
1209         struct drm_gem_object *vm_resv_obj;
1210         struct xe_vm *vm;
1211         int err, number_tiles = 0;
1212         struct xe_tile *tile;
1213         u8 id;
1214
1215         vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1216         if (!vm)
1217                 return ERR_PTR(-ENOMEM);
1218
1219         vm->xe = xe;
1220
1221         vm->size = 1ull << xe_pt_shift(xe->info.vm_max_level + 1);
1222
1223         vm->flags = flags;
1224
1225         init_rwsem(&vm->lock);
1226
1227         INIT_LIST_HEAD(&vm->rebind_list);
1228
1229         INIT_LIST_HEAD(&vm->userptr.repin_list);
1230         INIT_LIST_HEAD(&vm->userptr.invalidated);
1231         init_rwsem(&vm->userptr.notifier_lock);
1232         spin_lock_init(&vm->userptr.invalidated_lock);
1233
1234         INIT_LIST_HEAD(&vm->notifier.rebind_list);
1235         spin_lock_init(&vm->notifier.list_lock);
1236
1237         INIT_LIST_HEAD(&vm->async_ops.pending);
1238         INIT_WORK(&vm->async_ops.work, xe_vma_op_work_func);
1239         spin_lock_init(&vm->async_ops.lock);
1240
1241         INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1242
1243         INIT_LIST_HEAD(&vm->preempt.engines);
1244         vm->preempt.min_run_period_ms = 10;     /* FIXME: Wire up to uAPI */
1245
1246         INIT_LIST_HEAD(&vm->extobj.list);
1247
1248         if (!(flags & XE_VM_FLAG_MIGRATION)) {
1249                 /* We need to immeditatelly exit from any D3 state */
1250                 xe_pm_runtime_get(xe);
1251                 xe_device_mem_access_get(xe);
1252         }
1253
1254         vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1255         if (!vm_resv_obj) {
1256                 err = -ENOMEM;
1257                 goto err_no_resv;
1258         }
1259
1260         drm_gpuvm_init(&vm->gpuvm, "Xe VM", 0, &xe->drm, vm_resv_obj,
1261                        0, vm->size, 0, 0, &gpuvm_ops);
1262
1263         drm_gem_object_put(vm_resv_obj);
1264
1265         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
1266         if (err)
1267                 goto err_close;
1268
1269         if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1270                 vm->flags |= XE_VM_FLAGS_64K;
1271
1272         for_each_tile(tile, xe, id) {
1273                 if (flags & XE_VM_FLAG_MIGRATION &&
1274                     tile->id != XE_VM_FLAG_GT_ID(flags))
1275                         continue;
1276
1277                 vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1278                 if (IS_ERR(vm->pt_root[id])) {
1279                         err = PTR_ERR(vm->pt_root[id]);
1280                         vm->pt_root[id] = NULL;
1281                         goto err_unlock_close;
1282                 }
1283         }
1284
1285         if (flags & XE_VM_FLAG_SCRATCH_PAGE) {
1286                 for_each_tile(tile, xe, id) {
1287                         if (!vm->pt_root[id])
1288                                 continue;
1289
1290                         err = xe_pt_create_scratch(xe, tile, vm);
1291                         if (err)
1292                                 goto err_unlock_close;
1293                 }
1294                 vm->batch_invalidate_tlb = true;
1295         }
1296
1297         if (flags & XE_VM_FLAG_COMPUTE_MODE) {
1298                 INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1299                 vm->flags |= XE_VM_FLAG_COMPUTE_MODE;
1300                 vm->batch_invalidate_tlb = false;
1301         }
1302
1303         if (flags & XE_VM_FLAG_ASYNC_BIND_OPS) {
1304                 vm->async_ops.fence.context = dma_fence_context_alloc(1);
1305                 vm->flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
1306         }
1307
1308         /* Fill pt_root after allocating scratch tables */
1309         for_each_tile(tile, xe, id) {
1310                 if (!vm->pt_root[id])
1311                         continue;
1312
1313                 xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1314         }
1315         dma_resv_unlock(xe_vm_resv(vm));
1316
1317         /* Kernel migration VM shouldn't have a circular loop.. */
1318         if (!(flags & XE_VM_FLAG_MIGRATION)) {
1319                 for_each_tile(tile, xe, id) {
1320                         struct xe_gt *gt = tile->primary_gt;
1321                         struct xe_vm *migrate_vm;
1322                         struct xe_engine *eng;
1323
1324                         if (!vm->pt_root[id])
1325                                 continue;
1326
1327                         migrate_vm = xe_migrate_get_vm(tile->migrate);
1328                         eng = xe_engine_create_class(xe, gt, migrate_vm,
1329                                                      XE_ENGINE_CLASS_COPY,
1330                                                      ENGINE_FLAG_VM);
1331                         xe_vm_put(migrate_vm);
1332                         if (IS_ERR(eng)) {
1333                                 err = PTR_ERR(eng);
1334                                 goto err_close;
1335                         }
1336                         vm->eng[id] = eng;
1337                         number_tiles++;
1338                 }
1339         }
1340
1341         if (number_tiles > 1)
1342                 vm->composite_fence_ctx = dma_fence_context_alloc(1);
1343
1344         mutex_lock(&xe->usm.lock);
1345         if (flags & XE_VM_FLAG_FAULT_MODE)
1346                 xe->usm.num_vm_in_fault_mode++;
1347         else if (!(flags & XE_VM_FLAG_MIGRATION))
1348                 xe->usm.num_vm_in_non_fault_mode++;
1349         mutex_unlock(&xe->usm.lock);
1350
1351         trace_xe_vm_create(vm);
1352
1353         return vm;
1354
1355 err_unlock_close:
1356         dma_resv_unlock(xe_vm_resv(vm));
1357 err_close:
1358         xe_vm_close_and_put(vm);
1359         return ERR_PTR(err);
1360
1361 err_no_resv:
1362         kfree(vm);
1363         if (!(flags & XE_VM_FLAG_MIGRATION)) {
1364                 xe_device_mem_access_put(xe);
1365                 xe_pm_runtime_put(xe);
1366         }
1367         return ERR_PTR(err);
1368 }
1369
1370 static void flush_async_ops(struct xe_vm *vm)
1371 {
1372         queue_work(system_unbound_wq, &vm->async_ops.work);
1373         flush_work(&vm->async_ops.work);
1374 }
1375
1376 static void vm_error_capture(struct xe_vm *vm, int err,
1377                              u32 op, u64 addr, u64 size)
1378 {
1379         struct drm_xe_vm_bind_op_error_capture capture;
1380         u64 __user *address =
1381                 u64_to_user_ptr(vm->async_ops.error_capture.addr);
1382         bool in_kthread = !current->mm;
1383
1384         capture.error = err;
1385         capture.op = op;
1386         capture.addr = addr;
1387         capture.size = size;
1388
1389         if (in_kthread) {
1390                 if (!mmget_not_zero(vm->async_ops.error_capture.mm))
1391                         goto mm_closed;
1392                 kthread_use_mm(vm->async_ops.error_capture.mm);
1393         }
1394
1395         if (copy_to_user(address, &capture, sizeof(capture)))
1396                 XE_WARN_ON("Copy to user failed");
1397
1398         if (in_kthread) {
1399                 kthread_unuse_mm(vm->async_ops.error_capture.mm);
1400                 mmput(vm->async_ops.error_capture.mm);
1401         }
1402
1403 mm_closed:
1404         wake_up_all(&vm->async_ops.error_capture.wq);
1405 }
1406
1407 static void xe_vm_close(struct xe_vm *vm)
1408 {
1409         down_write(&vm->lock);
1410         vm->size = 0;
1411         up_write(&vm->lock);
1412 }
1413
1414 void xe_vm_close_and_put(struct xe_vm *vm)
1415 {
1416         LIST_HEAD(contested);
1417         struct ww_acquire_ctx ww;
1418         struct xe_device *xe = vm->xe;
1419         struct xe_tile *tile;
1420         struct xe_vma *vma, *next_vma;
1421         struct drm_gpuva *gpuva, *next;
1422         u8 id;
1423
1424         XE_BUG_ON(vm->preempt.num_engines);
1425
1426         xe_vm_close(vm);
1427         flush_async_ops(vm);
1428         if (xe_vm_in_compute_mode(vm))
1429                 flush_work(&vm->preempt.rebind_work);
1430
1431         for_each_tile(tile, xe, id) {
1432                 if (vm->eng[id]) {
1433                         xe_engine_kill(vm->eng[id]);
1434                         xe_engine_put(vm->eng[id]);
1435                         vm->eng[id] = NULL;
1436                 }
1437         }
1438
1439         down_write(&vm->lock);
1440         xe_vm_lock(vm, &ww, 0, false);
1441         drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1442                 vma = gpuva_to_vma(gpuva);
1443
1444                 if (xe_vma_has_no_bo(vma)) {
1445                         down_read(&vm->userptr.notifier_lock);
1446                         vma->gpuva.flags |= XE_VMA_DESTROYED;
1447                         up_read(&vm->userptr.notifier_lock);
1448                 }
1449
1450                 xe_vm_remove_vma(vm, vma);
1451
1452                 /* easy case, remove from VMA? */
1453                 if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1454                         xe_vma_destroy(vma, NULL);
1455                         continue;
1456                 }
1457
1458                 list_add_tail(&vma->unbind_link, &contested);
1459         }
1460
1461         /*
1462          * All vm operations will add shared fences to resv.
1463          * The only exception is eviction for a shared object,
1464          * but even so, the unbind when evicted would still
1465          * install a fence to resv. Hence it's safe to
1466          * destroy the pagetables immediately.
1467          */
1468         for_each_tile(tile, xe, id) {
1469                 if (vm->scratch_bo[id]) {
1470                         u32 i;
1471
1472                         xe_bo_unpin(vm->scratch_bo[id]);
1473                         xe_bo_put(vm->scratch_bo[id]);
1474                         for (i = 0; i < vm->pt_root[id]->level; i++)
1475                                 xe_pt_destroy(vm->scratch_pt[id][i], vm->flags,
1476                                               NULL);
1477                 }
1478                 if (vm->pt_root[id]) {
1479                         xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1480                         vm->pt_root[id] = NULL;
1481                 }
1482         }
1483         xe_vm_unlock(vm, &ww);
1484
1485         /*
1486          * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1487          * Since we hold a refcount to the bo, we can remove and free
1488          * the members safely without locking.
1489          */
1490         list_for_each_entry_safe(vma, next_vma, &contested, unbind_link) {
1491                 list_del_init(&vma->unbind_link);
1492                 xe_vma_destroy_unlocked(vma);
1493         }
1494
1495         if (vm->async_ops.error_capture.addr)
1496                 wake_up_all(&vm->async_ops.error_capture.wq);
1497
1498         XE_WARN_ON(!list_empty(&vm->extobj.list));
1499         up_write(&vm->lock);
1500
1501         mutex_lock(&xe->usm.lock);
1502         if (vm->flags & XE_VM_FLAG_FAULT_MODE)
1503                 xe->usm.num_vm_in_fault_mode--;
1504         else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1505                 xe->usm.num_vm_in_non_fault_mode--;
1506         mutex_unlock(&xe->usm.lock);
1507
1508         xe_vm_put(vm);
1509 }
1510
1511 static void vm_destroy_work_func(struct work_struct *w)
1512 {
1513         struct xe_vm *vm =
1514                 container_of(w, struct xe_vm, destroy_work);
1515         struct xe_device *xe = vm->xe;
1516         struct xe_tile *tile;
1517         u8 id;
1518         void *lookup;
1519
1520         /* xe_vm_close_and_put was not called? */
1521         XE_WARN_ON(vm->size);
1522
1523         if (!(vm->flags & XE_VM_FLAG_MIGRATION)) {
1524                 xe_device_mem_access_put(xe);
1525                 xe_pm_runtime_put(xe);
1526
1527                 if (xe->info.has_asid) {
1528                         mutex_lock(&xe->usm.lock);
1529                         lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1530                         XE_WARN_ON(lookup != vm);
1531                         mutex_unlock(&xe->usm.lock);
1532                 }
1533         }
1534
1535         for_each_tile(tile, xe, id)
1536                 XE_WARN_ON(vm->pt_root[id]);
1537
1538         trace_xe_vm_free(vm);
1539         dma_fence_put(vm->rebind_fence);
1540         kfree(vm);
1541 }
1542
1543 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1544 {
1545         struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1546
1547         /* To destroy the VM we need to be able to sleep */
1548         queue_work(system_unbound_wq, &vm->destroy_work);
1549 }
1550
1551 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1552 {
1553         struct xe_vm *vm;
1554
1555         mutex_lock(&xef->vm.lock);
1556         vm = xa_load(&xef->vm.xa, id);
1557         if (vm)
1558                 xe_vm_get(vm);
1559         mutex_unlock(&xef->vm.lock);
1560
1561         return vm;
1562 }
1563
1564 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1565 {
1566         return xe_pde_encode(vm->pt_root[tile->id]->bo, 0,
1567                              XE_CACHE_WB);
1568 }
1569
1570 static struct dma_fence *
1571 xe_vm_unbind_vma(struct xe_vma *vma, struct xe_engine *e,
1572                  struct xe_sync_entry *syncs, u32 num_syncs,
1573                  bool first_op, bool last_op)
1574 {
1575         struct xe_tile *tile;
1576         struct dma_fence *fence = NULL;
1577         struct dma_fence **fences = NULL;
1578         struct dma_fence_array *cf = NULL;
1579         struct xe_vm *vm = xe_vma_vm(vma);
1580         int cur_fence = 0, i;
1581         int number_tiles = hweight_long(vma->tile_present);
1582         int err;
1583         u8 id;
1584
1585         trace_xe_vma_unbind(vma);
1586
1587         if (number_tiles > 1) {
1588                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1589                                        GFP_KERNEL);
1590                 if (!fences)
1591                         return ERR_PTR(-ENOMEM);
1592         }
1593
1594         for_each_tile(tile, vm->xe, id) {
1595                 if (!(vma->tile_present & BIT(id)))
1596                         goto next;
1597
1598                 fence = __xe_pt_unbind_vma(tile, vma, e, first_op ? syncs : NULL,
1599                                            first_op ? num_syncs : 0);
1600                 if (IS_ERR(fence)) {
1601                         err = PTR_ERR(fence);
1602                         goto err_fences;
1603                 }
1604
1605                 if (fences)
1606                         fences[cur_fence++] = fence;
1607
1608 next:
1609                 if (e && vm->pt_root[id] && !list_empty(&e->multi_gt_list))
1610                         e = list_next_entry(e, multi_gt_list);
1611         }
1612
1613         if (fences) {
1614                 cf = dma_fence_array_create(number_tiles, fences,
1615                                             vm->composite_fence_ctx,
1616                                             vm->composite_fence_seqno++,
1617                                             false);
1618                 if (!cf) {
1619                         --vm->composite_fence_seqno;
1620                         err = -ENOMEM;
1621                         goto err_fences;
1622                 }
1623         }
1624
1625         if (last_op) {
1626                 for (i = 0; i < num_syncs; i++)
1627                         xe_sync_entry_signal(&syncs[i], NULL,
1628                                              cf ? &cf->base : fence);
1629         }
1630
1631         return cf ? &cf->base : !fence ? dma_fence_get_stub() : fence;
1632
1633 err_fences:
1634         if (fences) {
1635                 while (cur_fence) {
1636                         /* FIXME: Rewind the previous binds? */
1637                         dma_fence_put(fences[--cur_fence]);
1638                 }
1639                 kfree(fences);
1640         }
1641
1642         return ERR_PTR(err);
1643 }
1644
1645 static struct dma_fence *
1646 xe_vm_bind_vma(struct xe_vma *vma, struct xe_engine *e,
1647                struct xe_sync_entry *syncs, u32 num_syncs,
1648                bool first_op, bool last_op)
1649 {
1650         struct xe_tile *tile;
1651         struct dma_fence *fence;
1652         struct dma_fence **fences = NULL;
1653         struct dma_fence_array *cf = NULL;
1654         struct xe_vm *vm = xe_vma_vm(vma);
1655         int cur_fence = 0, i;
1656         int number_tiles = hweight_long(vma->tile_mask);
1657         int err;
1658         u8 id;
1659
1660         trace_xe_vma_bind(vma);
1661
1662         if (number_tiles > 1) {
1663                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1664                                        GFP_KERNEL);
1665                 if (!fences)
1666                         return ERR_PTR(-ENOMEM);
1667         }
1668
1669         for_each_tile(tile, vm->xe, id) {
1670                 if (!(vma->tile_mask & BIT(id)))
1671                         goto next;
1672
1673                 fence = __xe_pt_bind_vma(tile, vma, e, first_op ? syncs : NULL,
1674                                          first_op ? num_syncs : 0,
1675                                          vma->tile_present & BIT(id));
1676                 if (IS_ERR(fence)) {
1677                         err = PTR_ERR(fence);
1678                         goto err_fences;
1679                 }
1680
1681                 if (fences)
1682                         fences[cur_fence++] = fence;
1683
1684 next:
1685                 if (e && vm->pt_root[id] && !list_empty(&e->multi_gt_list))
1686                         e = list_next_entry(e, multi_gt_list);
1687         }
1688
1689         if (fences) {
1690                 cf = dma_fence_array_create(number_tiles, fences,
1691                                             vm->composite_fence_ctx,
1692                                             vm->composite_fence_seqno++,
1693                                             false);
1694                 if (!cf) {
1695                         --vm->composite_fence_seqno;
1696                         err = -ENOMEM;
1697                         goto err_fences;
1698                 }
1699         }
1700
1701         if (last_op) {
1702                 for (i = 0; i < num_syncs; i++)
1703                         xe_sync_entry_signal(&syncs[i], NULL,
1704                                              cf ? &cf->base : fence);
1705         }
1706
1707         return cf ? &cf->base : fence;
1708
1709 err_fences:
1710         if (fences) {
1711                 while (cur_fence) {
1712                         /* FIXME: Rewind the previous binds? */
1713                         dma_fence_put(fences[--cur_fence]);
1714                 }
1715                 kfree(fences);
1716         }
1717
1718         return ERR_PTR(err);
1719 }
1720
1721 struct async_op_fence {
1722         struct dma_fence fence;
1723         struct dma_fence *wait_fence;
1724         struct dma_fence_cb cb;
1725         struct xe_vm *vm;
1726         wait_queue_head_t wq;
1727         bool started;
1728 };
1729
1730 static const char *async_op_fence_get_driver_name(struct dma_fence *dma_fence)
1731 {
1732         return "xe";
1733 }
1734
1735 static const char *
1736 async_op_fence_get_timeline_name(struct dma_fence *dma_fence)
1737 {
1738         return "async_op_fence";
1739 }
1740
1741 static const struct dma_fence_ops async_op_fence_ops = {
1742         .get_driver_name = async_op_fence_get_driver_name,
1743         .get_timeline_name = async_op_fence_get_timeline_name,
1744 };
1745
1746 static void async_op_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
1747 {
1748         struct async_op_fence *afence =
1749                 container_of(cb, struct async_op_fence, cb);
1750
1751         afence->fence.error = afence->wait_fence->error;
1752         dma_fence_signal(&afence->fence);
1753         xe_vm_put(afence->vm);
1754         dma_fence_put(afence->wait_fence);
1755         dma_fence_put(&afence->fence);
1756 }
1757
1758 static void add_async_op_fence_cb(struct xe_vm *vm,
1759                                   struct dma_fence *fence,
1760                                   struct async_op_fence *afence)
1761 {
1762         int ret;
1763
1764         if (!xe_vm_no_dma_fences(vm)) {
1765                 afence->started = true;
1766                 smp_wmb();
1767                 wake_up_all(&afence->wq);
1768         }
1769
1770         afence->wait_fence = dma_fence_get(fence);
1771         afence->vm = xe_vm_get(vm);
1772         dma_fence_get(&afence->fence);
1773         ret = dma_fence_add_callback(fence, &afence->cb, async_op_fence_cb);
1774         if (ret == -ENOENT) {
1775                 afence->fence.error = afence->wait_fence->error;
1776                 dma_fence_signal(&afence->fence);
1777         }
1778         if (ret) {
1779                 xe_vm_put(vm);
1780                 dma_fence_put(afence->wait_fence);
1781                 dma_fence_put(&afence->fence);
1782         }
1783         XE_WARN_ON(ret && ret != -ENOENT);
1784 }
1785
1786 int xe_vm_async_fence_wait_start(struct dma_fence *fence)
1787 {
1788         if (fence->ops == &async_op_fence_ops) {
1789                 struct async_op_fence *afence =
1790                         container_of(fence, struct async_op_fence, fence);
1791
1792                 XE_BUG_ON(xe_vm_no_dma_fences(afence->vm));
1793
1794                 smp_rmb();
1795                 return wait_event_interruptible(afence->wq, afence->started);
1796         }
1797
1798         return 0;
1799 }
1800
1801 static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
1802                         struct xe_engine *e, struct xe_sync_entry *syncs,
1803                         u32 num_syncs, struct async_op_fence *afence,
1804                         bool immediate, bool first_op, bool last_op)
1805 {
1806         struct dma_fence *fence;
1807
1808         xe_vm_assert_held(vm);
1809
1810         if (immediate) {
1811                 fence = xe_vm_bind_vma(vma, e, syncs, num_syncs, first_op,
1812                                        last_op);
1813                 if (IS_ERR(fence))
1814                         return PTR_ERR(fence);
1815         } else {
1816                 int i;
1817
1818                 XE_BUG_ON(!xe_vm_in_fault_mode(vm));
1819
1820                 fence = dma_fence_get_stub();
1821                 if (last_op) {
1822                         for (i = 0; i < num_syncs; i++)
1823                                 xe_sync_entry_signal(&syncs[i], NULL, fence);
1824                 }
1825         }
1826         if (afence)
1827                 add_async_op_fence_cb(vm, fence, afence);
1828
1829         dma_fence_put(fence);
1830         return 0;
1831 }
1832
1833 static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_engine *e,
1834                       struct xe_bo *bo, struct xe_sync_entry *syncs,
1835                       u32 num_syncs, struct async_op_fence *afence,
1836                       bool immediate, bool first_op, bool last_op)
1837 {
1838         int err;
1839
1840         xe_vm_assert_held(vm);
1841         xe_bo_assert_held(bo);
1842
1843         if (bo && immediate) {
1844                 err = xe_bo_validate(bo, vm, true);
1845                 if (err)
1846                         return err;
1847         }
1848
1849         return __xe_vm_bind(vm, vma, e, syncs, num_syncs, afence, immediate,
1850                             first_op, last_op);
1851 }
1852
1853 static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
1854                         struct xe_engine *e, struct xe_sync_entry *syncs,
1855                         u32 num_syncs, struct async_op_fence *afence,
1856                         bool first_op, bool last_op)
1857 {
1858         struct dma_fence *fence;
1859
1860         xe_vm_assert_held(vm);
1861         xe_bo_assert_held(xe_vma_bo(vma));
1862
1863         fence = xe_vm_unbind_vma(vma, e, syncs, num_syncs, first_op, last_op);
1864         if (IS_ERR(fence))
1865                 return PTR_ERR(fence);
1866         if (afence)
1867                 add_async_op_fence_cb(vm, fence, afence);
1868
1869         xe_vma_destroy(vma, fence);
1870         dma_fence_put(fence);
1871
1872         return 0;
1873 }
1874
1875 static int vm_set_error_capture_address(struct xe_device *xe, struct xe_vm *vm,
1876                                         u64 value)
1877 {
1878         if (XE_IOCTL_ERR(xe, !value))
1879                 return -EINVAL;
1880
1881         if (XE_IOCTL_ERR(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
1882                 return -EOPNOTSUPP;
1883
1884         if (XE_IOCTL_ERR(xe, vm->async_ops.error_capture.addr))
1885                 return -EOPNOTSUPP;
1886
1887         vm->async_ops.error_capture.mm = current->mm;
1888         vm->async_ops.error_capture.addr = value;
1889         init_waitqueue_head(&vm->async_ops.error_capture.wq);
1890
1891         return 0;
1892 }
1893
1894 typedef int (*xe_vm_set_property_fn)(struct xe_device *xe, struct xe_vm *vm,
1895                                      u64 value);
1896
1897 static const xe_vm_set_property_fn vm_set_property_funcs[] = {
1898         [XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS] =
1899                 vm_set_error_capture_address,
1900 };
1901
1902 static int vm_user_ext_set_property(struct xe_device *xe, struct xe_vm *vm,
1903                                     u64 extension)
1904 {
1905         u64 __user *address = u64_to_user_ptr(extension);
1906         struct drm_xe_ext_vm_set_property ext;
1907         int err;
1908
1909         err = __copy_from_user(&ext, address, sizeof(ext));
1910         if (XE_IOCTL_ERR(xe, err))
1911                 return -EFAULT;
1912
1913         if (XE_IOCTL_ERR(xe, ext.property >=
1914                          ARRAY_SIZE(vm_set_property_funcs)) ||
1915             XE_IOCTL_ERR(xe, ext.pad) ||
1916             XE_IOCTL_ERR(xe, ext.reserved[0] || ext.reserved[1]))
1917                 return -EINVAL;
1918
1919         return vm_set_property_funcs[ext.property](xe, vm, ext.value);
1920 }
1921
1922 typedef int (*xe_vm_user_extension_fn)(struct xe_device *xe, struct xe_vm *vm,
1923                                        u64 extension);
1924
1925 static const xe_vm_set_property_fn vm_user_extension_funcs[] = {
1926         [XE_VM_EXTENSION_SET_PROPERTY] = vm_user_ext_set_property,
1927 };
1928
1929 #define MAX_USER_EXTENSIONS     16
1930 static int vm_user_extensions(struct xe_device *xe, struct xe_vm *vm,
1931                               u64 extensions, int ext_number)
1932 {
1933         u64 __user *address = u64_to_user_ptr(extensions);
1934         struct xe_user_extension ext;
1935         int err;
1936
1937         if (XE_IOCTL_ERR(xe, ext_number >= MAX_USER_EXTENSIONS))
1938                 return -E2BIG;
1939
1940         err = __copy_from_user(&ext, address, sizeof(ext));
1941         if (XE_IOCTL_ERR(xe, err))
1942                 return -EFAULT;
1943
1944         if (XE_IOCTL_ERR(xe, ext.pad) ||
1945             XE_IOCTL_ERR(xe, ext.name >=
1946                          ARRAY_SIZE(vm_user_extension_funcs)))
1947                 return -EINVAL;
1948
1949         err = vm_user_extension_funcs[ext.name](xe, vm, extensions);
1950         if (XE_IOCTL_ERR(xe, err))
1951                 return err;
1952
1953         if (ext.next_extension)
1954                 return vm_user_extensions(xe, vm, ext.next_extension,
1955                                           ++ext_number);
1956
1957         return 0;
1958 }
1959
1960 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_SCRATCH_PAGE | \
1961                                     DRM_XE_VM_CREATE_COMPUTE_MODE | \
1962                                     DRM_XE_VM_CREATE_ASYNC_BIND_OPS | \
1963                                     DRM_XE_VM_CREATE_FAULT_MODE)
1964
1965 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
1966                        struct drm_file *file)
1967 {
1968         struct xe_device *xe = to_xe_device(dev);
1969         struct xe_file *xef = to_xe_file(file);
1970         struct drm_xe_vm_create *args = data;
1971         struct xe_vm *vm;
1972         u32 id, asid;
1973         int err;
1974         u32 flags = 0;
1975
1976         if (XE_IOCTL_ERR(xe, args->reserved[0] || args->reserved[1]))
1977                 return -EINVAL;
1978
1979         if (XE_IOCTL_ERR(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
1980                 return -EINVAL;
1981
1982         if (XE_IOCTL_ERR(xe, args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE &&
1983                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
1984                 return -EINVAL;
1985
1986         if (XE_IOCTL_ERR(xe, args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE &&
1987                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
1988                 return -EINVAL;
1989
1990         if (XE_IOCTL_ERR(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
1991                          xe_device_in_non_fault_mode(xe)))
1992                 return -EINVAL;
1993
1994         if (XE_IOCTL_ERR(xe, !(args->flags & DRM_XE_VM_CREATE_FAULT_MODE) &&
1995                          xe_device_in_fault_mode(xe)))
1996                 return -EINVAL;
1997
1998         if (XE_IOCTL_ERR(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
1999                          !xe->info.supports_usm))
2000                 return -EINVAL;
2001
2002         if (args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE)
2003                 flags |= XE_VM_FLAG_SCRATCH_PAGE;
2004         if (args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE)
2005                 flags |= XE_VM_FLAG_COMPUTE_MODE;
2006         if (args->flags & DRM_XE_VM_CREATE_ASYNC_BIND_OPS)
2007                 flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
2008         if (args->flags & DRM_XE_VM_CREATE_FAULT_MODE)
2009                 flags |= XE_VM_FLAG_FAULT_MODE;
2010
2011         vm = xe_vm_create(xe, flags);
2012         if (IS_ERR(vm))
2013                 return PTR_ERR(vm);
2014
2015         if (args->extensions) {
2016                 err = vm_user_extensions(xe, vm, args->extensions, 0);
2017                 if (XE_IOCTL_ERR(xe, err)) {
2018                         xe_vm_close_and_put(vm);
2019                         return err;
2020                 }
2021         }
2022
2023         mutex_lock(&xef->vm.lock);
2024         err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2025         mutex_unlock(&xef->vm.lock);
2026         if (err) {
2027                 xe_vm_close_and_put(vm);
2028                 return err;
2029         }
2030
2031         if (xe->info.has_asid) {
2032                 mutex_lock(&xe->usm.lock);
2033                 err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2034                                       XA_LIMIT(0, XE_MAX_ASID - 1),
2035                                       &xe->usm.next_asid, GFP_KERNEL);
2036                 mutex_unlock(&xe->usm.lock);
2037                 if (err) {
2038                         xe_vm_close_and_put(vm);
2039                         return err;
2040                 }
2041                 vm->usm.asid = asid;
2042         }
2043
2044         args->vm_id = id;
2045
2046 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2047         /* Warning: Security issue - never enable by default */
2048         args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2049 #endif
2050
2051         return 0;
2052 }
2053
2054 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2055                         struct drm_file *file)
2056 {
2057         struct xe_device *xe = to_xe_device(dev);
2058         struct xe_file *xef = to_xe_file(file);
2059         struct drm_xe_vm_destroy *args = data;
2060         struct xe_vm *vm;
2061         int err = 0;
2062
2063         if (XE_IOCTL_ERR(xe, args->pad) ||
2064             XE_IOCTL_ERR(xe, args->reserved[0] || args->reserved[1]))
2065                 return -EINVAL;
2066
2067         mutex_lock(&xef->vm.lock);
2068         vm = xa_load(&xef->vm.xa, args->vm_id);
2069         if (XE_IOCTL_ERR(xe, !vm))
2070                 err = -ENOENT;
2071         else if (XE_IOCTL_ERR(xe, vm->preempt.num_engines))
2072                 err = -EBUSY;
2073         else
2074                 xa_erase(&xef->vm.xa, args->vm_id);
2075         mutex_unlock(&xef->vm.lock);
2076
2077         if (!err)
2078                 xe_vm_close_and_put(vm);
2079
2080         return err;
2081 }
2082
2083 static const u32 region_to_mem_type[] = {
2084         XE_PL_TT,
2085         XE_PL_VRAM0,
2086         XE_PL_VRAM1,
2087 };
2088
2089 static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
2090                           struct xe_engine *e, u32 region,
2091                           struct xe_sync_entry *syncs, u32 num_syncs,
2092                           struct async_op_fence *afence, bool first_op,
2093                           bool last_op)
2094 {
2095         int err;
2096
2097         XE_BUG_ON(region > ARRAY_SIZE(region_to_mem_type));
2098
2099         if (!xe_vma_has_no_bo(vma)) {
2100                 err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
2101                 if (err)
2102                         return err;
2103         }
2104
2105         if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
2106                 return xe_vm_bind(vm, vma, e, xe_vma_bo(vma), syncs, num_syncs,
2107                                   afence, true, first_op, last_op);
2108         } else {
2109                 int i;
2110
2111                 /* Nothing to do, signal fences now */
2112                 if (last_op) {
2113                         for (i = 0; i < num_syncs; i++)
2114                                 xe_sync_entry_signal(&syncs[i], NULL,
2115                                                      dma_fence_get_stub());
2116                 }
2117                 if (afence)
2118                         dma_fence_signal(&afence->fence);
2119                 return 0;
2120         }
2121 }
2122
2123 #define VM_BIND_OP(op)  (op & 0xffff)
2124
2125 struct ttm_buffer_object *xe_vm_ttm_bo(struct xe_vm *vm)
2126 {
2127         int idx = vm->flags & XE_VM_FLAG_MIGRATION ?
2128                 XE_VM_FLAG_GT_ID(vm->flags) : 0;
2129
2130         /* Safe to use index 0 as all BO in the VM share a single dma-resv lock */
2131         return &vm->pt_root[idx]->bo->ttm;
2132 }
2133
2134 static void xe_vm_tv_populate(struct xe_vm *vm, struct ttm_validate_buffer *tv)
2135 {
2136         tv->num_shared = 1;
2137         tv->bo = xe_vm_ttm_bo(vm);
2138 }
2139
2140 static void vm_set_async_error(struct xe_vm *vm, int err)
2141 {
2142         lockdep_assert_held(&vm->lock);
2143         vm->async_ops.error = err;
2144 }
2145
2146 static int vm_bind_ioctl_lookup_vma(struct xe_vm *vm, struct xe_bo *bo,
2147                                     u64 addr, u64 range, u32 op)
2148 {
2149         struct xe_device *xe = vm->xe;
2150         struct xe_vma *vma;
2151         bool async = !!(op & XE_VM_BIND_FLAG_ASYNC);
2152
2153         lockdep_assert_held(&vm->lock);
2154
2155         switch (VM_BIND_OP(op)) {
2156         case XE_VM_BIND_OP_MAP:
2157         case XE_VM_BIND_OP_MAP_USERPTR:
2158                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2159                 if (XE_IOCTL_ERR(xe, vma && !async))
2160                         return -EBUSY;
2161                 break;
2162         case XE_VM_BIND_OP_UNMAP:
2163         case XE_VM_BIND_OP_PREFETCH:
2164                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2165                 if (XE_IOCTL_ERR(xe, !vma))
2166                         return -ENODATA;        /* Not an actual error, IOCTL
2167                                                    cleans up returns and 0 */
2168                 if (XE_IOCTL_ERR(xe, (xe_vma_start(vma) != addr ||
2169                                  xe_vma_end(vma) != addr + range) && !async))
2170                         return -EINVAL;
2171                 break;
2172         case XE_VM_BIND_OP_UNMAP_ALL:
2173                 if (XE_IOCTL_ERR(xe, list_empty(&bo->ttm.base.gpuva.list)))
2174                         return -ENODATA;        /* Not an actual error, IOCTL
2175                                                    cleans up returns and 0 */
2176                 break;
2177         default:
2178                 XE_BUG_ON("NOT POSSIBLE");
2179                 return -EINVAL;
2180         }
2181
2182         return 0;
2183 }
2184
2185 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2186                              bool post_commit)
2187 {
2188         down_read(&vm->userptr.notifier_lock);
2189         vma->gpuva.flags |= XE_VMA_DESTROYED;
2190         up_read(&vm->userptr.notifier_lock);
2191         if (post_commit)
2192                 xe_vm_remove_vma(vm, vma);
2193 }
2194
2195 #undef ULL
2196 #define ULL     unsigned long long
2197
2198 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2199 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2200 {
2201         struct xe_vma *vma;
2202
2203         switch (op->op) {
2204         case DRM_GPUVA_OP_MAP:
2205                 vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2206                        (ULL)op->map.va.addr, (ULL)op->map.va.range);
2207                 break;
2208         case DRM_GPUVA_OP_REMAP:
2209                 vma = gpuva_to_vma(op->remap.unmap->va);
2210                 vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2211                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2212                        op->unmap.keep ? 1 : 0);
2213                 if (op->remap.prev)
2214                         vm_dbg(&xe->drm,
2215                                "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2216                                (ULL)op->remap.prev->va.addr,
2217                                (ULL)op->remap.prev->va.range);
2218                 if (op->remap.next)
2219                         vm_dbg(&xe->drm,
2220                                "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2221                                (ULL)op->remap.next->va.addr,
2222                                (ULL)op->remap.next->va.range);
2223                 break;
2224         case DRM_GPUVA_OP_UNMAP:
2225                 vma = gpuva_to_vma(op->unmap.va);
2226                 vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2227                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2228                        op->unmap.keep ? 1 : 0);
2229                 break;
2230         case DRM_GPUVA_OP_PREFETCH:
2231                 vma = gpuva_to_vma(op->prefetch.va);
2232                 vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2233                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2234                 break;
2235         default:
2236                 XE_BUG_ON("NOT POSSIBLE");
2237         }
2238 }
2239 #else
2240 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2241 {
2242 }
2243 #endif
2244
2245 /*
2246  * Create operations list from IOCTL arguments, setup operations fields so parse
2247  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2248  */
2249 static struct drm_gpuva_ops *
2250 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2251                          u64 bo_offset_or_userptr, u64 addr, u64 range,
2252                          u32 operation, u64 tile_mask, u32 region)
2253 {
2254         struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2255         struct ww_acquire_ctx ww;
2256         struct drm_gpuva_ops *ops;
2257         struct drm_gpuva_op *__op;
2258         struct xe_vma_op *op;
2259         struct drm_gpuvm_bo *vm_bo;
2260         int err;
2261
2262         lockdep_assert_held_write(&vm->lock);
2263
2264         vm_dbg(&vm->xe->drm,
2265                "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2266                VM_BIND_OP(operation), (ULL)addr, (ULL)range,
2267                (ULL)bo_offset_or_userptr);
2268
2269         switch (VM_BIND_OP(operation)) {
2270         case XE_VM_BIND_OP_MAP:
2271         case XE_VM_BIND_OP_MAP_USERPTR:
2272                 ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2273                                                   obj, bo_offset_or_userptr);
2274                 if (IS_ERR(ops))
2275                         return ops;
2276
2277                 drm_gpuva_for_each_op(__op, ops) {
2278                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2279
2280                         op->tile_mask = tile_mask;
2281                         op->map.immediate =
2282                                 operation & XE_VM_BIND_FLAG_IMMEDIATE;
2283                         op->map.read_only =
2284                                 operation & XE_VM_BIND_FLAG_READONLY;
2285                         op->map.is_null = operation & XE_VM_BIND_FLAG_NULL;
2286                 }
2287                 break;
2288         case XE_VM_BIND_OP_UNMAP:
2289                 ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2290                 if (IS_ERR(ops))
2291                         return ops;
2292
2293                 drm_gpuva_for_each_op(__op, ops) {
2294                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2295
2296                         op->tile_mask = tile_mask;
2297                 }
2298                 break;
2299         case XE_VM_BIND_OP_PREFETCH:
2300                 ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2301                 if (IS_ERR(ops))
2302                         return ops;
2303
2304                 drm_gpuva_for_each_op(__op, ops) {
2305                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2306
2307                         op->tile_mask = tile_mask;
2308                         op->prefetch.region = region;
2309                 }
2310                 break;
2311         case XE_VM_BIND_OP_UNMAP_ALL:
2312                 XE_BUG_ON(!bo);
2313
2314                 err = xe_bo_lock(bo, &ww, 0, true);
2315                 if (err)
2316                         return ERR_PTR(err);
2317
2318                 vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj);
2319                 if (!vm_bo)
2320                         break;
2321
2322                 ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2323                 drm_gpuvm_bo_put(vm_bo);
2324                 xe_bo_unlock(bo, &ww);
2325                 if (IS_ERR(ops))
2326                         return ops;
2327
2328                 drm_gpuva_for_each_op(__op, ops) {
2329                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2330
2331                         op->tile_mask = tile_mask;
2332                 }
2333                 break;
2334         default:
2335                 XE_BUG_ON("NOT POSSIBLE");
2336                 ops = ERR_PTR(-EINVAL);
2337         }
2338
2339 #ifdef TEST_VM_ASYNC_OPS_ERROR
2340         if (operation & FORCE_ASYNC_OP_ERROR) {
2341                 op = list_first_entry_or_null(&ops->list, struct xe_vma_op,
2342                                               base.entry);
2343                 if (op)
2344                         op->inject_error = true;
2345         }
2346 #endif
2347
2348         if (!IS_ERR(ops))
2349                 drm_gpuva_for_each_op(__op, ops)
2350                         print_op(vm->xe, __op);
2351
2352         return ops;
2353 }
2354
2355 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2356                               u64 tile_mask, bool read_only, bool is_null)
2357 {
2358         struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2359         struct xe_vma *vma;
2360         struct ww_acquire_ctx ww;
2361         int err;
2362
2363         lockdep_assert_held_write(&vm->lock);
2364
2365         if (bo) {
2366                 err = xe_bo_lock(bo, &ww, 0, true);
2367                 if (err)
2368                         return ERR_PTR(err);
2369         }
2370         vma = xe_vma_create(vm, bo, op->gem.offset,
2371                             op->va.addr, op->va.addr +
2372                             op->va.range - 1, read_only, is_null,
2373                             tile_mask);
2374         if (bo)
2375                 xe_bo_unlock(bo, &ww);
2376
2377         if (xe_vma_is_userptr(vma)) {
2378                 err = xe_vma_userptr_pin_pages(vma);
2379                 if (err) {
2380                         prep_vma_destroy(vm, vma, false);
2381                         xe_vma_destroy_unlocked(vma);
2382                         return ERR_PTR(err);
2383                 }
2384         } else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
2385                 vm_insert_extobj(vm, vma);
2386                 err = add_preempt_fences(vm, bo);
2387                 if (err) {
2388                         prep_vma_destroy(vm, vma, false);
2389                         xe_vma_destroy_unlocked(vma);
2390                         return ERR_PTR(err);
2391                 }
2392         }
2393
2394         return vma;
2395 }
2396
2397 /*
2398  * Parse operations list and create any resources needed for the operations
2399  * prior to fully committing to the operations. This setup can fail.
2400  */
2401 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_engine *e,
2402                                    struct drm_gpuva_ops **ops, int num_ops_list,
2403                                    struct xe_sync_entry *syncs, u32 num_syncs,
2404                                    struct list_head *ops_list, bool async)
2405 {
2406         struct xe_vma_op *last_op = NULL;
2407         struct list_head *async_list = NULL;
2408         struct async_op_fence *fence = NULL;
2409         int err, i;
2410
2411         lockdep_assert_held_write(&vm->lock);
2412         XE_BUG_ON(num_ops_list > 1 && !async);
2413
2414         if (num_syncs && async) {
2415                 u64 seqno;
2416
2417                 fence = kmalloc(sizeof(*fence), GFP_KERNEL);
2418                 if (!fence)
2419                         return -ENOMEM;
2420
2421                 seqno = e ? ++e->bind.fence_seqno : ++vm->async_ops.fence.seqno;
2422                 dma_fence_init(&fence->fence, &async_op_fence_ops,
2423                                &vm->async_ops.lock, e ? e->bind.fence_ctx :
2424                                vm->async_ops.fence.context, seqno);
2425
2426                 if (!xe_vm_no_dma_fences(vm)) {
2427                         fence->vm = vm;
2428                         fence->started = false;
2429                         init_waitqueue_head(&fence->wq);
2430                 }
2431         }
2432
2433         for (i = 0; i < num_ops_list; ++i) {
2434                 struct drm_gpuva_ops *__ops = ops[i];
2435                 struct drm_gpuva_op *__op;
2436
2437                 drm_gpuva_for_each_op(__op, __ops) {
2438                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2439                         bool first = !async_list;
2440
2441                         XE_BUG_ON(!first && !async);
2442
2443                         INIT_LIST_HEAD(&op->link);
2444                         if (first)
2445                                 async_list = ops_list;
2446                         list_add_tail(&op->link, async_list);
2447
2448                         if (first) {
2449                                 op->flags |= XE_VMA_OP_FIRST;
2450                                 op->num_syncs = num_syncs;
2451                                 op->syncs = syncs;
2452                         }
2453
2454                         op->engine = e;
2455
2456                         switch (op->base.op) {
2457                         case DRM_GPUVA_OP_MAP:
2458                         {
2459                                 struct xe_vma *vma;
2460
2461                                 vma = new_vma(vm, &op->base.map,
2462                                               op->tile_mask, op->map.read_only,
2463                                               op->map.is_null);
2464                                 if (IS_ERR(vma)) {
2465                                         err = PTR_ERR(vma);
2466                                         goto free_fence;
2467                                 }
2468
2469                                 op->map.vma = vma;
2470                                 break;
2471                         }
2472                         case DRM_GPUVA_OP_REMAP:
2473                                 if (op->base.remap.prev) {
2474                                         struct xe_vma *vma;
2475                                         bool read_only =
2476                                                 op->base.remap.unmap->va->flags &
2477                                                 XE_VMA_READ_ONLY;
2478                                         bool is_null =
2479                                                 op->base.remap.unmap->va->flags &
2480                                                 DRM_GPUVA_SPARSE;
2481
2482                                         vma = new_vma(vm, op->base.remap.prev,
2483                                                       op->tile_mask, read_only,
2484                                                       is_null);
2485                                         if (IS_ERR(vma)) {
2486                                                 err = PTR_ERR(vma);
2487                                                 goto free_fence;
2488                                         }
2489
2490                                         op->remap.prev = vma;
2491                                 }
2492
2493                                 if (op->base.remap.next) {
2494                                         struct xe_vma *vma;
2495                                         bool read_only =
2496                                                 op->base.remap.unmap->va->flags &
2497                                                 XE_VMA_READ_ONLY;
2498
2499                                         bool is_null =
2500                                                 op->base.remap.unmap->va->flags &
2501                                                 DRM_GPUVA_SPARSE;
2502
2503                                         vma = new_vma(vm, op->base.remap.next,
2504                                                       op->tile_mask, read_only,
2505                                                       is_null);
2506                                         if (IS_ERR(vma)) {
2507                                                 err = PTR_ERR(vma);
2508                                                 goto free_fence;
2509                                         }
2510
2511                                         op->remap.next = vma;
2512                                 }
2513
2514                                 /* XXX: Support no doing remaps */
2515                                 op->remap.start =
2516                                         xe_vma_start(gpuva_to_vma(op->base.remap.unmap->va));
2517                                 op->remap.range =
2518                                         xe_vma_size(gpuva_to_vma(op->base.remap.unmap->va));
2519                                 break;
2520                         case DRM_GPUVA_OP_UNMAP:
2521                                 op->unmap.start =
2522                                         xe_vma_start(gpuva_to_vma(op->base.unmap.va));
2523                                 op->unmap.range =
2524                                         xe_vma_size(gpuva_to_vma(op->base.unmap.va));
2525                                 break;
2526                         case DRM_GPUVA_OP_PREFETCH:
2527                                 /* Nothing to do */
2528                                 break;
2529                         default:
2530                                 XE_BUG_ON("NOT POSSIBLE");
2531                         }
2532
2533                         last_op = op;
2534                 }
2535
2536                 last_op->ops = __ops;
2537         }
2538
2539         if (!last_op)
2540                 return -ENODATA;
2541
2542         last_op->flags |= XE_VMA_OP_LAST;
2543         last_op->num_syncs = num_syncs;
2544         last_op->syncs = syncs;
2545         last_op->fence = fence;
2546
2547         return 0;
2548
2549 free_fence:
2550         kfree(fence);
2551         return err;
2552 }
2553
2554 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2555 {
2556         int err = 0;
2557
2558         lockdep_assert_held_write(&vm->lock);
2559
2560         switch (op->base.op) {
2561         case DRM_GPUVA_OP_MAP:
2562                 err |= xe_vm_insert_vma(vm, op->map.vma);
2563                 break;
2564         case DRM_GPUVA_OP_REMAP:
2565                 prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2566                                  true);
2567                 if (op->remap.prev)
2568                         err |= xe_vm_insert_vma(vm, op->remap.prev);
2569                 if (op->remap.next)
2570                         err |= xe_vm_insert_vma(vm, op->remap.next);
2571                 break;
2572         case DRM_GPUVA_OP_UNMAP:
2573                 prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2574                 break;
2575         case DRM_GPUVA_OP_PREFETCH:
2576                 /* Nothing to do */
2577                 break;
2578         default:
2579                 XE_BUG_ON("NOT POSSIBLE");
2580         }
2581
2582         op->flags |= XE_VMA_OP_COMMITTED;
2583         return err;
2584 }
2585
2586 static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
2587                                struct xe_vma_op *op)
2588 {
2589         LIST_HEAD(objs);
2590         LIST_HEAD(dups);
2591         struct ttm_validate_buffer tv_bo, tv_vm;
2592         struct ww_acquire_ctx ww;
2593         struct xe_bo *vbo;
2594         int err;
2595
2596         lockdep_assert_held_write(&vm->lock);
2597
2598         xe_vm_tv_populate(vm, &tv_vm);
2599         list_add_tail(&tv_vm.head, &objs);
2600         vbo = xe_vma_bo(vma);
2601         if (vbo) {
2602                 /*
2603                  * An unbind can drop the last reference to the BO and
2604                  * the BO is needed for ttm_eu_backoff_reservation so
2605                  * take a reference here.
2606                  */
2607                 xe_bo_get(vbo);
2608
2609                 if (!vbo->vm) {
2610                         tv_bo.bo = &vbo->ttm;
2611                         tv_bo.num_shared = 1;
2612                         list_add(&tv_bo.head, &objs);
2613                 }
2614         }
2615
2616 again:
2617         err = ttm_eu_reserve_buffers(&ww, &objs, true, &dups);
2618         if (err) {
2619                 xe_bo_put(vbo);
2620                 return err;
2621         }
2622
2623         xe_vm_assert_held(vm);
2624         xe_bo_assert_held(xe_vma_bo(vma));
2625
2626         switch (op->base.op) {
2627         case DRM_GPUVA_OP_MAP:
2628                 err = xe_vm_bind(vm, vma, op->engine, xe_vma_bo(vma),
2629                                  op->syncs, op->num_syncs, op->fence,
2630                                  op->map.immediate || !xe_vm_in_fault_mode(vm),
2631                                  op->flags & XE_VMA_OP_FIRST,
2632                                  op->flags & XE_VMA_OP_LAST);
2633                 break;
2634         case DRM_GPUVA_OP_REMAP:
2635         {
2636                 bool prev = !!op->remap.prev;
2637                 bool next = !!op->remap.next;
2638
2639                 if (!op->remap.unmap_done) {
2640                         vm->async_ops.munmap_rebind_inflight = true;
2641                         if (prev || next)
2642                                 vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
2643                         err = xe_vm_unbind(vm, vma, op->engine, op->syncs,
2644                                            op->num_syncs,
2645                                            !prev && !next ? op->fence : NULL,
2646                                            op->flags & XE_VMA_OP_FIRST,
2647                                            op->flags & XE_VMA_OP_LAST && !prev &&
2648                                            !next);
2649                         if (err)
2650                                 break;
2651                         op->remap.unmap_done = true;
2652                 }
2653
2654                 if (prev) {
2655                         op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
2656                         err = xe_vm_bind(vm, op->remap.prev, op->engine,
2657                                          xe_vma_bo(op->remap.prev), op->syncs,
2658                                          op->num_syncs,
2659                                          !next ? op->fence : NULL, true, false,
2660                                          op->flags & XE_VMA_OP_LAST && !next);
2661                         op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2662                         if (err)
2663                                 break;
2664                         op->remap.prev = NULL;
2665                 }
2666
2667                 if (next) {
2668                         op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
2669                         err = xe_vm_bind(vm, op->remap.next, op->engine,
2670                                          xe_vma_bo(op->remap.next),
2671                                          op->syncs, op->num_syncs,
2672                                          op->fence, true, false,
2673                                          op->flags & XE_VMA_OP_LAST);
2674                         op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2675                         if (err)
2676                                 break;
2677                         op->remap.next = NULL;
2678                 }
2679                 vm->async_ops.munmap_rebind_inflight = false;
2680
2681                 break;
2682         }
2683         case DRM_GPUVA_OP_UNMAP:
2684                 err = xe_vm_unbind(vm, vma, op->engine, op->syncs,
2685                                    op->num_syncs, op->fence,
2686                                    op->flags & XE_VMA_OP_FIRST,
2687                                    op->flags & XE_VMA_OP_LAST);
2688                 break;
2689         case DRM_GPUVA_OP_PREFETCH:
2690                 err = xe_vm_prefetch(vm, vma, op->engine, op->prefetch.region,
2691                                      op->syncs, op->num_syncs, op->fence,
2692                                      op->flags & XE_VMA_OP_FIRST,
2693                                      op->flags & XE_VMA_OP_LAST);
2694                 break;
2695         default:
2696                 XE_BUG_ON("NOT POSSIBLE");
2697         }
2698
2699         ttm_eu_backoff_reservation(&ww, &objs);
2700         if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
2701                 lockdep_assert_held_write(&vm->lock);
2702                 err = xe_vma_userptr_pin_pages(vma);
2703                 if (!err)
2704                         goto again;
2705         }
2706         xe_bo_put(vbo);
2707
2708         if (err)
2709                 trace_xe_vma_fail(vma);
2710
2711         return err;
2712 }
2713
2714 static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
2715 {
2716         int ret = 0;
2717
2718         lockdep_assert_held_write(&vm->lock);
2719
2720 #ifdef TEST_VM_ASYNC_OPS_ERROR
2721         if (op->inject_error) {
2722                 op->inject_error = false;
2723                 return -ENOMEM;
2724         }
2725 #endif
2726
2727         switch (op->base.op) {
2728         case DRM_GPUVA_OP_MAP:
2729                 ret = __xe_vma_op_execute(vm, op->map.vma, op);
2730                 break;
2731         case DRM_GPUVA_OP_REMAP:
2732         {
2733                 struct xe_vma *vma;
2734
2735                 if (!op->remap.unmap_done)
2736                         vma = gpuva_to_vma(op->base.remap.unmap->va);
2737                 else if (op->remap.prev)
2738                         vma = op->remap.prev;
2739                 else
2740                         vma = op->remap.next;
2741
2742                 ret = __xe_vma_op_execute(vm, vma, op);
2743                 break;
2744         }
2745         case DRM_GPUVA_OP_UNMAP:
2746                 ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
2747                                           op);
2748                 break;
2749         case DRM_GPUVA_OP_PREFETCH:
2750                 ret = __xe_vma_op_execute(vm,
2751                                           gpuva_to_vma(op->base.prefetch.va),
2752                                           op);
2753                 break;
2754         default:
2755                 XE_BUG_ON("NOT POSSIBLE");
2756         }
2757
2758         return ret;
2759 }
2760
2761 static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
2762 {
2763         bool last = op->flags & XE_VMA_OP_LAST;
2764
2765         if (last) {
2766                 while (op->num_syncs--)
2767                         xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
2768                 kfree(op->syncs);
2769                 if (op->engine)
2770                         xe_engine_put(op->engine);
2771                 if (op->fence)
2772                         dma_fence_put(&op->fence->fence);
2773         }
2774         if (!list_empty(&op->link)) {
2775                 spin_lock_irq(&vm->async_ops.lock);
2776                 list_del(&op->link);
2777                 spin_unlock_irq(&vm->async_ops.lock);
2778         }
2779         if (op->ops)
2780                 drm_gpuva_ops_free(&vm->gpuvm, op->ops);
2781         if (last)
2782                 xe_vm_put(vm);
2783 }
2784
2785 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2786                              bool post_commit)
2787 {
2788         lockdep_assert_held_write(&vm->lock);
2789
2790         switch (op->base.op) {
2791         case DRM_GPUVA_OP_MAP:
2792                 if (op->map.vma) {
2793                         prep_vma_destroy(vm, op->map.vma, post_commit);
2794                         xe_vma_destroy_unlocked(op->map.vma);
2795                 }
2796                 break;
2797         case DRM_GPUVA_OP_UNMAP:
2798         {
2799                 struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2800
2801                 down_read(&vm->userptr.notifier_lock);
2802                 vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2803                 up_read(&vm->userptr.notifier_lock);
2804                 if (post_commit)
2805                         xe_vm_insert_vma(vm, vma);
2806                 break;
2807         }
2808         case DRM_GPUVA_OP_REMAP:
2809         {
2810                 struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2811
2812                 if (op->remap.prev) {
2813                         prep_vma_destroy(vm, op->remap.prev, post_commit);
2814                         xe_vma_destroy_unlocked(op->remap.prev);
2815                 }
2816                 if (op->remap.next) {
2817                         prep_vma_destroy(vm, op->remap.next, post_commit);
2818                         xe_vma_destroy_unlocked(op->remap.next);
2819                 }
2820                 down_read(&vm->userptr.notifier_lock);
2821                 vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2822                 up_read(&vm->userptr.notifier_lock);
2823                 if (post_commit)
2824                         xe_vm_insert_vma(vm, vma);
2825                 break;
2826         }
2827         case DRM_GPUVA_OP_PREFETCH:
2828                 /* Nothing to do */
2829                 break;
2830         default:
2831                 XE_BUG_ON("NOT POSSIBLE");
2832         }
2833 }
2834
2835 static struct xe_vma_op *next_vma_op(struct xe_vm *vm)
2836 {
2837         return list_first_entry_or_null(&vm->async_ops.pending,
2838                                         struct xe_vma_op, link);
2839 }
2840
2841 static void xe_vma_op_work_func(struct work_struct *w)
2842 {
2843         struct xe_vm *vm = container_of(w, struct xe_vm, async_ops.work);
2844
2845         for (;;) {
2846                 struct xe_vma_op *op;
2847                 int err;
2848
2849                 if (vm->async_ops.error && !xe_vm_is_closed(vm))
2850                         break;
2851
2852                 spin_lock_irq(&vm->async_ops.lock);
2853                 op = next_vma_op(vm);
2854                 spin_unlock_irq(&vm->async_ops.lock);
2855
2856                 if (!op)
2857                         break;
2858
2859                 if (!xe_vm_is_closed(vm)) {
2860                         down_write(&vm->lock);
2861                         err = xe_vma_op_execute(vm, op);
2862                         if (err) {
2863                                 drm_warn(&vm->xe->drm,
2864                                          "Async VM op(%d) failed with %d",
2865                                          op->base.op, err);
2866                                 vm_set_async_error(vm, err);
2867                                 up_write(&vm->lock);
2868
2869                                 if (vm->async_ops.error_capture.addr)
2870                                         vm_error_capture(vm, err, 0, 0, 0);
2871                                 break;
2872                         }
2873                         up_write(&vm->lock);
2874                 } else {
2875                         struct xe_vma *vma;
2876
2877                         switch (op->base.op) {
2878                         case DRM_GPUVA_OP_REMAP:
2879                                 vma = gpuva_to_vma(op->base.remap.unmap->va);
2880                                 trace_xe_vma_flush(vma);
2881
2882                                 down_write(&vm->lock);
2883                                 xe_vma_destroy_unlocked(vma);
2884                                 up_write(&vm->lock);
2885                                 break;
2886                         case DRM_GPUVA_OP_UNMAP:
2887                                 vma = gpuva_to_vma(op->base.unmap.va);
2888                                 trace_xe_vma_flush(vma);
2889
2890                                 down_write(&vm->lock);
2891                                 xe_vma_destroy_unlocked(vma);
2892                                 up_write(&vm->lock);
2893                                 break;
2894                         default:
2895                                 /* Nothing to do */
2896                                 break;
2897                         }
2898
2899                         if (op->fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
2900                                                    &op->fence->fence.flags)) {
2901                                 if (!xe_vm_no_dma_fences(vm)) {
2902                                         op->fence->started = true;
2903                                         wake_up_all(&op->fence->wq);
2904                                 }
2905                                 dma_fence_signal(&op->fence->fence);
2906                         }
2907                 }
2908
2909                 xe_vma_op_cleanup(vm, op);
2910         }
2911 }
2912
2913 static int vm_bind_ioctl_ops_commit(struct xe_vm *vm,
2914                                     struct list_head *ops_list, bool async)
2915 {
2916         struct xe_vma_op *op, *last_op, *next;
2917         int err;
2918
2919         lockdep_assert_held_write(&vm->lock);
2920
2921         list_for_each_entry(op, ops_list, link) {
2922                 last_op = op;
2923                 err = xe_vma_op_commit(vm, op);
2924                 if (err)
2925                         goto unwind;
2926         }
2927
2928         if (!async) {
2929                 err = xe_vma_op_execute(vm, last_op);
2930                 if (err)
2931                         goto unwind;
2932                 xe_vma_op_cleanup(vm, last_op);
2933         } else {
2934                 int i;
2935                 bool installed = false;
2936
2937                 for (i = 0; i < last_op->num_syncs; i++)
2938                         installed |= xe_sync_entry_signal(&last_op->syncs[i],
2939                                                           NULL,
2940                                                           &last_op->fence->fence);
2941                 if (!installed && last_op->fence)
2942                         dma_fence_signal(&last_op->fence->fence);
2943
2944                 spin_lock_irq(&vm->async_ops.lock);
2945                 list_splice_tail(ops_list, &vm->async_ops.pending);
2946                 spin_unlock_irq(&vm->async_ops.lock);
2947
2948                 if (!vm->async_ops.error)
2949                         queue_work(system_unbound_wq, &vm->async_ops.work);
2950         }
2951
2952         return 0;
2953
2954 unwind:
2955         list_for_each_entry_reverse(op, ops_list, link)
2956                 xe_vma_op_unwind(vm, op, op->flags & XE_VMA_OP_COMMITTED);
2957         list_for_each_entry_safe(op, next, ops_list, link)
2958                 xe_vma_op_cleanup(vm, op);
2959
2960         return err;
2961 }
2962
2963 /*
2964  * Unwind operations list, called after a failure of vm_bind_ioctl_ops_create or
2965  * vm_bind_ioctl_ops_parse.
2966  */
2967 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2968                                      struct drm_gpuva_ops **ops,
2969                                      int num_ops_list)
2970 {
2971         int i;
2972
2973         for (i = 0; i < num_ops_list; ++i) {
2974                 struct drm_gpuva_ops *__ops = ops[i];
2975                 struct drm_gpuva_op *__op;
2976
2977                 if (!__ops)
2978                         continue;
2979
2980                 drm_gpuva_for_each_op(__op, __ops) {
2981                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2982
2983                         xe_vma_op_unwind(vm, op, false);
2984                 }
2985         }
2986 }
2987
2988 #ifdef TEST_VM_ASYNC_OPS_ERROR
2989 #define SUPPORTED_FLAGS \
2990         (FORCE_ASYNC_OP_ERROR | XE_VM_BIND_FLAG_ASYNC | \
2991          XE_VM_BIND_FLAG_READONLY | XE_VM_BIND_FLAG_IMMEDIATE | \
2992          XE_VM_BIND_FLAG_NULL | 0xffff)
2993 #else
2994 #define SUPPORTED_FLAGS \
2995         (XE_VM_BIND_FLAG_ASYNC | XE_VM_BIND_FLAG_READONLY | \
2996          XE_VM_BIND_FLAG_IMMEDIATE | XE_VM_BIND_FLAG_NULL | 0xffff)
2997 #endif
2998 #define XE_64K_PAGE_MASK 0xffffull
2999
3000 #define MAX_BINDS       512     /* FIXME: Picking random upper limit */
3001
3002 static int vm_bind_ioctl_check_args(struct xe_device *xe,
3003                                     struct drm_xe_vm_bind *args,
3004                                     struct drm_xe_vm_bind_op **bind_ops,
3005                                     bool *async)
3006 {
3007         int err;
3008         int i;
3009
3010         if (XE_IOCTL_ERR(xe, args->extensions) ||
3011             XE_IOCTL_ERR(xe, !args->num_binds) ||
3012             XE_IOCTL_ERR(xe, args->num_binds > MAX_BINDS))
3013                 return -EINVAL;
3014
3015         if (args->num_binds > 1) {
3016                 u64 __user *bind_user =
3017                         u64_to_user_ptr(args->vector_of_binds);
3018
3019                 *bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) *
3020                                     args->num_binds, GFP_KERNEL);
3021                 if (!*bind_ops)
3022                         return -ENOMEM;
3023
3024                 err = __copy_from_user(*bind_ops, bind_user,
3025                                        sizeof(struct drm_xe_vm_bind_op) *
3026                                        args->num_binds);
3027                 if (XE_IOCTL_ERR(xe, err)) {
3028                         err = -EFAULT;
3029                         goto free_bind_ops;
3030                 }
3031         } else {
3032                 *bind_ops = &args->bind;
3033         }
3034
3035         for (i = 0; i < args->num_binds; ++i) {
3036                 u64 range = (*bind_ops)[i].range;
3037                 u64 addr = (*bind_ops)[i].addr;
3038                 u32 op = (*bind_ops)[i].op;
3039                 u32 obj = (*bind_ops)[i].obj;
3040                 u64 obj_offset = (*bind_ops)[i].obj_offset;
3041                 u32 region = (*bind_ops)[i].region;
3042                 bool is_null = op & XE_VM_BIND_FLAG_NULL;
3043
3044                 if (i == 0) {
3045                         *async = !!(op & XE_VM_BIND_FLAG_ASYNC);
3046                 } else if (XE_IOCTL_ERR(xe, !*async) ||
3047                            XE_IOCTL_ERR(xe, !(op & XE_VM_BIND_FLAG_ASYNC)) ||
3048                            XE_IOCTL_ERR(xe, VM_BIND_OP(op) ==
3049                                         XE_VM_BIND_OP_RESTART)) {
3050                         err = -EINVAL;
3051                         goto free_bind_ops;
3052                 }
3053
3054                 if (XE_IOCTL_ERR(xe, !*async &&
3055                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL)) {
3056                         err = -EINVAL;
3057                         goto free_bind_ops;
3058                 }
3059
3060                 if (XE_IOCTL_ERR(xe, !*async &&
3061                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH)) {
3062                         err = -EINVAL;
3063                         goto free_bind_ops;
3064                 }
3065
3066                 if (XE_IOCTL_ERR(xe, VM_BIND_OP(op) >
3067                                  XE_VM_BIND_OP_PREFETCH) ||
3068                     XE_IOCTL_ERR(xe, op & ~SUPPORTED_FLAGS) ||
3069                     XE_IOCTL_ERR(xe, obj && is_null) ||
3070                     XE_IOCTL_ERR(xe, obj_offset && is_null) ||
3071                     XE_IOCTL_ERR(xe, VM_BIND_OP(op) != XE_VM_BIND_OP_MAP &&
3072                                  is_null) ||
3073                     XE_IOCTL_ERR(xe, !obj &&
3074                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP &&
3075                                  !is_null) ||
3076                     XE_IOCTL_ERR(xe, !obj &&
3077                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3078                     XE_IOCTL_ERR(xe, addr &&
3079                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3080                     XE_IOCTL_ERR(xe, range &&
3081                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3082                     XE_IOCTL_ERR(xe, obj &&
3083                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP_USERPTR) ||
3084                     XE_IOCTL_ERR(xe, obj &&
3085                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH) ||
3086                     XE_IOCTL_ERR(xe, region &&
3087                                  VM_BIND_OP(op) != XE_VM_BIND_OP_PREFETCH) ||
3088                     XE_IOCTL_ERR(xe, !(BIT(region) &
3089                                        xe->info.mem_region_mask)) ||
3090                     XE_IOCTL_ERR(xe, obj &&
3091                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP)) {
3092                         err = -EINVAL;
3093                         goto free_bind_ops;
3094                 }
3095
3096                 if (XE_IOCTL_ERR(xe, obj_offset & ~PAGE_MASK) ||
3097                     XE_IOCTL_ERR(xe, addr & ~PAGE_MASK) ||
3098                     XE_IOCTL_ERR(xe, range & ~PAGE_MASK) ||
3099                     XE_IOCTL_ERR(xe, !range && VM_BIND_OP(op) !=
3100                                  XE_VM_BIND_OP_RESTART &&
3101                                  VM_BIND_OP(op) != XE_VM_BIND_OP_UNMAP_ALL)) {
3102                         err = -EINVAL;
3103                         goto free_bind_ops;
3104                 }
3105         }
3106
3107         return 0;
3108
3109 free_bind_ops:
3110         if (args->num_binds > 1)
3111                 kfree(*bind_ops);
3112         return err;
3113 }
3114
3115 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3116 {
3117         struct xe_device *xe = to_xe_device(dev);
3118         struct xe_file *xef = to_xe_file(file);
3119         struct drm_xe_vm_bind *args = data;
3120         struct drm_xe_sync __user *syncs_user;
3121         struct xe_bo **bos = NULL;
3122         struct drm_gpuva_ops **ops = NULL;
3123         struct xe_vm *vm;
3124         struct xe_engine *e = NULL;
3125         u32 num_syncs;
3126         struct xe_sync_entry *syncs = NULL;
3127         struct drm_xe_vm_bind_op *bind_ops;
3128         LIST_HEAD(ops_list);
3129         bool async;
3130         int err;
3131         int i;
3132
3133         err = vm_bind_ioctl_check_args(xe, args, &bind_ops, &async);
3134         if (err)
3135                 return err;
3136
3137         if (args->engine_id) {
3138                 e = xe_engine_lookup(xef, args->engine_id);
3139                 if (XE_IOCTL_ERR(xe, !e)) {
3140                         err = -ENOENT;
3141                         goto free_objs;
3142                 }
3143
3144                 if (XE_IOCTL_ERR(xe, !(e->flags & ENGINE_FLAG_VM))) {
3145                         err = -EINVAL;
3146                         goto put_engine;
3147                 }
3148         }
3149
3150         vm = xe_vm_lookup(xef, args->vm_id);
3151         if (XE_IOCTL_ERR(xe, !vm)) {
3152                 err = -EINVAL;
3153                 goto put_engine;
3154         }
3155
3156         err = down_write_killable(&vm->lock);
3157         if (err)
3158                 goto put_vm;
3159
3160         if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
3161                 err = -ENOENT;
3162                 goto release_vm_lock;
3163         }
3164
3165         if (VM_BIND_OP(bind_ops[0].op) == XE_VM_BIND_OP_RESTART) {
3166                 if (XE_IOCTL_ERR(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
3167                         err = -EOPNOTSUPP;
3168                 if (XE_IOCTL_ERR(xe, !err && args->num_syncs))
3169                         err = EINVAL;
3170                 if (XE_IOCTL_ERR(xe, !err && !vm->async_ops.error))
3171                         err = -EPROTO;
3172
3173                 if (!err) {
3174                         trace_xe_vm_restart(vm);
3175                         vm_set_async_error(vm, 0);
3176
3177                         queue_work(system_unbound_wq, &vm->async_ops.work);
3178
3179                         /* Rebinds may have been blocked, give worker a kick */
3180                         if (xe_vm_in_compute_mode(vm))
3181                                 xe_vm_queue_rebind_worker(vm);
3182                 }
3183
3184                 goto release_vm_lock;
3185         }
3186
3187         if (XE_IOCTL_ERR(xe, !vm->async_ops.error &&
3188                          async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) {
3189                 err = -EOPNOTSUPP;
3190                 goto release_vm_lock;
3191         }
3192
3193         for (i = 0; i < args->num_binds; ++i) {
3194                 u64 range = bind_ops[i].range;
3195                 u64 addr = bind_ops[i].addr;
3196
3197                 if (XE_IOCTL_ERR(xe, range > vm->size) ||
3198                     XE_IOCTL_ERR(xe, addr > vm->size - range)) {
3199                         err = -EINVAL;
3200                         goto release_vm_lock;
3201                 }
3202
3203                 if (bind_ops[i].tile_mask) {
3204                         u64 valid_tiles = BIT(xe->info.tile_count) - 1;
3205
3206                         if (XE_IOCTL_ERR(xe, bind_ops[i].tile_mask &
3207                                          ~valid_tiles)) {
3208                                 err = -EINVAL;
3209                                 goto release_vm_lock;
3210                         }
3211                 }
3212         }
3213
3214         bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
3215         if (!bos) {
3216                 err = -ENOMEM;
3217                 goto release_vm_lock;
3218         }
3219
3220         ops = kzalloc(sizeof(*ops) * args->num_binds, GFP_KERNEL);
3221         if (!ops) {
3222                 err = -ENOMEM;
3223                 goto release_vm_lock;
3224         }
3225
3226         for (i = 0; i < args->num_binds; ++i) {
3227                 struct drm_gem_object *gem_obj;
3228                 u64 range = bind_ops[i].range;
3229                 u64 addr = bind_ops[i].addr;
3230                 u32 obj = bind_ops[i].obj;
3231                 u64 obj_offset = bind_ops[i].obj_offset;
3232
3233                 if (!obj)
3234                         continue;
3235
3236                 gem_obj = drm_gem_object_lookup(file, obj);
3237                 if (XE_IOCTL_ERR(xe, !gem_obj)) {
3238                         err = -ENOENT;
3239                         goto put_obj;
3240                 }
3241                 bos[i] = gem_to_xe_bo(gem_obj);
3242
3243                 if (XE_IOCTL_ERR(xe, range > bos[i]->size) ||
3244                     XE_IOCTL_ERR(xe, obj_offset >
3245                                  bos[i]->size - range)) {
3246                         err = -EINVAL;
3247                         goto put_obj;
3248                 }
3249
3250                 if (bos[i]->flags & XE_BO_INTERNAL_64K) {
3251                         if (XE_IOCTL_ERR(xe, obj_offset &
3252                                          XE_64K_PAGE_MASK) ||
3253                             XE_IOCTL_ERR(xe, addr & XE_64K_PAGE_MASK) ||
3254                             XE_IOCTL_ERR(xe, range & XE_64K_PAGE_MASK)) {
3255                                 err = -EINVAL;
3256                                 goto put_obj;
3257                         }
3258                 }
3259         }
3260
3261         if (args->num_syncs) {
3262                 syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3263                 if (!syncs) {
3264                         err = -ENOMEM;
3265                         goto put_obj;
3266                 }
3267         }
3268
3269         syncs_user = u64_to_user_ptr(args->syncs);
3270         for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3271                 err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3272                                           &syncs_user[num_syncs], false,
3273                                           xe_vm_no_dma_fences(vm));
3274                 if (err)
3275                         goto free_syncs;
3276         }
3277
3278         /* Do some error checking first to make the unwind easier */
3279         for (i = 0; i < args->num_binds; ++i) {
3280                 u64 range = bind_ops[i].range;
3281                 u64 addr = bind_ops[i].addr;
3282                 u32 op = bind_ops[i].op;
3283
3284                 err = vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op);
3285                 if (err)
3286                         goto free_syncs;
3287         }
3288
3289         for (i = 0; i < args->num_binds; ++i) {
3290                 u64 range = bind_ops[i].range;
3291                 u64 addr = bind_ops[i].addr;
3292                 u32 op = bind_ops[i].op;
3293                 u64 obj_offset = bind_ops[i].obj_offset;
3294                 u64 tile_mask = bind_ops[i].tile_mask;
3295                 u32 region = bind_ops[i].region;
3296
3297                 ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3298                                                   addr, range, op, tile_mask,
3299                                                   region);
3300                 if (IS_ERR(ops[i])) {
3301                         err = PTR_ERR(ops[i]);
3302                         ops[i] = NULL;
3303                         goto unwind_ops;
3304                 }
3305         }
3306
3307         err = vm_bind_ioctl_ops_parse(vm, e, ops, args->num_binds,
3308                                       syncs, num_syncs, &ops_list, async);
3309         if (err)
3310                 goto unwind_ops;
3311
3312         err = vm_bind_ioctl_ops_commit(vm, &ops_list, async);
3313         up_write(&vm->lock);
3314
3315         for (i = 0; i < args->num_binds; ++i)
3316                 xe_bo_put(bos[i]);
3317
3318         kfree(bos);
3319         kfree(ops);
3320         if (args->num_binds > 1)
3321                 kfree(bind_ops);
3322
3323         return err;
3324
3325 unwind_ops:
3326         vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3327 free_syncs:
3328         while (num_syncs--)
3329                 xe_sync_entry_cleanup(&syncs[num_syncs]);
3330
3331         kfree(syncs);
3332 put_obj:
3333         for (i = 0; i < args->num_binds; ++i)
3334                 xe_bo_put(bos[i]);
3335 release_vm_lock:
3336         up_write(&vm->lock);
3337 put_vm:
3338         xe_vm_put(vm);
3339 put_engine:
3340         if (e)
3341                 xe_engine_put(e);
3342 free_objs:
3343         kfree(bos);
3344         kfree(ops);
3345         if (args->num_binds > 1)
3346                 kfree(bind_ops);
3347         return err == -ENODATA ? 0 : err;
3348 }
3349
3350 /*
3351  * XXX: Using the TTM wrappers for now, likely can call into dma-resv code
3352  * directly to optimize. Also this likely should be an inline function.
3353  */
3354 int xe_vm_lock(struct xe_vm *vm, struct ww_acquire_ctx *ww,
3355                int num_resv, bool intr)
3356 {
3357         struct ttm_validate_buffer tv_vm;
3358         LIST_HEAD(objs);
3359         LIST_HEAD(dups);
3360
3361         XE_BUG_ON(!ww);
3362
3363         tv_vm.num_shared = num_resv;
3364         tv_vm.bo = xe_vm_ttm_bo(vm);;
3365         list_add_tail(&tv_vm.head, &objs);
3366
3367         return ttm_eu_reserve_buffers(ww, &objs, intr, &dups);
3368 }
3369
3370 void xe_vm_unlock(struct xe_vm *vm, struct ww_acquire_ctx *ww)
3371 {
3372         dma_resv_unlock(xe_vm_resv(vm));
3373         ww_acquire_fini(ww);
3374 }
3375
3376 /**
3377  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3378  * @vma: VMA to invalidate
3379  *
3380  * Walks a list of page tables leaves which it memset the entries owned by this
3381  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3382  * complete.
3383  *
3384  * Returns 0 for success, negative error code otherwise.
3385  */
3386 int xe_vm_invalidate_vma(struct xe_vma *vma)
3387 {
3388         struct xe_device *xe = xe_vma_vm(vma)->xe;
3389         struct xe_tile *tile;
3390         u32 tile_needs_invalidate = 0;
3391         int seqno[XE_MAX_TILES_PER_DEVICE];
3392         u8 id;
3393         int ret;
3394
3395         XE_BUG_ON(!xe_vm_in_fault_mode(xe_vma_vm(vma)));
3396         XE_WARN_ON(xe_vma_is_null(vma));
3397         trace_xe_vma_usm_invalidate(vma);
3398
3399         /* Check that we don't race with page-table updates */
3400         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3401                 if (xe_vma_is_userptr(vma)) {
3402                         WARN_ON_ONCE(!mmu_interval_check_retry
3403                                      (&vma->userptr.notifier,
3404                                       vma->userptr.notifier_seq));
3405                         WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3406                                                              DMA_RESV_USAGE_BOOKKEEP));
3407
3408                 } else {
3409                         xe_bo_assert_held(xe_vma_bo(vma));
3410                 }
3411         }
3412
3413         for_each_tile(tile, xe, id) {
3414                 if (xe_pt_zap_ptes(tile, vma)) {
3415                         tile_needs_invalidate |= BIT(id);
3416                         xe_device_wmb(xe);
3417                         /*
3418                          * FIXME: We potentially need to invalidate multiple
3419                          * GTs within the tile
3420                          */
3421                         seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
3422                         if (seqno[id] < 0)
3423                                 return seqno[id];
3424                 }
3425         }
3426
3427         for_each_tile(tile, xe, id) {
3428                 if (tile_needs_invalidate & BIT(id)) {
3429                         ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
3430                         if (ret < 0)
3431                                 return ret;
3432                 }
3433         }
3434
3435         vma->usm.tile_invalidated = vma->tile_mask;
3436
3437         return 0;
3438 }
3439
3440 int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
3441 {
3442         struct drm_gpuva *gpuva;
3443         bool is_vram;
3444         uint64_t addr;
3445
3446         if (!down_read_trylock(&vm->lock)) {
3447                 drm_printf(p, " Failed to acquire VM lock to dump capture");
3448                 return 0;
3449         }
3450         if (vm->pt_root[gt_id]) {
3451                 addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE,
3452                                   &is_vram);
3453                 drm_printf(p, " VM root: A:0x%llx %s\n", addr, is_vram ? "VRAM" : "SYS");
3454         }
3455
3456         drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3457                 struct xe_vma *vma = gpuva_to_vma(gpuva);
3458                 bool is_userptr = xe_vma_is_userptr(vma);
3459                 bool is_null = xe_vma_is_null(vma);
3460
3461                 if (is_null) {
3462                         addr = 0;
3463                 } else if (is_userptr) {
3464                         struct xe_res_cursor cur;
3465
3466                         if (vma->userptr.sg) {
3467                                 xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE,
3468                                                 &cur);
3469                                 addr = xe_res_dma(&cur);
3470                         } else {
3471                                 addr = 0;
3472                         }
3473                 } else {
3474                         addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE, &is_vram);
3475                 }
3476                 drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
3477                            xe_vma_start(vma), xe_vma_end(vma) - 1,
3478                            xe_vma_size(vma),
3479                            addr, is_null ? "NULL" : is_userptr ? "USR" :
3480                            is_vram ? "VRAM" : "SYS");
3481         }
3482         up_read(&vm->lock);
3483
3484         return 0;
3485 }