drm/xe: Fixup unwind on VM ops errors
[linux-2.6-microblaze.git] / drivers / gpu / drm / xe / xe_vm.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5
6 #include "xe_vm.h"
7
8 #include <linux/dma-fence-array.h>
9
10 #include <drm/drm_print.h>
11 #include <drm/ttm/ttm_execbuf_util.h>
12 #include <drm/ttm/ttm_tt.h>
13 #include <drm/xe_drm.h>
14 #include <linux/delay.h>
15 #include <linux/kthread.h>
16 #include <linux/mm.h>
17 #include <linux/swap.h>
18
19 #include "xe_bo.h"
20 #include "xe_device.h"
21 #include "xe_exec_queue.h"
22 #include "xe_gt.h"
23 #include "xe_gt_pagefault.h"
24 #include "xe_gt_tlb_invalidation.h"
25 #include "xe_migrate.h"
26 #include "xe_pm.h"
27 #include "xe_preempt_fence.h"
28 #include "xe_pt.h"
29 #include "xe_res_cursor.h"
30 #include "xe_sync.h"
31 #include "xe_trace.h"
32 #include "generated/xe_wa_oob.h"
33 #include "xe_wa.h"
34
35 #define TEST_VM_ASYNC_OPS_ERROR
36
37 /**
38  * xe_vma_userptr_check_repin() - Advisory check for repin needed
39  * @vma: The userptr vma
40  *
41  * Check if the userptr vma has been invalidated since last successful
42  * repin. The check is advisory only and can the function can be called
43  * without the vm->userptr.notifier_lock held. There is no guarantee that the
44  * vma userptr will remain valid after a lockless check, so typically
45  * the call needs to be followed by a proper check under the notifier_lock.
46  *
47  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
48  */
49 int xe_vma_userptr_check_repin(struct xe_vma *vma)
50 {
51         return mmu_interval_check_retry(&vma->userptr.notifier,
52                                         vma->userptr.notifier_seq) ?
53                 -EAGAIN : 0;
54 }
55
56 int xe_vma_userptr_pin_pages(struct xe_vma *vma)
57 {
58         struct xe_vm *vm = xe_vma_vm(vma);
59         struct xe_device *xe = vm->xe;
60         const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
61         struct page **pages;
62         bool in_kthread = !current->mm;
63         unsigned long notifier_seq;
64         int pinned, ret, i;
65         bool read_only = xe_vma_read_only(vma);
66
67         lockdep_assert_held(&vm->lock);
68         XE_WARN_ON(!xe_vma_is_userptr(vma));
69 retry:
70         if (vma->gpuva.flags & XE_VMA_DESTROYED)
71                 return 0;
72
73         notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
74         if (notifier_seq == vma->userptr.notifier_seq)
75                 return 0;
76
77         pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
78         if (!pages)
79                 return -ENOMEM;
80
81         if (vma->userptr.sg) {
82                 dma_unmap_sgtable(xe->drm.dev,
83                                   vma->userptr.sg,
84                                   read_only ? DMA_TO_DEVICE :
85                                   DMA_BIDIRECTIONAL, 0);
86                 sg_free_table(vma->userptr.sg);
87                 vma->userptr.sg = NULL;
88         }
89
90         pinned = ret = 0;
91         if (in_kthread) {
92                 if (!mmget_not_zero(vma->userptr.notifier.mm)) {
93                         ret = -EFAULT;
94                         goto mm_closed;
95                 }
96                 kthread_use_mm(vma->userptr.notifier.mm);
97         }
98
99         while (pinned < num_pages) {
100                 ret = get_user_pages_fast(xe_vma_userptr(vma) +
101                                           pinned * PAGE_SIZE,
102                                           num_pages - pinned,
103                                           read_only ? 0 : FOLL_WRITE,
104                                           &pages[pinned]);
105                 if (ret < 0) {
106                         if (in_kthread)
107                                 ret = 0;
108                         break;
109                 }
110
111                 pinned += ret;
112                 ret = 0;
113         }
114
115         if (in_kthread) {
116                 kthread_unuse_mm(vma->userptr.notifier.mm);
117                 mmput(vma->userptr.notifier.mm);
118         }
119 mm_closed:
120         if (ret)
121                 goto out;
122
123         ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages,
124                                                 pinned, 0,
125                                                 (u64)pinned << PAGE_SHIFT,
126                                                 xe_sg_segment_size(xe->drm.dev),
127                                                 GFP_KERNEL);
128         if (ret) {
129                 vma->userptr.sg = NULL;
130                 goto out;
131         }
132         vma->userptr.sg = &vma->userptr.sgt;
133
134         ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg,
135                               read_only ? DMA_TO_DEVICE :
136                               DMA_BIDIRECTIONAL,
137                               DMA_ATTR_SKIP_CPU_SYNC |
138                               DMA_ATTR_NO_KERNEL_MAPPING);
139         if (ret) {
140                 sg_free_table(vma->userptr.sg);
141                 vma->userptr.sg = NULL;
142                 goto out;
143         }
144
145         for (i = 0; i < pinned; ++i) {
146                 if (!read_only) {
147                         lock_page(pages[i]);
148                         set_page_dirty(pages[i]);
149                         unlock_page(pages[i]);
150                 }
151
152                 mark_page_accessed(pages[i]);
153         }
154
155 out:
156         release_pages(pages, pinned);
157         kvfree(pages);
158
159         if (!(ret < 0)) {
160                 vma->userptr.notifier_seq = notifier_seq;
161                 if (xe_vma_userptr_check_repin(vma) == -EAGAIN)
162                         goto retry;
163         }
164
165         return ret < 0 ? ret : 0;
166 }
167
168 static bool preempt_fences_waiting(struct xe_vm *vm)
169 {
170         struct xe_exec_queue *q;
171
172         lockdep_assert_held(&vm->lock);
173         xe_vm_assert_held(vm);
174
175         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
176                 if (!q->compute.pfence ||
177                     (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
178                                                    &q->compute.pfence->flags))) {
179                         return true;
180                 }
181         }
182
183         return false;
184 }
185
186 static void free_preempt_fences(struct list_head *list)
187 {
188         struct list_head *link, *next;
189
190         list_for_each_safe(link, next, list)
191                 xe_preempt_fence_free(to_preempt_fence_from_link(link));
192 }
193
194 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
195                                 unsigned int *count)
196 {
197         lockdep_assert_held(&vm->lock);
198         xe_vm_assert_held(vm);
199
200         if (*count >= vm->preempt.num_exec_queues)
201                 return 0;
202
203         for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
204                 struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
205
206                 if (IS_ERR(pfence))
207                         return PTR_ERR(pfence);
208
209                 list_move_tail(xe_preempt_fence_link(pfence), list);
210         }
211
212         return 0;
213 }
214
215 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
216 {
217         struct xe_exec_queue *q;
218
219         xe_vm_assert_held(vm);
220
221         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
222                 if (q->compute.pfence) {
223                         long timeout = dma_fence_wait(q->compute.pfence, false);
224
225                         if (timeout < 0)
226                                 return -ETIME;
227                         dma_fence_put(q->compute.pfence);
228                         q->compute.pfence = NULL;
229                 }
230         }
231
232         return 0;
233 }
234
235 static bool xe_vm_is_idle(struct xe_vm *vm)
236 {
237         struct xe_exec_queue *q;
238
239         xe_vm_assert_held(vm);
240         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
241                 if (!xe_exec_queue_is_idle(q))
242                         return false;
243         }
244
245         return true;
246 }
247
248 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
249 {
250         struct list_head *link;
251         struct xe_exec_queue *q;
252
253         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
254                 struct dma_fence *fence;
255
256                 link = list->next;
257                 XE_WARN_ON(link == list);
258
259                 fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
260                                              q, q->compute.context,
261                                              ++q->compute.seqno);
262                 dma_fence_put(q->compute.pfence);
263                 q->compute.pfence = fence;
264         }
265 }
266
267 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
268 {
269         struct xe_exec_queue *q;
270         struct ww_acquire_ctx ww;
271         int err;
272
273         err = xe_bo_lock(bo, &ww, vm->preempt.num_exec_queues, true);
274         if (err)
275                 return err;
276
277         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
278                 if (q->compute.pfence) {
279                         dma_resv_add_fence(bo->ttm.base.resv,
280                                            q->compute.pfence,
281                                            DMA_RESV_USAGE_BOOKKEEP);
282                 }
283
284         xe_bo_unlock(bo, &ww);
285         return 0;
286 }
287
288 /**
289  * xe_vm_fence_all_extobjs() - Add a fence to vm's external objects' resv
290  * @vm: The vm.
291  * @fence: The fence to add.
292  * @usage: The resv usage for the fence.
293  *
294  * Loops over all of the vm's external object bindings and adds a @fence
295  * with the given @usage to all of the external object's reservation
296  * objects.
297  */
298 void xe_vm_fence_all_extobjs(struct xe_vm *vm, struct dma_fence *fence,
299                              enum dma_resv_usage usage)
300 {
301         struct xe_vma *vma;
302
303         list_for_each_entry(vma, &vm->extobj.list, extobj.link)
304                 dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, usage);
305 }
306
307 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm)
308 {
309         struct xe_exec_queue *q;
310
311         lockdep_assert_held(&vm->lock);
312         xe_vm_assert_held(vm);
313
314         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
315                 q->ops->resume(q);
316
317                 dma_resv_add_fence(xe_vm_resv(vm), q->compute.pfence,
318                                    DMA_RESV_USAGE_BOOKKEEP);
319                 xe_vm_fence_all_extobjs(vm, q->compute.pfence,
320                                         DMA_RESV_USAGE_BOOKKEEP);
321         }
322 }
323
324 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
325 {
326         struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
327         struct ttm_validate_buffer *tv;
328         struct ww_acquire_ctx ww;
329         struct list_head objs;
330         struct dma_fence *pfence;
331         int err;
332         bool wait;
333
334         XE_WARN_ON(!xe_vm_in_compute_mode(vm));
335
336         down_write(&vm->lock);
337
338         err = xe_vm_lock_dma_resv(vm, &ww, tv_onstack, &tv, &objs, true, 1);
339         if (err)
340                 goto out_unlock_outer;
341
342         pfence = xe_preempt_fence_create(q, q->compute.context,
343                                          ++q->compute.seqno);
344         if (!pfence) {
345                 err = -ENOMEM;
346                 goto out_unlock;
347         }
348
349         list_add(&q->compute.link, &vm->preempt.exec_queues);
350         ++vm->preempt.num_exec_queues;
351         q->compute.pfence = pfence;
352
353         down_read(&vm->userptr.notifier_lock);
354
355         dma_resv_add_fence(xe_vm_resv(vm), pfence,
356                            DMA_RESV_USAGE_BOOKKEEP);
357
358         xe_vm_fence_all_extobjs(vm, pfence, DMA_RESV_USAGE_BOOKKEEP);
359
360         /*
361          * Check to see if a preemption on VM is in flight or userptr
362          * invalidation, if so trigger this preempt fence to sync state with
363          * other preempt fences on the VM.
364          */
365         wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
366         if (wait)
367                 dma_fence_enable_sw_signaling(pfence);
368
369         up_read(&vm->userptr.notifier_lock);
370
371 out_unlock:
372         xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
373 out_unlock_outer:
374         up_write(&vm->lock);
375
376         return err;
377 }
378
379 /**
380  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
381  * that need repinning.
382  * @vm: The VM.
383  *
384  * This function checks for whether the VM has userptrs that need repinning,
385  * and provides a release-type barrier on the userptr.notifier_lock after
386  * checking.
387  *
388  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
389  */
390 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
391 {
392         lockdep_assert_held_read(&vm->userptr.notifier_lock);
393
394         return (list_empty(&vm->userptr.repin_list) &&
395                 list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
396 }
397
398 /**
399  * xe_vm_lock_dma_resv() - Lock the vm dma_resv object and the dma_resv
400  * objects of the vm's external buffer objects.
401  * @vm: The vm.
402  * @ww: Pointer to a struct ww_acquire_ctx locking context.
403  * @tv_onstack: Array size XE_ONSTACK_TV of storage for the struct
404  * ttm_validate_buffers used for locking.
405  * @tv: Pointer to a pointer that on output contains the actual storage used.
406  * @objs: List head for the buffer objects locked.
407  * @intr: Whether to lock interruptible.
408  * @num_shared: Number of dma-fence slots to reserve in the locked objects.
409  *
410  * Locks the vm dma-resv objects and all the dma-resv objects of the
411  * buffer objects on the vm external object list. The TTM utilities require
412  * a list of struct ttm_validate_buffers pointing to the actual buffer
413  * objects to lock. Storage for those struct ttm_validate_buffers should
414  * be provided in @tv_onstack, and is typically reserved on the stack
415  * of the caller. If the size of @tv_onstack isn't sufficient, then
416  * storage will be allocated internally using kvmalloc().
417  *
418  * The function performs deadlock handling internally, and after a
419  * successful return the ww locking transaction should be considered
420  * sealed.
421  *
422  * Return: 0 on success, Negative error code on error. In particular if
423  * @intr is set to true, -EINTR or -ERESTARTSYS may be returned. In case
424  * of error, any locking performed has been reverted.
425  */
426 int xe_vm_lock_dma_resv(struct xe_vm *vm, struct ww_acquire_ctx *ww,
427                         struct ttm_validate_buffer *tv_onstack,
428                         struct ttm_validate_buffer **tv,
429                         struct list_head *objs,
430                         bool intr,
431                         unsigned int num_shared)
432 {
433         struct ttm_validate_buffer *tv_vm, *tv_bo;
434         struct xe_vma *vma, *next;
435         LIST_HEAD(dups);
436         int err;
437
438         lockdep_assert_held(&vm->lock);
439
440         if (vm->extobj.entries < XE_ONSTACK_TV) {
441                 tv_vm = tv_onstack;
442         } else {
443                 tv_vm = kvmalloc_array(vm->extobj.entries + 1, sizeof(*tv_vm),
444                                        GFP_KERNEL);
445                 if (!tv_vm)
446                         return -ENOMEM;
447         }
448         tv_bo = tv_vm + 1;
449
450         INIT_LIST_HEAD(objs);
451         list_for_each_entry(vma, &vm->extobj.list, extobj.link) {
452                 tv_bo->num_shared = num_shared;
453                 tv_bo->bo = &xe_vma_bo(vma)->ttm;
454
455                 list_add_tail(&tv_bo->head, objs);
456                 tv_bo++;
457         }
458         tv_vm->num_shared = num_shared;
459         tv_vm->bo = xe_vm_ttm_bo(vm);
460         list_add_tail(&tv_vm->head, objs);
461         err = ttm_eu_reserve_buffers(ww, objs, intr, &dups);
462         if (err)
463                 goto out_err;
464
465         spin_lock(&vm->notifier.list_lock);
466         list_for_each_entry_safe(vma, next, &vm->notifier.rebind_list,
467                                  notifier.rebind_link) {
468                 xe_bo_assert_held(xe_vma_bo(vma));
469
470                 list_del_init(&vma->notifier.rebind_link);
471                 if (vma->tile_present && !(vma->gpuva.flags & XE_VMA_DESTROYED))
472                         list_move_tail(&vma->combined_links.rebind,
473                                        &vm->rebind_list);
474         }
475         spin_unlock(&vm->notifier.list_lock);
476
477         *tv = tv_vm;
478         return 0;
479
480 out_err:
481         if (tv_vm != tv_onstack)
482                 kvfree(tv_vm);
483
484         return err;
485 }
486
487 /**
488  * xe_vm_unlock_dma_resv() - Unlock reservation objects locked by
489  * xe_vm_lock_dma_resv()
490  * @vm: The vm.
491  * @tv_onstack: The @tv_onstack array given to xe_vm_lock_dma_resv().
492  * @tv: The value of *@tv given by xe_vm_lock_dma_resv().
493  * @ww: The ww_acquire_context used for locking.
494  * @objs: The list returned from xe_vm_lock_dma_resv().
495  *
496  * Unlocks the reservation objects and frees any memory allocated by
497  * xe_vm_lock_dma_resv().
498  */
499 void xe_vm_unlock_dma_resv(struct xe_vm *vm,
500                            struct ttm_validate_buffer *tv_onstack,
501                            struct ttm_validate_buffer *tv,
502                            struct ww_acquire_ctx *ww,
503                            struct list_head *objs)
504 {
505         /*
506          * Nothing should've been able to enter the list while we were locked,
507          * since we've held the dma-resvs of all the vm's external objects,
508          * and holding the dma_resv of an object is required for list
509          * addition, and we shouldn't add ourselves.
510          */
511         XE_WARN_ON(!list_empty(&vm->notifier.rebind_list));
512
513         ttm_eu_backoff_reservation(ww, objs);
514         if (tv && tv != tv_onstack)
515                 kvfree(tv);
516 }
517
518 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
519
520 static void xe_vm_kill(struct xe_vm *vm)
521 {
522         struct ww_acquire_ctx ww;
523         struct xe_exec_queue *q;
524
525         lockdep_assert_held(&vm->lock);
526
527         xe_vm_lock(vm, &ww, 0, false);
528         vm->flags |= XE_VM_FLAG_BANNED;
529         trace_xe_vm_kill(vm);
530
531         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
532                 q->ops->kill(q);
533         xe_vm_unlock(vm, &ww);
534
535         /* TODO: Inform user the VM is banned */
536 }
537
538 static void preempt_rebind_work_func(struct work_struct *w)
539 {
540         struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
541         struct xe_vma *vma;
542         struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
543         struct ttm_validate_buffer *tv;
544         struct ww_acquire_ctx ww;
545         struct list_head objs;
546         struct dma_fence *rebind_fence;
547         unsigned int fence_count = 0;
548         LIST_HEAD(preempt_fences);
549         ktime_t end = 0;
550         int err;
551         long wait;
552         int __maybe_unused tries = 0;
553
554         XE_WARN_ON(!xe_vm_in_compute_mode(vm));
555         trace_xe_vm_rebind_worker_enter(vm);
556
557         down_write(&vm->lock);
558
559         if (xe_vm_is_closed_or_banned(vm)) {
560                 up_write(&vm->lock);
561                 trace_xe_vm_rebind_worker_exit(vm);
562                 return;
563         }
564
565 retry:
566         if (vm->async_ops.error)
567                 goto out_unlock_outer;
568
569         /*
570          * Extreme corner where we exit a VM error state with a munmap style VM
571          * unbind inflight which requires a rebind. In this case the rebind
572          * needs to install some fences into the dma-resv slots. The worker to
573          * do this queued, let that worker make progress by dropping vm->lock
574          * and trying this again.
575          */
576         if (vm->async_ops.munmap_rebind_inflight) {
577                 up_write(&vm->lock);
578                 flush_work(&vm->async_ops.work);
579                 goto retry;
580         }
581
582         if (xe_vm_userptr_check_repin(vm)) {
583                 err = xe_vm_userptr_pin(vm);
584                 if (err)
585                         goto out_unlock_outer;
586         }
587
588         err = xe_vm_lock_dma_resv(vm, &ww, tv_onstack, &tv, &objs,
589                                   false, vm->preempt.num_exec_queues);
590         if (err)
591                 goto out_unlock_outer;
592
593         if (xe_vm_is_idle(vm)) {
594                 vm->preempt.rebind_deactivated = true;
595                 goto out_unlock;
596         }
597
598         /* Fresh preempt fences already installed. Everyting is running. */
599         if (!preempt_fences_waiting(vm))
600                 goto out_unlock;
601
602         /*
603          * This makes sure vm is completely suspended and also balances
604          * xe_engine suspend- and resume; we resume *all* vm engines below.
605          */
606         err = wait_for_existing_preempt_fences(vm);
607         if (err)
608                 goto out_unlock;
609
610         err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
611         if (err)
612                 goto out_unlock;
613
614         list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
615                 if (xe_vma_has_no_bo(vma) ||
616                     vma->gpuva.flags & XE_VMA_DESTROYED)
617                         continue;
618
619                 err = xe_bo_validate(xe_vma_bo(vma), vm, false);
620                 if (err)
621                         goto out_unlock;
622         }
623
624         rebind_fence = xe_vm_rebind(vm, true);
625         if (IS_ERR(rebind_fence)) {
626                 err = PTR_ERR(rebind_fence);
627                 goto out_unlock;
628         }
629
630         if (rebind_fence) {
631                 dma_fence_wait(rebind_fence, false);
632                 dma_fence_put(rebind_fence);
633         }
634
635         /* Wait on munmap style VM unbinds */
636         wait = dma_resv_wait_timeout(xe_vm_resv(vm),
637                                      DMA_RESV_USAGE_KERNEL,
638                                      false, MAX_SCHEDULE_TIMEOUT);
639         if (wait <= 0) {
640                 err = -ETIME;
641                 goto out_unlock;
642         }
643
644 #define retry_required(__tries, __vm) \
645         (IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
646         (!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
647         __xe_vm_userptr_needs_repin(__vm))
648
649         down_read(&vm->userptr.notifier_lock);
650         if (retry_required(tries, vm)) {
651                 up_read(&vm->userptr.notifier_lock);
652                 err = -EAGAIN;
653                 goto out_unlock;
654         }
655
656 #undef retry_required
657
658         spin_lock(&vm->xe->ttm.lru_lock);
659         ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
660         spin_unlock(&vm->xe->ttm.lru_lock);
661
662         /* Point of no return. */
663         arm_preempt_fences(vm, &preempt_fences);
664         resume_and_reinstall_preempt_fences(vm);
665         up_read(&vm->userptr.notifier_lock);
666
667 out_unlock:
668         xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
669 out_unlock_outer:
670         if (err == -EAGAIN) {
671                 trace_xe_vm_rebind_worker_retry(vm);
672                 goto retry;
673         }
674
675         /*
676          * With multiple active VMs, under memory pressure, it is possible that
677          * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
678          * Until ttm properly handles locking in such scenarios, best thing the
679          * driver can do is retry with a timeout. Killing the VM or putting it
680          * in error state after timeout or other error scenarios is still TBD.
681          */
682         if (err == -ENOMEM) {
683                 ktime_t cur = ktime_get();
684
685                 end = end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
686                 if (ktime_before(cur, end)) {
687                         msleep(20);
688                         trace_xe_vm_rebind_worker_retry(vm);
689                         goto retry;
690                 }
691         }
692         if (err) {
693                 drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
694                 xe_vm_kill(vm);
695         }
696         up_write(&vm->lock);
697
698         free_preempt_fences(&preempt_fences);
699
700         trace_xe_vm_rebind_worker_exit(vm);
701 }
702
703 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
704                                    const struct mmu_notifier_range *range,
705                                    unsigned long cur_seq)
706 {
707         struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier);
708         struct xe_vm *vm = xe_vma_vm(vma);
709         struct dma_resv_iter cursor;
710         struct dma_fence *fence;
711         long err;
712
713         XE_WARN_ON(!xe_vma_is_userptr(vma));
714         trace_xe_vma_userptr_invalidate(vma);
715
716         if (!mmu_notifier_range_blockable(range))
717                 return false;
718
719         down_write(&vm->userptr.notifier_lock);
720         mmu_interval_set_seq(mni, cur_seq);
721
722         /* No need to stop gpu access if the userptr is not yet bound. */
723         if (!vma->userptr.initial_bind) {
724                 up_write(&vm->userptr.notifier_lock);
725                 return true;
726         }
727
728         /*
729          * Tell exec and rebind worker they need to repin and rebind this
730          * userptr.
731          */
732         if (!xe_vm_in_fault_mode(vm) &&
733             !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
734                 spin_lock(&vm->userptr.invalidated_lock);
735                 list_move_tail(&vma->userptr.invalidate_link,
736                                &vm->userptr.invalidated);
737                 spin_unlock(&vm->userptr.invalidated_lock);
738         }
739
740         up_write(&vm->userptr.notifier_lock);
741
742         /*
743          * Preempt fences turn into schedule disables, pipeline these.
744          * Note that even in fault mode, we need to wait for binds and
745          * unbinds to complete, and those are attached as BOOKMARK fences
746          * to the vm.
747          */
748         dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
749                             DMA_RESV_USAGE_BOOKKEEP);
750         dma_resv_for_each_fence_unlocked(&cursor, fence)
751                 dma_fence_enable_sw_signaling(fence);
752         dma_resv_iter_end(&cursor);
753
754         err = dma_resv_wait_timeout(xe_vm_resv(vm),
755                                     DMA_RESV_USAGE_BOOKKEEP,
756                                     false, MAX_SCHEDULE_TIMEOUT);
757         XE_WARN_ON(err <= 0);
758
759         if (xe_vm_in_fault_mode(vm)) {
760                 err = xe_vm_invalidate_vma(vma);
761                 XE_WARN_ON(err);
762         }
763
764         trace_xe_vma_userptr_invalidate_complete(vma);
765
766         return true;
767 }
768
769 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
770         .invalidate = vma_userptr_invalidate,
771 };
772
773 int xe_vm_userptr_pin(struct xe_vm *vm)
774 {
775         struct xe_vma *vma, *next;
776         int err = 0;
777         LIST_HEAD(tmp_evict);
778
779         lockdep_assert_held_write(&vm->lock);
780
781         /* Collect invalidated userptrs */
782         spin_lock(&vm->userptr.invalidated_lock);
783         list_for_each_entry_safe(vma, next, &vm->userptr.invalidated,
784                                  userptr.invalidate_link) {
785                 list_del_init(&vma->userptr.invalidate_link);
786                 if (list_empty(&vma->combined_links.userptr))
787                         list_move_tail(&vma->combined_links.userptr,
788                                        &vm->userptr.repin_list);
789         }
790         spin_unlock(&vm->userptr.invalidated_lock);
791
792         /* Pin and move to temporary list */
793         list_for_each_entry_safe(vma, next, &vm->userptr.repin_list,
794                                  combined_links.userptr) {
795                 err = xe_vma_userptr_pin_pages(vma);
796                 if (err < 0)
797                         goto out_err;
798
799                 list_move_tail(&vma->combined_links.userptr, &tmp_evict);
800         }
801
802         /* Take lock and move to rebind_list for rebinding. */
803         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
804         if (err)
805                 goto out_err;
806
807         list_for_each_entry_safe(vma, next, &tmp_evict, combined_links.userptr)
808                 list_move_tail(&vma->combined_links.rebind, &vm->rebind_list);
809
810         dma_resv_unlock(xe_vm_resv(vm));
811
812         return 0;
813
814 out_err:
815         list_splice_tail(&tmp_evict, &vm->userptr.repin_list);
816
817         return err;
818 }
819
820 /**
821  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
822  * that need repinning.
823  * @vm: The VM.
824  *
825  * This function does an advisory check for whether the VM has userptrs that
826  * need repinning.
827  *
828  * Return: 0 if there are no indications of userptrs needing repinning,
829  * -EAGAIN if there are.
830  */
831 int xe_vm_userptr_check_repin(struct xe_vm *vm)
832 {
833         return (list_empty_careful(&vm->userptr.repin_list) &&
834                 list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
835 }
836
837 static struct dma_fence *
838 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
839                struct xe_sync_entry *syncs, u32 num_syncs,
840                bool first_op, bool last_op);
841
842 struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
843 {
844         struct dma_fence *fence = NULL;
845         struct xe_vma *vma, *next;
846
847         lockdep_assert_held(&vm->lock);
848         if (xe_vm_no_dma_fences(vm) && !rebind_worker)
849                 return NULL;
850
851         xe_vm_assert_held(vm);
852         list_for_each_entry_safe(vma, next, &vm->rebind_list,
853                                  combined_links.rebind) {
854                 XE_WARN_ON(!vma->tile_present);
855
856                 list_del_init(&vma->combined_links.rebind);
857                 dma_fence_put(fence);
858                 if (rebind_worker)
859                         trace_xe_vma_rebind_worker(vma);
860                 else
861                         trace_xe_vma_rebind_exec(vma);
862                 fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
863                 if (IS_ERR(fence))
864                         return fence;
865         }
866
867         return fence;
868 }
869
870 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
871                                     struct xe_bo *bo,
872                                     u64 bo_offset_or_userptr,
873                                     u64 start, u64 end,
874                                     bool read_only,
875                                     bool is_null,
876                                     u8 tile_mask)
877 {
878         struct xe_vma *vma;
879         struct xe_tile *tile;
880         u8 id;
881
882         XE_WARN_ON(start >= end);
883         XE_WARN_ON(end >= vm->size);
884
885         if (!bo && !is_null)    /* userptr */
886                 vma = kzalloc(sizeof(*vma), GFP_KERNEL);
887         else
888                 vma = kzalloc(sizeof(*vma) - sizeof(struct xe_userptr),
889                               GFP_KERNEL);
890         if (!vma) {
891                 vma = ERR_PTR(-ENOMEM);
892                 return vma;
893         }
894
895         INIT_LIST_HEAD(&vma->combined_links.rebind);
896         INIT_LIST_HEAD(&vma->notifier.rebind_link);
897         INIT_LIST_HEAD(&vma->extobj.link);
898
899         INIT_LIST_HEAD(&vma->gpuva.gem.entry);
900         vma->gpuva.vm = &vm->gpuvm;
901         vma->gpuva.va.addr = start;
902         vma->gpuva.va.range = end - start + 1;
903         if (read_only)
904                 vma->gpuva.flags |= XE_VMA_READ_ONLY;
905         if (is_null)
906                 vma->gpuva.flags |= DRM_GPUVA_SPARSE;
907
908         if (tile_mask) {
909                 vma->tile_mask = tile_mask;
910         } else {
911                 for_each_tile(tile, vm->xe, id)
912                         vma->tile_mask |= 0x1 << id;
913         }
914
915         if (vm->xe->info.platform == XE_PVC)
916                 vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
917
918         if (bo) {
919                 struct drm_gpuvm_bo *vm_bo;
920
921                 xe_bo_assert_held(bo);
922
923                 vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
924                 if (IS_ERR(vm_bo)) {
925                         kfree(vma);
926                         return ERR_CAST(vm_bo);
927                 }
928
929                 drm_gem_object_get(&bo->ttm.base);
930                 vma->gpuva.gem.obj = &bo->ttm.base;
931                 vma->gpuva.gem.offset = bo_offset_or_userptr;
932                 drm_gpuva_link(&vma->gpuva, vm_bo);
933                 drm_gpuvm_bo_put(vm_bo);
934         } else /* userptr or null */ {
935                 if (!is_null) {
936                         u64 size = end - start + 1;
937                         int err;
938
939                         INIT_LIST_HEAD(&vma->userptr.invalidate_link);
940                         vma->gpuva.gem.offset = bo_offset_or_userptr;
941
942                         err = mmu_interval_notifier_insert(&vma->userptr.notifier,
943                                                            current->mm,
944                                                            xe_vma_userptr(vma), size,
945                                                            &vma_userptr_notifier_ops);
946                         if (err) {
947                                 kfree(vma);
948                                 vma = ERR_PTR(err);
949                                 return vma;
950                         }
951
952                         vma->userptr.notifier_seq = LONG_MAX;
953                 }
954
955                 xe_vm_get(vm);
956         }
957
958         return vma;
959 }
960
961 static bool vm_remove_extobj(struct xe_vma *vma)
962 {
963         if (!list_empty(&vma->extobj.link)) {
964                 xe_vma_vm(vma)->extobj.entries--;
965                 list_del_init(&vma->extobj.link);
966                 return true;
967         }
968         return false;
969 }
970
971 static void xe_vma_destroy_late(struct xe_vma *vma)
972 {
973         struct xe_vm *vm = xe_vma_vm(vma);
974         struct xe_device *xe = vm->xe;
975         bool read_only = xe_vma_read_only(vma);
976
977         if (xe_vma_is_userptr(vma)) {
978                 if (vma->userptr.sg) {
979                         dma_unmap_sgtable(xe->drm.dev,
980                                           vma->userptr.sg,
981                                           read_only ? DMA_TO_DEVICE :
982                                           DMA_BIDIRECTIONAL, 0);
983                         sg_free_table(vma->userptr.sg);
984                         vma->userptr.sg = NULL;
985                 }
986
987                 /*
988                  * Since userptr pages are not pinned, we can't remove
989                  * the notifer until we're sure the GPU is not accessing
990                  * them anymore
991                  */
992                 mmu_interval_notifier_remove(&vma->userptr.notifier);
993                 xe_vm_put(vm);
994         } else if (xe_vma_is_null(vma)) {
995                 xe_vm_put(vm);
996         } else {
997                 xe_bo_put(xe_vma_bo(vma));
998         }
999
1000         kfree(vma);
1001 }
1002
1003 static void vma_destroy_work_func(struct work_struct *w)
1004 {
1005         struct xe_vma *vma =
1006                 container_of(w, struct xe_vma, destroy_work);
1007
1008         xe_vma_destroy_late(vma);
1009 }
1010
1011 static struct xe_vma *
1012 bo_has_vm_references_locked(struct xe_bo *bo, struct xe_vm *vm,
1013                             struct xe_vma *ignore)
1014 {
1015         struct drm_gpuvm_bo *vm_bo;
1016         struct drm_gpuva *va;
1017         struct drm_gem_object *obj = &bo->ttm.base;
1018
1019         xe_bo_assert_held(bo);
1020
1021         drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
1022                 drm_gpuvm_bo_for_each_va(va, vm_bo) {
1023                         struct xe_vma *vma = gpuva_to_vma(va);
1024
1025                         if (vma != ignore && xe_vma_vm(vma) == vm)
1026                                 return vma;
1027                 }
1028         }
1029
1030         return NULL;
1031 }
1032
1033 static bool bo_has_vm_references(struct xe_bo *bo, struct xe_vm *vm,
1034                                  struct xe_vma *ignore)
1035 {
1036         struct ww_acquire_ctx ww;
1037         bool ret;
1038
1039         xe_bo_lock(bo, &ww, 0, false);
1040         ret = !!bo_has_vm_references_locked(bo, vm, ignore);
1041         xe_bo_unlock(bo, &ww);
1042
1043         return ret;
1044 }
1045
1046 static void __vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1047 {
1048         lockdep_assert_held_write(&vm->lock);
1049
1050         list_add(&vma->extobj.link, &vm->extobj.list);
1051         vm->extobj.entries++;
1052 }
1053
1054 static void vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1055 {
1056         struct xe_bo *bo = xe_vma_bo(vma);
1057
1058         lockdep_assert_held_write(&vm->lock);
1059
1060         if (bo_has_vm_references(bo, vm, vma))
1061                 return;
1062
1063         __vm_insert_extobj(vm, vma);
1064 }
1065
1066 static void vma_destroy_cb(struct dma_fence *fence,
1067                            struct dma_fence_cb *cb)
1068 {
1069         struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1070
1071         INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1072         queue_work(system_unbound_wq, &vma->destroy_work);
1073 }
1074
1075 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1076 {
1077         struct xe_vm *vm = xe_vma_vm(vma);
1078
1079         lockdep_assert_held_write(&vm->lock);
1080         XE_WARN_ON(!list_empty(&vma->combined_links.destroy));
1081
1082         if (xe_vma_is_userptr(vma)) {
1083                 XE_WARN_ON(!(vma->gpuva.flags & XE_VMA_DESTROYED));
1084
1085                 spin_lock(&vm->userptr.invalidated_lock);
1086                 list_del(&vma->userptr.invalidate_link);
1087                 spin_unlock(&vm->userptr.invalidated_lock);
1088         } else if (!xe_vma_is_null(vma)) {
1089                 xe_bo_assert_held(xe_vma_bo(vma));
1090
1091                 spin_lock(&vm->notifier.list_lock);
1092                 list_del(&vma->notifier.rebind_link);
1093                 spin_unlock(&vm->notifier.list_lock);
1094
1095                 drm_gpuva_unlink(&vma->gpuva);
1096
1097                 if (!xe_vma_bo(vma)->vm && vm_remove_extobj(vma)) {
1098                         struct xe_vma *other;
1099
1100                         other = bo_has_vm_references_locked(xe_vma_bo(vma), vm, NULL);
1101
1102                         if (other)
1103                                 __vm_insert_extobj(vm, other);
1104                 }
1105         }
1106
1107         xe_vm_assert_held(vm);
1108         if (fence) {
1109                 int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1110                                                  vma_destroy_cb);
1111
1112                 if (ret) {
1113                         XE_WARN_ON(ret != -ENOENT);
1114                         xe_vma_destroy_late(vma);
1115                 }
1116         } else {
1117                 xe_vma_destroy_late(vma);
1118         }
1119 }
1120
1121 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1122 {
1123         struct ttm_validate_buffer tv[2];
1124         struct ww_acquire_ctx ww;
1125         struct xe_bo *bo = xe_vma_bo(vma);
1126         LIST_HEAD(objs);
1127         LIST_HEAD(dups);
1128         int err;
1129
1130         memset(tv, 0, sizeof(tv));
1131         tv[0].bo = xe_vm_ttm_bo(xe_vma_vm(vma));
1132         list_add(&tv[0].head, &objs);
1133
1134         if (bo) {
1135                 tv[1].bo = &xe_bo_get(bo)->ttm;
1136                 list_add(&tv[1].head, &objs);
1137         }
1138         err = ttm_eu_reserve_buffers(&ww, &objs, false, &dups);
1139         XE_WARN_ON(err);
1140
1141         xe_vma_destroy(vma, NULL);
1142
1143         ttm_eu_backoff_reservation(&ww, &objs);
1144         if (bo)
1145                 xe_bo_put(bo);
1146 }
1147
1148 struct xe_vma *
1149 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1150 {
1151         struct drm_gpuva *gpuva;
1152
1153         lockdep_assert_held(&vm->lock);
1154
1155         if (xe_vm_is_closed_or_banned(vm))
1156                 return NULL;
1157
1158         XE_WARN_ON(start + range > vm->size);
1159
1160         gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1161
1162         return gpuva ? gpuva_to_vma(gpuva) : NULL;
1163 }
1164
1165 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1166 {
1167         int err;
1168
1169         XE_WARN_ON(xe_vma_vm(vma) != vm);
1170         lockdep_assert_held(&vm->lock);
1171
1172         err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1173         XE_WARN_ON(err);        /* Shouldn't be possible */
1174
1175         return err;
1176 }
1177
1178 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1179 {
1180         XE_WARN_ON(xe_vma_vm(vma) != vm);
1181         lockdep_assert_held(&vm->lock);
1182
1183         drm_gpuva_remove(&vma->gpuva);
1184         if (vm->usm.last_fault_vma == vma)
1185                 vm->usm.last_fault_vma = NULL;
1186 }
1187
1188 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1189 {
1190         struct xe_vma_op *op;
1191
1192         op = kzalloc(sizeof(*op), GFP_KERNEL);
1193
1194         if (unlikely(!op))
1195                 return NULL;
1196
1197         return &op->base;
1198 }
1199
1200 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1201
1202 static struct drm_gpuvm_ops gpuvm_ops = {
1203         .op_alloc = xe_vm_op_alloc,
1204         .vm_free = xe_vm_free,
1205 };
1206
1207 static void xe_vma_op_work_func(struct work_struct *w);
1208 static void vm_destroy_work_func(struct work_struct *w);
1209
1210 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1211 {
1212         struct drm_gem_object *vm_resv_obj;
1213         struct xe_vm *vm;
1214         int err, number_tiles = 0;
1215         struct xe_tile *tile;
1216         u8 id;
1217
1218         vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1219         if (!vm)
1220                 return ERR_PTR(-ENOMEM);
1221
1222         vm->xe = xe;
1223
1224         vm->size = 1ull << xe->info.va_bits;
1225
1226         vm->flags = flags;
1227
1228         init_rwsem(&vm->lock);
1229
1230         INIT_LIST_HEAD(&vm->rebind_list);
1231
1232         INIT_LIST_HEAD(&vm->userptr.repin_list);
1233         INIT_LIST_HEAD(&vm->userptr.invalidated);
1234         init_rwsem(&vm->userptr.notifier_lock);
1235         spin_lock_init(&vm->userptr.invalidated_lock);
1236
1237         INIT_LIST_HEAD(&vm->notifier.rebind_list);
1238         spin_lock_init(&vm->notifier.list_lock);
1239
1240         INIT_LIST_HEAD(&vm->async_ops.pending);
1241         INIT_WORK(&vm->async_ops.work, xe_vma_op_work_func);
1242         spin_lock_init(&vm->async_ops.lock);
1243
1244         INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1245
1246         INIT_LIST_HEAD(&vm->preempt.exec_queues);
1247         vm->preempt.min_run_period_ms = 10;     /* FIXME: Wire up to uAPI */
1248
1249         for_each_tile(tile, xe, id)
1250                 xe_range_fence_tree_init(&vm->rftree[id]);
1251
1252         INIT_LIST_HEAD(&vm->extobj.list);
1253
1254         if (!(flags & XE_VM_FLAG_MIGRATION))
1255                 xe_device_mem_access_get(xe);
1256
1257         vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1258         if (!vm_resv_obj) {
1259                 err = -ENOMEM;
1260                 goto err_no_resv;
1261         }
1262
1263         drm_gpuvm_init(&vm->gpuvm, "Xe VM", 0, &xe->drm, vm_resv_obj,
1264                        0, vm->size, 0, 0, &gpuvm_ops);
1265
1266         drm_gem_object_put(vm_resv_obj);
1267
1268         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
1269         if (err)
1270                 goto err_close;
1271
1272         if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1273                 vm->flags |= XE_VM_FLAG_64K;
1274
1275         for_each_tile(tile, xe, id) {
1276                 if (flags & XE_VM_FLAG_MIGRATION &&
1277                     tile->id != XE_VM_FLAG_TILE_ID(flags))
1278                         continue;
1279
1280                 vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1281                 if (IS_ERR(vm->pt_root[id])) {
1282                         err = PTR_ERR(vm->pt_root[id]);
1283                         vm->pt_root[id] = NULL;
1284                         goto err_unlock_close;
1285                 }
1286         }
1287
1288         if (flags & XE_VM_FLAG_SCRATCH_PAGE) {
1289                 for_each_tile(tile, xe, id) {
1290                         if (!vm->pt_root[id])
1291                                 continue;
1292
1293                         err = xe_pt_create_scratch(xe, tile, vm);
1294                         if (err)
1295                                 goto err_unlock_close;
1296                 }
1297                 vm->batch_invalidate_tlb = true;
1298         }
1299
1300         if (flags & XE_VM_FLAG_COMPUTE_MODE) {
1301                 INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1302                 vm->flags |= XE_VM_FLAG_COMPUTE_MODE;
1303                 vm->batch_invalidate_tlb = false;
1304         }
1305
1306         if (flags & XE_VM_FLAG_ASYNC_BIND_OPS) {
1307                 vm->async_ops.fence.context = dma_fence_context_alloc(1);
1308                 vm->flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
1309         }
1310
1311         /* Fill pt_root after allocating scratch tables */
1312         for_each_tile(tile, xe, id) {
1313                 if (!vm->pt_root[id])
1314                         continue;
1315
1316                 xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1317         }
1318         dma_resv_unlock(xe_vm_resv(vm));
1319
1320         /* Kernel migration VM shouldn't have a circular loop.. */
1321         if (!(flags & XE_VM_FLAG_MIGRATION)) {
1322                 for_each_tile(tile, xe, id) {
1323                         struct xe_gt *gt = tile->primary_gt;
1324                         struct xe_vm *migrate_vm;
1325                         struct xe_exec_queue *q;
1326
1327                         if (!vm->pt_root[id])
1328                                 continue;
1329
1330                         migrate_vm = xe_migrate_get_vm(tile->migrate);
1331                         q = xe_exec_queue_create_class(xe, gt, migrate_vm,
1332                                                        XE_ENGINE_CLASS_COPY,
1333                                                        EXEC_QUEUE_FLAG_VM);
1334                         xe_vm_put(migrate_vm);
1335                         if (IS_ERR(q)) {
1336                                 err = PTR_ERR(q);
1337                                 goto err_close;
1338                         }
1339                         vm->q[id] = q;
1340                         number_tiles++;
1341                 }
1342         }
1343
1344         if (number_tiles > 1)
1345                 vm->composite_fence_ctx = dma_fence_context_alloc(1);
1346
1347         mutex_lock(&xe->usm.lock);
1348         if (flags & XE_VM_FLAG_FAULT_MODE)
1349                 xe->usm.num_vm_in_fault_mode++;
1350         else if (!(flags & XE_VM_FLAG_MIGRATION))
1351                 xe->usm.num_vm_in_non_fault_mode++;
1352         mutex_unlock(&xe->usm.lock);
1353
1354         trace_xe_vm_create(vm);
1355
1356         return vm;
1357
1358 err_unlock_close:
1359         dma_resv_unlock(xe_vm_resv(vm));
1360 err_close:
1361         xe_vm_close_and_put(vm);
1362         return ERR_PTR(err);
1363
1364 err_no_resv:
1365         for_each_tile(tile, xe, id)
1366                 xe_range_fence_tree_fini(&vm->rftree[id]);
1367         kfree(vm);
1368         if (!(flags & XE_VM_FLAG_MIGRATION))
1369                 xe_device_mem_access_put(xe);
1370         return ERR_PTR(err);
1371 }
1372
1373 static void flush_async_ops(struct xe_vm *vm)
1374 {
1375         queue_work(system_unbound_wq, &vm->async_ops.work);
1376         flush_work(&vm->async_ops.work);
1377 }
1378
1379 static void vm_error_capture(struct xe_vm *vm, int err,
1380                              u32 op, u64 addr, u64 size)
1381 {
1382         struct drm_xe_vm_bind_op_error_capture capture;
1383         u64 __user *address =
1384                 u64_to_user_ptr(vm->async_ops.error_capture.addr);
1385         bool in_kthread = !current->mm;
1386
1387         capture.error = err;
1388         capture.op = op;
1389         capture.addr = addr;
1390         capture.size = size;
1391
1392         if (in_kthread) {
1393                 if (!mmget_not_zero(vm->async_ops.error_capture.mm))
1394                         goto mm_closed;
1395                 kthread_use_mm(vm->async_ops.error_capture.mm);
1396         }
1397
1398         if (copy_to_user(address, &capture, sizeof(capture)))
1399                 XE_WARN_ON("Copy to user failed");
1400
1401         if (in_kthread) {
1402                 kthread_unuse_mm(vm->async_ops.error_capture.mm);
1403                 mmput(vm->async_ops.error_capture.mm);
1404         }
1405
1406 mm_closed:
1407         wake_up_all(&vm->async_ops.error_capture.wq);
1408 }
1409
1410 static void xe_vm_close(struct xe_vm *vm)
1411 {
1412         down_write(&vm->lock);
1413         vm->size = 0;
1414         up_write(&vm->lock);
1415 }
1416
1417 void xe_vm_close_and_put(struct xe_vm *vm)
1418 {
1419         LIST_HEAD(contested);
1420         struct ww_acquire_ctx ww;
1421         struct xe_device *xe = vm->xe;
1422         struct xe_tile *tile;
1423         struct xe_vma *vma, *next_vma;
1424         struct drm_gpuva *gpuva, *next;
1425         u8 id;
1426
1427         XE_WARN_ON(vm->preempt.num_exec_queues);
1428
1429         xe_vm_close(vm);
1430         flush_async_ops(vm);
1431         if (xe_vm_in_compute_mode(vm))
1432                 flush_work(&vm->preempt.rebind_work);
1433
1434         for_each_tile(tile, xe, id) {
1435                 if (vm->q[id]) {
1436                         xe_exec_queue_kill(vm->q[id]);
1437                         xe_exec_queue_put(vm->q[id]);
1438                         vm->q[id] = NULL;
1439                 }
1440         }
1441
1442         down_write(&vm->lock);
1443         xe_vm_lock(vm, &ww, 0, false);
1444         drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1445                 vma = gpuva_to_vma(gpuva);
1446
1447                 if (xe_vma_has_no_bo(vma)) {
1448                         down_read(&vm->userptr.notifier_lock);
1449                         vma->gpuva.flags |= XE_VMA_DESTROYED;
1450                         up_read(&vm->userptr.notifier_lock);
1451                 }
1452
1453                 xe_vm_remove_vma(vm, vma);
1454
1455                 /* easy case, remove from VMA? */
1456                 if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1457                         list_del_init(&vma->combined_links.rebind);
1458                         xe_vma_destroy(vma, NULL);
1459                         continue;
1460                 }
1461
1462                 list_move_tail(&vma->combined_links.destroy, &contested);
1463                 vma->gpuva.flags |= XE_VMA_DESTROYED;
1464         }
1465
1466         /*
1467          * All vm operations will add shared fences to resv.
1468          * The only exception is eviction for a shared object,
1469          * but even so, the unbind when evicted would still
1470          * install a fence to resv. Hence it's safe to
1471          * destroy the pagetables immediately.
1472          */
1473         for_each_tile(tile, xe, id) {
1474                 if (vm->scratch_bo[id]) {
1475                         u32 i;
1476
1477                         xe_bo_unpin(vm->scratch_bo[id]);
1478                         xe_bo_put(vm->scratch_bo[id]);
1479                         for (i = 0; i < vm->pt_root[id]->level; i++)
1480                                 xe_pt_destroy(vm->scratch_pt[id][i], vm->flags,
1481                                               NULL);
1482                 }
1483                 if (vm->pt_root[id]) {
1484                         xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1485                         vm->pt_root[id] = NULL;
1486                 }
1487         }
1488         xe_vm_unlock(vm, &ww);
1489
1490         /*
1491          * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1492          * Since we hold a refcount to the bo, we can remove and free
1493          * the members safely without locking.
1494          */
1495         list_for_each_entry_safe(vma, next_vma, &contested,
1496                                  combined_links.destroy) {
1497                 list_del_init(&vma->combined_links.destroy);
1498                 xe_vma_destroy_unlocked(vma);
1499         }
1500
1501         if (vm->async_ops.error_capture.addr)
1502                 wake_up_all(&vm->async_ops.error_capture.wq);
1503
1504         XE_WARN_ON(!list_empty(&vm->extobj.list));
1505         up_write(&vm->lock);
1506
1507         mutex_lock(&xe->usm.lock);
1508         if (vm->flags & XE_VM_FLAG_FAULT_MODE)
1509                 xe->usm.num_vm_in_fault_mode--;
1510         else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1511                 xe->usm.num_vm_in_non_fault_mode--;
1512         mutex_unlock(&xe->usm.lock);
1513
1514         for_each_tile(tile, xe, id)
1515                 xe_range_fence_tree_fini(&vm->rftree[id]);
1516
1517         xe_vm_put(vm);
1518 }
1519
1520 static void vm_destroy_work_func(struct work_struct *w)
1521 {
1522         struct xe_vm *vm =
1523                 container_of(w, struct xe_vm, destroy_work);
1524         struct xe_device *xe = vm->xe;
1525         struct xe_tile *tile;
1526         u8 id;
1527         void *lookup;
1528
1529         /* xe_vm_close_and_put was not called? */
1530         XE_WARN_ON(vm->size);
1531
1532         if (!(vm->flags & XE_VM_FLAG_MIGRATION)) {
1533                 xe_device_mem_access_put(xe);
1534
1535                 if (xe->info.has_asid) {
1536                         mutex_lock(&xe->usm.lock);
1537                         lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1538                         XE_WARN_ON(lookup != vm);
1539                         mutex_unlock(&xe->usm.lock);
1540                 }
1541         }
1542
1543         for_each_tile(tile, xe, id)
1544                 XE_WARN_ON(vm->pt_root[id]);
1545
1546         trace_xe_vm_free(vm);
1547         dma_fence_put(vm->rebind_fence);
1548         kfree(vm);
1549 }
1550
1551 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1552 {
1553         struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1554
1555         /* To destroy the VM we need to be able to sleep */
1556         queue_work(system_unbound_wq, &vm->destroy_work);
1557 }
1558
1559 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1560 {
1561         struct xe_vm *vm;
1562
1563         mutex_lock(&xef->vm.lock);
1564         vm = xa_load(&xef->vm.xa, id);
1565         if (vm)
1566                 xe_vm_get(vm);
1567         mutex_unlock(&xef->vm.lock);
1568
1569         return vm;
1570 }
1571
1572 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1573 {
1574         return xe_pde_encode(vm->pt_root[tile->id]->bo, 0,
1575                              XE_CACHE_WB);
1576 }
1577
1578 static struct dma_fence *
1579 xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1580                  struct xe_sync_entry *syncs, u32 num_syncs,
1581                  bool first_op, bool last_op)
1582 {
1583         struct xe_tile *tile;
1584         struct dma_fence *fence = NULL;
1585         struct dma_fence **fences = NULL;
1586         struct dma_fence_array *cf = NULL;
1587         struct xe_vm *vm = xe_vma_vm(vma);
1588         int cur_fence = 0, i;
1589         int number_tiles = hweight8(vma->tile_present);
1590         int err;
1591         u8 id;
1592
1593         trace_xe_vma_unbind(vma);
1594
1595         if (number_tiles > 1) {
1596                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1597                                        GFP_KERNEL);
1598                 if (!fences)
1599                         return ERR_PTR(-ENOMEM);
1600         }
1601
1602         for_each_tile(tile, vm->xe, id) {
1603                 if (!(vma->tile_present & BIT(id)))
1604                         goto next;
1605
1606                 fence = __xe_pt_unbind_vma(tile, vma, q, first_op ? syncs : NULL,
1607                                            first_op ? num_syncs : 0);
1608                 if (IS_ERR(fence)) {
1609                         err = PTR_ERR(fence);
1610                         goto err_fences;
1611                 }
1612
1613                 if (fences)
1614                         fences[cur_fence++] = fence;
1615
1616 next:
1617                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1618                         q = list_next_entry(q, multi_gt_list);
1619         }
1620
1621         if (fences) {
1622                 cf = dma_fence_array_create(number_tiles, fences,
1623                                             vm->composite_fence_ctx,
1624                                             vm->composite_fence_seqno++,
1625                                             false);
1626                 if (!cf) {
1627                         --vm->composite_fence_seqno;
1628                         err = -ENOMEM;
1629                         goto err_fences;
1630                 }
1631         }
1632
1633         if (last_op) {
1634                 for (i = 0; i < num_syncs; i++)
1635                         xe_sync_entry_signal(&syncs[i], NULL,
1636                                              cf ? &cf->base : fence);
1637         }
1638
1639         return cf ? &cf->base : !fence ? dma_fence_get_stub() : fence;
1640
1641 err_fences:
1642         if (fences) {
1643                 while (cur_fence) {
1644                         /* FIXME: Rewind the previous binds? */
1645                         dma_fence_put(fences[--cur_fence]);
1646                 }
1647                 kfree(fences);
1648         }
1649
1650         return ERR_PTR(err);
1651 }
1652
1653 static struct dma_fence *
1654 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1655                struct xe_sync_entry *syncs, u32 num_syncs,
1656                bool first_op, bool last_op)
1657 {
1658         struct xe_tile *tile;
1659         struct dma_fence *fence;
1660         struct dma_fence **fences = NULL;
1661         struct dma_fence_array *cf = NULL;
1662         struct xe_vm *vm = xe_vma_vm(vma);
1663         int cur_fence = 0, i;
1664         int number_tiles = hweight8(vma->tile_mask);
1665         int err;
1666         u8 id;
1667
1668         trace_xe_vma_bind(vma);
1669
1670         if (number_tiles > 1) {
1671                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1672                                        GFP_KERNEL);
1673                 if (!fences)
1674                         return ERR_PTR(-ENOMEM);
1675         }
1676
1677         for_each_tile(tile, vm->xe, id) {
1678                 if (!(vma->tile_mask & BIT(id)))
1679                         goto next;
1680
1681                 fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id],
1682                                          first_op ? syncs : NULL,
1683                                          first_op ? num_syncs : 0,
1684                                          vma->tile_present & BIT(id));
1685                 if (IS_ERR(fence)) {
1686                         err = PTR_ERR(fence);
1687                         goto err_fences;
1688                 }
1689
1690                 if (fences)
1691                         fences[cur_fence++] = fence;
1692
1693 next:
1694                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1695                         q = list_next_entry(q, multi_gt_list);
1696         }
1697
1698         if (fences) {
1699                 cf = dma_fence_array_create(number_tiles, fences,
1700                                             vm->composite_fence_ctx,
1701                                             vm->composite_fence_seqno++,
1702                                             false);
1703                 if (!cf) {
1704                         --vm->composite_fence_seqno;
1705                         err = -ENOMEM;
1706                         goto err_fences;
1707                 }
1708         }
1709
1710         if (last_op) {
1711                 for (i = 0; i < num_syncs; i++)
1712                         xe_sync_entry_signal(&syncs[i], NULL,
1713                                              cf ? &cf->base : fence);
1714         }
1715
1716         return cf ? &cf->base : fence;
1717
1718 err_fences:
1719         if (fences) {
1720                 while (cur_fence) {
1721                         /* FIXME: Rewind the previous binds? */
1722                         dma_fence_put(fences[--cur_fence]);
1723                 }
1724                 kfree(fences);
1725         }
1726
1727         return ERR_PTR(err);
1728 }
1729
1730 struct async_op_fence {
1731         struct dma_fence fence;
1732         struct dma_fence *wait_fence;
1733         struct dma_fence_cb cb;
1734         struct xe_vm *vm;
1735         wait_queue_head_t wq;
1736         bool started;
1737 };
1738
1739 static const char *async_op_fence_get_driver_name(struct dma_fence *dma_fence)
1740 {
1741         return "xe";
1742 }
1743
1744 static const char *
1745 async_op_fence_get_timeline_name(struct dma_fence *dma_fence)
1746 {
1747         return "async_op_fence";
1748 }
1749
1750 static const struct dma_fence_ops async_op_fence_ops = {
1751         .get_driver_name = async_op_fence_get_driver_name,
1752         .get_timeline_name = async_op_fence_get_timeline_name,
1753 };
1754
1755 static void async_op_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
1756 {
1757         struct async_op_fence *afence =
1758                 container_of(cb, struct async_op_fence, cb);
1759
1760         afence->fence.error = afence->wait_fence->error;
1761         dma_fence_signal(&afence->fence);
1762         xe_vm_put(afence->vm);
1763         dma_fence_put(afence->wait_fence);
1764         dma_fence_put(&afence->fence);
1765 }
1766
1767 static void add_async_op_fence_cb(struct xe_vm *vm,
1768                                   struct dma_fence *fence,
1769                                   struct async_op_fence *afence)
1770 {
1771         int ret;
1772
1773         if (!xe_vm_no_dma_fences(vm)) {
1774                 afence->started = true;
1775                 smp_wmb();
1776                 wake_up_all(&afence->wq);
1777         }
1778
1779         afence->wait_fence = dma_fence_get(fence);
1780         afence->vm = xe_vm_get(vm);
1781         dma_fence_get(&afence->fence);
1782         ret = dma_fence_add_callback(fence, &afence->cb, async_op_fence_cb);
1783         if (ret == -ENOENT) {
1784                 afence->fence.error = afence->wait_fence->error;
1785                 dma_fence_signal(&afence->fence);
1786         }
1787         if (ret) {
1788                 xe_vm_put(vm);
1789                 dma_fence_put(afence->wait_fence);
1790                 dma_fence_put(&afence->fence);
1791         }
1792         XE_WARN_ON(ret && ret != -ENOENT);
1793 }
1794
1795 int xe_vm_async_fence_wait_start(struct dma_fence *fence)
1796 {
1797         if (fence->ops == &async_op_fence_ops) {
1798                 struct async_op_fence *afence =
1799                         container_of(fence, struct async_op_fence, fence);
1800
1801                 XE_WARN_ON(xe_vm_no_dma_fences(afence->vm));
1802
1803                 smp_rmb();
1804                 return wait_event_interruptible(afence->wq, afence->started);
1805         }
1806
1807         return 0;
1808 }
1809
1810 static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
1811                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1812                         u32 num_syncs, struct async_op_fence *afence,
1813                         bool immediate, bool first_op, bool last_op)
1814 {
1815         struct dma_fence *fence;
1816
1817         xe_vm_assert_held(vm);
1818
1819         if (immediate) {
1820                 fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op,
1821                                        last_op);
1822                 if (IS_ERR(fence))
1823                         return PTR_ERR(fence);
1824         } else {
1825                 int i;
1826
1827                 XE_WARN_ON(!xe_vm_in_fault_mode(vm));
1828
1829                 fence = dma_fence_get_stub();
1830                 if (last_op) {
1831                         for (i = 0; i < num_syncs; i++)
1832                                 xe_sync_entry_signal(&syncs[i], NULL, fence);
1833                 }
1834         }
1835         if (afence)
1836                 add_async_op_fence_cb(vm, fence, afence);
1837
1838         dma_fence_put(fence);
1839         return 0;
1840 }
1841
1842 static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
1843                       struct xe_bo *bo, struct xe_sync_entry *syncs,
1844                       u32 num_syncs, struct async_op_fence *afence,
1845                       bool immediate, bool first_op, bool last_op)
1846 {
1847         int err;
1848
1849         xe_vm_assert_held(vm);
1850         xe_bo_assert_held(bo);
1851
1852         if (bo && immediate) {
1853                 err = xe_bo_validate(bo, vm, true);
1854                 if (err)
1855                         return err;
1856         }
1857
1858         return __xe_vm_bind(vm, vma, q, syncs, num_syncs, afence, immediate,
1859                             first_op, last_op);
1860 }
1861
1862 static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
1863                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1864                         u32 num_syncs, struct async_op_fence *afence,
1865                         bool first_op, bool last_op)
1866 {
1867         struct dma_fence *fence;
1868
1869         xe_vm_assert_held(vm);
1870         xe_bo_assert_held(xe_vma_bo(vma));
1871
1872         fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op);
1873         if (IS_ERR(fence))
1874                 return PTR_ERR(fence);
1875         if (afence)
1876                 add_async_op_fence_cb(vm, fence, afence);
1877
1878         xe_vma_destroy(vma, fence);
1879         dma_fence_put(fence);
1880
1881         return 0;
1882 }
1883
1884 static int vm_set_error_capture_address(struct xe_device *xe, struct xe_vm *vm,
1885                                         u64 value)
1886 {
1887         if (XE_IOCTL_DBG(xe, !value))
1888                 return -EINVAL;
1889
1890         if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
1891                 return -EOPNOTSUPP;
1892
1893         if (XE_IOCTL_DBG(xe, vm->async_ops.error_capture.addr))
1894                 return -EOPNOTSUPP;
1895
1896         vm->async_ops.error_capture.mm = current->mm;
1897         vm->async_ops.error_capture.addr = value;
1898         init_waitqueue_head(&vm->async_ops.error_capture.wq);
1899
1900         return 0;
1901 }
1902
1903 typedef int (*xe_vm_set_property_fn)(struct xe_device *xe, struct xe_vm *vm,
1904                                      u64 value);
1905
1906 static const xe_vm_set_property_fn vm_set_property_funcs[] = {
1907         [XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS] =
1908                 vm_set_error_capture_address,
1909 };
1910
1911 static int vm_user_ext_set_property(struct xe_device *xe, struct xe_vm *vm,
1912                                     u64 extension)
1913 {
1914         u64 __user *address = u64_to_user_ptr(extension);
1915         struct drm_xe_ext_vm_set_property ext;
1916         int err;
1917
1918         err = __copy_from_user(&ext, address, sizeof(ext));
1919         if (XE_IOCTL_DBG(xe, err))
1920                 return -EFAULT;
1921
1922         if (XE_IOCTL_DBG(xe, ext.property >=
1923                          ARRAY_SIZE(vm_set_property_funcs)) ||
1924             XE_IOCTL_DBG(xe, ext.pad) ||
1925             XE_IOCTL_DBG(xe, ext.reserved[0] || ext.reserved[1]))
1926                 return -EINVAL;
1927
1928         return vm_set_property_funcs[ext.property](xe, vm, ext.value);
1929 }
1930
1931 typedef int (*xe_vm_user_extension_fn)(struct xe_device *xe, struct xe_vm *vm,
1932                                        u64 extension);
1933
1934 static const xe_vm_set_property_fn vm_user_extension_funcs[] = {
1935         [XE_VM_EXTENSION_SET_PROPERTY] = vm_user_ext_set_property,
1936 };
1937
1938 #define MAX_USER_EXTENSIONS     16
1939 static int vm_user_extensions(struct xe_device *xe, struct xe_vm *vm,
1940                               u64 extensions, int ext_number)
1941 {
1942         u64 __user *address = u64_to_user_ptr(extensions);
1943         struct xe_user_extension ext;
1944         int err;
1945
1946         if (XE_IOCTL_DBG(xe, ext_number >= MAX_USER_EXTENSIONS))
1947                 return -E2BIG;
1948
1949         err = __copy_from_user(&ext, address, sizeof(ext));
1950         if (XE_IOCTL_DBG(xe, err))
1951                 return -EFAULT;
1952
1953         if (XE_IOCTL_DBG(xe, ext.pad) ||
1954             XE_IOCTL_DBG(xe, ext.name >=
1955                          ARRAY_SIZE(vm_user_extension_funcs)))
1956                 return -EINVAL;
1957
1958         err = vm_user_extension_funcs[ext.name](xe, vm, extensions);
1959         if (XE_IOCTL_DBG(xe, err))
1960                 return err;
1961
1962         if (ext.next_extension)
1963                 return vm_user_extensions(xe, vm, ext.next_extension,
1964                                           ++ext_number);
1965
1966         return 0;
1967 }
1968
1969 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_SCRATCH_PAGE | \
1970                                     DRM_XE_VM_CREATE_COMPUTE_MODE | \
1971                                     DRM_XE_VM_CREATE_ASYNC_BIND_OPS | \
1972                                     DRM_XE_VM_CREATE_FAULT_MODE)
1973
1974 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
1975                        struct drm_file *file)
1976 {
1977         struct xe_device *xe = to_xe_device(dev);
1978         struct xe_file *xef = to_xe_file(file);
1979         struct drm_xe_vm_create *args = data;
1980         struct xe_vm *vm;
1981         u32 id, asid;
1982         int err;
1983         u32 flags = 0;
1984
1985         if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
1986                 args->flags |= DRM_XE_VM_CREATE_SCRATCH_PAGE;
1987
1988         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
1989                          !xe->info.supports_usm))
1990                 return -EINVAL;
1991
1992         if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
1993                 return -EINVAL;
1994
1995         if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
1996                 return -EINVAL;
1997
1998         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE &&
1999                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2000                 return -EINVAL;
2001
2002         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE &&
2003                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2004                 return -EINVAL;
2005
2006         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
2007                          xe_device_in_non_fault_mode(xe)))
2008                 return -EINVAL;
2009
2010         if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FAULT_MODE) &&
2011                          xe_device_in_fault_mode(xe)))
2012                 return -EINVAL;
2013
2014         if (args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE)
2015                 flags |= XE_VM_FLAG_SCRATCH_PAGE;
2016         if (args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE)
2017                 flags |= XE_VM_FLAG_COMPUTE_MODE;
2018         if (args->flags & DRM_XE_VM_CREATE_ASYNC_BIND_OPS)
2019                 flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
2020         if (args->flags & DRM_XE_VM_CREATE_FAULT_MODE)
2021                 flags |= XE_VM_FLAG_FAULT_MODE;
2022
2023         vm = xe_vm_create(xe, flags);
2024         if (IS_ERR(vm))
2025                 return PTR_ERR(vm);
2026
2027         if (args->extensions) {
2028                 err = vm_user_extensions(xe, vm, args->extensions, 0);
2029                 if (XE_IOCTL_DBG(xe, err)) {
2030                         xe_vm_close_and_put(vm);
2031                         return err;
2032                 }
2033         }
2034
2035         mutex_lock(&xef->vm.lock);
2036         err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2037         mutex_unlock(&xef->vm.lock);
2038         if (err) {
2039                 xe_vm_close_and_put(vm);
2040                 return err;
2041         }
2042
2043         if (xe->info.has_asid) {
2044                 mutex_lock(&xe->usm.lock);
2045                 err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2046                                       XA_LIMIT(0, XE_MAX_ASID - 1),
2047                                       &xe->usm.next_asid, GFP_KERNEL);
2048                 mutex_unlock(&xe->usm.lock);
2049                 if (err) {
2050                         xe_vm_close_and_put(vm);
2051                         return err;
2052                 }
2053                 vm->usm.asid = asid;
2054         }
2055
2056         args->vm_id = id;
2057
2058 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2059         /* Warning: Security issue - never enable by default */
2060         args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2061 #endif
2062
2063         return 0;
2064 }
2065
2066 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2067                         struct drm_file *file)
2068 {
2069         struct xe_device *xe = to_xe_device(dev);
2070         struct xe_file *xef = to_xe_file(file);
2071         struct drm_xe_vm_destroy *args = data;
2072         struct xe_vm *vm;
2073         int err = 0;
2074
2075         if (XE_IOCTL_DBG(xe, args->pad) ||
2076             XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2077                 return -EINVAL;
2078
2079         mutex_lock(&xef->vm.lock);
2080         vm = xa_load(&xef->vm.xa, args->vm_id);
2081         if (XE_IOCTL_DBG(xe, !vm))
2082                 err = -ENOENT;
2083         else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2084                 err = -EBUSY;
2085         else
2086                 xa_erase(&xef->vm.xa, args->vm_id);
2087         mutex_unlock(&xef->vm.lock);
2088
2089         if (!err)
2090                 xe_vm_close_and_put(vm);
2091
2092         return err;
2093 }
2094
2095 static const u32 region_to_mem_type[] = {
2096         XE_PL_TT,
2097         XE_PL_VRAM0,
2098         XE_PL_VRAM1,
2099 };
2100
2101 static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
2102                           struct xe_exec_queue *q, u32 region,
2103                           struct xe_sync_entry *syncs, u32 num_syncs,
2104                           struct async_op_fence *afence, bool first_op,
2105                           bool last_op)
2106 {
2107         int err;
2108
2109         XE_WARN_ON(region > ARRAY_SIZE(region_to_mem_type));
2110
2111         if (!xe_vma_has_no_bo(vma)) {
2112                 err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
2113                 if (err)
2114                         return err;
2115         }
2116
2117         if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
2118                 return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
2119                                   afence, true, first_op, last_op);
2120         } else {
2121                 int i;
2122
2123                 /* Nothing to do, signal fences now */
2124                 if (last_op) {
2125                         for (i = 0; i < num_syncs; i++)
2126                                 xe_sync_entry_signal(&syncs[i], NULL,
2127                                                      dma_fence_get_stub());
2128                 }
2129                 if (afence)
2130                         dma_fence_signal(&afence->fence);
2131                 return 0;
2132         }
2133 }
2134
2135 #define VM_BIND_OP(op)  (op & 0xffff)
2136
2137 struct ttm_buffer_object *xe_vm_ttm_bo(struct xe_vm *vm)
2138 {
2139         int idx = vm->flags & XE_VM_FLAG_MIGRATION ?
2140                 XE_VM_FLAG_TILE_ID(vm->flags) : 0;
2141
2142         /* Safe to use index 0 as all BO in the VM share a single dma-resv lock */
2143         return &vm->pt_root[idx]->bo->ttm;
2144 }
2145
2146 static void xe_vm_tv_populate(struct xe_vm *vm, struct ttm_validate_buffer *tv)
2147 {
2148         tv->num_shared = 1;
2149         tv->bo = xe_vm_ttm_bo(vm);
2150 }
2151
2152 static void vm_set_async_error(struct xe_vm *vm, int err)
2153 {
2154         lockdep_assert_held(&vm->lock);
2155         vm->async_ops.error = err;
2156 }
2157
2158 static int vm_bind_ioctl_lookup_vma(struct xe_vm *vm, struct xe_bo *bo,
2159                                     u64 addr, u64 range, u32 op)
2160 {
2161         struct xe_device *xe = vm->xe;
2162         struct xe_vma *vma;
2163         bool async = !!(op & XE_VM_BIND_FLAG_ASYNC);
2164
2165         lockdep_assert_held(&vm->lock);
2166
2167         switch (VM_BIND_OP(op)) {
2168         case XE_VM_BIND_OP_MAP:
2169         case XE_VM_BIND_OP_MAP_USERPTR:
2170                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2171                 if (XE_IOCTL_DBG(xe, vma && !async))
2172                         return -EBUSY;
2173                 break;
2174         case XE_VM_BIND_OP_UNMAP:
2175         case XE_VM_BIND_OP_PREFETCH:
2176                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2177                 if (XE_IOCTL_DBG(xe, !vma))
2178                         /* Not an actual error, IOCTL cleans up returns and 0 */
2179                         return -ENODATA;
2180                 if (XE_IOCTL_DBG(xe, (xe_vma_start(vma) != addr ||
2181                                       xe_vma_end(vma) != addr + range) && !async))
2182                         return -EINVAL;
2183                 break;
2184         case XE_VM_BIND_OP_UNMAP_ALL:
2185                 if (XE_IOCTL_DBG(xe, list_empty(&bo->ttm.base.gpuva.list)))
2186                         /* Not an actual error, IOCTL cleans up returns and 0 */
2187                         return -ENODATA;
2188                 break;
2189         default:
2190                 XE_WARN_ON("NOT POSSIBLE");
2191                 return -EINVAL;
2192         }
2193
2194         return 0;
2195 }
2196
2197 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2198                              bool post_commit)
2199 {
2200         down_read(&vm->userptr.notifier_lock);
2201         vma->gpuva.flags |= XE_VMA_DESTROYED;
2202         up_read(&vm->userptr.notifier_lock);
2203         if (post_commit)
2204                 xe_vm_remove_vma(vm, vma);
2205 }
2206
2207 #undef ULL
2208 #define ULL     unsigned long long
2209
2210 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2211 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2212 {
2213         struct xe_vma *vma;
2214
2215         switch (op->op) {
2216         case DRM_GPUVA_OP_MAP:
2217                 vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2218                        (ULL)op->map.va.addr, (ULL)op->map.va.range);
2219                 break;
2220         case DRM_GPUVA_OP_REMAP:
2221                 vma = gpuva_to_vma(op->remap.unmap->va);
2222                 vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2223                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2224                        op->unmap.keep ? 1 : 0);
2225                 if (op->remap.prev)
2226                         vm_dbg(&xe->drm,
2227                                "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2228                                (ULL)op->remap.prev->va.addr,
2229                                (ULL)op->remap.prev->va.range);
2230                 if (op->remap.next)
2231                         vm_dbg(&xe->drm,
2232                                "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2233                                (ULL)op->remap.next->va.addr,
2234                                (ULL)op->remap.next->va.range);
2235                 break;
2236         case DRM_GPUVA_OP_UNMAP:
2237                 vma = gpuva_to_vma(op->unmap.va);
2238                 vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2239                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2240                        op->unmap.keep ? 1 : 0);
2241                 break;
2242         case DRM_GPUVA_OP_PREFETCH:
2243                 vma = gpuva_to_vma(op->prefetch.va);
2244                 vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2245                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2246                 break;
2247         default:
2248                 XE_WARN_ON("NOT POSSIBLE");
2249         }
2250 }
2251 #else
2252 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2253 {
2254 }
2255 #endif
2256
2257 /*
2258  * Create operations list from IOCTL arguments, setup operations fields so parse
2259  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2260  */
2261 static struct drm_gpuva_ops *
2262 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2263                          u64 bo_offset_or_userptr, u64 addr, u64 range,
2264                          u32 operation, u8 tile_mask, u32 region)
2265 {
2266         struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2267         struct ww_acquire_ctx ww;
2268         struct drm_gpuva_ops *ops;
2269         struct drm_gpuva_op *__op;
2270         struct xe_vma_op *op;
2271         struct drm_gpuvm_bo *vm_bo;
2272         int err;
2273
2274         lockdep_assert_held_write(&vm->lock);
2275
2276         vm_dbg(&vm->xe->drm,
2277                "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2278                VM_BIND_OP(operation), (ULL)addr, (ULL)range,
2279                (ULL)bo_offset_or_userptr);
2280
2281         switch (VM_BIND_OP(operation)) {
2282         case XE_VM_BIND_OP_MAP:
2283         case XE_VM_BIND_OP_MAP_USERPTR:
2284                 ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2285                                                   obj, bo_offset_or_userptr);
2286                 if (IS_ERR(ops))
2287                         return ops;
2288
2289                 drm_gpuva_for_each_op(__op, ops) {
2290                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2291
2292                         op->tile_mask = tile_mask;
2293                         op->map.immediate =
2294                                 operation & XE_VM_BIND_FLAG_IMMEDIATE;
2295                         op->map.read_only =
2296                                 operation & XE_VM_BIND_FLAG_READONLY;
2297                         op->map.is_null = operation & XE_VM_BIND_FLAG_NULL;
2298                 }
2299                 break;
2300         case XE_VM_BIND_OP_UNMAP:
2301                 ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2302                 if (IS_ERR(ops))
2303                         return ops;
2304
2305                 drm_gpuva_for_each_op(__op, ops) {
2306                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2307
2308                         op->tile_mask = tile_mask;
2309                 }
2310                 break;
2311         case XE_VM_BIND_OP_PREFETCH:
2312                 ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2313                 if (IS_ERR(ops))
2314                         return ops;
2315
2316                 drm_gpuva_for_each_op(__op, ops) {
2317                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2318
2319                         op->tile_mask = tile_mask;
2320                         op->prefetch.region = region;
2321                 }
2322                 break;
2323         case XE_VM_BIND_OP_UNMAP_ALL:
2324                 XE_WARN_ON(!bo);
2325
2326                 err = xe_bo_lock(bo, &ww, 0, true);
2327                 if (err)
2328                         return ERR_PTR(err);
2329
2330                 vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj);
2331                 if (!vm_bo)
2332                         break;
2333
2334                 ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2335                 drm_gpuvm_bo_put(vm_bo);
2336                 xe_bo_unlock(bo, &ww);
2337                 if (IS_ERR(ops))
2338                         return ops;
2339
2340                 drm_gpuva_for_each_op(__op, ops) {
2341                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2342
2343                         op->tile_mask = tile_mask;
2344                 }
2345                 break;
2346         default:
2347                 XE_WARN_ON("NOT POSSIBLE");
2348                 ops = ERR_PTR(-EINVAL);
2349         }
2350
2351 #ifdef TEST_VM_ASYNC_OPS_ERROR
2352         if (operation & FORCE_ASYNC_OP_ERROR) {
2353                 op = list_first_entry_or_null(&ops->list, struct xe_vma_op,
2354                                               base.entry);
2355                 if (op)
2356                         op->inject_error = true;
2357         }
2358 #endif
2359
2360         if (!IS_ERR(ops))
2361                 drm_gpuva_for_each_op(__op, ops)
2362                         print_op(vm->xe, __op);
2363
2364         return ops;
2365 }
2366
2367 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2368                               u8 tile_mask, bool read_only, bool is_null)
2369 {
2370         struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2371         struct xe_vma *vma;
2372         struct ww_acquire_ctx ww;
2373         int err;
2374
2375         lockdep_assert_held_write(&vm->lock);
2376
2377         if (bo) {
2378                 err = xe_bo_lock(bo, &ww, 0, true);
2379                 if (err)
2380                         return ERR_PTR(err);
2381         }
2382         vma = xe_vma_create(vm, bo, op->gem.offset,
2383                             op->va.addr, op->va.addr +
2384                             op->va.range - 1, read_only, is_null,
2385                             tile_mask);
2386         if (bo)
2387                 xe_bo_unlock(bo, &ww);
2388
2389         if (xe_vma_is_userptr(vma)) {
2390                 err = xe_vma_userptr_pin_pages(vma);
2391                 if (err) {
2392                         prep_vma_destroy(vm, vma, false);
2393                         xe_vma_destroy_unlocked(vma);
2394                         return ERR_PTR(err);
2395                 }
2396         } else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
2397                 vm_insert_extobj(vm, vma);
2398                 err = add_preempt_fences(vm, bo);
2399                 if (err) {
2400                         prep_vma_destroy(vm, vma, false);
2401                         xe_vma_destroy_unlocked(vma);
2402                         return ERR_PTR(err);
2403                 }
2404         }
2405
2406         return vma;
2407 }
2408
2409 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2410 {
2411         if (vma->gpuva.flags & XE_VMA_PTE_1G)
2412                 return SZ_1G;
2413         else if (vma->gpuva.flags & XE_VMA_PTE_2M)
2414                 return SZ_2M;
2415
2416         return SZ_4K;
2417 }
2418
2419 static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2420 {
2421         switch (size) {
2422         case SZ_1G:
2423                 vma->gpuva.flags |= XE_VMA_PTE_1G;
2424                 break;
2425         case SZ_2M:
2426                 vma->gpuva.flags |= XE_VMA_PTE_2M;
2427                 break;
2428         }
2429
2430         return SZ_4K;
2431 }
2432
2433 /*
2434  * Parse operations list and create any resources needed for the operations
2435  * prior to fully committing to the operations. This setup can fail.
2436  */
2437 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
2438                                    struct drm_gpuva_ops **ops, int num_ops_list,
2439                                    struct xe_sync_entry *syncs, u32 num_syncs,
2440                                    struct list_head *ops_list, bool async)
2441 {
2442         struct xe_vma_op *last_op = NULL;
2443         struct list_head *async_list = NULL;
2444         struct async_op_fence *fence = NULL;
2445         int err, i;
2446
2447         lockdep_assert_held_write(&vm->lock);
2448         XE_WARN_ON(num_ops_list > 1 && !async);
2449
2450         if (num_syncs && async) {
2451                 u64 seqno;
2452
2453                 fence = kmalloc(sizeof(*fence), GFP_KERNEL);
2454                 if (!fence)
2455                         return -ENOMEM;
2456
2457                 seqno = q ? ++q->bind.fence_seqno : ++vm->async_ops.fence.seqno;
2458                 dma_fence_init(&fence->fence, &async_op_fence_ops,
2459                                &vm->async_ops.lock, q ? q->bind.fence_ctx :
2460                                vm->async_ops.fence.context, seqno);
2461
2462                 if (!xe_vm_no_dma_fences(vm)) {
2463                         fence->vm = vm;
2464                         fence->started = false;
2465                         init_waitqueue_head(&fence->wq);
2466                 }
2467         }
2468
2469         for (i = 0; i < num_ops_list; ++i) {
2470                 struct drm_gpuva_ops *__ops = ops[i];
2471                 struct drm_gpuva_op *__op;
2472
2473                 drm_gpuva_for_each_op(__op, __ops) {
2474                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2475                         bool first = !async_list;
2476
2477                         XE_WARN_ON(!first && !async);
2478
2479                         INIT_LIST_HEAD(&op->link);
2480                         if (first)
2481                                 async_list = ops_list;
2482                         list_add_tail(&op->link, async_list);
2483
2484                         if (first) {
2485                                 op->flags |= XE_VMA_OP_FIRST;
2486                                 op->num_syncs = num_syncs;
2487                                 op->syncs = syncs;
2488                         }
2489
2490                         op->q = q;
2491
2492                         switch (op->base.op) {
2493                         case DRM_GPUVA_OP_MAP:
2494                         {
2495                                 struct xe_vma *vma;
2496
2497                                 vma = new_vma(vm, &op->base.map,
2498                                               op->tile_mask, op->map.read_only,
2499                                               op->map.is_null);
2500                                 if (IS_ERR(vma)) {
2501                                         err = PTR_ERR(vma);
2502                                         goto free_fence;
2503                                 }
2504
2505                                 op->map.vma = vma;
2506                                 break;
2507                         }
2508                         case DRM_GPUVA_OP_REMAP:
2509                         {
2510                                 struct xe_vma *old =
2511                                         gpuva_to_vma(op->base.remap.unmap->va);
2512
2513                                 op->remap.start = xe_vma_start(old);
2514                                 op->remap.range = xe_vma_size(old);
2515
2516                                 if (op->base.remap.prev) {
2517                                         struct xe_vma *vma;
2518                                         bool read_only =
2519                                                 op->base.remap.unmap->va->flags &
2520                                                 XE_VMA_READ_ONLY;
2521                                         bool is_null =
2522                                                 op->base.remap.unmap->va->flags &
2523                                                 DRM_GPUVA_SPARSE;
2524
2525                                         vma = new_vma(vm, op->base.remap.prev,
2526                                                       op->tile_mask, read_only,
2527                                                       is_null);
2528                                         if (IS_ERR(vma)) {
2529                                                 err = PTR_ERR(vma);
2530                                                 goto free_fence;
2531                                         }
2532
2533                                         op->remap.prev = vma;
2534
2535                                         /*
2536                                          * Userptr creates a new SG mapping so
2537                                          * we must also rebind.
2538                                          */
2539                                         op->remap.skip_prev = !xe_vma_is_userptr(old) &&
2540                                                 IS_ALIGNED(xe_vma_end(vma),
2541                                                            xe_vma_max_pte_size(old));
2542                                         if (op->remap.skip_prev) {
2543                                                 xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2544                                                 op->remap.range -=
2545                                                         xe_vma_end(vma) -
2546                                                         xe_vma_start(old);
2547                                                 op->remap.start = xe_vma_end(vma);
2548                                         }
2549                                 }
2550
2551                                 if (op->base.remap.next) {
2552                                         struct xe_vma *vma;
2553                                         bool read_only =
2554                                                 op->base.remap.unmap->va->flags &
2555                                                 XE_VMA_READ_ONLY;
2556
2557                                         bool is_null =
2558                                                 op->base.remap.unmap->va->flags &
2559                                                 DRM_GPUVA_SPARSE;
2560
2561                                         vma = new_vma(vm, op->base.remap.next,
2562                                                       op->tile_mask, read_only,
2563                                                       is_null);
2564                                         if (IS_ERR(vma)) {
2565                                                 err = PTR_ERR(vma);
2566                                                 goto free_fence;
2567                                         }
2568
2569                                         op->remap.next = vma;
2570
2571                                         /*
2572                                          * Userptr creates a new SG mapping so
2573                                          * we must also rebind.
2574                                          */
2575                                         op->remap.skip_next = !xe_vma_is_userptr(old) &&
2576                                                 IS_ALIGNED(xe_vma_start(vma),
2577                                                            xe_vma_max_pte_size(old));
2578                                         if (op->remap.skip_next) {
2579                                                 xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2580                                                 op->remap.range -=
2581                                                         xe_vma_end(old) -
2582                                                         xe_vma_start(vma);
2583                                         }
2584                                 }
2585                                 break;
2586                         }
2587                         case DRM_GPUVA_OP_UNMAP:
2588                         case DRM_GPUVA_OP_PREFETCH:
2589                                 /* Nothing to do */
2590                                 break;
2591                         default:
2592                                 XE_WARN_ON("NOT POSSIBLE");
2593                         }
2594
2595                         last_op = op;
2596                 }
2597
2598                 last_op->ops = __ops;
2599         }
2600
2601         if (!last_op)
2602                 return -ENODATA;
2603
2604         last_op->flags |= XE_VMA_OP_LAST;
2605         last_op->num_syncs = num_syncs;
2606         last_op->syncs = syncs;
2607         last_op->fence = fence;
2608
2609         return 0;
2610
2611 free_fence:
2612         kfree(fence);
2613         return err;
2614 }
2615
2616 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2617 {
2618         int err = 0;
2619
2620         lockdep_assert_held_write(&vm->lock);
2621
2622         switch (op->base.op) {
2623         case DRM_GPUVA_OP_MAP:
2624                 err |= xe_vm_insert_vma(vm, op->map.vma);
2625                 if (!err)
2626                         op->flags |= XE_VMA_OP_COMMITTED;
2627                 break;
2628         case DRM_GPUVA_OP_REMAP:
2629                 prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2630                                  true);
2631                 op->flags |= XE_VMA_OP_COMMITTED;
2632
2633                 if (op->remap.prev) {
2634                         err |= xe_vm_insert_vma(vm, op->remap.prev);
2635                         if (!err)
2636                                 op->flags |= XE_VMA_OP_PREV_COMMITTED;
2637                         if (!err && op->remap.skip_prev)
2638                                 op->remap.prev = NULL;
2639                 }
2640                 if (op->remap.next) {
2641                         err |= xe_vm_insert_vma(vm, op->remap.next);
2642                         if (!err)
2643                                 op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2644                         if (!err && op->remap.skip_next)
2645                                 op->remap.next = NULL;
2646                 }
2647
2648                 /* Adjust for partial unbind after removin VMA from VM */
2649                 if (!err) {
2650                         op->base.remap.unmap->va->va.addr = op->remap.start;
2651                         op->base.remap.unmap->va->va.range = op->remap.range;
2652                 }
2653                 break;
2654         case DRM_GPUVA_OP_UNMAP:
2655                 prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2656                 op->flags |= XE_VMA_OP_COMMITTED;
2657                 break;
2658         case DRM_GPUVA_OP_PREFETCH:
2659                 op->flags |= XE_VMA_OP_COMMITTED;
2660                 break;
2661         default:
2662                 XE_WARN_ON("NOT POSSIBLE");
2663         }
2664
2665         return err;
2666 }
2667
2668 static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
2669                                struct xe_vma_op *op)
2670 {
2671         LIST_HEAD(objs);
2672         LIST_HEAD(dups);
2673         struct ttm_validate_buffer tv_bo, tv_vm;
2674         struct ww_acquire_ctx ww;
2675         struct xe_bo *vbo;
2676         int err;
2677
2678         lockdep_assert_held_write(&vm->lock);
2679
2680         xe_vm_tv_populate(vm, &tv_vm);
2681         list_add_tail(&tv_vm.head, &objs);
2682         vbo = xe_vma_bo(vma);
2683         if (vbo) {
2684                 /*
2685                  * An unbind can drop the last reference to the BO and
2686                  * the BO is needed for ttm_eu_backoff_reservation so
2687                  * take a reference here.
2688                  */
2689                 xe_bo_get(vbo);
2690
2691                 if (!vbo->vm) {
2692                         tv_bo.bo = &vbo->ttm;
2693                         tv_bo.num_shared = 1;
2694                         list_add(&tv_bo.head, &objs);
2695                 }
2696         }
2697
2698 again:
2699         err = ttm_eu_reserve_buffers(&ww, &objs, true, &dups);
2700         if (err) {
2701                 xe_bo_put(vbo);
2702                 return err;
2703         }
2704
2705         xe_vm_assert_held(vm);
2706         xe_bo_assert_held(xe_vma_bo(vma));
2707
2708         switch (op->base.op) {
2709         case DRM_GPUVA_OP_MAP:
2710                 err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
2711                                  op->syncs, op->num_syncs, op->fence,
2712                                  op->map.immediate || !xe_vm_in_fault_mode(vm),
2713                                  op->flags & XE_VMA_OP_FIRST,
2714                                  op->flags & XE_VMA_OP_LAST);
2715                 break;
2716         case DRM_GPUVA_OP_REMAP:
2717         {
2718                 bool prev = !!op->remap.prev;
2719                 bool next = !!op->remap.next;
2720
2721                 if (!op->remap.unmap_done) {
2722                         if (prev || next) {
2723                                 vm->async_ops.munmap_rebind_inflight = true;
2724                                 vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
2725                         }
2726                         err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2727                                            op->num_syncs,
2728                                            !prev && !next ? op->fence : NULL,
2729                                            op->flags & XE_VMA_OP_FIRST,
2730                                            op->flags & XE_VMA_OP_LAST && !prev &&
2731                                            !next);
2732                         if (err)
2733                                 break;
2734                         op->remap.unmap_done = true;
2735                 }
2736
2737                 if (prev) {
2738                         op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
2739                         err = xe_vm_bind(vm, op->remap.prev, op->q,
2740                                          xe_vma_bo(op->remap.prev), op->syncs,
2741                                          op->num_syncs,
2742                                          !next ? op->fence : NULL, true, false,
2743                                          op->flags & XE_VMA_OP_LAST && !next);
2744                         op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2745                         if (err)
2746                                 break;
2747                         op->remap.prev = NULL;
2748                 }
2749
2750                 if (next) {
2751                         op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
2752                         err = xe_vm_bind(vm, op->remap.next, op->q,
2753                                          xe_vma_bo(op->remap.next),
2754                                          op->syncs, op->num_syncs,
2755                                          op->fence, true, false,
2756                                          op->flags & XE_VMA_OP_LAST);
2757                         op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2758                         if (err)
2759                                 break;
2760                         op->remap.next = NULL;
2761                 }
2762                 vm->async_ops.munmap_rebind_inflight = false;
2763
2764                 break;
2765         }
2766         case DRM_GPUVA_OP_UNMAP:
2767                 err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2768                                    op->num_syncs, op->fence,
2769                                    op->flags & XE_VMA_OP_FIRST,
2770                                    op->flags & XE_VMA_OP_LAST);
2771                 break;
2772         case DRM_GPUVA_OP_PREFETCH:
2773                 err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
2774                                      op->syncs, op->num_syncs, op->fence,
2775                                      op->flags & XE_VMA_OP_FIRST,
2776                                      op->flags & XE_VMA_OP_LAST);
2777                 break;
2778         default:
2779                 XE_WARN_ON("NOT POSSIBLE");
2780         }
2781
2782         ttm_eu_backoff_reservation(&ww, &objs);
2783         if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
2784                 lockdep_assert_held_write(&vm->lock);
2785                 err = xe_vma_userptr_pin_pages(vma);
2786                 if (!err)
2787                         goto again;
2788         }
2789         xe_bo_put(vbo);
2790
2791         if (err)
2792                 trace_xe_vma_fail(vma);
2793
2794         return err;
2795 }
2796
2797 static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
2798 {
2799         int ret = 0;
2800
2801         lockdep_assert_held_write(&vm->lock);
2802
2803 #ifdef TEST_VM_ASYNC_OPS_ERROR
2804         if (op->inject_error) {
2805                 op->inject_error = false;
2806                 return -ENOMEM;
2807         }
2808 #endif
2809
2810         switch (op->base.op) {
2811         case DRM_GPUVA_OP_MAP:
2812                 ret = __xe_vma_op_execute(vm, op->map.vma, op);
2813                 break;
2814         case DRM_GPUVA_OP_REMAP:
2815         {
2816                 struct xe_vma *vma;
2817
2818                 if (!op->remap.unmap_done)
2819                         vma = gpuva_to_vma(op->base.remap.unmap->va);
2820                 else if (op->remap.prev)
2821                         vma = op->remap.prev;
2822                 else
2823                         vma = op->remap.next;
2824
2825                 ret = __xe_vma_op_execute(vm, vma, op);
2826                 break;
2827         }
2828         case DRM_GPUVA_OP_UNMAP:
2829                 ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
2830                                           op);
2831                 break;
2832         case DRM_GPUVA_OP_PREFETCH:
2833                 ret = __xe_vma_op_execute(vm,
2834                                           gpuva_to_vma(op->base.prefetch.va),
2835                                           op);
2836                 break;
2837         default:
2838                 XE_WARN_ON("NOT POSSIBLE");
2839         }
2840
2841         return ret;
2842 }
2843
2844 static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
2845 {
2846         bool last = op->flags & XE_VMA_OP_LAST;
2847
2848         if (last) {
2849                 while (op->num_syncs--)
2850                         xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
2851                 kfree(op->syncs);
2852                 if (op->q)
2853                         xe_exec_queue_put(op->q);
2854                 if (op->fence)
2855                         dma_fence_put(&op->fence->fence);
2856         }
2857         if (!list_empty(&op->link)) {
2858                 spin_lock_irq(&vm->async_ops.lock);
2859                 list_del(&op->link);
2860                 spin_unlock_irq(&vm->async_ops.lock);
2861         }
2862         if (op->ops)
2863                 drm_gpuva_ops_free(&vm->gpuvm, op->ops);
2864         if (last)
2865                 xe_vm_put(vm);
2866 }
2867
2868 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2869                              bool post_commit, bool prev_post_commit,
2870                              bool next_post_commit)
2871 {
2872         lockdep_assert_held_write(&vm->lock);
2873
2874         switch (op->base.op) {
2875         case DRM_GPUVA_OP_MAP:
2876                 if (op->map.vma) {
2877                         prep_vma_destroy(vm, op->map.vma, post_commit);
2878                         xe_vma_destroy_unlocked(op->map.vma);
2879                 }
2880                 break;
2881         case DRM_GPUVA_OP_UNMAP:
2882         {
2883                 struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2884
2885                 down_read(&vm->userptr.notifier_lock);
2886                 vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2887                 up_read(&vm->userptr.notifier_lock);
2888                 if (post_commit)
2889                         xe_vm_insert_vma(vm, vma);
2890                 break;
2891         }
2892         case DRM_GPUVA_OP_REMAP:
2893         {
2894                 struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2895
2896                 if (op->remap.prev) {
2897                         prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2898                         xe_vma_destroy_unlocked(op->remap.prev);
2899                 }
2900                 if (op->remap.next) {
2901                         prep_vma_destroy(vm, op->remap.next, next_post_commit);
2902                         xe_vma_destroy_unlocked(op->remap.next);
2903                 }
2904                 down_read(&vm->userptr.notifier_lock);
2905                 vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2906                 up_read(&vm->userptr.notifier_lock);
2907                 if (post_commit)
2908                         xe_vm_insert_vma(vm, vma);
2909                 break;
2910         }
2911         case DRM_GPUVA_OP_PREFETCH:
2912                 /* Nothing to do */
2913                 break;
2914         default:
2915                 XE_WARN_ON("NOT POSSIBLE");
2916         }
2917 }
2918
2919 static struct xe_vma_op *next_vma_op(struct xe_vm *vm)
2920 {
2921         return list_first_entry_or_null(&vm->async_ops.pending,
2922                                         struct xe_vma_op, link);
2923 }
2924
2925 static void xe_vma_op_work_func(struct work_struct *w)
2926 {
2927         struct xe_vm *vm = container_of(w, struct xe_vm, async_ops.work);
2928
2929         for (;;) {
2930                 struct xe_vma_op *op;
2931                 int err;
2932
2933                 if (vm->async_ops.error && !xe_vm_is_closed(vm))
2934                         break;
2935
2936                 spin_lock_irq(&vm->async_ops.lock);
2937                 op = next_vma_op(vm);
2938                 spin_unlock_irq(&vm->async_ops.lock);
2939
2940                 if (!op)
2941                         break;
2942
2943                 if (!xe_vm_is_closed(vm)) {
2944                         down_write(&vm->lock);
2945                         err = xe_vma_op_execute(vm, op);
2946                         if (err) {
2947                                 drm_warn(&vm->xe->drm,
2948                                          "Async VM op(%d) failed with %d",
2949                                          op->base.op, err);
2950                                 vm_set_async_error(vm, err);
2951                                 up_write(&vm->lock);
2952
2953                                 if (vm->async_ops.error_capture.addr)
2954                                         vm_error_capture(vm, err, 0, 0, 0);
2955                                 break;
2956                         }
2957                         up_write(&vm->lock);
2958                 } else {
2959                         struct xe_vma *vma;
2960
2961                         switch (op->base.op) {
2962                         case DRM_GPUVA_OP_REMAP:
2963                                 vma = gpuva_to_vma(op->base.remap.unmap->va);
2964                                 trace_xe_vma_flush(vma);
2965
2966                                 down_write(&vm->lock);
2967                                 xe_vma_destroy_unlocked(vma);
2968                                 up_write(&vm->lock);
2969                                 break;
2970                         case DRM_GPUVA_OP_UNMAP:
2971                                 vma = gpuva_to_vma(op->base.unmap.va);
2972                                 trace_xe_vma_flush(vma);
2973
2974                                 down_write(&vm->lock);
2975                                 xe_vma_destroy_unlocked(vma);
2976                                 up_write(&vm->lock);
2977                                 break;
2978                         default:
2979                                 /* Nothing to do */
2980                                 break;
2981                         }
2982
2983                         if (op->fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
2984                                                    &op->fence->fence.flags)) {
2985                                 if (!xe_vm_no_dma_fences(vm)) {
2986                                         op->fence->started = true;
2987                                         wake_up_all(&op->fence->wq);
2988                                 }
2989                                 dma_fence_signal(&op->fence->fence);
2990                         }
2991                 }
2992
2993                 xe_vma_op_cleanup(vm, op);
2994         }
2995 }
2996
2997 static int vm_bind_ioctl_ops_commit(struct xe_vm *vm,
2998                                     struct list_head *ops_list, bool async)
2999 {
3000         struct xe_vma_op *op, *last_op, *next;
3001         int err;
3002
3003         lockdep_assert_held_write(&vm->lock);
3004
3005         list_for_each_entry(op, ops_list, link) {
3006                 last_op = op;
3007                 err = xe_vma_op_commit(vm, op);
3008                 if (err)
3009                         goto unwind;
3010         }
3011
3012         if (!async) {
3013                 err = xe_vma_op_execute(vm, last_op);
3014                 if (err)
3015                         goto unwind;
3016                 xe_vma_op_cleanup(vm, last_op);
3017         } else {
3018                 int i;
3019                 bool installed = false;
3020
3021                 for (i = 0; i < last_op->num_syncs; i++)
3022                         installed |= xe_sync_entry_signal(&last_op->syncs[i],
3023                                                           NULL,
3024                                                           &last_op->fence->fence);
3025                 if (!installed && last_op->fence)
3026                         dma_fence_signal(&last_op->fence->fence);
3027
3028                 spin_lock_irq(&vm->async_ops.lock);
3029                 list_splice_tail(ops_list, &vm->async_ops.pending);
3030                 spin_unlock_irq(&vm->async_ops.lock);
3031
3032                 if (!vm->async_ops.error)
3033                         queue_work(system_unbound_wq, &vm->async_ops.work);
3034         }
3035
3036         return 0;
3037
3038 unwind:
3039         list_for_each_entry_reverse(op, ops_list, link)
3040                 xe_vma_op_unwind(vm, op, op->flags & XE_VMA_OP_COMMITTED,
3041                                  op->flags & XE_VMA_OP_PREV_COMMITTED,
3042                                  op->flags & XE_VMA_OP_NEXT_COMMITTED);
3043         list_for_each_entry_safe(op, next, ops_list, link)
3044                 xe_vma_op_cleanup(vm, op);
3045
3046         return err;
3047 }
3048
3049 /*
3050  * Unwind operations list, called after a failure of vm_bind_ioctl_ops_create or
3051  * vm_bind_ioctl_ops_parse.
3052  */
3053 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
3054                                      struct drm_gpuva_ops **ops,
3055                                      int num_ops_list)
3056 {
3057         int i;
3058
3059         for (i = 0; i < num_ops_list; ++i) {
3060                 struct drm_gpuva_ops *__ops = ops[i];
3061                 struct drm_gpuva_op *__op;
3062
3063                 if (!__ops)
3064                         continue;
3065
3066                 drm_gpuva_for_each_op(__op, __ops) {
3067                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3068
3069                         xe_vma_op_unwind(vm, op, false, false, false);
3070                 }
3071         }
3072 }
3073
3074 #ifdef TEST_VM_ASYNC_OPS_ERROR
3075 #define SUPPORTED_FLAGS \
3076         (FORCE_ASYNC_OP_ERROR | XE_VM_BIND_FLAG_ASYNC | \
3077          XE_VM_BIND_FLAG_READONLY | XE_VM_BIND_FLAG_IMMEDIATE | \
3078          XE_VM_BIND_FLAG_NULL | 0xffff)
3079 #else
3080 #define SUPPORTED_FLAGS \
3081         (XE_VM_BIND_FLAG_ASYNC | XE_VM_BIND_FLAG_READONLY | \
3082          XE_VM_BIND_FLAG_IMMEDIATE | XE_VM_BIND_FLAG_NULL | 0xffff)
3083 #endif
3084 #define XE_64K_PAGE_MASK 0xffffull
3085
3086 #define MAX_BINDS       512     /* FIXME: Picking random upper limit */
3087
3088 static int vm_bind_ioctl_check_args(struct xe_device *xe,
3089                                     struct drm_xe_vm_bind *args,
3090                                     struct drm_xe_vm_bind_op **bind_ops,
3091                                     bool *async)
3092 {
3093         int err;
3094         int i;
3095
3096         if (XE_IOCTL_DBG(xe, args->extensions) ||
3097             XE_IOCTL_DBG(xe, !args->num_binds) ||
3098             XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS))
3099                 return -EINVAL;
3100
3101         if (args->num_binds > 1) {
3102                 u64 __user *bind_user =
3103                         u64_to_user_ptr(args->vector_of_binds);
3104
3105                 *bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) *
3106                                     args->num_binds, GFP_KERNEL);
3107                 if (!*bind_ops)
3108                         return -ENOMEM;
3109
3110                 err = __copy_from_user(*bind_ops, bind_user,
3111                                        sizeof(struct drm_xe_vm_bind_op) *
3112                                        args->num_binds);
3113                 if (XE_IOCTL_DBG(xe, err)) {
3114                         err = -EFAULT;
3115                         goto free_bind_ops;
3116                 }
3117         } else {
3118                 *bind_ops = &args->bind;
3119         }
3120
3121         for (i = 0; i < args->num_binds; ++i) {
3122                 u64 range = (*bind_ops)[i].range;
3123                 u64 addr = (*bind_ops)[i].addr;
3124                 u32 op = (*bind_ops)[i].op;
3125                 u32 obj = (*bind_ops)[i].obj;
3126                 u64 obj_offset = (*bind_ops)[i].obj_offset;
3127                 u32 region = (*bind_ops)[i].region;
3128                 bool is_null = op & XE_VM_BIND_FLAG_NULL;
3129
3130                 if (i == 0) {
3131                         *async = !!(op & XE_VM_BIND_FLAG_ASYNC);
3132                 } else if (XE_IOCTL_DBG(xe, !*async) ||
3133                            XE_IOCTL_DBG(xe, !(op & XE_VM_BIND_FLAG_ASYNC)) ||
3134                            XE_IOCTL_DBG(xe, VM_BIND_OP(op) ==
3135                                         XE_VM_BIND_OP_RESTART)) {
3136                         err = -EINVAL;
3137                         goto free_bind_ops;
3138                 }
3139
3140                 if (XE_IOCTL_DBG(xe, !*async &&
3141                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL)) {
3142                         err = -EINVAL;
3143                         goto free_bind_ops;
3144                 }
3145
3146                 if (XE_IOCTL_DBG(xe, !*async &&
3147                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH)) {
3148                         err = -EINVAL;
3149                         goto free_bind_ops;
3150                 }
3151
3152                 if (XE_IOCTL_DBG(xe, VM_BIND_OP(op) >
3153                                  XE_VM_BIND_OP_PREFETCH) ||
3154                     XE_IOCTL_DBG(xe, op & ~SUPPORTED_FLAGS) ||
3155                     XE_IOCTL_DBG(xe, obj && is_null) ||
3156                     XE_IOCTL_DBG(xe, obj_offset && is_null) ||
3157                     XE_IOCTL_DBG(xe, VM_BIND_OP(op) != XE_VM_BIND_OP_MAP &&
3158                                  is_null) ||
3159                     XE_IOCTL_DBG(xe, !obj &&
3160                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP &&
3161                                  !is_null) ||
3162                     XE_IOCTL_DBG(xe, !obj &&
3163                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3164                     XE_IOCTL_DBG(xe, addr &&
3165                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3166                     XE_IOCTL_DBG(xe, range &&
3167                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3168                     XE_IOCTL_DBG(xe, obj &&
3169                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP_USERPTR) ||
3170                     XE_IOCTL_DBG(xe, obj &&
3171                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH) ||
3172                     XE_IOCTL_DBG(xe, region &&
3173                                  VM_BIND_OP(op) != XE_VM_BIND_OP_PREFETCH) ||
3174                     XE_IOCTL_DBG(xe, !(BIT(region) &
3175                                        xe->info.mem_region_mask)) ||
3176                     XE_IOCTL_DBG(xe, obj &&
3177                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP)) {
3178                         err = -EINVAL;
3179                         goto free_bind_ops;
3180                 }
3181
3182                 if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3183                     XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3184                     XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3185                     XE_IOCTL_DBG(xe, !range && VM_BIND_OP(op) !=
3186                                  XE_VM_BIND_OP_RESTART &&
3187                                  VM_BIND_OP(op) != XE_VM_BIND_OP_UNMAP_ALL)) {
3188                         err = -EINVAL;
3189                         goto free_bind_ops;
3190                 }
3191         }
3192
3193         return 0;
3194
3195 free_bind_ops:
3196         if (args->num_binds > 1)
3197                 kfree(*bind_ops);
3198         return err;
3199 }
3200
3201 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3202 {
3203         struct xe_device *xe = to_xe_device(dev);
3204         struct xe_file *xef = to_xe_file(file);
3205         struct drm_xe_vm_bind *args = data;
3206         struct drm_xe_sync __user *syncs_user;
3207         struct xe_bo **bos = NULL;
3208         struct drm_gpuva_ops **ops = NULL;
3209         struct xe_vm *vm;
3210         struct xe_exec_queue *q = NULL;
3211         u32 num_syncs;
3212         struct xe_sync_entry *syncs = NULL;
3213         struct drm_xe_vm_bind_op *bind_ops;
3214         LIST_HEAD(ops_list);
3215         bool async;
3216         int err;
3217         int i;
3218
3219         err = vm_bind_ioctl_check_args(xe, args, &bind_ops, &async);
3220         if (err)
3221                 return err;
3222
3223         if (args->exec_queue_id) {
3224                 q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3225                 if (XE_IOCTL_DBG(xe, !q)) {
3226                         err = -ENOENT;
3227                         goto free_objs;
3228                 }
3229
3230                 if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3231                         err = -EINVAL;
3232                         goto put_exec_queue;
3233                 }
3234         }
3235
3236         vm = xe_vm_lookup(xef, args->vm_id);
3237         if (XE_IOCTL_DBG(xe, !vm)) {
3238                 err = -EINVAL;
3239                 goto put_exec_queue;
3240         }
3241
3242         err = down_write_killable(&vm->lock);
3243         if (err)
3244                 goto put_vm;
3245
3246         if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3247                 err = -ENOENT;
3248                 goto release_vm_lock;
3249         }
3250
3251         if (VM_BIND_OP(bind_ops[0].op) == XE_VM_BIND_OP_RESTART) {
3252                 if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
3253                         err = -EOPNOTSUPP;
3254                 if (XE_IOCTL_DBG(xe, !err && args->num_syncs))
3255                         err = EINVAL;
3256                 if (XE_IOCTL_DBG(xe, !err && !vm->async_ops.error))
3257                         err = -EPROTO;
3258
3259                 if (!err) {
3260                         trace_xe_vm_restart(vm);
3261                         vm_set_async_error(vm, 0);
3262
3263                         queue_work(system_unbound_wq, &vm->async_ops.work);
3264
3265                         /* Rebinds may have been blocked, give worker a kick */
3266                         if (xe_vm_in_compute_mode(vm))
3267                                 xe_vm_queue_rebind_worker(vm);
3268                 }
3269
3270                 goto release_vm_lock;
3271         }
3272
3273         if (XE_IOCTL_DBG(xe, !vm->async_ops.error &&
3274                          async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) {
3275                 err = -EOPNOTSUPP;
3276                 goto release_vm_lock;
3277         }
3278
3279         for (i = 0; i < args->num_binds; ++i) {
3280                 u64 range = bind_ops[i].range;
3281                 u64 addr = bind_ops[i].addr;
3282
3283                 if (XE_IOCTL_DBG(xe, range > vm->size) ||
3284                     XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3285                         err = -EINVAL;
3286                         goto release_vm_lock;
3287                 }
3288
3289                 if (bind_ops[i].tile_mask) {
3290                         u64 valid_tiles = BIT(xe->info.tile_count) - 1;
3291
3292                         if (XE_IOCTL_DBG(xe, bind_ops[i].tile_mask &
3293                                          ~valid_tiles)) {
3294                                 err = -EINVAL;
3295                                 goto release_vm_lock;
3296                         }
3297                 }
3298         }
3299
3300         bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
3301         if (!bos) {
3302                 err = -ENOMEM;
3303                 goto release_vm_lock;
3304         }
3305
3306         ops = kzalloc(sizeof(*ops) * args->num_binds, GFP_KERNEL);
3307         if (!ops) {
3308                 err = -ENOMEM;
3309                 goto release_vm_lock;
3310         }
3311
3312         for (i = 0; i < args->num_binds; ++i) {
3313                 struct drm_gem_object *gem_obj;
3314                 u64 range = bind_ops[i].range;
3315                 u64 addr = bind_ops[i].addr;
3316                 u32 obj = bind_ops[i].obj;
3317                 u64 obj_offset = bind_ops[i].obj_offset;
3318
3319                 if (!obj)
3320                         continue;
3321
3322                 gem_obj = drm_gem_object_lookup(file, obj);
3323                 if (XE_IOCTL_DBG(xe, !gem_obj)) {
3324                         err = -ENOENT;
3325                         goto put_obj;
3326                 }
3327                 bos[i] = gem_to_xe_bo(gem_obj);
3328
3329                 if (XE_IOCTL_DBG(xe, range > bos[i]->size) ||
3330                     XE_IOCTL_DBG(xe, obj_offset >
3331                                  bos[i]->size - range)) {
3332                         err = -EINVAL;
3333                         goto put_obj;
3334                 }
3335
3336                 if (bos[i]->flags & XE_BO_INTERNAL_64K) {
3337                         if (XE_IOCTL_DBG(xe, obj_offset &
3338                                          XE_64K_PAGE_MASK) ||
3339                             XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3340                             XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3341                                 err = -EINVAL;
3342                                 goto put_obj;
3343                         }
3344                 }
3345         }
3346
3347         if (args->num_syncs) {
3348                 syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3349                 if (!syncs) {
3350                         err = -ENOMEM;
3351                         goto put_obj;
3352                 }
3353         }
3354
3355         syncs_user = u64_to_user_ptr(args->syncs);
3356         for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3357                 err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3358                                           &syncs_user[num_syncs], false,
3359                                           xe_vm_no_dma_fences(vm));
3360                 if (err)
3361                         goto free_syncs;
3362         }
3363
3364         /* Do some error checking first to make the unwind easier */
3365         for (i = 0; i < args->num_binds; ++i) {
3366                 u64 range = bind_ops[i].range;
3367                 u64 addr = bind_ops[i].addr;
3368                 u32 op = bind_ops[i].op;
3369
3370                 err = vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op);
3371                 if (err)
3372                         goto free_syncs;
3373         }
3374
3375         for (i = 0; i < args->num_binds; ++i) {
3376                 u64 range = bind_ops[i].range;
3377                 u64 addr = bind_ops[i].addr;
3378                 u32 op = bind_ops[i].op;
3379                 u64 obj_offset = bind_ops[i].obj_offset;
3380                 u8 tile_mask = bind_ops[i].tile_mask;
3381                 u32 region = bind_ops[i].region;
3382
3383                 ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3384                                                   addr, range, op, tile_mask,
3385                                                   region);
3386                 if (IS_ERR(ops[i])) {
3387                         err = PTR_ERR(ops[i]);
3388                         ops[i] = NULL;
3389                         goto unwind_ops;
3390                 }
3391         }
3392
3393         err = vm_bind_ioctl_ops_parse(vm, q, ops, args->num_binds,
3394                                       syncs, num_syncs, &ops_list, async);
3395         if (err)
3396                 goto unwind_ops;
3397
3398         err = vm_bind_ioctl_ops_commit(vm, &ops_list, async);
3399         up_write(&vm->lock);
3400
3401         for (i = 0; i < args->num_binds; ++i)
3402                 xe_bo_put(bos[i]);
3403
3404         kfree(bos);
3405         kfree(ops);
3406         if (args->num_binds > 1)
3407                 kfree(bind_ops);
3408
3409         return err;
3410
3411 unwind_ops:
3412         vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3413 free_syncs:
3414         for (i = 0; err == -ENODATA && i < num_syncs; i++)
3415                 xe_sync_entry_signal(&syncs[i], NULL, dma_fence_get_stub());
3416         while (num_syncs--)
3417                 xe_sync_entry_cleanup(&syncs[num_syncs]);
3418
3419         kfree(syncs);
3420 put_obj:
3421         for (i = 0; i < args->num_binds; ++i)
3422                 xe_bo_put(bos[i]);
3423 release_vm_lock:
3424         up_write(&vm->lock);
3425 put_vm:
3426         xe_vm_put(vm);
3427 put_exec_queue:
3428         if (q)
3429                 xe_exec_queue_put(q);
3430 free_objs:
3431         kfree(bos);
3432         kfree(ops);
3433         if (args->num_binds > 1)
3434                 kfree(bind_ops);
3435         return err == -ENODATA ? 0 : err;
3436 }
3437
3438 /*
3439  * XXX: Using the TTM wrappers for now, likely can call into dma-resv code
3440  * directly to optimize. Also this likely should be an inline function.
3441  */
3442 int xe_vm_lock(struct xe_vm *vm, struct ww_acquire_ctx *ww,
3443                int num_resv, bool intr)
3444 {
3445         struct ttm_validate_buffer tv_vm;
3446         LIST_HEAD(objs);
3447         LIST_HEAD(dups);
3448
3449         XE_WARN_ON(!ww);
3450
3451         tv_vm.num_shared = num_resv;
3452         tv_vm.bo = xe_vm_ttm_bo(vm);
3453         list_add_tail(&tv_vm.head, &objs);
3454
3455         return ttm_eu_reserve_buffers(ww, &objs, intr, &dups);
3456 }
3457
3458 void xe_vm_unlock(struct xe_vm *vm, struct ww_acquire_ctx *ww)
3459 {
3460         dma_resv_unlock(xe_vm_resv(vm));
3461         ww_acquire_fini(ww);
3462 }
3463
3464 /**
3465  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3466  * @vma: VMA to invalidate
3467  *
3468  * Walks a list of page tables leaves which it memset the entries owned by this
3469  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3470  * complete.
3471  *
3472  * Returns 0 for success, negative error code otherwise.
3473  */
3474 int xe_vm_invalidate_vma(struct xe_vma *vma)
3475 {
3476         struct xe_device *xe = xe_vma_vm(vma)->xe;
3477         struct xe_tile *tile;
3478         u32 tile_needs_invalidate = 0;
3479         int seqno[XE_MAX_TILES_PER_DEVICE];
3480         u8 id;
3481         int ret;
3482
3483         XE_WARN_ON(!xe_vm_in_fault_mode(xe_vma_vm(vma)));
3484         XE_WARN_ON(xe_vma_is_null(vma));
3485         trace_xe_vma_usm_invalidate(vma);
3486
3487         /* Check that we don't race with page-table updates */
3488         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3489                 if (xe_vma_is_userptr(vma)) {
3490                         WARN_ON_ONCE(!mmu_interval_check_retry
3491                                      (&vma->userptr.notifier,
3492                                       vma->userptr.notifier_seq));
3493                         WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3494                                                              DMA_RESV_USAGE_BOOKKEEP));
3495
3496                 } else {
3497                         xe_bo_assert_held(xe_vma_bo(vma));
3498                 }
3499         }
3500
3501         for_each_tile(tile, xe, id) {
3502                 if (xe_pt_zap_ptes(tile, vma)) {
3503                         tile_needs_invalidate |= BIT(id);
3504                         xe_device_wmb(xe);
3505                         /*
3506                          * FIXME: We potentially need to invalidate multiple
3507                          * GTs within the tile
3508                          */
3509                         seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
3510                         if (seqno[id] < 0)
3511                                 return seqno[id];
3512                 }
3513         }
3514
3515         for_each_tile(tile, xe, id) {
3516                 if (tile_needs_invalidate & BIT(id)) {
3517                         ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
3518                         if (ret < 0)
3519                                 return ret;
3520                 }
3521         }
3522
3523         vma->usm.tile_invalidated = vma->tile_mask;
3524
3525         return 0;
3526 }
3527
3528 int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
3529 {
3530         struct drm_gpuva *gpuva;
3531         bool is_vram;
3532         uint64_t addr;
3533
3534         if (!down_read_trylock(&vm->lock)) {
3535                 drm_printf(p, " Failed to acquire VM lock to dump capture");
3536                 return 0;
3537         }
3538         if (vm->pt_root[gt_id]) {
3539                 addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
3540                 is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
3541                 drm_printf(p, " VM root: A:0x%llx %s\n", addr,
3542                            is_vram ? "VRAM" : "SYS");
3543         }
3544
3545         drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3546                 struct xe_vma *vma = gpuva_to_vma(gpuva);
3547                 bool is_userptr = xe_vma_is_userptr(vma);
3548                 bool is_null = xe_vma_is_null(vma);
3549
3550                 if (is_null) {
3551                         addr = 0;
3552                 } else if (is_userptr) {
3553                         struct xe_res_cursor cur;
3554
3555                         if (vma->userptr.sg) {
3556                                 xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE,
3557                                                 &cur);
3558                                 addr = xe_res_dma(&cur);
3559                         } else {
3560                                 addr = 0;
3561                         }
3562                 } else {
3563                         addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
3564                         is_vram = xe_bo_is_vram(xe_vma_bo(vma));
3565                 }
3566                 drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
3567                            xe_vma_start(vma), xe_vma_end(vma) - 1,
3568                            xe_vma_size(vma),
3569                            addr, is_null ? "NULL" : is_userptr ? "USR" :
3570                            is_vram ? "VRAM" : "SYS");
3571         }
3572         up_read(&vm->lock);
3573
3574         return 0;
3575 }