drm/xe/vm: Simplify and document xe_vm_lock()
[linux-2.6-microblaze.git] / drivers / gpu / drm / xe / xe_vm.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5
6 #include "xe_vm.h"
7
8 #include <linux/dma-fence-array.h>
9
10 #include <drm/drm_print.h>
11 #include <drm/ttm/ttm_execbuf_util.h>
12 #include <drm/ttm/ttm_tt.h>
13 #include <drm/xe_drm.h>
14 #include <linux/delay.h>
15 #include <linux/kthread.h>
16 #include <linux/mm.h>
17 #include <linux/swap.h>
18
19 #include "xe_bo.h"
20 #include "xe_device.h"
21 #include "xe_exec_queue.h"
22 #include "xe_gt.h"
23 #include "xe_gt_pagefault.h"
24 #include "xe_gt_tlb_invalidation.h"
25 #include "xe_migrate.h"
26 #include "xe_pm.h"
27 #include "xe_preempt_fence.h"
28 #include "xe_pt.h"
29 #include "xe_res_cursor.h"
30 #include "xe_sync.h"
31 #include "xe_trace.h"
32 #include "generated/xe_wa_oob.h"
33 #include "xe_wa.h"
34
35 #define TEST_VM_ASYNC_OPS_ERROR
36
37 /**
38  * xe_vma_userptr_check_repin() - Advisory check for repin needed
39  * @vma: The userptr vma
40  *
41  * Check if the userptr vma has been invalidated since last successful
42  * repin. The check is advisory only and can the function can be called
43  * without the vm->userptr.notifier_lock held. There is no guarantee that the
44  * vma userptr will remain valid after a lockless check, so typically
45  * the call needs to be followed by a proper check under the notifier_lock.
46  *
47  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
48  */
49 int xe_vma_userptr_check_repin(struct xe_vma *vma)
50 {
51         return mmu_interval_check_retry(&vma->userptr.notifier,
52                                         vma->userptr.notifier_seq) ?
53                 -EAGAIN : 0;
54 }
55
56 int xe_vma_userptr_pin_pages(struct xe_vma *vma)
57 {
58         struct xe_vm *vm = xe_vma_vm(vma);
59         struct xe_device *xe = vm->xe;
60         const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
61         struct page **pages;
62         bool in_kthread = !current->mm;
63         unsigned long notifier_seq;
64         int pinned, ret, i;
65         bool read_only = xe_vma_read_only(vma);
66
67         lockdep_assert_held(&vm->lock);
68         XE_WARN_ON(!xe_vma_is_userptr(vma));
69 retry:
70         if (vma->gpuva.flags & XE_VMA_DESTROYED)
71                 return 0;
72
73         notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
74         if (notifier_seq == vma->userptr.notifier_seq)
75                 return 0;
76
77         pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
78         if (!pages)
79                 return -ENOMEM;
80
81         if (vma->userptr.sg) {
82                 dma_unmap_sgtable(xe->drm.dev,
83                                   vma->userptr.sg,
84                                   read_only ? DMA_TO_DEVICE :
85                                   DMA_BIDIRECTIONAL, 0);
86                 sg_free_table(vma->userptr.sg);
87                 vma->userptr.sg = NULL;
88         }
89
90         pinned = ret = 0;
91         if (in_kthread) {
92                 if (!mmget_not_zero(vma->userptr.notifier.mm)) {
93                         ret = -EFAULT;
94                         goto mm_closed;
95                 }
96                 kthread_use_mm(vma->userptr.notifier.mm);
97         }
98
99         while (pinned < num_pages) {
100                 ret = get_user_pages_fast(xe_vma_userptr(vma) +
101                                           pinned * PAGE_SIZE,
102                                           num_pages - pinned,
103                                           read_only ? 0 : FOLL_WRITE,
104                                           &pages[pinned]);
105                 if (ret < 0) {
106                         if (in_kthread)
107                                 ret = 0;
108                         break;
109                 }
110
111                 pinned += ret;
112                 ret = 0;
113         }
114
115         if (in_kthread) {
116                 kthread_unuse_mm(vma->userptr.notifier.mm);
117                 mmput(vma->userptr.notifier.mm);
118         }
119 mm_closed:
120         if (ret)
121                 goto out;
122
123         ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages,
124                                                 pinned, 0,
125                                                 (u64)pinned << PAGE_SHIFT,
126                                                 xe_sg_segment_size(xe->drm.dev),
127                                                 GFP_KERNEL);
128         if (ret) {
129                 vma->userptr.sg = NULL;
130                 goto out;
131         }
132         vma->userptr.sg = &vma->userptr.sgt;
133
134         ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg,
135                               read_only ? DMA_TO_DEVICE :
136                               DMA_BIDIRECTIONAL,
137                               DMA_ATTR_SKIP_CPU_SYNC |
138                               DMA_ATTR_NO_KERNEL_MAPPING);
139         if (ret) {
140                 sg_free_table(vma->userptr.sg);
141                 vma->userptr.sg = NULL;
142                 goto out;
143         }
144
145         for (i = 0; i < pinned; ++i) {
146                 if (!read_only) {
147                         lock_page(pages[i]);
148                         set_page_dirty(pages[i]);
149                         unlock_page(pages[i]);
150                 }
151
152                 mark_page_accessed(pages[i]);
153         }
154
155 out:
156         release_pages(pages, pinned);
157         kvfree(pages);
158
159         if (!(ret < 0)) {
160                 vma->userptr.notifier_seq = notifier_seq;
161                 if (xe_vma_userptr_check_repin(vma) == -EAGAIN)
162                         goto retry;
163         }
164
165         return ret < 0 ? ret : 0;
166 }
167
168 static bool preempt_fences_waiting(struct xe_vm *vm)
169 {
170         struct xe_exec_queue *q;
171
172         lockdep_assert_held(&vm->lock);
173         xe_vm_assert_held(vm);
174
175         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
176                 if (!q->compute.pfence ||
177                     (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
178                                                    &q->compute.pfence->flags))) {
179                         return true;
180                 }
181         }
182
183         return false;
184 }
185
186 static void free_preempt_fences(struct list_head *list)
187 {
188         struct list_head *link, *next;
189
190         list_for_each_safe(link, next, list)
191                 xe_preempt_fence_free(to_preempt_fence_from_link(link));
192 }
193
194 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
195                                 unsigned int *count)
196 {
197         lockdep_assert_held(&vm->lock);
198         xe_vm_assert_held(vm);
199
200         if (*count >= vm->preempt.num_exec_queues)
201                 return 0;
202
203         for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
204                 struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
205
206                 if (IS_ERR(pfence))
207                         return PTR_ERR(pfence);
208
209                 list_move_tail(xe_preempt_fence_link(pfence), list);
210         }
211
212         return 0;
213 }
214
215 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
216 {
217         struct xe_exec_queue *q;
218
219         xe_vm_assert_held(vm);
220
221         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
222                 if (q->compute.pfence) {
223                         long timeout = dma_fence_wait(q->compute.pfence, false);
224
225                         if (timeout < 0)
226                                 return -ETIME;
227                         dma_fence_put(q->compute.pfence);
228                         q->compute.pfence = NULL;
229                 }
230         }
231
232         return 0;
233 }
234
235 static bool xe_vm_is_idle(struct xe_vm *vm)
236 {
237         struct xe_exec_queue *q;
238
239         xe_vm_assert_held(vm);
240         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
241                 if (!xe_exec_queue_is_idle(q))
242                         return false;
243         }
244
245         return true;
246 }
247
248 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
249 {
250         struct list_head *link;
251         struct xe_exec_queue *q;
252
253         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
254                 struct dma_fence *fence;
255
256                 link = list->next;
257                 XE_WARN_ON(link == list);
258
259                 fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
260                                              q, q->compute.context,
261                                              ++q->compute.seqno);
262                 dma_fence_put(q->compute.pfence);
263                 q->compute.pfence = fence;
264         }
265 }
266
267 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
268 {
269         struct xe_exec_queue *q;
270         int err;
271
272         err = xe_bo_lock(bo, true);
273         if (err)
274                 return err;
275
276         err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
277         if (err)
278                 goto out_unlock;
279
280         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
281                 if (q->compute.pfence) {
282                         dma_resv_add_fence(bo->ttm.base.resv,
283                                            q->compute.pfence,
284                                            DMA_RESV_USAGE_BOOKKEEP);
285                 }
286
287 out_unlock:
288         xe_bo_unlock(bo);
289         return err;
290 }
291
292 /**
293  * xe_vm_fence_all_extobjs() - Add a fence to vm's external objects' resv
294  * @vm: The vm.
295  * @fence: The fence to add.
296  * @usage: The resv usage for the fence.
297  *
298  * Loops over all of the vm's external object bindings and adds a @fence
299  * with the given @usage to all of the external object's reservation
300  * objects.
301  */
302 void xe_vm_fence_all_extobjs(struct xe_vm *vm, struct dma_fence *fence,
303                              enum dma_resv_usage usage)
304 {
305         struct xe_vma *vma;
306
307         list_for_each_entry(vma, &vm->extobj.list, extobj.link)
308                 dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, usage);
309 }
310
311 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm)
312 {
313         struct xe_exec_queue *q;
314
315         lockdep_assert_held(&vm->lock);
316         xe_vm_assert_held(vm);
317
318         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
319                 q->ops->resume(q);
320
321                 dma_resv_add_fence(xe_vm_resv(vm), q->compute.pfence,
322                                    DMA_RESV_USAGE_BOOKKEEP);
323                 xe_vm_fence_all_extobjs(vm, q->compute.pfence,
324                                         DMA_RESV_USAGE_BOOKKEEP);
325         }
326 }
327
328 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
329 {
330         struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
331         struct ttm_validate_buffer *tv;
332         struct ww_acquire_ctx ww;
333         struct list_head objs;
334         struct dma_fence *pfence;
335         int err;
336         bool wait;
337
338         XE_WARN_ON(!xe_vm_in_compute_mode(vm));
339
340         down_write(&vm->lock);
341
342         err = xe_vm_lock_dma_resv(vm, &ww, tv_onstack, &tv, &objs, true, 1);
343         if (err)
344                 goto out_unlock_outer;
345
346         pfence = xe_preempt_fence_create(q, q->compute.context,
347                                          ++q->compute.seqno);
348         if (!pfence) {
349                 err = -ENOMEM;
350                 goto out_unlock;
351         }
352
353         list_add(&q->compute.link, &vm->preempt.exec_queues);
354         ++vm->preempt.num_exec_queues;
355         q->compute.pfence = pfence;
356
357         down_read(&vm->userptr.notifier_lock);
358
359         dma_resv_add_fence(xe_vm_resv(vm), pfence,
360                            DMA_RESV_USAGE_BOOKKEEP);
361
362         xe_vm_fence_all_extobjs(vm, pfence, DMA_RESV_USAGE_BOOKKEEP);
363
364         /*
365          * Check to see if a preemption on VM is in flight or userptr
366          * invalidation, if so trigger this preempt fence to sync state with
367          * other preempt fences on the VM.
368          */
369         wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
370         if (wait)
371                 dma_fence_enable_sw_signaling(pfence);
372
373         up_read(&vm->userptr.notifier_lock);
374
375 out_unlock:
376         xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
377 out_unlock_outer:
378         up_write(&vm->lock);
379
380         return err;
381 }
382
383 /**
384  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
385  * that need repinning.
386  * @vm: The VM.
387  *
388  * This function checks for whether the VM has userptrs that need repinning,
389  * and provides a release-type barrier on the userptr.notifier_lock after
390  * checking.
391  *
392  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
393  */
394 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
395 {
396         lockdep_assert_held_read(&vm->userptr.notifier_lock);
397
398         return (list_empty(&vm->userptr.repin_list) &&
399                 list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
400 }
401
402 /**
403  * xe_vm_lock_dma_resv() - Lock the vm dma_resv object and the dma_resv
404  * objects of the vm's external buffer objects.
405  * @vm: The vm.
406  * @ww: Pointer to a struct ww_acquire_ctx locking context.
407  * @tv_onstack: Array size XE_ONSTACK_TV of storage for the struct
408  * ttm_validate_buffers used for locking.
409  * @tv: Pointer to a pointer that on output contains the actual storage used.
410  * @objs: List head for the buffer objects locked.
411  * @intr: Whether to lock interruptible.
412  * @num_shared: Number of dma-fence slots to reserve in the locked objects.
413  *
414  * Locks the vm dma-resv objects and all the dma-resv objects of the
415  * buffer objects on the vm external object list. The TTM utilities require
416  * a list of struct ttm_validate_buffers pointing to the actual buffer
417  * objects to lock. Storage for those struct ttm_validate_buffers should
418  * be provided in @tv_onstack, and is typically reserved on the stack
419  * of the caller. If the size of @tv_onstack isn't sufficient, then
420  * storage will be allocated internally using kvmalloc().
421  *
422  * The function performs deadlock handling internally, and after a
423  * successful return the ww locking transaction should be considered
424  * sealed.
425  *
426  * Return: 0 on success, Negative error code on error. In particular if
427  * @intr is set to true, -EINTR or -ERESTARTSYS may be returned. In case
428  * of error, any locking performed has been reverted.
429  */
430 int xe_vm_lock_dma_resv(struct xe_vm *vm, struct ww_acquire_ctx *ww,
431                         struct ttm_validate_buffer *tv_onstack,
432                         struct ttm_validate_buffer **tv,
433                         struct list_head *objs,
434                         bool intr,
435                         unsigned int num_shared)
436 {
437         struct ttm_validate_buffer *tv_vm, *tv_bo;
438         struct xe_vma *vma, *next;
439         LIST_HEAD(dups);
440         int err;
441
442         lockdep_assert_held(&vm->lock);
443
444         if (vm->extobj.entries < XE_ONSTACK_TV) {
445                 tv_vm = tv_onstack;
446         } else {
447                 tv_vm = kvmalloc_array(vm->extobj.entries + 1, sizeof(*tv_vm),
448                                        GFP_KERNEL);
449                 if (!tv_vm)
450                         return -ENOMEM;
451         }
452         tv_bo = tv_vm + 1;
453
454         INIT_LIST_HEAD(objs);
455         list_for_each_entry(vma, &vm->extobj.list, extobj.link) {
456                 tv_bo->num_shared = num_shared;
457                 tv_bo->bo = &xe_vma_bo(vma)->ttm;
458
459                 list_add_tail(&tv_bo->head, objs);
460                 tv_bo++;
461         }
462         tv_vm->num_shared = num_shared;
463         tv_vm->bo = xe_vm_ttm_bo(vm);
464         list_add_tail(&tv_vm->head, objs);
465         err = ttm_eu_reserve_buffers(ww, objs, intr, &dups);
466         if (err)
467                 goto out_err;
468
469         spin_lock(&vm->notifier.list_lock);
470         list_for_each_entry_safe(vma, next, &vm->notifier.rebind_list,
471                                  notifier.rebind_link) {
472                 xe_bo_assert_held(xe_vma_bo(vma));
473
474                 list_del_init(&vma->notifier.rebind_link);
475                 if (vma->tile_present && !(vma->gpuva.flags & XE_VMA_DESTROYED))
476                         list_move_tail(&vma->combined_links.rebind,
477                                        &vm->rebind_list);
478         }
479         spin_unlock(&vm->notifier.list_lock);
480
481         *tv = tv_vm;
482         return 0;
483
484 out_err:
485         if (tv_vm != tv_onstack)
486                 kvfree(tv_vm);
487
488         return err;
489 }
490
491 /**
492  * xe_vm_unlock_dma_resv() - Unlock reservation objects locked by
493  * xe_vm_lock_dma_resv()
494  * @vm: The vm.
495  * @tv_onstack: The @tv_onstack array given to xe_vm_lock_dma_resv().
496  * @tv: The value of *@tv given by xe_vm_lock_dma_resv().
497  * @ww: The ww_acquire_context used for locking.
498  * @objs: The list returned from xe_vm_lock_dma_resv().
499  *
500  * Unlocks the reservation objects and frees any memory allocated by
501  * xe_vm_lock_dma_resv().
502  */
503 void xe_vm_unlock_dma_resv(struct xe_vm *vm,
504                            struct ttm_validate_buffer *tv_onstack,
505                            struct ttm_validate_buffer *tv,
506                            struct ww_acquire_ctx *ww,
507                            struct list_head *objs)
508 {
509         /*
510          * Nothing should've been able to enter the list while we were locked,
511          * since we've held the dma-resvs of all the vm's external objects,
512          * and holding the dma_resv of an object is required for list
513          * addition, and we shouldn't add ourselves.
514          */
515         XE_WARN_ON(!list_empty(&vm->notifier.rebind_list));
516
517         ttm_eu_backoff_reservation(ww, objs);
518         if (tv && tv != tv_onstack)
519                 kvfree(tv);
520 }
521
522 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
523
524 static void xe_vm_kill(struct xe_vm *vm)
525 {
526         struct xe_exec_queue *q;
527
528         lockdep_assert_held(&vm->lock);
529
530         xe_vm_lock(vm, false);
531         vm->flags |= XE_VM_FLAG_BANNED;
532         trace_xe_vm_kill(vm);
533
534         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
535                 q->ops->kill(q);
536         xe_vm_unlock(vm);
537
538         /* TODO: Inform user the VM is banned */
539 }
540
541 static void preempt_rebind_work_func(struct work_struct *w)
542 {
543         struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
544         struct xe_vma *vma;
545         struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
546         struct ttm_validate_buffer *tv;
547         struct ww_acquire_ctx ww;
548         struct list_head objs;
549         struct dma_fence *rebind_fence;
550         unsigned int fence_count = 0;
551         LIST_HEAD(preempt_fences);
552         ktime_t end = 0;
553         int err;
554         long wait;
555         int __maybe_unused tries = 0;
556
557         XE_WARN_ON(!xe_vm_in_compute_mode(vm));
558         trace_xe_vm_rebind_worker_enter(vm);
559
560         down_write(&vm->lock);
561
562         if (xe_vm_is_closed_or_banned(vm)) {
563                 up_write(&vm->lock);
564                 trace_xe_vm_rebind_worker_exit(vm);
565                 return;
566         }
567
568 retry:
569         if (vm->async_ops.error)
570                 goto out_unlock_outer;
571
572         /*
573          * Extreme corner where we exit a VM error state with a munmap style VM
574          * unbind inflight which requires a rebind. In this case the rebind
575          * needs to install some fences into the dma-resv slots. The worker to
576          * do this queued, let that worker make progress by dropping vm->lock
577          * and trying this again.
578          */
579         if (vm->async_ops.munmap_rebind_inflight) {
580                 up_write(&vm->lock);
581                 flush_work(&vm->async_ops.work);
582                 goto retry;
583         }
584
585         if (xe_vm_userptr_check_repin(vm)) {
586                 err = xe_vm_userptr_pin(vm);
587                 if (err)
588                         goto out_unlock_outer;
589         }
590
591         err = xe_vm_lock_dma_resv(vm, &ww, tv_onstack, &tv, &objs,
592                                   false, vm->preempt.num_exec_queues);
593         if (err)
594                 goto out_unlock_outer;
595
596         if (xe_vm_is_idle(vm)) {
597                 vm->preempt.rebind_deactivated = true;
598                 goto out_unlock;
599         }
600
601         /* Fresh preempt fences already installed. Everyting is running. */
602         if (!preempt_fences_waiting(vm))
603                 goto out_unlock;
604
605         /*
606          * This makes sure vm is completely suspended and also balances
607          * xe_engine suspend- and resume; we resume *all* vm engines below.
608          */
609         err = wait_for_existing_preempt_fences(vm);
610         if (err)
611                 goto out_unlock;
612
613         err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
614         if (err)
615                 goto out_unlock;
616
617         list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
618                 if (xe_vma_has_no_bo(vma) ||
619                     vma->gpuva.flags & XE_VMA_DESTROYED)
620                         continue;
621
622                 err = xe_bo_validate(xe_vma_bo(vma), vm, false);
623                 if (err)
624                         goto out_unlock;
625         }
626
627         rebind_fence = xe_vm_rebind(vm, true);
628         if (IS_ERR(rebind_fence)) {
629                 err = PTR_ERR(rebind_fence);
630                 goto out_unlock;
631         }
632
633         if (rebind_fence) {
634                 dma_fence_wait(rebind_fence, false);
635                 dma_fence_put(rebind_fence);
636         }
637
638         /* Wait on munmap style VM unbinds */
639         wait = dma_resv_wait_timeout(xe_vm_resv(vm),
640                                      DMA_RESV_USAGE_KERNEL,
641                                      false, MAX_SCHEDULE_TIMEOUT);
642         if (wait <= 0) {
643                 err = -ETIME;
644                 goto out_unlock;
645         }
646
647 #define retry_required(__tries, __vm) \
648         (IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
649         (!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
650         __xe_vm_userptr_needs_repin(__vm))
651
652         down_read(&vm->userptr.notifier_lock);
653         if (retry_required(tries, vm)) {
654                 up_read(&vm->userptr.notifier_lock);
655                 err = -EAGAIN;
656                 goto out_unlock;
657         }
658
659 #undef retry_required
660
661         spin_lock(&vm->xe->ttm.lru_lock);
662         ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
663         spin_unlock(&vm->xe->ttm.lru_lock);
664
665         /* Point of no return. */
666         arm_preempt_fences(vm, &preempt_fences);
667         resume_and_reinstall_preempt_fences(vm);
668         up_read(&vm->userptr.notifier_lock);
669
670 out_unlock:
671         xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
672 out_unlock_outer:
673         if (err == -EAGAIN) {
674                 trace_xe_vm_rebind_worker_retry(vm);
675                 goto retry;
676         }
677
678         /*
679          * With multiple active VMs, under memory pressure, it is possible that
680          * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
681          * Until ttm properly handles locking in such scenarios, best thing the
682          * driver can do is retry with a timeout. Killing the VM or putting it
683          * in error state after timeout or other error scenarios is still TBD.
684          */
685         if (err == -ENOMEM) {
686                 ktime_t cur = ktime_get();
687
688                 end = end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
689                 if (ktime_before(cur, end)) {
690                         msleep(20);
691                         trace_xe_vm_rebind_worker_retry(vm);
692                         goto retry;
693                 }
694         }
695         if (err) {
696                 drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
697                 xe_vm_kill(vm);
698         }
699         up_write(&vm->lock);
700
701         free_preempt_fences(&preempt_fences);
702
703         trace_xe_vm_rebind_worker_exit(vm);
704 }
705
706 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
707                                    const struct mmu_notifier_range *range,
708                                    unsigned long cur_seq)
709 {
710         struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier);
711         struct xe_vm *vm = xe_vma_vm(vma);
712         struct dma_resv_iter cursor;
713         struct dma_fence *fence;
714         long err;
715
716         XE_WARN_ON(!xe_vma_is_userptr(vma));
717         trace_xe_vma_userptr_invalidate(vma);
718
719         if (!mmu_notifier_range_blockable(range))
720                 return false;
721
722         down_write(&vm->userptr.notifier_lock);
723         mmu_interval_set_seq(mni, cur_seq);
724
725         /* No need to stop gpu access if the userptr is not yet bound. */
726         if (!vma->userptr.initial_bind) {
727                 up_write(&vm->userptr.notifier_lock);
728                 return true;
729         }
730
731         /*
732          * Tell exec and rebind worker they need to repin and rebind this
733          * userptr.
734          */
735         if (!xe_vm_in_fault_mode(vm) &&
736             !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
737                 spin_lock(&vm->userptr.invalidated_lock);
738                 list_move_tail(&vma->userptr.invalidate_link,
739                                &vm->userptr.invalidated);
740                 spin_unlock(&vm->userptr.invalidated_lock);
741         }
742
743         up_write(&vm->userptr.notifier_lock);
744
745         /*
746          * Preempt fences turn into schedule disables, pipeline these.
747          * Note that even in fault mode, we need to wait for binds and
748          * unbinds to complete, and those are attached as BOOKMARK fences
749          * to the vm.
750          */
751         dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
752                             DMA_RESV_USAGE_BOOKKEEP);
753         dma_resv_for_each_fence_unlocked(&cursor, fence)
754                 dma_fence_enable_sw_signaling(fence);
755         dma_resv_iter_end(&cursor);
756
757         err = dma_resv_wait_timeout(xe_vm_resv(vm),
758                                     DMA_RESV_USAGE_BOOKKEEP,
759                                     false, MAX_SCHEDULE_TIMEOUT);
760         XE_WARN_ON(err <= 0);
761
762         if (xe_vm_in_fault_mode(vm)) {
763                 err = xe_vm_invalidate_vma(vma);
764                 XE_WARN_ON(err);
765         }
766
767         trace_xe_vma_userptr_invalidate_complete(vma);
768
769         return true;
770 }
771
772 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
773         .invalidate = vma_userptr_invalidate,
774 };
775
776 int xe_vm_userptr_pin(struct xe_vm *vm)
777 {
778         struct xe_vma *vma, *next;
779         int err = 0;
780         LIST_HEAD(tmp_evict);
781
782         lockdep_assert_held_write(&vm->lock);
783
784         /* Collect invalidated userptrs */
785         spin_lock(&vm->userptr.invalidated_lock);
786         list_for_each_entry_safe(vma, next, &vm->userptr.invalidated,
787                                  userptr.invalidate_link) {
788                 list_del_init(&vma->userptr.invalidate_link);
789                 if (list_empty(&vma->combined_links.userptr))
790                         list_move_tail(&vma->combined_links.userptr,
791                                        &vm->userptr.repin_list);
792         }
793         spin_unlock(&vm->userptr.invalidated_lock);
794
795         /* Pin and move to temporary list */
796         list_for_each_entry_safe(vma, next, &vm->userptr.repin_list,
797                                  combined_links.userptr) {
798                 err = xe_vma_userptr_pin_pages(vma);
799                 if (err < 0)
800                         goto out_err;
801
802                 list_move_tail(&vma->combined_links.userptr, &tmp_evict);
803         }
804
805         /* Take lock and move to rebind_list for rebinding. */
806         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
807         if (err)
808                 goto out_err;
809
810         list_for_each_entry_safe(vma, next, &tmp_evict, combined_links.userptr)
811                 list_move_tail(&vma->combined_links.rebind, &vm->rebind_list);
812
813         dma_resv_unlock(xe_vm_resv(vm));
814
815         return 0;
816
817 out_err:
818         list_splice_tail(&tmp_evict, &vm->userptr.repin_list);
819
820         return err;
821 }
822
823 /**
824  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
825  * that need repinning.
826  * @vm: The VM.
827  *
828  * This function does an advisory check for whether the VM has userptrs that
829  * need repinning.
830  *
831  * Return: 0 if there are no indications of userptrs needing repinning,
832  * -EAGAIN if there are.
833  */
834 int xe_vm_userptr_check_repin(struct xe_vm *vm)
835 {
836         return (list_empty_careful(&vm->userptr.repin_list) &&
837                 list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
838 }
839
840 static struct dma_fence *
841 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
842                struct xe_sync_entry *syncs, u32 num_syncs,
843                bool first_op, bool last_op);
844
845 struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
846 {
847         struct dma_fence *fence = NULL;
848         struct xe_vma *vma, *next;
849
850         lockdep_assert_held(&vm->lock);
851         if (xe_vm_no_dma_fences(vm) && !rebind_worker)
852                 return NULL;
853
854         xe_vm_assert_held(vm);
855         list_for_each_entry_safe(vma, next, &vm->rebind_list,
856                                  combined_links.rebind) {
857                 XE_WARN_ON(!vma->tile_present);
858
859                 list_del_init(&vma->combined_links.rebind);
860                 dma_fence_put(fence);
861                 if (rebind_worker)
862                         trace_xe_vma_rebind_worker(vma);
863                 else
864                         trace_xe_vma_rebind_exec(vma);
865                 fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
866                 if (IS_ERR(fence))
867                         return fence;
868         }
869
870         return fence;
871 }
872
873 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
874                                     struct xe_bo *bo,
875                                     u64 bo_offset_or_userptr,
876                                     u64 start, u64 end,
877                                     bool read_only,
878                                     bool is_null,
879                                     u8 tile_mask)
880 {
881         struct xe_vma *vma;
882         struct xe_tile *tile;
883         u8 id;
884
885         XE_WARN_ON(start >= end);
886         XE_WARN_ON(end >= vm->size);
887
888         if (!bo && !is_null)    /* userptr */
889                 vma = kzalloc(sizeof(*vma), GFP_KERNEL);
890         else
891                 vma = kzalloc(sizeof(*vma) - sizeof(struct xe_userptr),
892                               GFP_KERNEL);
893         if (!vma) {
894                 vma = ERR_PTR(-ENOMEM);
895                 return vma;
896         }
897
898         INIT_LIST_HEAD(&vma->combined_links.rebind);
899         INIT_LIST_HEAD(&vma->notifier.rebind_link);
900         INIT_LIST_HEAD(&vma->extobj.link);
901
902         INIT_LIST_HEAD(&vma->gpuva.gem.entry);
903         vma->gpuva.vm = &vm->gpuvm;
904         vma->gpuva.va.addr = start;
905         vma->gpuva.va.range = end - start + 1;
906         if (read_only)
907                 vma->gpuva.flags |= XE_VMA_READ_ONLY;
908         if (is_null)
909                 vma->gpuva.flags |= DRM_GPUVA_SPARSE;
910
911         if (tile_mask) {
912                 vma->tile_mask = tile_mask;
913         } else {
914                 for_each_tile(tile, vm->xe, id)
915                         vma->tile_mask |= 0x1 << id;
916         }
917
918         if (vm->xe->info.platform == XE_PVC)
919                 vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
920
921         if (bo) {
922                 struct drm_gpuvm_bo *vm_bo;
923
924                 xe_bo_assert_held(bo);
925
926                 vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
927                 if (IS_ERR(vm_bo)) {
928                         kfree(vma);
929                         return ERR_CAST(vm_bo);
930                 }
931
932                 drm_gem_object_get(&bo->ttm.base);
933                 vma->gpuva.gem.obj = &bo->ttm.base;
934                 vma->gpuva.gem.offset = bo_offset_or_userptr;
935                 drm_gpuva_link(&vma->gpuva, vm_bo);
936                 drm_gpuvm_bo_put(vm_bo);
937         } else /* userptr or null */ {
938                 if (!is_null) {
939                         u64 size = end - start + 1;
940                         int err;
941
942                         INIT_LIST_HEAD(&vma->userptr.invalidate_link);
943                         vma->gpuva.gem.offset = bo_offset_or_userptr;
944
945                         err = mmu_interval_notifier_insert(&vma->userptr.notifier,
946                                                            current->mm,
947                                                            xe_vma_userptr(vma), size,
948                                                            &vma_userptr_notifier_ops);
949                         if (err) {
950                                 kfree(vma);
951                                 vma = ERR_PTR(err);
952                                 return vma;
953                         }
954
955                         vma->userptr.notifier_seq = LONG_MAX;
956                 }
957
958                 xe_vm_get(vm);
959         }
960
961         return vma;
962 }
963
964 static bool vm_remove_extobj(struct xe_vma *vma)
965 {
966         if (!list_empty(&vma->extobj.link)) {
967                 xe_vma_vm(vma)->extobj.entries--;
968                 list_del_init(&vma->extobj.link);
969                 return true;
970         }
971         return false;
972 }
973
974 static void xe_vma_destroy_late(struct xe_vma *vma)
975 {
976         struct xe_vm *vm = xe_vma_vm(vma);
977         struct xe_device *xe = vm->xe;
978         bool read_only = xe_vma_read_only(vma);
979
980         if (xe_vma_is_userptr(vma)) {
981                 if (vma->userptr.sg) {
982                         dma_unmap_sgtable(xe->drm.dev,
983                                           vma->userptr.sg,
984                                           read_only ? DMA_TO_DEVICE :
985                                           DMA_BIDIRECTIONAL, 0);
986                         sg_free_table(vma->userptr.sg);
987                         vma->userptr.sg = NULL;
988                 }
989
990                 /*
991                  * Since userptr pages are not pinned, we can't remove
992                  * the notifer until we're sure the GPU is not accessing
993                  * them anymore
994                  */
995                 mmu_interval_notifier_remove(&vma->userptr.notifier);
996                 xe_vm_put(vm);
997         } else if (xe_vma_is_null(vma)) {
998                 xe_vm_put(vm);
999         } else {
1000                 xe_bo_put(xe_vma_bo(vma));
1001         }
1002
1003         kfree(vma);
1004 }
1005
1006 static void vma_destroy_work_func(struct work_struct *w)
1007 {
1008         struct xe_vma *vma =
1009                 container_of(w, struct xe_vma, destroy_work);
1010
1011         xe_vma_destroy_late(vma);
1012 }
1013
1014 static struct xe_vma *
1015 bo_has_vm_references_locked(struct xe_bo *bo, struct xe_vm *vm,
1016                             struct xe_vma *ignore)
1017 {
1018         struct drm_gpuvm_bo *vm_bo;
1019         struct drm_gpuva *va;
1020         struct drm_gem_object *obj = &bo->ttm.base;
1021
1022         xe_bo_assert_held(bo);
1023
1024         drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
1025                 drm_gpuvm_bo_for_each_va(va, vm_bo) {
1026                         struct xe_vma *vma = gpuva_to_vma(va);
1027
1028                         if (vma != ignore && xe_vma_vm(vma) == vm)
1029                                 return vma;
1030                 }
1031         }
1032
1033         return NULL;
1034 }
1035
1036 static bool bo_has_vm_references(struct xe_bo *bo, struct xe_vm *vm,
1037                                  struct xe_vma *ignore)
1038 {
1039         bool ret;
1040
1041         xe_bo_lock(bo, false);
1042         ret = !!bo_has_vm_references_locked(bo, vm, ignore);
1043         xe_bo_unlock(bo);
1044
1045         return ret;
1046 }
1047
1048 static void __vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1049 {
1050         lockdep_assert_held_write(&vm->lock);
1051
1052         list_add(&vma->extobj.link, &vm->extobj.list);
1053         vm->extobj.entries++;
1054 }
1055
1056 static void vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1057 {
1058         struct xe_bo *bo = xe_vma_bo(vma);
1059
1060         lockdep_assert_held_write(&vm->lock);
1061
1062         if (bo_has_vm_references(bo, vm, vma))
1063                 return;
1064
1065         __vm_insert_extobj(vm, vma);
1066 }
1067
1068 static void vma_destroy_cb(struct dma_fence *fence,
1069                            struct dma_fence_cb *cb)
1070 {
1071         struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1072
1073         INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1074         queue_work(system_unbound_wq, &vma->destroy_work);
1075 }
1076
1077 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1078 {
1079         struct xe_vm *vm = xe_vma_vm(vma);
1080
1081         lockdep_assert_held_write(&vm->lock);
1082         XE_WARN_ON(!list_empty(&vma->combined_links.destroy));
1083
1084         if (xe_vma_is_userptr(vma)) {
1085                 XE_WARN_ON(!(vma->gpuva.flags & XE_VMA_DESTROYED));
1086
1087                 spin_lock(&vm->userptr.invalidated_lock);
1088                 list_del(&vma->userptr.invalidate_link);
1089                 spin_unlock(&vm->userptr.invalidated_lock);
1090         } else if (!xe_vma_is_null(vma)) {
1091                 xe_bo_assert_held(xe_vma_bo(vma));
1092
1093                 spin_lock(&vm->notifier.list_lock);
1094                 list_del(&vma->notifier.rebind_link);
1095                 spin_unlock(&vm->notifier.list_lock);
1096
1097                 drm_gpuva_unlink(&vma->gpuva);
1098
1099                 if (!xe_vma_bo(vma)->vm && vm_remove_extobj(vma)) {
1100                         struct xe_vma *other;
1101
1102                         other = bo_has_vm_references_locked(xe_vma_bo(vma), vm, NULL);
1103
1104                         if (other)
1105                                 __vm_insert_extobj(vm, other);
1106                 }
1107         }
1108
1109         xe_vm_assert_held(vm);
1110         if (fence) {
1111                 int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1112                                                  vma_destroy_cb);
1113
1114                 if (ret) {
1115                         XE_WARN_ON(ret != -ENOENT);
1116                         xe_vma_destroy_late(vma);
1117                 }
1118         } else {
1119                 xe_vma_destroy_late(vma);
1120         }
1121 }
1122
1123 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1124 {
1125         struct ttm_validate_buffer tv[2];
1126         struct ww_acquire_ctx ww;
1127         struct xe_bo *bo = xe_vma_bo(vma);
1128         LIST_HEAD(objs);
1129         LIST_HEAD(dups);
1130         int err;
1131
1132         memset(tv, 0, sizeof(tv));
1133         tv[0].bo = xe_vm_ttm_bo(xe_vma_vm(vma));
1134         list_add(&tv[0].head, &objs);
1135
1136         if (bo) {
1137                 tv[1].bo = &xe_bo_get(bo)->ttm;
1138                 list_add(&tv[1].head, &objs);
1139         }
1140         err = ttm_eu_reserve_buffers(&ww, &objs, false, &dups);
1141         XE_WARN_ON(err);
1142
1143         xe_vma_destroy(vma, NULL);
1144
1145         ttm_eu_backoff_reservation(&ww, &objs);
1146         if (bo)
1147                 xe_bo_put(bo);
1148 }
1149
1150 struct xe_vma *
1151 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1152 {
1153         struct drm_gpuva *gpuva;
1154
1155         lockdep_assert_held(&vm->lock);
1156
1157         if (xe_vm_is_closed_or_banned(vm))
1158                 return NULL;
1159
1160         XE_WARN_ON(start + range > vm->size);
1161
1162         gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1163
1164         return gpuva ? gpuva_to_vma(gpuva) : NULL;
1165 }
1166
1167 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1168 {
1169         int err;
1170
1171         XE_WARN_ON(xe_vma_vm(vma) != vm);
1172         lockdep_assert_held(&vm->lock);
1173
1174         err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1175         XE_WARN_ON(err);        /* Shouldn't be possible */
1176
1177         return err;
1178 }
1179
1180 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1181 {
1182         XE_WARN_ON(xe_vma_vm(vma) != vm);
1183         lockdep_assert_held(&vm->lock);
1184
1185         drm_gpuva_remove(&vma->gpuva);
1186         if (vm->usm.last_fault_vma == vma)
1187                 vm->usm.last_fault_vma = NULL;
1188 }
1189
1190 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1191 {
1192         struct xe_vma_op *op;
1193
1194         op = kzalloc(sizeof(*op), GFP_KERNEL);
1195
1196         if (unlikely(!op))
1197                 return NULL;
1198
1199         return &op->base;
1200 }
1201
1202 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1203
1204 static struct drm_gpuvm_ops gpuvm_ops = {
1205         .op_alloc = xe_vm_op_alloc,
1206         .vm_free = xe_vm_free,
1207 };
1208
1209 static void xe_vma_op_work_func(struct work_struct *w);
1210 static void vm_destroy_work_func(struct work_struct *w);
1211
1212 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1213 {
1214         struct drm_gem_object *vm_resv_obj;
1215         struct xe_vm *vm;
1216         int err, number_tiles = 0;
1217         struct xe_tile *tile;
1218         u8 id;
1219
1220         vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1221         if (!vm)
1222                 return ERR_PTR(-ENOMEM);
1223
1224         vm->xe = xe;
1225
1226         vm->size = 1ull << xe->info.va_bits;
1227
1228         vm->flags = flags;
1229
1230         init_rwsem(&vm->lock);
1231
1232         INIT_LIST_HEAD(&vm->rebind_list);
1233
1234         INIT_LIST_HEAD(&vm->userptr.repin_list);
1235         INIT_LIST_HEAD(&vm->userptr.invalidated);
1236         init_rwsem(&vm->userptr.notifier_lock);
1237         spin_lock_init(&vm->userptr.invalidated_lock);
1238
1239         INIT_LIST_HEAD(&vm->notifier.rebind_list);
1240         spin_lock_init(&vm->notifier.list_lock);
1241
1242         INIT_LIST_HEAD(&vm->async_ops.pending);
1243         INIT_WORK(&vm->async_ops.work, xe_vma_op_work_func);
1244         spin_lock_init(&vm->async_ops.lock);
1245
1246         INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1247
1248         INIT_LIST_HEAD(&vm->preempt.exec_queues);
1249         vm->preempt.min_run_period_ms = 10;     /* FIXME: Wire up to uAPI */
1250
1251         for_each_tile(tile, xe, id)
1252                 xe_range_fence_tree_init(&vm->rftree[id]);
1253
1254         INIT_LIST_HEAD(&vm->extobj.list);
1255
1256         if (!(flags & XE_VM_FLAG_MIGRATION))
1257                 xe_device_mem_access_get(xe);
1258
1259         vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1260         if (!vm_resv_obj) {
1261                 err = -ENOMEM;
1262                 goto err_no_resv;
1263         }
1264
1265         drm_gpuvm_init(&vm->gpuvm, "Xe VM", 0, &xe->drm, vm_resv_obj,
1266                        0, vm->size, 0, 0, &gpuvm_ops);
1267
1268         drm_gem_object_put(vm_resv_obj);
1269
1270         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
1271         if (err)
1272                 goto err_close;
1273
1274         if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1275                 vm->flags |= XE_VM_FLAG_64K;
1276
1277         for_each_tile(tile, xe, id) {
1278                 if (flags & XE_VM_FLAG_MIGRATION &&
1279                     tile->id != XE_VM_FLAG_TILE_ID(flags))
1280                         continue;
1281
1282                 vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1283                 if (IS_ERR(vm->pt_root[id])) {
1284                         err = PTR_ERR(vm->pt_root[id]);
1285                         vm->pt_root[id] = NULL;
1286                         goto err_unlock_close;
1287                 }
1288         }
1289
1290         if (flags & XE_VM_FLAG_SCRATCH_PAGE) {
1291                 for_each_tile(tile, xe, id) {
1292                         if (!vm->pt_root[id])
1293                                 continue;
1294
1295                         err = xe_pt_create_scratch(xe, tile, vm);
1296                         if (err)
1297                                 goto err_unlock_close;
1298                 }
1299                 vm->batch_invalidate_tlb = true;
1300         }
1301
1302         if (flags & XE_VM_FLAG_COMPUTE_MODE) {
1303                 INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1304                 vm->flags |= XE_VM_FLAG_COMPUTE_MODE;
1305                 vm->batch_invalidate_tlb = false;
1306         }
1307
1308         if (flags & XE_VM_FLAG_ASYNC_BIND_OPS) {
1309                 vm->async_ops.fence.context = dma_fence_context_alloc(1);
1310                 vm->flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
1311         }
1312
1313         /* Fill pt_root after allocating scratch tables */
1314         for_each_tile(tile, xe, id) {
1315                 if (!vm->pt_root[id])
1316                         continue;
1317
1318                 xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1319         }
1320         dma_resv_unlock(xe_vm_resv(vm));
1321
1322         /* Kernel migration VM shouldn't have a circular loop.. */
1323         if (!(flags & XE_VM_FLAG_MIGRATION)) {
1324                 for_each_tile(tile, xe, id) {
1325                         struct xe_gt *gt = tile->primary_gt;
1326                         struct xe_vm *migrate_vm;
1327                         struct xe_exec_queue *q;
1328
1329                         if (!vm->pt_root[id])
1330                                 continue;
1331
1332                         migrate_vm = xe_migrate_get_vm(tile->migrate);
1333                         q = xe_exec_queue_create_class(xe, gt, migrate_vm,
1334                                                        XE_ENGINE_CLASS_COPY,
1335                                                        EXEC_QUEUE_FLAG_VM);
1336                         xe_vm_put(migrate_vm);
1337                         if (IS_ERR(q)) {
1338                                 err = PTR_ERR(q);
1339                                 goto err_close;
1340                         }
1341                         vm->q[id] = q;
1342                         number_tiles++;
1343                 }
1344         }
1345
1346         if (number_tiles > 1)
1347                 vm->composite_fence_ctx = dma_fence_context_alloc(1);
1348
1349         mutex_lock(&xe->usm.lock);
1350         if (flags & XE_VM_FLAG_FAULT_MODE)
1351                 xe->usm.num_vm_in_fault_mode++;
1352         else if (!(flags & XE_VM_FLAG_MIGRATION))
1353                 xe->usm.num_vm_in_non_fault_mode++;
1354         mutex_unlock(&xe->usm.lock);
1355
1356         trace_xe_vm_create(vm);
1357
1358         return vm;
1359
1360 err_unlock_close:
1361         dma_resv_unlock(xe_vm_resv(vm));
1362 err_close:
1363         xe_vm_close_and_put(vm);
1364         return ERR_PTR(err);
1365
1366 err_no_resv:
1367         for_each_tile(tile, xe, id)
1368                 xe_range_fence_tree_fini(&vm->rftree[id]);
1369         kfree(vm);
1370         if (!(flags & XE_VM_FLAG_MIGRATION))
1371                 xe_device_mem_access_put(xe);
1372         return ERR_PTR(err);
1373 }
1374
1375 static void flush_async_ops(struct xe_vm *vm)
1376 {
1377         queue_work(system_unbound_wq, &vm->async_ops.work);
1378         flush_work(&vm->async_ops.work);
1379 }
1380
1381 static void vm_error_capture(struct xe_vm *vm, int err,
1382                              u32 op, u64 addr, u64 size)
1383 {
1384         struct drm_xe_vm_bind_op_error_capture capture;
1385         u64 __user *address =
1386                 u64_to_user_ptr(vm->async_ops.error_capture.addr);
1387         bool in_kthread = !current->mm;
1388
1389         capture.error = err;
1390         capture.op = op;
1391         capture.addr = addr;
1392         capture.size = size;
1393
1394         if (in_kthread) {
1395                 if (!mmget_not_zero(vm->async_ops.error_capture.mm))
1396                         goto mm_closed;
1397                 kthread_use_mm(vm->async_ops.error_capture.mm);
1398         }
1399
1400         if (copy_to_user(address, &capture, sizeof(capture)))
1401                 XE_WARN_ON("Copy to user failed");
1402
1403         if (in_kthread) {
1404                 kthread_unuse_mm(vm->async_ops.error_capture.mm);
1405                 mmput(vm->async_ops.error_capture.mm);
1406         }
1407
1408 mm_closed:
1409         wake_up_all(&vm->async_ops.error_capture.wq);
1410 }
1411
1412 static void xe_vm_close(struct xe_vm *vm)
1413 {
1414         down_write(&vm->lock);
1415         vm->size = 0;
1416         up_write(&vm->lock);
1417 }
1418
1419 void xe_vm_close_and_put(struct xe_vm *vm)
1420 {
1421         LIST_HEAD(contested);
1422         struct xe_device *xe = vm->xe;
1423         struct xe_tile *tile;
1424         struct xe_vma *vma, *next_vma;
1425         struct drm_gpuva *gpuva, *next;
1426         u8 id;
1427
1428         XE_WARN_ON(vm->preempt.num_exec_queues);
1429
1430         xe_vm_close(vm);
1431         flush_async_ops(vm);
1432         if (xe_vm_in_compute_mode(vm))
1433                 flush_work(&vm->preempt.rebind_work);
1434
1435         for_each_tile(tile, xe, id) {
1436                 if (vm->q[id]) {
1437                         xe_exec_queue_kill(vm->q[id]);
1438                         xe_exec_queue_put(vm->q[id]);
1439                         vm->q[id] = NULL;
1440                 }
1441         }
1442
1443         down_write(&vm->lock);
1444         xe_vm_lock(vm, false);
1445         drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1446                 vma = gpuva_to_vma(gpuva);
1447
1448                 if (xe_vma_has_no_bo(vma)) {
1449                         down_read(&vm->userptr.notifier_lock);
1450                         vma->gpuva.flags |= XE_VMA_DESTROYED;
1451                         up_read(&vm->userptr.notifier_lock);
1452                 }
1453
1454                 xe_vm_remove_vma(vm, vma);
1455
1456                 /* easy case, remove from VMA? */
1457                 if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1458                         list_del_init(&vma->combined_links.rebind);
1459                         xe_vma_destroy(vma, NULL);
1460                         continue;
1461                 }
1462
1463                 list_move_tail(&vma->combined_links.destroy, &contested);
1464                 vma->gpuva.flags |= XE_VMA_DESTROYED;
1465         }
1466
1467         /*
1468          * All vm operations will add shared fences to resv.
1469          * The only exception is eviction for a shared object,
1470          * but even so, the unbind when evicted would still
1471          * install a fence to resv. Hence it's safe to
1472          * destroy the pagetables immediately.
1473          */
1474         for_each_tile(tile, xe, id) {
1475                 if (vm->scratch_bo[id]) {
1476                         u32 i;
1477
1478                         xe_bo_unpin(vm->scratch_bo[id]);
1479                         xe_bo_put(vm->scratch_bo[id]);
1480                         for (i = 0; i < vm->pt_root[id]->level; i++)
1481                                 xe_pt_destroy(vm->scratch_pt[id][i], vm->flags,
1482                                               NULL);
1483                 }
1484                 if (vm->pt_root[id]) {
1485                         xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1486                         vm->pt_root[id] = NULL;
1487                 }
1488         }
1489         xe_vm_unlock(vm);
1490
1491         /*
1492          * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1493          * Since we hold a refcount to the bo, we can remove and free
1494          * the members safely without locking.
1495          */
1496         list_for_each_entry_safe(vma, next_vma, &contested,
1497                                  combined_links.destroy) {
1498                 list_del_init(&vma->combined_links.destroy);
1499                 xe_vma_destroy_unlocked(vma);
1500         }
1501
1502         if (vm->async_ops.error_capture.addr)
1503                 wake_up_all(&vm->async_ops.error_capture.wq);
1504
1505         XE_WARN_ON(!list_empty(&vm->extobj.list));
1506         up_write(&vm->lock);
1507
1508         mutex_lock(&xe->usm.lock);
1509         if (vm->flags & XE_VM_FLAG_FAULT_MODE)
1510                 xe->usm.num_vm_in_fault_mode--;
1511         else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1512                 xe->usm.num_vm_in_non_fault_mode--;
1513         mutex_unlock(&xe->usm.lock);
1514
1515         for_each_tile(tile, xe, id)
1516                 xe_range_fence_tree_fini(&vm->rftree[id]);
1517
1518         xe_vm_put(vm);
1519 }
1520
1521 static void vm_destroy_work_func(struct work_struct *w)
1522 {
1523         struct xe_vm *vm =
1524                 container_of(w, struct xe_vm, destroy_work);
1525         struct xe_device *xe = vm->xe;
1526         struct xe_tile *tile;
1527         u8 id;
1528         void *lookup;
1529
1530         /* xe_vm_close_and_put was not called? */
1531         XE_WARN_ON(vm->size);
1532
1533         if (!(vm->flags & XE_VM_FLAG_MIGRATION)) {
1534                 xe_device_mem_access_put(xe);
1535
1536                 if (xe->info.has_asid) {
1537                         mutex_lock(&xe->usm.lock);
1538                         lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1539                         XE_WARN_ON(lookup != vm);
1540                         mutex_unlock(&xe->usm.lock);
1541                 }
1542         }
1543
1544         for_each_tile(tile, xe, id)
1545                 XE_WARN_ON(vm->pt_root[id]);
1546
1547         trace_xe_vm_free(vm);
1548         dma_fence_put(vm->rebind_fence);
1549         kfree(vm);
1550 }
1551
1552 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1553 {
1554         struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1555
1556         /* To destroy the VM we need to be able to sleep */
1557         queue_work(system_unbound_wq, &vm->destroy_work);
1558 }
1559
1560 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1561 {
1562         struct xe_vm *vm;
1563
1564         mutex_lock(&xef->vm.lock);
1565         vm = xa_load(&xef->vm.xa, id);
1566         if (vm)
1567                 xe_vm_get(vm);
1568         mutex_unlock(&xef->vm.lock);
1569
1570         return vm;
1571 }
1572
1573 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1574 {
1575         return xe_pde_encode(vm->pt_root[tile->id]->bo, 0,
1576                              XE_CACHE_WB);
1577 }
1578
1579 static struct dma_fence *
1580 xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1581                  struct xe_sync_entry *syncs, u32 num_syncs,
1582                  bool first_op, bool last_op)
1583 {
1584         struct xe_tile *tile;
1585         struct dma_fence *fence = NULL;
1586         struct dma_fence **fences = NULL;
1587         struct dma_fence_array *cf = NULL;
1588         struct xe_vm *vm = xe_vma_vm(vma);
1589         int cur_fence = 0, i;
1590         int number_tiles = hweight8(vma->tile_present);
1591         int err;
1592         u8 id;
1593
1594         trace_xe_vma_unbind(vma);
1595
1596         if (number_tiles > 1) {
1597                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1598                                        GFP_KERNEL);
1599                 if (!fences)
1600                         return ERR_PTR(-ENOMEM);
1601         }
1602
1603         for_each_tile(tile, vm->xe, id) {
1604                 if (!(vma->tile_present & BIT(id)))
1605                         goto next;
1606
1607                 fence = __xe_pt_unbind_vma(tile, vma, q, first_op ? syncs : NULL,
1608                                            first_op ? num_syncs : 0);
1609                 if (IS_ERR(fence)) {
1610                         err = PTR_ERR(fence);
1611                         goto err_fences;
1612                 }
1613
1614                 if (fences)
1615                         fences[cur_fence++] = fence;
1616
1617 next:
1618                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1619                         q = list_next_entry(q, multi_gt_list);
1620         }
1621
1622         if (fences) {
1623                 cf = dma_fence_array_create(number_tiles, fences,
1624                                             vm->composite_fence_ctx,
1625                                             vm->composite_fence_seqno++,
1626                                             false);
1627                 if (!cf) {
1628                         --vm->composite_fence_seqno;
1629                         err = -ENOMEM;
1630                         goto err_fences;
1631                 }
1632         }
1633
1634         if (last_op) {
1635                 for (i = 0; i < num_syncs; i++)
1636                         xe_sync_entry_signal(&syncs[i], NULL,
1637                                              cf ? &cf->base : fence);
1638         }
1639
1640         return cf ? &cf->base : !fence ? dma_fence_get_stub() : fence;
1641
1642 err_fences:
1643         if (fences) {
1644                 while (cur_fence) {
1645                         /* FIXME: Rewind the previous binds? */
1646                         dma_fence_put(fences[--cur_fence]);
1647                 }
1648                 kfree(fences);
1649         }
1650
1651         return ERR_PTR(err);
1652 }
1653
1654 static struct dma_fence *
1655 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1656                struct xe_sync_entry *syncs, u32 num_syncs,
1657                bool first_op, bool last_op)
1658 {
1659         struct xe_tile *tile;
1660         struct dma_fence *fence;
1661         struct dma_fence **fences = NULL;
1662         struct dma_fence_array *cf = NULL;
1663         struct xe_vm *vm = xe_vma_vm(vma);
1664         int cur_fence = 0, i;
1665         int number_tiles = hweight8(vma->tile_mask);
1666         int err;
1667         u8 id;
1668
1669         trace_xe_vma_bind(vma);
1670
1671         if (number_tiles > 1) {
1672                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1673                                        GFP_KERNEL);
1674                 if (!fences)
1675                         return ERR_PTR(-ENOMEM);
1676         }
1677
1678         for_each_tile(tile, vm->xe, id) {
1679                 if (!(vma->tile_mask & BIT(id)))
1680                         goto next;
1681
1682                 fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id],
1683                                          first_op ? syncs : NULL,
1684                                          first_op ? num_syncs : 0,
1685                                          vma->tile_present & BIT(id));
1686                 if (IS_ERR(fence)) {
1687                         err = PTR_ERR(fence);
1688                         goto err_fences;
1689                 }
1690
1691                 if (fences)
1692                         fences[cur_fence++] = fence;
1693
1694 next:
1695                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1696                         q = list_next_entry(q, multi_gt_list);
1697         }
1698
1699         if (fences) {
1700                 cf = dma_fence_array_create(number_tiles, fences,
1701                                             vm->composite_fence_ctx,
1702                                             vm->composite_fence_seqno++,
1703                                             false);
1704                 if (!cf) {
1705                         --vm->composite_fence_seqno;
1706                         err = -ENOMEM;
1707                         goto err_fences;
1708                 }
1709         }
1710
1711         if (last_op) {
1712                 for (i = 0; i < num_syncs; i++)
1713                         xe_sync_entry_signal(&syncs[i], NULL,
1714                                              cf ? &cf->base : fence);
1715         }
1716
1717         return cf ? &cf->base : fence;
1718
1719 err_fences:
1720         if (fences) {
1721                 while (cur_fence) {
1722                         /* FIXME: Rewind the previous binds? */
1723                         dma_fence_put(fences[--cur_fence]);
1724                 }
1725                 kfree(fences);
1726         }
1727
1728         return ERR_PTR(err);
1729 }
1730
1731 struct async_op_fence {
1732         struct dma_fence fence;
1733         struct dma_fence *wait_fence;
1734         struct dma_fence_cb cb;
1735         struct xe_vm *vm;
1736         wait_queue_head_t wq;
1737         bool started;
1738 };
1739
1740 static const char *async_op_fence_get_driver_name(struct dma_fence *dma_fence)
1741 {
1742         return "xe";
1743 }
1744
1745 static const char *
1746 async_op_fence_get_timeline_name(struct dma_fence *dma_fence)
1747 {
1748         return "async_op_fence";
1749 }
1750
1751 static const struct dma_fence_ops async_op_fence_ops = {
1752         .get_driver_name = async_op_fence_get_driver_name,
1753         .get_timeline_name = async_op_fence_get_timeline_name,
1754 };
1755
1756 static void async_op_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
1757 {
1758         struct async_op_fence *afence =
1759                 container_of(cb, struct async_op_fence, cb);
1760
1761         afence->fence.error = afence->wait_fence->error;
1762         dma_fence_signal(&afence->fence);
1763         xe_vm_put(afence->vm);
1764         dma_fence_put(afence->wait_fence);
1765         dma_fence_put(&afence->fence);
1766 }
1767
1768 static void add_async_op_fence_cb(struct xe_vm *vm,
1769                                   struct dma_fence *fence,
1770                                   struct async_op_fence *afence)
1771 {
1772         int ret;
1773
1774         if (!xe_vm_no_dma_fences(vm)) {
1775                 afence->started = true;
1776                 smp_wmb();
1777                 wake_up_all(&afence->wq);
1778         }
1779
1780         afence->wait_fence = dma_fence_get(fence);
1781         afence->vm = xe_vm_get(vm);
1782         dma_fence_get(&afence->fence);
1783         ret = dma_fence_add_callback(fence, &afence->cb, async_op_fence_cb);
1784         if (ret == -ENOENT) {
1785                 afence->fence.error = afence->wait_fence->error;
1786                 dma_fence_signal(&afence->fence);
1787         }
1788         if (ret) {
1789                 xe_vm_put(vm);
1790                 dma_fence_put(afence->wait_fence);
1791                 dma_fence_put(&afence->fence);
1792         }
1793         XE_WARN_ON(ret && ret != -ENOENT);
1794 }
1795
1796 int xe_vm_async_fence_wait_start(struct dma_fence *fence)
1797 {
1798         if (fence->ops == &async_op_fence_ops) {
1799                 struct async_op_fence *afence =
1800                         container_of(fence, struct async_op_fence, fence);
1801
1802                 XE_WARN_ON(xe_vm_no_dma_fences(afence->vm));
1803
1804                 smp_rmb();
1805                 return wait_event_interruptible(afence->wq, afence->started);
1806         }
1807
1808         return 0;
1809 }
1810
1811 static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
1812                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1813                         u32 num_syncs, struct async_op_fence *afence,
1814                         bool immediate, bool first_op, bool last_op)
1815 {
1816         struct dma_fence *fence;
1817
1818         xe_vm_assert_held(vm);
1819
1820         if (immediate) {
1821                 fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op,
1822                                        last_op);
1823                 if (IS_ERR(fence))
1824                         return PTR_ERR(fence);
1825         } else {
1826                 int i;
1827
1828                 XE_WARN_ON(!xe_vm_in_fault_mode(vm));
1829
1830                 fence = dma_fence_get_stub();
1831                 if (last_op) {
1832                         for (i = 0; i < num_syncs; i++)
1833                                 xe_sync_entry_signal(&syncs[i], NULL, fence);
1834                 }
1835         }
1836         if (afence)
1837                 add_async_op_fence_cb(vm, fence, afence);
1838
1839         dma_fence_put(fence);
1840         return 0;
1841 }
1842
1843 static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
1844                       struct xe_bo *bo, struct xe_sync_entry *syncs,
1845                       u32 num_syncs, struct async_op_fence *afence,
1846                       bool immediate, bool first_op, bool last_op)
1847 {
1848         int err;
1849
1850         xe_vm_assert_held(vm);
1851         xe_bo_assert_held(bo);
1852
1853         if (bo && immediate) {
1854                 err = xe_bo_validate(bo, vm, true);
1855                 if (err)
1856                         return err;
1857         }
1858
1859         return __xe_vm_bind(vm, vma, q, syncs, num_syncs, afence, immediate,
1860                             first_op, last_op);
1861 }
1862
1863 static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
1864                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1865                         u32 num_syncs, struct async_op_fence *afence,
1866                         bool first_op, bool last_op)
1867 {
1868         struct dma_fence *fence;
1869
1870         xe_vm_assert_held(vm);
1871         xe_bo_assert_held(xe_vma_bo(vma));
1872
1873         fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op);
1874         if (IS_ERR(fence))
1875                 return PTR_ERR(fence);
1876         if (afence)
1877                 add_async_op_fence_cb(vm, fence, afence);
1878
1879         xe_vma_destroy(vma, fence);
1880         dma_fence_put(fence);
1881
1882         return 0;
1883 }
1884
1885 static int vm_set_error_capture_address(struct xe_device *xe, struct xe_vm *vm,
1886                                         u64 value)
1887 {
1888         if (XE_IOCTL_DBG(xe, !value))
1889                 return -EINVAL;
1890
1891         if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
1892                 return -EOPNOTSUPP;
1893
1894         if (XE_IOCTL_DBG(xe, vm->async_ops.error_capture.addr))
1895                 return -EOPNOTSUPP;
1896
1897         vm->async_ops.error_capture.mm = current->mm;
1898         vm->async_ops.error_capture.addr = value;
1899         init_waitqueue_head(&vm->async_ops.error_capture.wq);
1900
1901         return 0;
1902 }
1903
1904 typedef int (*xe_vm_set_property_fn)(struct xe_device *xe, struct xe_vm *vm,
1905                                      u64 value);
1906
1907 static const xe_vm_set_property_fn vm_set_property_funcs[] = {
1908         [XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS] =
1909                 vm_set_error_capture_address,
1910 };
1911
1912 static int vm_user_ext_set_property(struct xe_device *xe, struct xe_vm *vm,
1913                                     u64 extension)
1914 {
1915         u64 __user *address = u64_to_user_ptr(extension);
1916         struct drm_xe_ext_vm_set_property ext;
1917         int err;
1918
1919         err = __copy_from_user(&ext, address, sizeof(ext));
1920         if (XE_IOCTL_DBG(xe, err))
1921                 return -EFAULT;
1922
1923         if (XE_IOCTL_DBG(xe, ext.property >=
1924                          ARRAY_SIZE(vm_set_property_funcs)) ||
1925             XE_IOCTL_DBG(xe, ext.pad) ||
1926             XE_IOCTL_DBG(xe, ext.reserved[0] || ext.reserved[1]))
1927                 return -EINVAL;
1928
1929         return vm_set_property_funcs[ext.property](xe, vm, ext.value);
1930 }
1931
1932 typedef int (*xe_vm_user_extension_fn)(struct xe_device *xe, struct xe_vm *vm,
1933                                        u64 extension);
1934
1935 static const xe_vm_set_property_fn vm_user_extension_funcs[] = {
1936         [XE_VM_EXTENSION_SET_PROPERTY] = vm_user_ext_set_property,
1937 };
1938
1939 #define MAX_USER_EXTENSIONS     16
1940 static int vm_user_extensions(struct xe_device *xe, struct xe_vm *vm,
1941                               u64 extensions, int ext_number)
1942 {
1943         u64 __user *address = u64_to_user_ptr(extensions);
1944         struct xe_user_extension ext;
1945         int err;
1946
1947         if (XE_IOCTL_DBG(xe, ext_number >= MAX_USER_EXTENSIONS))
1948                 return -E2BIG;
1949
1950         err = __copy_from_user(&ext, address, sizeof(ext));
1951         if (XE_IOCTL_DBG(xe, err))
1952                 return -EFAULT;
1953
1954         if (XE_IOCTL_DBG(xe, ext.pad) ||
1955             XE_IOCTL_DBG(xe, ext.name >=
1956                          ARRAY_SIZE(vm_user_extension_funcs)))
1957                 return -EINVAL;
1958
1959         err = vm_user_extension_funcs[ext.name](xe, vm, extensions);
1960         if (XE_IOCTL_DBG(xe, err))
1961                 return err;
1962
1963         if (ext.next_extension)
1964                 return vm_user_extensions(xe, vm, ext.next_extension,
1965                                           ++ext_number);
1966
1967         return 0;
1968 }
1969
1970 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_SCRATCH_PAGE | \
1971                                     DRM_XE_VM_CREATE_COMPUTE_MODE | \
1972                                     DRM_XE_VM_CREATE_ASYNC_BIND_OPS | \
1973                                     DRM_XE_VM_CREATE_FAULT_MODE)
1974
1975 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
1976                        struct drm_file *file)
1977 {
1978         struct xe_device *xe = to_xe_device(dev);
1979         struct xe_file *xef = to_xe_file(file);
1980         struct drm_xe_vm_create *args = data;
1981         struct xe_vm *vm;
1982         u32 id, asid;
1983         int err;
1984         u32 flags = 0;
1985
1986         if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
1987                 args->flags |= DRM_XE_VM_CREATE_SCRATCH_PAGE;
1988
1989         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
1990                          !xe->info.supports_usm))
1991                 return -EINVAL;
1992
1993         if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
1994                 return -EINVAL;
1995
1996         if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
1997                 return -EINVAL;
1998
1999         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE &&
2000                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2001                 return -EINVAL;
2002
2003         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE &&
2004                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2005                 return -EINVAL;
2006
2007         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
2008                          xe_device_in_non_fault_mode(xe)))
2009                 return -EINVAL;
2010
2011         if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FAULT_MODE) &&
2012                          xe_device_in_fault_mode(xe)))
2013                 return -EINVAL;
2014
2015         if (args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE)
2016                 flags |= XE_VM_FLAG_SCRATCH_PAGE;
2017         if (args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE)
2018                 flags |= XE_VM_FLAG_COMPUTE_MODE;
2019         if (args->flags & DRM_XE_VM_CREATE_ASYNC_BIND_OPS)
2020                 flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
2021         if (args->flags & DRM_XE_VM_CREATE_FAULT_MODE)
2022                 flags |= XE_VM_FLAG_FAULT_MODE;
2023
2024         vm = xe_vm_create(xe, flags);
2025         if (IS_ERR(vm))
2026                 return PTR_ERR(vm);
2027
2028         if (args->extensions) {
2029                 err = vm_user_extensions(xe, vm, args->extensions, 0);
2030                 if (XE_IOCTL_DBG(xe, err)) {
2031                         xe_vm_close_and_put(vm);
2032                         return err;
2033                 }
2034         }
2035
2036         mutex_lock(&xef->vm.lock);
2037         err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2038         mutex_unlock(&xef->vm.lock);
2039         if (err) {
2040                 xe_vm_close_and_put(vm);
2041                 return err;
2042         }
2043
2044         if (xe->info.has_asid) {
2045                 mutex_lock(&xe->usm.lock);
2046                 err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2047                                       XA_LIMIT(0, XE_MAX_ASID - 1),
2048                                       &xe->usm.next_asid, GFP_KERNEL);
2049                 mutex_unlock(&xe->usm.lock);
2050                 if (err) {
2051                         xe_vm_close_and_put(vm);
2052                         return err;
2053                 }
2054                 vm->usm.asid = asid;
2055         }
2056
2057         args->vm_id = id;
2058
2059 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2060         /* Warning: Security issue - never enable by default */
2061         args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2062 #endif
2063
2064         return 0;
2065 }
2066
2067 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2068                         struct drm_file *file)
2069 {
2070         struct xe_device *xe = to_xe_device(dev);
2071         struct xe_file *xef = to_xe_file(file);
2072         struct drm_xe_vm_destroy *args = data;
2073         struct xe_vm *vm;
2074         int err = 0;
2075
2076         if (XE_IOCTL_DBG(xe, args->pad) ||
2077             XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2078                 return -EINVAL;
2079
2080         mutex_lock(&xef->vm.lock);
2081         vm = xa_load(&xef->vm.xa, args->vm_id);
2082         if (XE_IOCTL_DBG(xe, !vm))
2083                 err = -ENOENT;
2084         else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2085                 err = -EBUSY;
2086         else
2087                 xa_erase(&xef->vm.xa, args->vm_id);
2088         mutex_unlock(&xef->vm.lock);
2089
2090         if (!err)
2091                 xe_vm_close_and_put(vm);
2092
2093         return err;
2094 }
2095
2096 static const u32 region_to_mem_type[] = {
2097         XE_PL_TT,
2098         XE_PL_VRAM0,
2099         XE_PL_VRAM1,
2100 };
2101
2102 static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
2103                           struct xe_exec_queue *q, u32 region,
2104                           struct xe_sync_entry *syncs, u32 num_syncs,
2105                           struct async_op_fence *afence, bool first_op,
2106                           bool last_op)
2107 {
2108         int err;
2109
2110         XE_WARN_ON(region > ARRAY_SIZE(region_to_mem_type));
2111
2112         if (!xe_vma_has_no_bo(vma)) {
2113                 err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
2114                 if (err)
2115                         return err;
2116         }
2117
2118         if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
2119                 return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
2120                                   afence, true, first_op, last_op);
2121         } else {
2122                 int i;
2123
2124                 /* Nothing to do, signal fences now */
2125                 if (last_op) {
2126                         for (i = 0; i < num_syncs; i++)
2127                                 xe_sync_entry_signal(&syncs[i], NULL,
2128                                                      dma_fence_get_stub());
2129                 }
2130                 if (afence)
2131                         dma_fence_signal(&afence->fence);
2132                 return 0;
2133         }
2134 }
2135
2136 #define VM_BIND_OP(op)  (op & 0xffff)
2137
2138 struct ttm_buffer_object *xe_vm_ttm_bo(struct xe_vm *vm)
2139 {
2140         int idx = vm->flags & XE_VM_FLAG_MIGRATION ?
2141                 XE_VM_FLAG_TILE_ID(vm->flags) : 0;
2142
2143         /* Safe to use index 0 as all BO in the VM share a single dma-resv lock */
2144         return &vm->pt_root[idx]->bo->ttm;
2145 }
2146
2147 static void xe_vm_tv_populate(struct xe_vm *vm, struct ttm_validate_buffer *tv)
2148 {
2149         tv->num_shared = 1;
2150         tv->bo = xe_vm_ttm_bo(vm);
2151 }
2152
2153 static void vm_set_async_error(struct xe_vm *vm, int err)
2154 {
2155         lockdep_assert_held(&vm->lock);
2156         vm->async_ops.error = err;
2157 }
2158
2159 static int vm_bind_ioctl_lookup_vma(struct xe_vm *vm, struct xe_bo *bo,
2160                                     u64 addr, u64 range, u32 op)
2161 {
2162         struct xe_device *xe = vm->xe;
2163         struct xe_vma *vma;
2164         bool async = !!(op & XE_VM_BIND_FLAG_ASYNC);
2165
2166         lockdep_assert_held(&vm->lock);
2167
2168         switch (VM_BIND_OP(op)) {
2169         case XE_VM_BIND_OP_MAP:
2170         case XE_VM_BIND_OP_MAP_USERPTR:
2171                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2172                 if (XE_IOCTL_DBG(xe, vma && !async))
2173                         return -EBUSY;
2174                 break;
2175         case XE_VM_BIND_OP_UNMAP:
2176         case XE_VM_BIND_OP_PREFETCH:
2177                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2178                 if (XE_IOCTL_DBG(xe, !vma))
2179                         /* Not an actual error, IOCTL cleans up returns and 0 */
2180                         return -ENODATA;
2181                 if (XE_IOCTL_DBG(xe, (xe_vma_start(vma) != addr ||
2182                                       xe_vma_end(vma) != addr + range) && !async))
2183                         return -EINVAL;
2184                 break;
2185         case XE_VM_BIND_OP_UNMAP_ALL:
2186                 if (XE_IOCTL_DBG(xe, list_empty(&bo->ttm.base.gpuva.list)))
2187                         /* Not an actual error, IOCTL cleans up returns and 0 */
2188                         return -ENODATA;
2189                 break;
2190         default:
2191                 XE_WARN_ON("NOT POSSIBLE");
2192                 return -EINVAL;
2193         }
2194
2195         return 0;
2196 }
2197
2198 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2199                              bool post_commit)
2200 {
2201         down_read(&vm->userptr.notifier_lock);
2202         vma->gpuva.flags |= XE_VMA_DESTROYED;
2203         up_read(&vm->userptr.notifier_lock);
2204         if (post_commit)
2205                 xe_vm_remove_vma(vm, vma);
2206 }
2207
2208 #undef ULL
2209 #define ULL     unsigned long long
2210
2211 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2212 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2213 {
2214         struct xe_vma *vma;
2215
2216         switch (op->op) {
2217         case DRM_GPUVA_OP_MAP:
2218                 vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2219                        (ULL)op->map.va.addr, (ULL)op->map.va.range);
2220                 break;
2221         case DRM_GPUVA_OP_REMAP:
2222                 vma = gpuva_to_vma(op->remap.unmap->va);
2223                 vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2224                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2225                        op->unmap.keep ? 1 : 0);
2226                 if (op->remap.prev)
2227                         vm_dbg(&xe->drm,
2228                                "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2229                                (ULL)op->remap.prev->va.addr,
2230                                (ULL)op->remap.prev->va.range);
2231                 if (op->remap.next)
2232                         vm_dbg(&xe->drm,
2233                                "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2234                                (ULL)op->remap.next->va.addr,
2235                                (ULL)op->remap.next->va.range);
2236                 break;
2237         case DRM_GPUVA_OP_UNMAP:
2238                 vma = gpuva_to_vma(op->unmap.va);
2239                 vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2240                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2241                        op->unmap.keep ? 1 : 0);
2242                 break;
2243         case DRM_GPUVA_OP_PREFETCH:
2244                 vma = gpuva_to_vma(op->prefetch.va);
2245                 vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2246                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2247                 break;
2248         default:
2249                 XE_WARN_ON("NOT POSSIBLE");
2250         }
2251 }
2252 #else
2253 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2254 {
2255 }
2256 #endif
2257
2258 /*
2259  * Create operations list from IOCTL arguments, setup operations fields so parse
2260  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2261  */
2262 static struct drm_gpuva_ops *
2263 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2264                          u64 bo_offset_or_userptr, u64 addr, u64 range,
2265                          u32 operation, u8 tile_mask, u32 region)
2266 {
2267         struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2268         struct drm_gpuva_ops *ops;
2269         struct drm_gpuva_op *__op;
2270         struct xe_vma_op *op;
2271         struct drm_gpuvm_bo *vm_bo;
2272         int err;
2273
2274         lockdep_assert_held_write(&vm->lock);
2275
2276         vm_dbg(&vm->xe->drm,
2277                "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2278                VM_BIND_OP(operation), (ULL)addr, (ULL)range,
2279                (ULL)bo_offset_or_userptr);
2280
2281         switch (VM_BIND_OP(operation)) {
2282         case XE_VM_BIND_OP_MAP:
2283         case XE_VM_BIND_OP_MAP_USERPTR:
2284                 ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2285                                                   obj, bo_offset_or_userptr);
2286                 if (IS_ERR(ops))
2287                         return ops;
2288
2289                 drm_gpuva_for_each_op(__op, ops) {
2290                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2291
2292                         op->tile_mask = tile_mask;
2293                         op->map.immediate =
2294                                 operation & XE_VM_BIND_FLAG_IMMEDIATE;
2295                         op->map.read_only =
2296                                 operation & XE_VM_BIND_FLAG_READONLY;
2297                         op->map.is_null = operation & XE_VM_BIND_FLAG_NULL;
2298                 }
2299                 break;
2300         case XE_VM_BIND_OP_UNMAP:
2301                 ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2302                 if (IS_ERR(ops))
2303                         return ops;
2304
2305                 drm_gpuva_for_each_op(__op, ops) {
2306                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2307
2308                         op->tile_mask = tile_mask;
2309                 }
2310                 break;
2311         case XE_VM_BIND_OP_PREFETCH:
2312                 ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2313                 if (IS_ERR(ops))
2314                         return ops;
2315
2316                 drm_gpuva_for_each_op(__op, ops) {
2317                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2318
2319                         op->tile_mask = tile_mask;
2320                         op->prefetch.region = region;
2321                 }
2322                 break;
2323         case XE_VM_BIND_OP_UNMAP_ALL:
2324                 XE_WARN_ON(!bo);
2325
2326                 err = xe_bo_lock(bo, true);
2327                 if (err)
2328                         return ERR_PTR(err);
2329
2330                 vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj);
2331                 if (!vm_bo)
2332                         break;
2333
2334                 ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2335                 drm_gpuvm_bo_put(vm_bo);
2336                 xe_bo_unlock(bo);
2337                 if (IS_ERR(ops))
2338                         return ops;
2339
2340                 drm_gpuva_for_each_op(__op, ops) {
2341                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2342
2343                         op->tile_mask = tile_mask;
2344                 }
2345                 break;
2346         default:
2347                 XE_WARN_ON("NOT POSSIBLE");
2348                 ops = ERR_PTR(-EINVAL);
2349         }
2350
2351 #ifdef TEST_VM_ASYNC_OPS_ERROR
2352         if (operation & FORCE_ASYNC_OP_ERROR) {
2353                 op = list_first_entry_or_null(&ops->list, struct xe_vma_op,
2354                                               base.entry);
2355                 if (op)
2356                         op->inject_error = true;
2357         }
2358 #endif
2359
2360         if (!IS_ERR(ops))
2361                 drm_gpuva_for_each_op(__op, ops)
2362                         print_op(vm->xe, __op);
2363
2364         return ops;
2365 }
2366
2367 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2368                               u8 tile_mask, bool read_only, bool is_null)
2369 {
2370         struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2371         struct xe_vma *vma;
2372         int err;
2373
2374         lockdep_assert_held_write(&vm->lock);
2375
2376         if (bo) {
2377                 err = xe_bo_lock(bo, true);
2378                 if (err)
2379                         return ERR_PTR(err);
2380         }
2381         vma = xe_vma_create(vm, bo, op->gem.offset,
2382                             op->va.addr, op->va.addr +
2383                             op->va.range - 1, read_only, is_null,
2384                             tile_mask);
2385         if (bo)
2386                 xe_bo_unlock(bo);
2387
2388         if (xe_vma_is_userptr(vma)) {
2389                 err = xe_vma_userptr_pin_pages(vma);
2390                 if (err) {
2391                         prep_vma_destroy(vm, vma, false);
2392                         xe_vma_destroy_unlocked(vma);
2393                         return ERR_PTR(err);
2394                 }
2395         } else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
2396                 vm_insert_extobj(vm, vma);
2397                 err = add_preempt_fences(vm, bo);
2398                 if (err) {
2399                         prep_vma_destroy(vm, vma, false);
2400                         xe_vma_destroy_unlocked(vma);
2401                         return ERR_PTR(err);
2402                 }
2403         }
2404
2405         return vma;
2406 }
2407
2408 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2409 {
2410         if (vma->gpuva.flags & XE_VMA_PTE_1G)
2411                 return SZ_1G;
2412         else if (vma->gpuva.flags & XE_VMA_PTE_2M)
2413                 return SZ_2M;
2414
2415         return SZ_4K;
2416 }
2417
2418 static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2419 {
2420         switch (size) {
2421         case SZ_1G:
2422                 vma->gpuva.flags |= XE_VMA_PTE_1G;
2423                 break;
2424         case SZ_2M:
2425                 vma->gpuva.flags |= XE_VMA_PTE_2M;
2426                 break;
2427         }
2428
2429         return SZ_4K;
2430 }
2431
2432 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2433 {
2434         int err = 0;
2435
2436         lockdep_assert_held_write(&vm->lock);
2437
2438         switch (op->base.op) {
2439         case DRM_GPUVA_OP_MAP:
2440                 err |= xe_vm_insert_vma(vm, op->map.vma);
2441                 if (!err)
2442                         op->flags |= XE_VMA_OP_COMMITTED;
2443                 break;
2444         case DRM_GPUVA_OP_REMAP:
2445                 prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2446                                  true);
2447                 op->flags |= XE_VMA_OP_COMMITTED;
2448
2449                 if (op->remap.prev) {
2450                         err |= xe_vm_insert_vma(vm, op->remap.prev);
2451                         if (!err)
2452                                 op->flags |= XE_VMA_OP_PREV_COMMITTED;
2453                         if (!err && op->remap.skip_prev)
2454                                 op->remap.prev = NULL;
2455                 }
2456                 if (op->remap.next) {
2457                         err |= xe_vm_insert_vma(vm, op->remap.next);
2458                         if (!err)
2459                                 op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2460                         if (!err && op->remap.skip_next)
2461                                 op->remap.next = NULL;
2462                 }
2463
2464                 /* Adjust for partial unbind after removin VMA from VM */
2465                 if (!err) {
2466                         op->base.remap.unmap->va->va.addr = op->remap.start;
2467                         op->base.remap.unmap->va->va.range = op->remap.range;
2468                 }
2469                 break;
2470         case DRM_GPUVA_OP_UNMAP:
2471                 prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2472                 op->flags |= XE_VMA_OP_COMMITTED;
2473                 break;
2474         case DRM_GPUVA_OP_PREFETCH:
2475                 op->flags |= XE_VMA_OP_COMMITTED;
2476                 break;
2477         default:
2478                 XE_WARN_ON("NOT POSSIBLE");
2479         }
2480
2481         return err;
2482 }
2483
2484
2485 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
2486                                    struct drm_gpuva_ops *ops,
2487                                    struct xe_sync_entry *syncs, u32 num_syncs,
2488                                    struct list_head *ops_list, bool last,
2489                                    bool async)
2490 {
2491         struct xe_vma_op *last_op = NULL;
2492         struct async_op_fence *fence = NULL;
2493         struct drm_gpuva_op *__op;
2494         int err = 0;
2495
2496         lockdep_assert_held_write(&vm->lock);
2497
2498         if (last && num_syncs && async) {
2499                 u64 seqno;
2500
2501                 fence = kmalloc(sizeof(*fence), GFP_KERNEL);
2502                 if (!fence)
2503                         return -ENOMEM;
2504
2505                 seqno = q ? ++q->bind.fence_seqno : ++vm->async_ops.fence.seqno;
2506                 dma_fence_init(&fence->fence, &async_op_fence_ops,
2507                                &vm->async_ops.lock, q ? q->bind.fence_ctx :
2508                                vm->async_ops.fence.context, seqno);
2509
2510                 if (!xe_vm_no_dma_fences(vm)) {
2511                         fence->vm = vm;
2512                         fence->started = false;
2513                         init_waitqueue_head(&fence->wq);
2514                 }
2515         }
2516
2517         drm_gpuva_for_each_op(__op, ops) {
2518                 struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2519                 bool first = list_empty(ops_list);
2520
2521                 XE_WARN_ON(!first && !async);
2522
2523                 INIT_LIST_HEAD(&op->link);
2524                 list_add_tail(&op->link, ops_list);
2525
2526                 if (first) {
2527                         op->flags |= XE_VMA_OP_FIRST;
2528                         op->num_syncs = num_syncs;
2529                         op->syncs = syncs;
2530                 }
2531
2532                 op->q = q;
2533
2534                 switch (op->base.op) {
2535                 case DRM_GPUVA_OP_MAP:
2536                 {
2537                         struct xe_vma *vma;
2538
2539                         vma = new_vma(vm, &op->base.map,
2540                                       op->tile_mask, op->map.read_only,
2541                                       op->map.is_null);
2542                         if (IS_ERR(vma)) {
2543                                 err = PTR_ERR(vma);
2544                                 goto free_fence;
2545                         }
2546
2547                         op->map.vma = vma;
2548                         break;
2549                 }
2550                 case DRM_GPUVA_OP_REMAP:
2551                 {
2552                         struct xe_vma *old =
2553                                 gpuva_to_vma(op->base.remap.unmap->va);
2554
2555                         op->remap.start = xe_vma_start(old);
2556                         op->remap.range = xe_vma_size(old);
2557
2558                         if (op->base.remap.prev) {
2559                                 struct xe_vma *vma;
2560                                 bool read_only =
2561                                         op->base.remap.unmap->va->flags &
2562                                         XE_VMA_READ_ONLY;
2563                                 bool is_null =
2564                                         op->base.remap.unmap->va->flags &
2565                                         DRM_GPUVA_SPARSE;
2566
2567                                 vma = new_vma(vm, op->base.remap.prev,
2568                                               op->tile_mask, read_only,
2569                                               is_null);
2570                                 if (IS_ERR(vma)) {
2571                                         err = PTR_ERR(vma);
2572                                         goto free_fence;
2573                                 }
2574
2575                                 op->remap.prev = vma;
2576
2577                                 /*
2578                                  * Userptr creates a new SG mapping so
2579                                  * we must also rebind.
2580                                  */
2581                                 op->remap.skip_prev = !xe_vma_is_userptr(old) &&
2582                                         IS_ALIGNED(xe_vma_end(vma),
2583                                                    xe_vma_max_pte_size(old));
2584                                 if (op->remap.skip_prev) {
2585                                         xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2586                                         op->remap.range -=
2587                                                 xe_vma_end(vma) -
2588                                                 xe_vma_start(old);
2589                                         op->remap.start = xe_vma_end(vma);
2590                                 }
2591                         }
2592
2593                         if (op->base.remap.next) {
2594                                 struct xe_vma *vma;
2595                                 bool read_only =
2596                                         op->base.remap.unmap->va->flags &
2597                                         XE_VMA_READ_ONLY;
2598
2599                                 bool is_null =
2600                                         op->base.remap.unmap->va->flags &
2601                                         DRM_GPUVA_SPARSE;
2602
2603                                 vma = new_vma(vm, op->base.remap.next,
2604                                               op->tile_mask, read_only,
2605                                               is_null);
2606                                 if (IS_ERR(vma)) {
2607                                         err = PTR_ERR(vma);
2608                                         goto free_fence;
2609                                 }
2610
2611                                 op->remap.next = vma;
2612
2613                                 /*
2614                                  * Userptr creates a new SG mapping so
2615                                  * we must also rebind.
2616                                  */
2617                                 op->remap.skip_next = !xe_vma_is_userptr(old) &&
2618                                         IS_ALIGNED(xe_vma_start(vma),
2619                                                    xe_vma_max_pte_size(old));
2620                                 if (op->remap.skip_next) {
2621                                         xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2622                                         op->remap.range -=
2623                                                 xe_vma_end(old) -
2624                                                 xe_vma_start(vma);
2625                                 }
2626                         }
2627                         break;
2628                 }
2629                 case DRM_GPUVA_OP_UNMAP:
2630                 case DRM_GPUVA_OP_PREFETCH:
2631                         /* Nothing to do */
2632                         break;
2633                 default:
2634                         XE_WARN_ON("NOT POSSIBLE");
2635                 }
2636
2637                 last_op = op;
2638
2639                 err = xe_vma_op_commit(vm, op);
2640                 if (err)
2641                         goto free_fence;
2642         }
2643
2644         /* FIXME: Unhandled corner case */
2645         XE_WARN_ON(!last_op && last && !list_empty(ops_list));
2646
2647         if (!last_op)
2648                 goto free_fence;
2649         last_op->ops = ops;
2650         if (last) {
2651                 last_op->flags |= XE_VMA_OP_LAST;
2652                 last_op->num_syncs = num_syncs;
2653                 last_op->syncs = syncs;
2654                 last_op->fence = fence;
2655         }
2656
2657         return 0;
2658
2659 free_fence:
2660         kfree(fence);
2661         return err;
2662 }
2663
2664 static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
2665                                struct xe_vma_op *op)
2666 {
2667         LIST_HEAD(objs);
2668         LIST_HEAD(dups);
2669         struct ttm_validate_buffer tv_bo, tv_vm;
2670         struct ww_acquire_ctx ww;
2671         struct xe_bo *vbo;
2672         int err;
2673
2674         lockdep_assert_held_write(&vm->lock);
2675
2676         xe_vm_tv_populate(vm, &tv_vm);
2677         list_add_tail(&tv_vm.head, &objs);
2678         vbo = xe_vma_bo(vma);
2679         if (vbo) {
2680                 /*
2681                  * An unbind can drop the last reference to the BO and
2682                  * the BO is needed for ttm_eu_backoff_reservation so
2683                  * take a reference here.
2684                  */
2685                 xe_bo_get(vbo);
2686
2687                 if (!vbo->vm) {
2688                         tv_bo.bo = &vbo->ttm;
2689                         tv_bo.num_shared = 1;
2690                         list_add(&tv_bo.head, &objs);
2691                 }
2692         }
2693
2694 again:
2695         err = ttm_eu_reserve_buffers(&ww, &objs, true, &dups);
2696         if (err) {
2697                 xe_bo_put(vbo);
2698                 return err;
2699         }
2700
2701         xe_vm_assert_held(vm);
2702         xe_bo_assert_held(xe_vma_bo(vma));
2703
2704         switch (op->base.op) {
2705         case DRM_GPUVA_OP_MAP:
2706                 err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
2707                                  op->syncs, op->num_syncs, op->fence,
2708                                  op->map.immediate || !xe_vm_in_fault_mode(vm),
2709                                  op->flags & XE_VMA_OP_FIRST,
2710                                  op->flags & XE_VMA_OP_LAST);
2711                 break;
2712         case DRM_GPUVA_OP_REMAP:
2713         {
2714                 bool prev = !!op->remap.prev;
2715                 bool next = !!op->remap.next;
2716
2717                 if (!op->remap.unmap_done) {
2718                         if (prev || next) {
2719                                 vm->async_ops.munmap_rebind_inflight = true;
2720                                 vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
2721                         }
2722                         err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2723                                            op->num_syncs,
2724                                            !prev && !next ? op->fence : NULL,
2725                                            op->flags & XE_VMA_OP_FIRST,
2726                                            op->flags & XE_VMA_OP_LAST && !prev &&
2727                                            !next);
2728                         if (err)
2729                                 break;
2730                         op->remap.unmap_done = true;
2731                 }
2732
2733                 if (prev) {
2734                         op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
2735                         err = xe_vm_bind(vm, op->remap.prev, op->q,
2736                                          xe_vma_bo(op->remap.prev), op->syncs,
2737                                          op->num_syncs,
2738                                          !next ? op->fence : NULL, true, false,
2739                                          op->flags & XE_VMA_OP_LAST && !next);
2740                         op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2741                         if (err)
2742                                 break;
2743                         op->remap.prev = NULL;
2744                 }
2745
2746                 if (next) {
2747                         op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
2748                         err = xe_vm_bind(vm, op->remap.next, op->q,
2749                                          xe_vma_bo(op->remap.next),
2750                                          op->syncs, op->num_syncs,
2751                                          op->fence, true, false,
2752                                          op->flags & XE_VMA_OP_LAST);
2753                         op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2754                         if (err)
2755                                 break;
2756                         op->remap.next = NULL;
2757                 }
2758                 vm->async_ops.munmap_rebind_inflight = false;
2759
2760                 break;
2761         }
2762         case DRM_GPUVA_OP_UNMAP:
2763                 err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2764                                    op->num_syncs, op->fence,
2765                                    op->flags & XE_VMA_OP_FIRST,
2766                                    op->flags & XE_VMA_OP_LAST);
2767                 break;
2768         case DRM_GPUVA_OP_PREFETCH:
2769                 err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
2770                                      op->syncs, op->num_syncs, op->fence,
2771                                      op->flags & XE_VMA_OP_FIRST,
2772                                      op->flags & XE_VMA_OP_LAST);
2773                 break;
2774         default:
2775                 XE_WARN_ON("NOT POSSIBLE");
2776         }
2777
2778         ttm_eu_backoff_reservation(&ww, &objs);
2779         if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
2780                 lockdep_assert_held_write(&vm->lock);
2781                 err = xe_vma_userptr_pin_pages(vma);
2782                 if (!err)
2783                         goto again;
2784         }
2785         xe_bo_put(vbo);
2786
2787         if (err)
2788                 trace_xe_vma_fail(vma);
2789
2790         return err;
2791 }
2792
2793 static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
2794 {
2795         int ret = 0;
2796
2797         lockdep_assert_held_write(&vm->lock);
2798
2799 #ifdef TEST_VM_ASYNC_OPS_ERROR
2800         if (op->inject_error) {
2801                 op->inject_error = false;
2802                 return -ENOMEM;
2803         }
2804 #endif
2805
2806         switch (op->base.op) {
2807         case DRM_GPUVA_OP_MAP:
2808                 ret = __xe_vma_op_execute(vm, op->map.vma, op);
2809                 break;
2810         case DRM_GPUVA_OP_REMAP:
2811         {
2812                 struct xe_vma *vma;
2813
2814                 if (!op->remap.unmap_done)
2815                         vma = gpuva_to_vma(op->base.remap.unmap->va);
2816                 else if (op->remap.prev)
2817                         vma = op->remap.prev;
2818                 else
2819                         vma = op->remap.next;
2820
2821                 ret = __xe_vma_op_execute(vm, vma, op);
2822                 break;
2823         }
2824         case DRM_GPUVA_OP_UNMAP:
2825                 ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
2826                                           op);
2827                 break;
2828         case DRM_GPUVA_OP_PREFETCH:
2829                 ret = __xe_vma_op_execute(vm,
2830                                           gpuva_to_vma(op->base.prefetch.va),
2831                                           op);
2832                 break;
2833         default:
2834                 XE_WARN_ON("NOT POSSIBLE");
2835         }
2836
2837         return ret;
2838 }
2839
2840 static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
2841 {
2842         bool last = op->flags & XE_VMA_OP_LAST;
2843
2844         if (last) {
2845                 while (op->num_syncs--)
2846                         xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
2847                 kfree(op->syncs);
2848                 if (op->q)
2849                         xe_exec_queue_put(op->q);
2850                 if (op->fence)
2851                         dma_fence_put(&op->fence->fence);
2852         }
2853         if (!list_empty(&op->link)) {
2854                 spin_lock_irq(&vm->async_ops.lock);
2855                 list_del(&op->link);
2856                 spin_unlock_irq(&vm->async_ops.lock);
2857         }
2858         if (op->ops)
2859                 drm_gpuva_ops_free(&vm->gpuvm, op->ops);
2860         if (last)
2861                 xe_vm_put(vm);
2862 }
2863
2864 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2865                              bool post_commit, bool prev_post_commit,
2866                              bool next_post_commit)
2867 {
2868         lockdep_assert_held_write(&vm->lock);
2869
2870         switch (op->base.op) {
2871         case DRM_GPUVA_OP_MAP:
2872                 if (op->map.vma) {
2873                         prep_vma_destroy(vm, op->map.vma, post_commit);
2874                         xe_vma_destroy_unlocked(op->map.vma);
2875                 }
2876                 break;
2877         case DRM_GPUVA_OP_UNMAP:
2878         {
2879                 struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2880
2881                 if (vma) {
2882                         down_read(&vm->userptr.notifier_lock);
2883                         vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2884                         up_read(&vm->userptr.notifier_lock);
2885                         if (post_commit)
2886                                 xe_vm_insert_vma(vm, vma);
2887                 }
2888                 break;
2889         }
2890         case DRM_GPUVA_OP_REMAP:
2891         {
2892                 struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2893
2894                 if (op->remap.prev) {
2895                         prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2896                         xe_vma_destroy_unlocked(op->remap.prev);
2897                 }
2898                 if (op->remap.next) {
2899                         prep_vma_destroy(vm, op->remap.next, next_post_commit);
2900                         xe_vma_destroy_unlocked(op->remap.next);
2901                 }
2902                 if (vma) {
2903                         down_read(&vm->userptr.notifier_lock);
2904                         vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2905                         up_read(&vm->userptr.notifier_lock);
2906                         if (post_commit)
2907                                 xe_vm_insert_vma(vm, vma);
2908                 }
2909                 break;
2910         }
2911         case DRM_GPUVA_OP_PREFETCH:
2912                 /* Nothing to do */
2913                 break;
2914         default:
2915                 XE_WARN_ON("NOT POSSIBLE");
2916         }
2917 }
2918
2919 static struct xe_vma_op *next_vma_op(struct xe_vm *vm)
2920 {
2921         return list_first_entry_or_null(&vm->async_ops.pending,
2922                                         struct xe_vma_op, link);
2923 }
2924
2925 static void xe_vma_op_work_func(struct work_struct *w)
2926 {
2927         struct xe_vm *vm = container_of(w, struct xe_vm, async_ops.work);
2928
2929         for (;;) {
2930                 struct xe_vma_op *op;
2931                 int err;
2932
2933                 if (vm->async_ops.error && !xe_vm_is_closed(vm))
2934                         break;
2935
2936                 spin_lock_irq(&vm->async_ops.lock);
2937                 op = next_vma_op(vm);
2938                 spin_unlock_irq(&vm->async_ops.lock);
2939
2940                 if (!op)
2941                         break;
2942
2943                 if (!xe_vm_is_closed(vm)) {
2944                         down_write(&vm->lock);
2945                         err = xe_vma_op_execute(vm, op);
2946                         if (err) {
2947                                 drm_warn(&vm->xe->drm,
2948                                          "Async VM op(%d) failed with %d",
2949                                          op->base.op, err);
2950                                 vm_set_async_error(vm, err);
2951                                 up_write(&vm->lock);
2952
2953                                 if (vm->async_ops.error_capture.addr)
2954                                         vm_error_capture(vm, err, 0, 0, 0);
2955                                 break;
2956                         }
2957                         up_write(&vm->lock);
2958                 } else {
2959                         struct xe_vma *vma;
2960
2961                         switch (op->base.op) {
2962                         case DRM_GPUVA_OP_REMAP:
2963                                 vma = gpuva_to_vma(op->base.remap.unmap->va);
2964                                 trace_xe_vma_flush(vma);
2965
2966                                 down_write(&vm->lock);
2967                                 xe_vma_destroy_unlocked(vma);
2968                                 up_write(&vm->lock);
2969                                 break;
2970                         case DRM_GPUVA_OP_UNMAP:
2971                                 vma = gpuva_to_vma(op->base.unmap.va);
2972                                 trace_xe_vma_flush(vma);
2973
2974                                 down_write(&vm->lock);
2975                                 xe_vma_destroy_unlocked(vma);
2976                                 up_write(&vm->lock);
2977                                 break;
2978                         default:
2979                                 /* Nothing to do */
2980                                 break;
2981                         }
2982
2983                         if (op->fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
2984                                                    &op->fence->fence.flags)) {
2985                                 if (!xe_vm_no_dma_fences(vm)) {
2986                                         op->fence->started = true;
2987                                         wake_up_all(&op->fence->wq);
2988                                 }
2989                                 dma_fence_signal(&op->fence->fence);
2990                         }
2991                 }
2992
2993                 xe_vma_op_cleanup(vm, op);
2994         }
2995 }
2996
2997 static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
2998                                      struct list_head *ops_list, bool async)
2999 {
3000         struct xe_vma_op *op, *last_op, *next;
3001         int err;
3002
3003         lockdep_assert_held_write(&vm->lock);
3004
3005         list_for_each_entry(op, ops_list, link)
3006                 last_op = op;
3007
3008         if (!async) {
3009                 err = xe_vma_op_execute(vm, last_op);
3010                 if (err)
3011                         goto unwind;
3012                 xe_vma_op_cleanup(vm, last_op);
3013         } else {
3014                 int i;
3015                 bool installed = false;
3016
3017                 for (i = 0; i < last_op->num_syncs; i++)
3018                         installed |= xe_sync_entry_signal(&last_op->syncs[i],
3019                                                           NULL,
3020                                                           &last_op->fence->fence);
3021                 if (!installed && last_op->fence)
3022                         dma_fence_signal(&last_op->fence->fence);
3023
3024                 spin_lock_irq(&vm->async_ops.lock);
3025                 list_splice_tail(ops_list, &vm->async_ops.pending);
3026                 spin_unlock_irq(&vm->async_ops.lock);
3027
3028                 if (!vm->async_ops.error)
3029                         queue_work(system_unbound_wq, &vm->async_ops.work);
3030         }
3031
3032         return 0;
3033
3034 unwind:
3035         list_for_each_entry_reverse(op, ops_list, link)
3036                 xe_vma_op_unwind(vm, op, op->flags & XE_VMA_OP_COMMITTED,
3037                                  op->flags & XE_VMA_OP_PREV_COMMITTED,
3038                                  op->flags & XE_VMA_OP_NEXT_COMMITTED);
3039         list_for_each_entry_safe(op, next, ops_list, link)
3040                 xe_vma_op_cleanup(vm, op);
3041
3042         return err;
3043 }
3044
3045 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
3046                                      struct drm_gpuva_ops **ops,
3047                                      int num_ops_list)
3048 {
3049         int i;
3050
3051         for (i = num_ops_list - 1; i; ++i) {
3052                 struct drm_gpuva_ops *__ops = ops[i];
3053                 struct drm_gpuva_op *__op;
3054
3055                 if (!__ops)
3056                         continue;
3057
3058                 drm_gpuva_for_each_op_reverse(__op, __ops) {
3059                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3060
3061                         xe_vma_op_unwind(vm, op,
3062                                          op->flags & XE_VMA_OP_COMMITTED,
3063                                          op->flags & XE_VMA_OP_PREV_COMMITTED,
3064                                          op->flags & XE_VMA_OP_NEXT_COMMITTED);
3065                 }
3066
3067                 drm_gpuva_ops_free(&vm->gpuvm, __ops);
3068         }
3069 }
3070
3071 #ifdef TEST_VM_ASYNC_OPS_ERROR
3072 #define SUPPORTED_FLAGS \
3073         (FORCE_ASYNC_OP_ERROR | XE_VM_BIND_FLAG_ASYNC | \
3074          XE_VM_BIND_FLAG_READONLY | XE_VM_BIND_FLAG_IMMEDIATE | \
3075          XE_VM_BIND_FLAG_NULL | 0xffff)
3076 #else
3077 #define SUPPORTED_FLAGS \
3078         (XE_VM_BIND_FLAG_ASYNC | XE_VM_BIND_FLAG_READONLY | \
3079          XE_VM_BIND_FLAG_IMMEDIATE | XE_VM_BIND_FLAG_NULL | 0xffff)
3080 #endif
3081 #define XE_64K_PAGE_MASK 0xffffull
3082
3083 #define MAX_BINDS       512     /* FIXME: Picking random upper limit */
3084
3085 static int vm_bind_ioctl_check_args(struct xe_device *xe,
3086                                     struct drm_xe_vm_bind *args,
3087                                     struct drm_xe_vm_bind_op **bind_ops,
3088                                     bool *async)
3089 {
3090         int err;
3091         int i;
3092
3093         if (XE_IOCTL_DBG(xe, args->extensions) ||
3094             XE_IOCTL_DBG(xe, !args->num_binds) ||
3095             XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS))
3096                 return -EINVAL;
3097
3098         if (args->num_binds > 1) {
3099                 u64 __user *bind_user =
3100                         u64_to_user_ptr(args->vector_of_binds);
3101
3102                 *bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) *
3103                                     args->num_binds, GFP_KERNEL);
3104                 if (!*bind_ops)
3105                         return -ENOMEM;
3106
3107                 err = __copy_from_user(*bind_ops, bind_user,
3108                                        sizeof(struct drm_xe_vm_bind_op) *
3109                                        args->num_binds);
3110                 if (XE_IOCTL_DBG(xe, err)) {
3111                         err = -EFAULT;
3112                         goto free_bind_ops;
3113                 }
3114         } else {
3115                 *bind_ops = &args->bind;
3116         }
3117
3118         for (i = 0; i < args->num_binds; ++i) {
3119                 u64 range = (*bind_ops)[i].range;
3120                 u64 addr = (*bind_ops)[i].addr;
3121                 u32 op = (*bind_ops)[i].op;
3122                 u32 obj = (*bind_ops)[i].obj;
3123                 u64 obj_offset = (*bind_ops)[i].obj_offset;
3124                 u32 region = (*bind_ops)[i].region;
3125                 bool is_null = op & XE_VM_BIND_FLAG_NULL;
3126
3127                 if (i == 0) {
3128                         *async = !!(op & XE_VM_BIND_FLAG_ASYNC);
3129                 } else if (XE_IOCTL_DBG(xe, !*async) ||
3130                            XE_IOCTL_DBG(xe, !(op & XE_VM_BIND_FLAG_ASYNC)) ||
3131                            XE_IOCTL_DBG(xe, VM_BIND_OP(op) ==
3132                                         XE_VM_BIND_OP_RESTART)) {
3133                         err = -EINVAL;
3134                         goto free_bind_ops;
3135                 }
3136
3137                 if (XE_IOCTL_DBG(xe, !*async &&
3138                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL)) {
3139                         err = -EINVAL;
3140                         goto free_bind_ops;
3141                 }
3142
3143                 if (XE_IOCTL_DBG(xe, !*async &&
3144                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH)) {
3145                         err = -EINVAL;
3146                         goto free_bind_ops;
3147                 }
3148
3149                 if (XE_IOCTL_DBG(xe, VM_BIND_OP(op) >
3150                                  XE_VM_BIND_OP_PREFETCH) ||
3151                     XE_IOCTL_DBG(xe, op & ~SUPPORTED_FLAGS) ||
3152                     XE_IOCTL_DBG(xe, obj && is_null) ||
3153                     XE_IOCTL_DBG(xe, obj_offset && is_null) ||
3154                     XE_IOCTL_DBG(xe, VM_BIND_OP(op) != XE_VM_BIND_OP_MAP &&
3155                                  is_null) ||
3156                     XE_IOCTL_DBG(xe, !obj &&
3157                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP &&
3158                                  !is_null) ||
3159                     XE_IOCTL_DBG(xe, !obj &&
3160                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3161                     XE_IOCTL_DBG(xe, addr &&
3162                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3163                     XE_IOCTL_DBG(xe, range &&
3164                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3165                     XE_IOCTL_DBG(xe, obj &&
3166                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP_USERPTR) ||
3167                     XE_IOCTL_DBG(xe, obj &&
3168                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH) ||
3169                     XE_IOCTL_DBG(xe, region &&
3170                                  VM_BIND_OP(op) != XE_VM_BIND_OP_PREFETCH) ||
3171                     XE_IOCTL_DBG(xe, !(BIT(region) &
3172                                        xe->info.mem_region_mask)) ||
3173                     XE_IOCTL_DBG(xe, obj &&
3174                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP)) {
3175                         err = -EINVAL;
3176                         goto free_bind_ops;
3177                 }
3178
3179                 if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3180                     XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3181                     XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3182                     XE_IOCTL_DBG(xe, !range && VM_BIND_OP(op) !=
3183                                  XE_VM_BIND_OP_RESTART &&
3184                                  VM_BIND_OP(op) != XE_VM_BIND_OP_UNMAP_ALL)) {
3185                         err = -EINVAL;
3186                         goto free_bind_ops;
3187                 }
3188         }
3189
3190         return 0;
3191
3192 free_bind_ops:
3193         if (args->num_binds > 1)
3194                 kfree(*bind_ops);
3195         return err;
3196 }
3197
3198 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3199 {
3200         struct xe_device *xe = to_xe_device(dev);
3201         struct xe_file *xef = to_xe_file(file);
3202         struct drm_xe_vm_bind *args = data;
3203         struct drm_xe_sync __user *syncs_user;
3204         struct xe_bo **bos = NULL;
3205         struct drm_gpuva_ops **ops = NULL;
3206         struct xe_vm *vm;
3207         struct xe_exec_queue *q = NULL;
3208         u32 num_syncs;
3209         struct xe_sync_entry *syncs = NULL;
3210         struct drm_xe_vm_bind_op *bind_ops;
3211         LIST_HEAD(ops_list);
3212         bool async;
3213         int err;
3214         int i;
3215
3216         err = vm_bind_ioctl_check_args(xe, args, &bind_ops, &async);
3217         if (err)
3218                 return err;
3219
3220         if (args->exec_queue_id) {
3221                 q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3222                 if (XE_IOCTL_DBG(xe, !q)) {
3223                         err = -ENOENT;
3224                         goto free_objs;
3225                 }
3226
3227                 if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3228                         err = -EINVAL;
3229                         goto put_exec_queue;
3230                 }
3231         }
3232
3233         vm = xe_vm_lookup(xef, args->vm_id);
3234         if (XE_IOCTL_DBG(xe, !vm)) {
3235                 err = -EINVAL;
3236                 goto put_exec_queue;
3237         }
3238
3239         err = down_write_killable(&vm->lock);
3240         if (err)
3241                 goto put_vm;
3242
3243         if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3244                 err = -ENOENT;
3245                 goto release_vm_lock;
3246         }
3247
3248         if (VM_BIND_OP(bind_ops[0].op) == XE_VM_BIND_OP_RESTART) {
3249                 if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
3250                         err = -EOPNOTSUPP;
3251                 if (XE_IOCTL_DBG(xe, !err && args->num_syncs))
3252                         err = EINVAL;
3253                 if (XE_IOCTL_DBG(xe, !err && !vm->async_ops.error))
3254                         err = -EPROTO;
3255
3256                 if (!err) {
3257                         trace_xe_vm_restart(vm);
3258                         vm_set_async_error(vm, 0);
3259
3260                         queue_work(system_unbound_wq, &vm->async_ops.work);
3261
3262                         /* Rebinds may have been blocked, give worker a kick */
3263                         if (xe_vm_in_compute_mode(vm))
3264                                 xe_vm_queue_rebind_worker(vm);
3265                 }
3266
3267                 goto release_vm_lock;
3268         }
3269
3270         if (XE_IOCTL_DBG(xe, !vm->async_ops.error &&
3271                          async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) {
3272                 err = -EOPNOTSUPP;
3273                 goto release_vm_lock;
3274         }
3275
3276         for (i = 0; i < args->num_binds; ++i) {
3277                 u64 range = bind_ops[i].range;
3278                 u64 addr = bind_ops[i].addr;
3279
3280                 if (XE_IOCTL_DBG(xe, range > vm->size) ||
3281                     XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3282                         err = -EINVAL;
3283                         goto release_vm_lock;
3284                 }
3285
3286                 if (bind_ops[i].tile_mask) {
3287                         u64 valid_tiles = BIT(xe->info.tile_count) - 1;
3288
3289                         if (XE_IOCTL_DBG(xe, bind_ops[i].tile_mask &
3290                                          ~valid_tiles)) {
3291                                 err = -EINVAL;
3292                                 goto release_vm_lock;
3293                         }
3294                 }
3295         }
3296
3297         bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
3298         if (!bos) {
3299                 err = -ENOMEM;
3300                 goto release_vm_lock;
3301         }
3302
3303         ops = kzalloc(sizeof(*ops) * args->num_binds, GFP_KERNEL);
3304         if (!ops) {
3305                 err = -ENOMEM;
3306                 goto release_vm_lock;
3307         }
3308
3309         for (i = 0; i < args->num_binds; ++i) {
3310                 struct drm_gem_object *gem_obj;
3311                 u64 range = bind_ops[i].range;
3312                 u64 addr = bind_ops[i].addr;
3313                 u32 obj = bind_ops[i].obj;
3314                 u64 obj_offset = bind_ops[i].obj_offset;
3315
3316                 if (!obj)
3317                         continue;
3318
3319                 gem_obj = drm_gem_object_lookup(file, obj);
3320                 if (XE_IOCTL_DBG(xe, !gem_obj)) {
3321                         err = -ENOENT;
3322                         goto put_obj;
3323                 }
3324                 bos[i] = gem_to_xe_bo(gem_obj);
3325
3326                 if (XE_IOCTL_DBG(xe, range > bos[i]->size) ||
3327                     XE_IOCTL_DBG(xe, obj_offset >
3328                                  bos[i]->size - range)) {
3329                         err = -EINVAL;
3330                         goto put_obj;
3331                 }
3332
3333                 if (bos[i]->flags & XE_BO_INTERNAL_64K) {
3334                         if (XE_IOCTL_DBG(xe, obj_offset &
3335                                          XE_64K_PAGE_MASK) ||
3336                             XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3337                             XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3338                                 err = -EINVAL;
3339                                 goto put_obj;
3340                         }
3341                 }
3342         }
3343
3344         if (args->num_syncs) {
3345                 syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3346                 if (!syncs) {
3347                         err = -ENOMEM;
3348                         goto put_obj;
3349                 }
3350         }
3351
3352         syncs_user = u64_to_user_ptr(args->syncs);
3353         for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3354                 err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3355                                           &syncs_user[num_syncs], false,
3356                                           xe_vm_no_dma_fences(vm));
3357                 if (err)
3358                         goto free_syncs;
3359         }
3360
3361         /* Do some error checking first to make the unwind easier */
3362         for (i = 0; i < args->num_binds; ++i) {
3363                 u64 range = bind_ops[i].range;
3364                 u64 addr = bind_ops[i].addr;
3365                 u32 op = bind_ops[i].op;
3366
3367                 err = vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op);
3368                 if (err)
3369                         goto free_syncs;
3370         }
3371
3372         for (i = 0; i < args->num_binds; ++i) {
3373                 u64 range = bind_ops[i].range;
3374                 u64 addr = bind_ops[i].addr;
3375                 u32 op = bind_ops[i].op;
3376                 u64 obj_offset = bind_ops[i].obj_offset;
3377                 u8 tile_mask = bind_ops[i].tile_mask;
3378                 u32 region = bind_ops[i].region;
3379
3380                 ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3381                                                   addr, range, op, tile_mask,
3382                                                   region);
3383                 if (IS_ERR(ops[i])) {
3384                         err = PTR_ERR(ops[i]);
3385                         ops[i] = NULL;
3386                         goto unwind_ops;
3387                 }
3388
3389                 err = vm_bind_ioctl_ops_parse(vm, q, ops[i], syncs, num_syncs,
3390                                               &ops_list,
3391                                               i == args->num_binds - 1,
3392                                               async);
3393                 if (err)
3394                         goto unwind_ops;
3395         }
3396
3397         /* Nothing to do */
3398         if (list_empty(&ops_list)) {
3399                 err = -ENODATA;
3400                 goto unwind_ops;
3401         }
3402
3403         err = vm_bind_ioctl_ops_execute(vm, &ops_list, async);
3404         up_write(&vm->lock);
3405
3406         for (i = 0; i < args->num_binds; ++i)
3407                 xe_bo_put(bos[i]);
3408
3409         kfree(bos);
3410         kfree(ops);
3411         if (args->num_binds > 1)
3412                 kfree(bind_ops);
3413
3414         return err;
3415
3416 unwind_ops:
3417         vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3418 free_syncs:
3419         for (i = 0; err == -ENODATA && i < num_syncs; i++)
3420                 xe_sync_entry_signal(&syncs[i], NULL, dma_fence_get_stub());
3421         while (num_syncs--)
3422                 xe_sync_entry_cleanup(&syncs[num_syncs]);
3423
3424         kfree(syncs);
3425 put_obj:
3426         for (i = 0; i < args->num_binds; ++i)
3427                 xe_bo_put(bos[i]);
3428 release_vm_lock:
3429         up_write(&vm->lock);
3430 put_vm:
3431         xe_vm_put(vm);
3432 put_exec_queue:
3433         if (q)
3434                 xe_exec_queue_put(q);
3435 free_objs:
3436         kfree(bos);
3437         kfree(ops);
3438         if (args->num_binds > 1)
3439                 kfree(bind_ops);
3440         return err == -ENODATA ? 0 : err;
3441 }
3442
3443 /**
3444  * xe_vm_lock() - Lock the vm's dma_resv object
3445  * @vm: The struct xe_vm whose lock is to be locked
3446  * @intr: Whether to perform any wait interruptible
3447  *
3448  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3449  * contended lock was interrupted. If @intr is false, the function
3450  * always returns 0.
3451  */
3452 int xe_vm_lock(struct xe_vm *vm, bool intr)
3453 {
3454         if (intr)
3455                 return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3456
3457         return dma_resv_lock(xe_vm_resv(vm), NULL);
3458 }
3459
3460 /**
3461  * xe_vm_unlock() - Unlock the vm's dma_resv object
3462  * @vm: The struct xe_vm whose lock is to be released.
3463  *
3464  * Unlock a buffer object lock that was locked by xe_vm_lock().
3465  */
3466 void xe_vm_unlock(struct xe_vm *vm)
3467 {
3468         dma_resv_unlock(xe_vm_resv(vm));
3469 }
3470
3471 /**
3472  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3473  * @vma: VMA to invalidate
3474  *
3475  * Walks a list of page tables leaves which it memset the entries owned by this
3476  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3477  * complete.
3478  *
3479  * Returns 0 for success, negative error code otherwise.
3480  */
3481 int xe_vm_invalidate_vma(struct xe_vma *vma)
3482 {
3483         struct xe_device *xe = xe_vma_vm(vma)->xe;
3484         struct xe_tile *tile;
3485         u32 tile_needs_invalidate = 0;
3486         int seqno[XE_MAX_TILES_PER_DEVICE];
3487         u8 id;
3488         int ret;
3489
3490         XE_WARN_ON(!xe_vm_in_fault_mode(xe_vma_vm(vma)));
3491         XE_WARN_ON(xe_vma_is_null(vma));
3492         trace_xe_vma_usm_invalidate(vma);
3493
3494         /* Check that we don't race with page-table updates */
3495         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3496                 if (xe_vma_is_userptr(vma)) {
3497                         WARN_ON_ONCE(!mmu_interval_check_retry
3498                                      (&vma->userptr.notifier,
3499                                       vma->userptr.notifier_seq));
3500                         WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3501                                                              DMA_RESV_USAGE_BOOKKEEP));
3502
3503                 } else {
3504                         xe_bo_assert_held(xe_vma_bo(vma));
3505                 }
3506         }
3507
3508         for_each_tile(tile, xe, id) {
3509                 if (xe_pt_zap_ptes(tile, vma)) {
3510                         tile_needs_invalidate |= BIT(id);
3511                         xe_device_wmb(xe);
3512                         /*
3513                          * FIXME: We potentially need to invalidate multiple
3514                          * GTs within the tile
3515                          */
3516                         seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
3517                         if (seqno[id] < 0)
3518                                 return seqno[id];
3519                 }
3520         }
3521
3522         for_each_tile(tile, xe, id) {
3523                 if (tile_needs_invalidate & BIT(id)) {
3524                         ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
3525                         if (ret < 0)
3526                                 return ret;
3527                 }
3528         }
3529
3530         vma->usm.tile_invalidated = vma->tile_mask;
3531
3532         return 0;
3533 }
3534
3535 int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
3536 {
3537         struct drm_gpuva *gpuva;
3538         bool is_vram;
3539         uint64_t addr;
3540
3541         if (!down_read_trylock(&vm->lock)) {
3542                 drm_printf(p, " Failed to acquire VM lock to dump capture");
3543                 return 0;
3544         }
3545         if (vm->pt_root[gt_id]) {
3546                 addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
3547                 is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
3548                 drm_printf(p, " VM root: A:0x%llx %s\n", addr,
3549                            is_vram ? "VRAM" : "SYS");
3550         }
3551
3552         drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3553                 struct xe_vma *vma = gpuva_to_vma(gpuva);
3554                 bool is_userptr = xe_vma_is_userptr(vma);
3555                 bool is_null = xe_vma_is_null(vma);
3556
3557                 if (is_null) {
3558                         addr = 0;
3559                 } else if (is_userptr) {
3560                         struct xe_res_cursor cur;
3561
3562                         if (vma->userptr.sg) {
3563                                 xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE,
3564                                                 &cur);
3565                                 addr = xe_res_dma(&cur);
3566                         } else {
3567                                 addr = 0;
3568                         }
3569                 } else {
3570                         addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
3571                         is_vram = xe_bo_is_vram(xe_vma_bo(vma));
3572                 }
3573                 drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
3574                            xe_vma_start(vma), xe_vma_end(vma) - 1,
3575                            xe_vma_size(vma),
3576                            addr, is_null ? "NULL" : is_userptr ? "USR" :
3577                            is_vram ? "VRAM" : "SYS");
3578         }
3579         up_read(&vm->lock);
3580
3581         return 0;
3582 }