drm/xe/vm: Remove VM_BIND_OP macro
[linux-2.6-microblaze.git] / drivers / gpu / drm / xe / xe_vm.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5
6 #include "xe_vm.h"
7
8 #include <linux/dma-fence-array.h>
9
10 #include <drm/drm_exec.h>
11 #include <drm/drm_print.h>
12 #include <drm/ttm/ttm_execbuf_util.h>
13 #include <drm/ttm/ttm_tt.h>
14 #include <drm/xe_drm.h>
15 #include <linux/delay.h>
16 #include <linux/kthread.h>
17 #include <linux/mm.h>
18 #include <linux/swap.h>
19
20 #include "xe_assert.h"
21 #include "xe_bo.h"
22 #include "xe_device.h"
23 #include "xe_drm_client.h"
24 #include "xe_exec_queue.h"
25 #include "xe_gt.h"
26 #include "xe_gt_pagefault.h"
27 #include "xe_gt_tlb_invalidation.h"
28 #include "xe_migrate.h"
29 #include "xe_pm.h"
30 #include "xe_preempt_fence.h"
31 #include "xe_pt.h"
32 #include "xe_res_cursor.h"
33 #include "xe_sync.h"
34 #include "xe_trace.h"
35 #include "generated/xe_wa_oob.h"
36 #include "xe_wa.h"
37
38 #define TEST_VM_ASYNC_OPS_ERROR
39
40 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
41 {
42         return vm->gpuvm.r_obj;
43 }
44
45 /**
46  * xe_vma_userptr_check_repin() - Advisory check for repin needed
47  * @vma: The userptr vma
48  *
49  * Check if the userptr vma has been invalidated since last successful
50  * repin. The check is advisory only and can the function can be called
51  * without the vm->userptr.notifier_lock held. There is no guarantee that the
52  * vma userptr will remain valid after a lockless check, so typically
53  * the call needs to be followed by a proper check under the notifier_lock.
54  *
55  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
56  */
57 int xe_vma_userptr_check_repin(struct xe_vma *vma)
58 {
59         return mmu_interval_check_retry(&vma->userptr.notifier,
60                                         vma->userptr.notifier_seq) ?
61                 -EAGAIN : 0;
62 }
63
64 int xe_vma_userptr_pin_pages(struct xe_vma *vma)
65 {
66         struct xe_vm *vm = xe_vma_vm(vma);
67         struct xe_device *xe = vm->xe;
68         const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
69         struct page **pages;
70         bool in_kthread = !current->mm;
71         unsigned long notifier_seq;
72         int pinned, ret, i;
73         bool read_only = xe_vma_read_only(vma);
74
75         lockdep_assert_held(&vm->lock);
76         xe_assert(xe, xe_vma_is_userptr(vma));
77 retry:
78         if (vma->gpuva.flags & XE_VMA_DESTROYED)
79                 return 0;
80
81         notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
82         if (notifier_seq == vma->userptr.notifier_seq)
83                 return 0;
84
85         pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
86         if (!pages)
87                 return -ENOMEM;
88
89         if (vma->userptr.sg) {
90                 dma_unmap_sgtable(xe->drm.dev,
91                                   vma->userptr.sg,
92                                   read_only ? DMA_TO_DEVICE :
93                                   DMA_BIDIRECTIONAL, 0);
94                 sg_free_table(vma->userptr.sg);
95                 vma->userptr.sg = NULL;
96         }
97
98         pinned = ret = 0;
99         if (in_kthread) {
100                 if (!mmget_not_zero(vma->userptr.notifier.mm)) {
101                         ret = -EFAULT;
102                         goto mm_closed;
103                 }
104                 kthread_use_mm(vma->userptr.notifier.mm);
105         }
106
107         while (pinned < num_pages) {
108                 ret = get_user_pages_fast(xe_vma_userptr(vma) +
109                                           pinned * PAGE_SIZE,
110                                           num_pages - pinned,
111                                           read_only ? 0 : FOLL_WRITE,
112                                           &pages[pinned]);
113                 if (ret < 0) {
114                         if (in_kthread)
115                                 ret = 0;
116                         break;
117                 }
118
119                 pinned += ret;
120                 ret = 0;
121         }
122
123         if (in_kthread) {
124                 kthread_unuse_mm(vma->userptr.notifier.mm);
125                 mmput(vma->userptr.notifier.mm);
126         }
127 mm_closed:
128         if (ret)
129                 goto out;
130
131         ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages,
132                                                 pinned, 0,
133                                                 (u64)pinned << PAGE_SHIFT,
134                                                 xe_sg_segment_size(xe->drm.dev),
135                                                 GFP_KERNEL);
136         if (ret) {
137                 vma->userptr.sg = NULL;
138                 goto out;
139         }
140         vma->userptr.sg = &vma->userptr.sgt;
141
142         ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg,
143                               read_only ? DMA_TO_DEVICE :
144                               DMA_BIDIRECTIONAL,
145                               DMA_ATTR_SKIP_CPU_SYNC |
146                               DMA_ATTR_NO_KERNEL_MAPPING);
147         if (ret) {
148                 sg_free_table(vma->userptr.sg);
149                 vma->userptr.sg = NULL;
150                 goto out;
151         }
152
153         for (i = 0; i < pinned; ++i) {
154                 if (!read_only) {
155                         lock_page(pages[i]);
156                         set_page_dirty(pages[i]);
157                         unlock_page(pages[i]);
158                 }
159
160                 mark_page_accessed(pages[i]);
161         }
162
163 out:
164         release_pages(pages, pinned);
165         kvfree(pages);
166
167         if (!(ret < 0)) {
168                 vma->userptr.notifier_seq = notifier_seq;
169                 if (xe_vma_userptr_check_repin(vma) == -EAGAIN)
170                         goto retry;
171         }
172
173         return ret < 0 ? ret : 0;
174 }
175
176 static bool preempt_fences_waiting(struct xe_vm *vm)
177 {
178         struct xe_exec_queue *q;
179
180         lockdep_assert_held(&vm->lock);
181         xe_vm_assert_held(vm);
182
183         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
184                 if (!q->compute.pfence ||
185                     (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
186                                                    &q->compute.pfence->flags))) {
187                         return true;
188                 }
189         }
190
191         return false;
192 }
193
194 static void free_preempt_fences(struct list_head *list)
195 {
196         struct list_head *link, *next;
197
198         list_for_each_safe(link, next, list)
199                 xe_preempt_fence_free(to_preempt_fence_from_link(link));
200 }
201
202 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
203                                 unsigned int *count)
204 {
205         lockdep_assert_held(&vm->lock);
206         xe_vm_assert_held(vm);
207
208         if (*count >= vm->preempt.num_exec_queues)
209                 return 0;
210
211         for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
212                 struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
213
214                 if (IS_ERR(pfence))
215                         return PTR_ERR(pfence);
216
217                 list_move_tail(xe_preempt_fence_link(pfence), list);
218         }
219
220         return 0;
221 }
222
223 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
224 {
225         struct xe_exec_queue *q;
226
227         xe_vm_assert_held(vm);
228
229         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
230                 if (q->compute.pfence) {
231                         long timeout = dma_fence_wait(q->compute.pfence, false);
232
233                         if (timeout < 0)
234                                 return -ETIME;
235                         dma_fence_put(q->compute.pfence);
236                         q->compute.pfence = NULL;
237                 }
238         }
239
240         return 0;
241 }
242
243 static bool xe_vm_is_idle(struct xe_vm *vm)
244 {
245         struct xe_exec_queue *q;
246
247         xe_vm_assert_held(vm);
248         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
249                 if (!xe_exec_queue_is_idle(q))
250                         return false;
251         }
252
253         return true;
254 }
255
256 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
257 {
258         struct list_head *link;
259         struct xe_exec_queue *q;
260
261         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
262                 struct dma_fence *fence;
263
264                 link = list->next;
265                 xe_assert(vm->xe, link != list);
266
267                 fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
268                                              q, q->compute.context,
269                                              ++q->compute.seqno);
270                 dma_fence_put(q->compute.pfence);
271                 q->compute.pfence = fence;
272         }
273 }
274
275 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
276 {
277         struct xe_exec_queue *q;
278         int err;
279
280         err = xe_bo_lock(bo, true);
281         if (err)
282                 return err;
283
284         err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
285         if (err)
286                 goto out_unlock;
287
288         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
289                 if (q->compute.pfence) {
290                         dma_resv_add_fence(bo->ttm.base.resv,
291                                            q->compute.pfence,
292                                            DMA_RESV_USAGE_BOOKKEEP);
293                 }
294
295 out_unlock:
296         xe_bo_unlock(bo);
297         return err;
298 }
299
300 /**
301  * xe_vm_fence_all_extobjs() - Add a fence to vm's external objects' resv
302  * @vm: The vm.
303  * @fence: The fence to add.
304  * @usage: The resv usage for the fence.
305  *
306  * Loops over all of the vm's external object bindings and adds a @fence
307  * with the given @usage to all of the external object's reservation
308  * objects.
309  */
310 void xe_vm_fence_all_extobjs(struct xe_vm *vm, struct dma_fence *fence,
311                              enum dma_resv_usage usage)
312 {
313         struct xe_vma *vma;
314
315         list_for_each_entry(vma, &vm->extobj.list, extobj.link)
316                 dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, usage);
317 }
318
319 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm)
320 {
321         struct xe_exec_queue *q;
322
323         lockdep_assert_held(&vm->lock);
324         xe_vm_assert_held(vm);
325
326         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
327                 q->ops->resume(q);
328
329                 dma_resv_add_fence(xe_vm_resv(vm), q->compute.pfence,
330                                    DMA_RESV_USAGE_BOOKKEEP);
331                 xe_vm_fence_all_extobjs(vm, q->compute.pfence,
332                                         DMA_RESV_USAGE_BOOKKEEP);
333         }
334 }
335
336 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
337 {
338         struct drm_exec exec;
339         struct dma_fence *pfence;
340         int err;
341         bool wait;
342
343         xe_assert(vm->xe, xe_vm_in_compute_mode(vm));
344
345         down_write(&vm->lock);
346         drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
347         drm_exec_until_all_locked(&exec) {
348                 err = xe_vm_lock_dma_resv(vm, &exec, 1, true);
349                 drm_exec_retry_on_contention(&exec);
350                 if (err)
351                         goto out_unlock;
352         }
353
354         pfence = xe_preempt_fence_create(q, q->compute.context,
355                                          ++q->compute.seqno);
356         if (!pfence) {
357                 err = -ENOMEM;
358                 goto out_unlock;
359         }
360
361         list_add(&q->compute.link, &vm->preempt.exec_queues);
362         ++vm->preempt.num_exec_queues;
363         q->compute.pfence = pfence;
364
365         down_read(&vm->userptr.notifier_lock);
366
367         dma_resv_add_fence(xe_vm_resv(vm), pfence,
368                            DMA_RESV_USAGE_BOOKKEEP);
369
370         xe_vm_fence_all_extobjs(vm, pfence, DMA_RESV_USAGE_BOOKKEEP);
371
372         /*
373          * Check to see if a preemption on VM is in flight or userptr
374          * invalidation, if so trigger this preempt fence to sync state with
375          * other preempt fences on the VM.
376          */
377         wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
378         if (wait)
379                 dma_fence_enable_sw_signaling(pfence);
380
381         up_read(&vm->userptr.notifier_lock);
382
383 out_unlock:
384         drm_exec_fini(&exec);
385         up_write(&vm->lock);
386
387         return err;
388 }
389
390 /**
391  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
392  * that need repinning.
393  * @vm: The VM.
394  *
395  * This function checks for whether the VM has userptrs that need repinning,
396  * and provides a release-type barrier on the userptr.notifier_lock after
397  * checking.
398  *
399  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
400  */
401 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
402 {
403         lockdep_assert_held_read(&vm->userptr.notifier_lock);
404
405         return (list_empty(&vm->userptr.repin_list) &&
406                 list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
407 }
408
409 /**
410  * xe_vm_lock_dma_resv() - Lock the vm dma_resv object and the dma_resv
411  * objects of the vm's external buffer objects.
412  * @vm: The vm.
413  * @exec: Pointer to a struct drm_exec locking context.
414  * @num_shared: Number of dma-fence slots to reserve in the locked objects.
415  * @lock_vm: Lock also the vm's dma_resv.
416  *
417  * Locks the vm dma-resv objects and all the dma-resv objects of the
418  * buffer objects on the vm external object list.
419  *
420  * Return: 0 on success, Negative error code on error. In particular if
421  * @intr is set to true, -EINTR or -ERESTARTSYS may be returned.
422  */
423 int xe_vm_lock_dma_resv(struct xe_vm *vm, struct drm_exec *exec,
424                         unsigned int num_shared, bool lock_vm)
425 {
426         struct xe_vma *vma, *next;
427         int err = 0;
428
429         lockdep_assert_held(&vm->lock);
430
431         if (lock_vm) {
432                 err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared);
433                 if (err)
434                         return err;
435         }
436
437         list_for_each_entry(vma, &vm->extobj.list, extobj.link) {
438                 err = drm_exec_prepare_obj(exec, &xe_vma_bo(vma)->ttm.base, num_shared);
439                 if (err)
440                         return err;
441         }
442
443         spin_lock(&vm->notifier.list_lock);
444         list_for_each_entry_safe(vma, next, &vm->notifier.rebind_list,
445                                  notifier.rebind_link) {
446                 xe_bo_assert_held(xe_vma_bo(vma));
447
448                 list_del_init(&vma->notifier.rebind_link);
449                 if (vma->tile_present && !(vma->gpuva.flags & XE_VMA_DESTROYED))
450                         list_move_tail(&vma->combined_links.rebind,
451                                        &vm->rebind_list);
452         }
453         spin_unlock(&vm->notifier.list_lock);
454
455         return 0;
456 }
457
458 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
459
460 static void xe_vm_kill(struct xe_vm *vm)
461 {
462         struct xe_exec_queue *q;
463
464         lockdep_assert_held(&vm->lock);
465
466         xe_vm_lock(vm, false);
467         vm->flags |= XE_VM_FLAG_BANNED;
468         trace_xe_vm_kill(vm);
469
470         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
471                 q->ops->kill(q);
472         xe_vm_unlock(vm);
473
474         /* TODO: Inform user the VM is banned */
475 }
476
477 /**
478  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
479  * @exec: The drm_exec object used for locking before validation.
480  * @err: The error returned from ttm_bo_validate().
481  * @end: A ktime_t cookie that should be set to 0 before first use and
482  * that should be reused on subsequent calls.
483  *
484  * With multiple active VMs, under memory pressure, it is possible that
485  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
486  * Until ttm properly handles locking in such scenarios, best thing the
487  * driver can do is retry with a timeout. Check if that is necessary, and
488  * if so unlock the drm_exec's objects while keeping the ticket to prepare
489  * for a rerun.
490  *
491  * Return: true if a retry after drm_exec_init() is recommended;
492  * false otherwise.
493  */
494 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
495 {
496         ktime_t cur;
497
498         if (err != -ENOMEM)
499                 return false;
500
501         cur = ktime_get();
502         *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
503         if (!ktime_before(cur, *end))
504                 return false;
505
506         /*
507          * We would like to keep the ticket here with
508          * drm_exec_unlock_all(), but WW mutex asserts currently
509          * stop us from that. In any case this function could go away
510          * with proper TTM -EDEADLK handling.
511          */
512         drm_exec_fini(exec);
513
514         msleep(20);
515         return true;
516 }
517
518 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
519                                  bool *done)
520 {
521         struct xe_vma *vma;
522         int err;
523
524         /*
525          * 1 fence for each preempt fence plus a fence for each tile from a
526          * possible rebind
527          */
528         err = drm_exec_prepare_obj(exec, xe_vm_obj(vm),
529                                    vm->preempt.num_exec_queues +
530                                    vm->xe->info.tile_count);
531         if (err)
532                 return err;
533
534         if (xe_vm_is_idle(vm)) {
535                 vm->preempt.rebind_deactivated = true;
536                 *done = true;
537                 return 0;
538         }
539
540         if (!preempt_fences_waiting(vm)) {
541                 *done = true;
542                 return 0;
543         }
544
545         err = xe_vm_lock_dma_resv(vm, exec, vm->preempt.num_exec_queues, false);
546         if (err)
547                 return err;
548
549         err = wait_for_existing_preempt_fences(vm);
550         if (err)
551                 return err;
552
553         list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
554                 if (xe_vma_has_no_bo(vma) ||
555                     vma->gpuva.flags & XE_VMA_DESTROYED)
556                         continue;
557
558                 err = xe_bo_validate(xe_vma_bo(vma), vm, false);
559                 if (err)
560                         break;
561         }
562
563         return err;
564 }
565
566 static void preempt_rebind_work_func(struct work_struct *w)
567 {
568         struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
569         struct drm_exec exec;
570         struct dma_fence *rebind_fence;
571         unsigned int fence_count = 0;
572         LIST_HEAD(preempt_fences);
573         ktime_t end = 0;
574         int err;
575         long wait;
576         int __maybe_unused tries = 0;
577
578         xe_assert(vm->xe, xe_vm_in_compute_mode(vm));
579         trace_xe_vm_rebind_worker_enter(vm);
580
581         down_write(&vm->lock);
582
583         if (xe_vm_is_closed_or_banned(vm)) {
584                 up_write(&vm->lock);
585                 trace_xe_vm_rebind_worker_exit(vm);
586                 return;
587         }
588
589 retry:
590         if (vm->async_ops.error)
591                 goto out_unlock_outer;
592
593         /*
594          * Extreme corner where we exit a VM error state with a munmap style VM
595          * unbind inflight which requires a rebind. In this case the rebind
596          * needs to install some fences into the dma-resv slots. The worker to
597          * do this queued, let that worker make progress by dropping vm->lock
598          * and trying this again.
599          */
600         if (vm->async_ops.munmap_rebind_inflight) {
601                 up_write(&vm->lock);
602                 flush_work(&vm->async_ops.work);
603                 goto retry;
604         }
605
606         if (xe_vm_userptr_check_repin(vm)) {
607                 err = xe_vm_userptr_pin(vm);
608                 if (err)
609                         goto out_unlock_outer;
610         }
611
612         drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
613
614         drm_exec_until_all_locked(&exec) {
615                 bool done = false;
616
617                 err = xe_preempt_work_begin(&exec, vm, &done);
618                 drm_exec_retry_on_contention(&exec);
619                 if (err && xe_vm_validate_should_retry(&exec, err, &end)) {
620                         err = -EAGAIN;
621                         goto out_unlock_outer;
622                 }
623                 if (err || done)
624                         goto out_unlock;
625         }
626
627         err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
628         if (err)
629                 goto out_unlock;
630
631         rebind_fence = xe_vm_rebind(vm, true);
632         if (IS_ERR(rebind_fence)) {
633                 err = PTR_ERR(rebind_fence);
634                 goto out_unlock;
635         }
636
637         if (rebind_fence) {
638                 dma_fence_wait(rebind_fence, false);
639                 dma_fence_put(rebind_fence);
640         }
641
642         /* Wait on munmap style VM unbinds */
643         wait = dma_resv_wait_timeout(xe_vm_resv(vm),
644                                      DMA_RESV_USAGE_KERNEL,
645                                      false, MAX_SCHEDULE_TIMEOUT);
646         if (wait <= 0) {
647                 err = -ETIME;
648                 goto out_unlock;
649         }
650
651 #define retry_required(__tries, __vm) \
652         (IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
653         (!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
654         __xe_vm_userptr_needs_repin(__vm))
655
656         down_read(&vm->userptr.notifier_lock);
657         if (retry_required(tries, vm)) {
658                 up_read(&vm->userptr.notifier_lock);
659                 err = -EAGAIN;
660                 goto out_unlock;
661         }
662
663 #undef retry_required
664
665         spin_lock(&vm->xe->ttm.lru_lock);
666         ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
667         spin_unlock(&vm->xe->ttm.lru_lock);
668
669         /* Point of no return. */
670         arm_preempt_fences(vm, &preempt_fences);
671         resume_and_reinstall_preempt_fences(vm);
672         up_read(&vm->userptr.notifier_lock);
673
674 out_unlock:
675         drm_exec_fini(&exec);
676 out_unlock_outer:
677         if (err == -EAGAIN) {
678                 trace_xe_vm_rebind_worker_retry(vm);
679                 goto retry;
680         }
681
682         if (err) {
683                 drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
684                 xe_vm_kill(vm);
685         }
686         up_write(&vm->lock);
687
688         free_preempt_fences(&preempt_fences);
689
690         trace_xe_vm_rebind_worker_exit(vm);
691 }
692
693 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
694                                    const struct mmu_notifier_range *range,
695                                    unsigned long cur_seq)
696 {
697         struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier);
698         struct xe_vm *vm = xe_vma_vm(vma);
699         struct dma_resv_iter cursor;
700         struct dma_fence *fence;
701         long err;
702
703         xe_assert(vm->xe, xe_vma_is_userptr(vma));
704         trace_xe_vma_userptr_invalidate(vma);
705
706         if (!mmu_notifier_range_blockable(range))
707                 return false;
708
709         down_write(&vm->userptr.notifier_lock);
710         mmu_interval_set_seq(mni, cur_seq);
711
712         /* No need to stop gpu access if the userptr is not yet bound. */
713         if (!vma->userptr.initial_bind) {
714                 up_write(&vm->userptr.notifier_lock);
715                 return true;
716         }
717
718         /*
719          * Tell exec and rebind worker they need to repin and rebind this
720          * userptr.
721          */
722         if (!xe_vm_in_fault_mode(vm) &&
723             !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
724                 spin_lock(&vm->userptr.invalidated_lock);
725                 list_move_tail(&vma->userptr.invalidate_link,
726                                &vm->userptr.invalidated);
727                 spin_unlock(&vm->userptr.invalidated_lock);
728         }
729
730         up_write(&vm->userptr.notifier_lock);
731
732         /*
733          * Preempt fences turn into schedule disables, pipeline these.
734          * Note that even in fault mode, we need to wait for binds and
735          * unbinds to complete, and those are attached as BOOKMARK fences
736          * to the vm.
737          */
738         dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
739                             DMA_RESV_USAGE_BOOKKEEP);
740         dma_resv_for_each_fence_unlocked(&cursor, fence)
741                 dma_fence_enable_sw_signaling(fence);
742         dma_resv_iter_end(&cursor);
743
744         err = dma_resv_wait_timeout(xe_vm_resv(vm),
745                                     DMA_RESV_USAGE_BOOKKEEP,
746                                     false, MAX_SCHEDULE_TIMEOUT);
747         XE_WARN_ON(err <= 0);
748
749         if (xe_vm_in_fault_mode(vm)) {
750                 err = xe_vm_invalidate_vma(vma);
751                 XE_WARN_ON(err);
752         }
753
754         trace_xe_vma_userptr_invalidate_complete(vma);
755
756         return true;
757 }
758
759 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
760         .invalidate = vma_userptr_invalidate,
761 };
762
763 int xe_vm_userptr_pin(struct xe_vm *vm)
764 {
765         struct xe_vma *vma, *next;
766         int err = 0;
767         LIST_HEAD(tmp_evict);
768
769         lockdep_assert_held_write(&vm->lock);
770
771         /* Collect invalidated userptrs */
772         spin_lock(&vm->userptr.invalidated_lock);
773         list_for_each_entry_safe(vma, next, &vm->userptr.invalidated,
774                                  userptr.invalidate_link) {
775                 list_del_init(&vma->userptr.invalidate_link);
776                 if (list_empty(&vma->combined_links.userptr))
777                         list_move_tail(&vma->combined_links.userptr,
778                                        &vm->userptr.repin_list);
779         }
780         spin_unlock(&vm->userptr.invalidated_lock);
781
782         /* Pin and move to temporary list */
783         list_for_each_entry_safe(vma, next, &vm->userptr.repin_list,
784                                  combined_links.userptr) {
785                 err = xe_vma_userptr_pin_pages(vma);
786                 if (err < 0)
787                         goto out_err;
788
789                 list_move_tail(&vma->combined_links.userptr, &tmp_evict);
790         }
791
792         /* Take lock and move to rebind_list for rebinding. */
793         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
794         if (err)
795                 goto out_err;
796
797         list_for_each_entry_safe(vma, next, &tmp_evict, combined_links.userptr)
798                 list_move_tail(&vma->combined_links.rebind, &vm->rebind_list);
799
800         dma_resv_unlock(xe_vm_resv(vm));
801
802         return 0;
803
804 out_err:
805         list_splice_tail(&tmp_evict, &vm->userptr.repin_list);
806
807         return err;
808 }
809
810 /**
811  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
812  * that need repinning.
813  * @vm: The VM.
814  *
815  * This function does an advisory check for whether the VM has userptrs that
816  * need repinning.
817  *
818  * Return: 0 if there are no indications of userptrs needing repinning,
819  * -EAGAIN if there are.
820  */
821 int xe_vm_userptr_check_repin(struct xe_vm *vm)
822 {
823         return (list_empty_careful(&vm->userptr.repin_list) &&
824                 list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
825 }
826
827 static struct dma_fence *
828 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
829                struct xe_sync_entry *syncs, u32 num_syncs,
830                bool first_op, bool last_op);
831
832 struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
833 {
834         struct dma_fence *fence = NULL;
835         struct xe_vma *vma, *next;
836
837         lockdep_assert_held(&vm->lock);
838         if (xe_vm_no_dma_fences(vm) && !rebind_worker)
839                 return NULL;
840
841         xe_vm_assert_held(vm);
842         list_for_each_entry_safe(vma, next, &vm->rebind_list,
843                                  combined_links.rebind) {
844                 xe_assert(vm->xe, vma->tile_present);
845
846                 list_del_init(&vma->combined_links.rebind);
847                 dma_fence_put(fence);
848                 if (rebind_worker)
849                         trace_xe_vma_rebind_worker(vma);
850                 else
851                         trace_xe_vma_rebind_exec(vma);
852                 fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
853                 if (IS_ERR(fence))
854                         return fence;
855         }
856
857         return fence;
858 }
859
860 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
861                                     struct xe_bo *bo,
862                                     u64 bo_offset_or_userptr,
863                                     u64 start, u64 end,
864                                     bool read_only,
865                                     bool is_null,
866                                     u8 tile_mask)
867 {
868         struct xe_vma *vma;
869         struct xe_tile *tile;
870         u8 id;
871
872         xe_assert(vm->xe, start < end);
873         xe_assert(vm->xe, end < vm->size);
874
875         if (!bo && !is_null)    /* userptr */
876                 vma = kzalloc(sizeof(*vma), GFP_KERNEL);
877         else
878                 vma = kzalloc(sizeof(*vma) - sizeof(struct xe_userptr),
879                               GFP_KERNEL);
880         if (!vma) {
881                 vma = ERR_PTR(-ENOMEM);
882                 return vma;
883         }
884
885         INIT_LIST_HEAD(&vma->combined_links.rebind);
886         INIT_LIST_HEAD(&vma->notifier.rebind_link);
887         INIT_LIST_HEAD(&vma->extobj.link);
888
889         INIT_LIST_HEAD(&vma->gpuva.gem.entry);
890         vma->gpuva.vm = &vm->gpuvm;
891         vma->gpuva.va.addr = start;
892         vma->gpuva.va.range = end - start + 1;
893         if (read_only)
894                 vma->gpuva.flags |= XE_VMA_READ_ONLY;
895         if (is_null)
896                 vma->gpuva.flags |= DRM_GPUVA_SPARSE;
897
898         if (tile_mask) {
899                 vma->tile_mask = tile_mask;
900         } else {
901                 for_each_tile(tile, vm->xe, id)
902                         vma->tile_mask |= 0x1 << id;
903         }
904
905         if (GRAPHICS_VER(vm->xe) >= 20 || vm->xe->info.platform == XE_PVC)
906                 vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
907
908         if (bo) {
909                 struct drm_gpuvm_bo *vm_bo;
910
911                 xe_bo_assert_held(bo);
912
913                 vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
914                 if (IS_ERR(vm_bo)) {
915                         kfree(vma);
916                         return ERR_CAST(vm_bo);
917                 }
918
919                 drm_gem_object_get(&bo->ttm.base);
920                 vma->gpuva.gem.obj = &bo->ttm.base;
921                 vma->gpuva.gem.offset = bo_offset_or_userptr;
922                 drm_gpuva_link(&vma->gpuva, vm_bo);
923                 drm_gpuvm_bo_put(vm_bo);
924         } else /* userptr or null */ {
925                 if (!is_null) {
926                         u64 size = end - start + 1;
927                         int err;
928
929                         INIT_LIST_HEAD(&vma->userptr.invalidate_link);
930                         vma->gpuva.gem.offset = bo_offset_or_userptr;
931
932                         err = mmu_interval_notifier_insert(&vma->userptr.notifier,
933                                                            current->mm,
934                                                            xe_vma_userptr(vma), size,
935                                                            &vma_userptr_notifier_ops);
936                         if (err) {
937                                 kfree(vma);
938                                 vma = ERR_PTR(err);
939                                 return vma;
940                         }
941
942                         vma->userptr.notifier_seq = LONG_MAX;
943                 }
944
945                 xe_vm_get(vm);
946         }
947
948         return vma;
949 }
950
951 static bool vm_remove_extobj(struct xe_vma *vma)
952 {
953         if (!list_empty(&vma->extobj.link)) {
954                 xe_vma_vm(vma)->extobj.entries--;
955                 list_del_init(&vma->extobj.link);
956                 return true;
957         }
958         return false;
959 }
960
961 static void xe_vma_destroy_late(struct xe_vma *vma)
962 {
963         struct xe_vm *vm = xe_vma_vm(vma);
964         struct xe_device *xe = vm->xe;
965         bool read_only = xe_vma_read_only(vma);
966
967         if (xe_vma_is_userptr(vma)) {
968                 if (vma->userptr.sg) {
969                         dma_unmap_sgtable(xe->drm.dev,
970                                           vma->userptr.sg,
971                                           read_only ? DMA_TO_DEVICE :
972                                           DMA_BIDIRECTIONAL, 0);
973                         sg_free_table(vma->userptr.sg);
974                         vma->userptr.sg = NULL;
975                 }
976
977                 /*
978                  * Since userptr pages are not pinned, we can't remove
979                  * the notifer until we're sure the GPU is not accessing
980                  * them anymore
981                  */
982                 mmu_interval_notifier_remove(&vma->userptr.notifier);
983                 xe_vm_put(vm);
984         } else if (xe_vma_is_null(vma)) {
985                 xe_vm_put(vm);
986         } else {
987                 xe_bo_put(xe_vma_bo(vma));
988         }
989
990         kfree(vma);
991 }
992
993 static void vma_destroy_work_func(struct work_struct *w)
994 {
995         struct xe_vma *vma =
996                 container_of(w, struct xe_vma, destroy_work);
997
998         xe_vma_destroy_late(vma);
999 }
1000
1001 static struct xe_vma *
1002 bo_has_vm_references_locked(struct xe_bo *bo, struct xe_vm *vm,
1003                             struct xe_vma *ignore)
1004 {
1005         struct drm_gpuvm_bo *vm_bo;
1006         struct drm_gpuva *va;
1007         struct drm_gem_object *obj = &bo->ttm.base;
1008
1009         xe_bo_assert_held(bo);
1010
1011         drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
1012                 drm_gpuvm_bo_for_each_va(va, vm_bo) {
1013                         struct xe_vma *vma = gpuva_to_vma(va);
1014
1015                         if (vma != ignore && xe_vma_vm(vma) == vm)
1016                                 return vma;
1017                 }
1018         }
1019
1020         return NULL;
1021 }
1022
1023 static bool bo_has_vm_references(struct xe_bo *bo, struct xe_vm *vm,
1024                                  struct xe_vma *ignore)
1025 {
1026         bool ret;
1027
1028         xe_bo_lock(bo, false);
1029         ret = !!bo_has_vm_references_locked(bo, vm, ignore);
1030         xe_bo_unlock(bo);
1031
1032         return ret;
1033 }
1034
1035 static void __vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1036 {
1037         lockdep_assert_held_write(&vm->lock);
1038
1039         list_add(&vma->extobj.link, &vm->extobj.list);
1040         vm->extobj.entries++;
1041 }
1042
1043 static void vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1044 {
1045         struct xe_bo *bo = xe_vma_bo(vma);
1046
1047         lockdep_assert_held_write(&vm->lock);
1048
1049         if (bo_has_vm_references(bo, vm, vma))
1050                 return;
1051
1052         __vm_insert_extobj(vm, vma);
1053 }
1054
1055 static void vma_destroy_cb(struct dma_fence *fence,
1056                            struct dma_fence_cb *cb)
1057 {
1058         struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1059
1060         INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1061         queue_work(system_unbound_wq, &vma->destroy_work);
1062 }
1063
1064 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1065 {
1066         struct xe_vm *vm = xe_vma_vm(vma);
1067
1068         lockdep_assert_held_write(&vm->lock);
1069         xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1070
1071         if (xe_vma_is_userptr(vma)) {
1072                 xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1073
1074                 spin_lock(&vm->userptr.invalidated_lock);
1075                 list_del(&vma->userptr.invalidate_link);
1076                 spin_unlock(&vm->userptr.invalidated_lock);
1077         } else if (!xe_vma_is_null(vma)) {
1078                 xe_bo_assert_held(xe_vma_bo(vma));
1079
1080                 spin_lock(&vm->notifier.list_lock);
1081                 list_del(&vma->notifier.rebind_link);
1082                 spin_unlock(&vm->notifier.list_lock);
1083
1084                 drm_gpuva_unlink(&vma->gpuva);
1085
1086                 if (!xe_vma_bo(vma)->vm && vm_remove_extobj(vma)) {
1087                         struct xe_vma *other;
1088
1089                         other = bo_has_vm_references_locked(xe_vma_bo(vma), vm, NULL);
1090
1091                         if (other)
1092                                 __vm_insert_extobj(vm, other);
1093                 }
1094         }
1095
1096         xe_vm_assert_held(vm);
1097         if (fence) {
1098                 int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1099                                                  vma_destroy_cb);
1100
1101                 if (ret) {
1102                         XE_WARN_ON(ret != -ENOENT);
1103                         xe_vma_destroy_late(vma);
1104                 }
1105         } else {
1106                 xe_vma_destroy_late(vma);
1107         }
1108 }
1109
1110 /**
1111  * xe_vm_prepare_vma() - drm_exec utility to lock a vma
1112  * @exec: The drm_exec object we're currently locking for.
1113  * @vma: The vma for witch we want to lock the vm resv and any attached
1114  * object's resv.
1115  * @num_shared: The number of dma-fence slots to pre-allocate in the
1116  * objects' reservation objects.
1117  *
1118  * Return: 0 on success, negative error code on error. In particular
1119  * may return -EDEADLK on WW transaction contention and -EINTR if
1120  * an interruptible wait is terminated by a signal.
1121  */
1122 int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma,
1123                       unsigned int num_shared)
1124 {
1125         struct xe_vm *vm = xe_vma_vm(vma);
1126         struct xe_bo *bo = xe_vma_bo(vma);
1127         int err;
1128
1129         XE_WARN_ON(!vm);
1130         err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared);
1131         if (!err && bo && !bo->vm)
1132                 err = drm_exec_prepare_obj(exec, &bo->ttm.base, num_shared);
1133
1134         return err;
1135 }
1136
1137 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1138 {
1139         struct drm_exec exec;
1140         int err;
1141
1142         drm_exec_init(&exec, 0);
1143         drm_exec_until_all_locked(&exec) {
1144                 err = xe_vm_prepare_vma(&exec, vma, 0);
1145                 drm_exec_retry_on_contention(&exec);
1146                 if (XE_WARN_ON(err))
1147                         break;
1148         }
1149
1150         xe_vma_destroy(vma, NULL);
1151
1152         drm_exec_fini(&exec);
1153 }
1154
1155 struct xe_vma *
1156 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1157 {
1158         struct drm_gpuva *gpuva;
1159
1160         lockdep_assert_held(&vm->lock);
1161
1162         if (xe_vm_is_closed_or_banned(vm))
1163                 return NULL;
1164
1165         xe_assert(vm->xe, start + range <= vm->size);
1166
1167         gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1168
1169         return gpuva ? gpuva_to_vma(gpuva) : NULL;
1170 }
1171
1172 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1173 {
1174         int err;
1175
1176         xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1177         lockdep_assert_held(&vm->lock);
1178
1179         err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1180         XE_WARN_ON(err);        /* Shouldn't be possible */
1181
1182         return err;
1183 }
1184
1185 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1186 {
1187         xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1188         lockdep_assert_held(&vm->lock);
1189
1190         drm_gpuva_remove(&vma->gpuva);
1191         if (vm->usm.last_fault_vma == vma)
1192                 vm->usm.last_fault_vma = NULL;
1193 }
1194
1195 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1196 {
1197         struct xe_vma_op *op;
1198
1199         op = kzalloc(sizeof(*op), GFP_KERNEL);
1200
1201         if (unlikely(!op))
1202                 return NULL;
1203
1204         return &op->base;
1205 }
1206
1207 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1208
1209 static struct drm_gpuvm_ops gpuvm_ops = {
1210         .op_alloc = xe_vm_op_alloc,
1211         .vm_free = xe_vm_free,
1212 };
1213
1214 static u64 pde_encode_pat_index(struct xe_device *xe, u16 pat_index)
1215 {
1216         u64 pte = 0;
1217
1218         if (pat_index & BIT(0))
1219                 pte |= XE_PPGTT_PTE_PAT0;
1220
1221         if (pat_index & BIT(1))
1222                 pte |= XE_PPGTT_PTE_PAT1;
1223
1224         return pte;
1225 }
1226
1227 static u64 pte_encode_pat_index(struct xe_device *xe, u16 pat_index)
1228 {
1229         u64 pte = 0;
1230
1231         if (pat_index & BIT(0))
1232                 pte |= XE_PPGTT_PTE_PAT0;
1233
1234         if (pat_index & BIT(1))
1235                 pte |= XE_PPGTT_PTE_PAT1;
1236
1237         if (pat_index & BIT(2))
1238                 pte |= XE_PPGTT_PTE_PAT2;
1239
1240         if (pat_index & BIT(3))
1241                 pte |= XELPG_PPGTT_PTE_PAT3;
1242
1243         if (pat_index & (BIT(4)))
1244                 pte |= XE2_PPGTT_PTE_PAT4;
1245
1246         return pte;
1247 }
1248
1249 static u64 pte_encode_ps(u32 pt_level)
1250 {
1251         XE_WARN_ON(pt_level > 2);
1252
1253         if (pt_level == 1)
1254                 return XE_PDE_PS_2M;
1255         else if (pt_level == 2)
1256                 return XE_PDPE_PS_1G;
1257
1258         return 0;
1259 }
1260
1261 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
1262                               const u16 pat_index)
1263 {
1264         struct xe_device *xe = xe_bo_device(bo);
1265         u64 pde;
1266
1267         pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1268         pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1269         pde |= pde_encode_pat_index(xe, pat_index);
1270
1271         return pde;
1272 }
1273
1274 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1275                               u16 pat_index, u32 pt_level)
1276 {
1277         struct xe_device *xe = xe_bo_device(bo);
1278         u64 pte;
1279
1280         pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1281         pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1282         pte |= pte_encode_pat_index(xe, pat_index);
1283         pte |= pte_encode_ps(pt_level);
1284
1285         if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1286                 pte |= XE_PPGTT_PTE_DM;
1287
1288         return pte;
1289 }
1290
1291 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1292                                u16 pat_index, u32 pt_level)
1293 {
1294         struct xe_device *xe = xe_vma_vm(vma)->xe;
1295
1296         pte |= XE_PAGE_PRESENT;
1297
1298         if (likely(!xe_vma_read_only(vma)))
1299                 pte |= XE_PAGE_RW;
1300
1301         pte |= pte_encode_pat_index(xe, pat_index);
1302         pte |= pte_encode_ps(pt_level);
1303
1304         if (unlikely(xe_vma_is_null(vma)))
1305                 pte |= XE_PTE_NULL;
1306
1307         return pte;
1308 }
1309
1310 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1311                                 u16 pat_index,
1312                                 u32 pt_level, bool devmem, u64 flags)
1313 {
1314         u64 pte;
1315
1316         /* Avoid passing random bits directly as flags */
1317         xe_assert(xe, !(flags & ~XE_PTE_PS64));
1318
1319         pte = addr;
1320         pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1321         pte |= pte_encode_pat_index(xe, pat_index);
1322         pte |= pte_encode_ps(pt_level);
1323
1324         if (devmem)
1325                 pte |= XE_PPGTT_PTE_DM;
1326
1327         pte |= flags;
1328
1329         return pte;
1330 }
1331
1332 static const struct xe_pt_ops xelp_pt_ops = {
1333         .pte_encode_bo = xelp_pte_encode_bo,
1334         .pte_encode_vma = xelp_pte_encode_vma,
1335         .pte_encode_addr = xelp_pte_encode_addr,
1336         .pde_encode_bo = xelp_pde_encode_bo,
1337 };
1338
1339 static void xe_vma_op_work_func(struct work_struct *w);
1340 static void vm_destroy_work_func(struct work_struct *w);
1341
1342 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1343 {
1344         struct drm_gem_object *vm_resv_obj;
1345         struct xe_vm *vm;
1346         int err, number_tiles = 0;
1347         struct xe_tile *tile;
1348         u8 id;
1349
1350         vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1351         if (!vm)
1352                 return ERR_PTR(-ENOMEM);
1353
1354         vm->xe = xe;
1355
1356         vm->size = 1ull << xe->info.va_bits;
1357
1358         vm->flags = flags;
1359
1360         init_rwsem(&vm->lock);
1361
1362         INIT_LIST_HEAD(&vm->rebind_list);
1363
1364         INIT_LIST_HEAD(&vm->userptr.repin_list);
1365         INIT_LIST_HEAD(&vm->userptr.invalidated);
1366         init_rwsem(&vm->userptr.notifier_lock);
1367         spin_lock_init(&vm->userptr.invalidated_lock);
1368
1369         INIT_LIST_HEAD(&vm->notifier.rebind_list);
1370         spin_lock_init(&vm->notifier.list_lock);
1371
1372         INIT_LIST_HEAD(&vm->async_ops.pending);
1373         INIT_WORK(&vm->async_ops.work, xe_vma_op_work_func);
1374         spin_lock_init(&vm->async_ops.lock);
1375
1376         INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1377
1378         INIT_LIST_HEAD(&vm->preempt.exec_queues);
1379         vm->preempt.min_run_period_ms = 10;     /* FIXME: Wire up to uAPI */
1380
1381         for_each_tile(tile, xe, id)
1382                 xe_range_fence_tree_init(&vm->rftree[id]);
1383
1384         INIT_LIST_HEAD(&vm->extobj.list);
1385
1386         vm->pt_ops = &xelp_pt_ops;
1387
1388         if (!(flags & XE_VM_FLAG_MIGRATION))
1389                 xe_device_mem_access_get(xe);
1390
1391         vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1392         if (!vm_resv_obj) {
1393                 err = -ENOMEM;
1394                 goto err_no_resv;
1395         }
1396
1397         drm_gpuvm_init(&vm->gpuvm, "Xe VM", 0, &xe->drm, vm_resv_obj,
1398                        0, vm->size, 0, 0, &gpuvm_ops);
1399
1400         drm_gem_object_put(vm_resv_obj);
1401
1402         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
1403         if (err)
1404                 goto err_close;
1405
1406         if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1407                 vm->flags |= XE_VM_FLAG_64K;
1408
1409         for_each_tile(tile, xe, id) {
1410                 if (flags & XE_VM_FLAG_MIGRATION &&
1411                     tile->id != XE_VM_FLAG_TILE_ID(flags))
1412                         continue;
1413
1414                 vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1415                 if (IS_ERR(vm->pt_root[id])) {
1416                         err = PTR_ERR(vm->pt_root[id]);
1417                         vm->pt_root[id] = NULL;
1418                         goto err_unlock_close;
1419                 }
1420         }
1421
1422         if (flags & XE_VM_FLAG_SCRATCH_PAGE) {
1423                 for_each_tile(tile, xe, id) {
1424                         if (!vm->pt_root[id])
1425                                 continue;
1426
1427                         err = xe_pt_create_scratch(xe, tile, vm);
1428                         if (err)
1429                                 goto err_unlock_close;
1430                 }
1431                 vm->batch_invalidate_tlb = true;
1432         }
1433
1434         if (flags & XE_VM_FLAG_COMPUTE_MODE) {
1435                 INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1436                 vm->flags |= XE_VM_FLAG_COMPUTE_MODE;
1437                 vm->batch_invalidate_tlb = false;
1438         }
1439
1440         if (flags & XE_VM_FLAG_ASYNC_BIND_OPS) {
1441                 vm->async_ops.fence.context = dma_fence_context_alloc(1);
1442                 vm->flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
1443         }
1444
1445         /* Fill pt_root after allocating scratch tables */
1446         for_each_tile(tile, xe, id) {
1447                 if (!vm->pt_root[id])
1448                         continue;
1449
1450                 xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1451         }
1452         dma_resv_unlock(xe_vm_resv(vm));
1453
1454         /* Kernel migration VM shouldn't have a circular loop.. */
1455         if (!(flags & XE_VM_FLAG_MIGRATION)) {
1456                 for_each_tile(tile, xe, id) {
1457                         struct xe_gt *gt = tile->primary_gt;
1458                         struct xe_vm *migrate_vm;
1459                         struct xe_exec_queue *q;
1460
1461                         if (!vm->pt_root[id])
1462                                 continue;
1463
1464                         migrate_vm = xe_migrate_get_vm(tile->migrate);
1465                         q = xe_exec_queue_create_class(xe, gt, migrate_vm,
1466                                                        XE_ENGINE_CLASS_COPY,
1467                                                        EXEC_QUEUE_FLAG_VM);
1468                         xe_vm_put(migrate_vm);
1469                         if (IS_ERR(q)) {
1470                                 err = PTR_ERR(q);
1471                                 goto err_close;
1472                         }
1473                         vm->q[id] = q;
1474                         number_tiles++;
1475                 }
1476         }
1477
1478         if (number_tiles > 1)
1479                 vm->composite_fence_ctx = dma_fence_context_alloc(1);
1480
1481         mutex_lock(&xe->usm.lock);
1482         if (flags & XE_VM_FLAG_FAULT_MODE)
1483                 xe->usm.num_vm_in_fault_mode++;
1484         else if (!(flags & XE_VM_FLAG_MIGRATION))
1485                 xe->usm.num_vm_in_non_fault_mode++;
1486         mutex_unlock(&xe->usm.lock);
1487
1488         trace_xe_vm_create(vm);
1489
1490         return vm;
1491
1492 err_unlock_close:
1493         dma_resv_unlock(xe_vm_resv(vm));
1494 err_close:
1495         xe_vm_close_and_put(vm);
1496         return ERR_PTR(err);
1497
1498 err_no_resv:
1499         for_each_tile(tile, xe, id)
1500                 xe_range_fence_tree_fini(&vm->rftree[id]);
1501         kfree(vm);
1502         if (!(flags & XE_VM_FLAG_MIGRATION))
1503                 xe_device_mem_access_put(xe);
1504         return ERR_PTR(err);
1505 }
1506
1507 static void flush_async_ops(struct xe_vm *vm)
1508 {
1509         queue_work(system_unbound_wq, &vm->async_ops.work);
1510         flush_work(&vm->async_ops.work);
1511 }
1512
1513 static void vm_error_capture(struct xe_vm *vm, int err,
1514                              u32 op, u64 addr, u64 size)
1515 {
1516         struct drm_xe_vm_bind_op_error_capture capture;
1517         u64 __user *address =
1518                 u64_to_user_ptr(vm->async_ops.error_capture.addr);
1519         bool in_kthread = !current->mm;
1520
1521         capture.error = err;
1522         capture.op = op;
1523         capture.addr = addr;
1524         capture.size = size;
1525
1526         if (in_kthread) {
1527                 if (!mmget_not_zero(vm->async_ops.error_capture.mm))
1528                         goto mm_closed;
1529                 kthread_use_mm(vm->async_ops.error_capture.mm);
1530         }
1531
1532         if (copy_to_user(address, &capture, sizeof(capture)))
1533                 drm_warn(&vm->xe->drm, "Copy to user failed");
1534
1535         if (in_kthread) {
1536                 kthread_unuse_mm(vm->async_ops.error_capture.mm);
1537                 mmput(vm->async_ops.error_capture.mm);
1538         }
1539
1540 mm_closed:
1541         wake_up_all(&vm->async_ops.error_capture.wq);
1542 }
1543
1544 static void xe_vm_close(struct xe_vm *vm)
1545 {
1546         down_write(&vm->lock);
1547         vm->size = 0;
1548         up_write(&vm->lock);
1549 }
1550
1551 void xe_vm_close_and_put(struct xe_vm *vm)
1552 {
1553         LIST_HEAD(contested);
1554         struct xe_device *xe = vm->xe;
1555         struct xe_tile *tile;
1556         struct xe_vma *vma, *next_vma;
1557         struct drm_gpuva *gpuva, *next;
1558         u8 id;
1559
1560         xe_assert(xe, !vm->preempt.num_exec_queues);
1561
1562         xe_vm_close(vm);
1563         flush_async_ops(vm);
1564         if (xe_vm_in_compute_mode(vm))
1565                 flush_work(&vm->preempt.rebind_work);
1566
1567         for_each_tile(tile, xe, id) {
1568                 if (vm->q[id]) {
1569                         xe_exec_queue_kill(vm->q[id]);
1570                         xe_exec_queue_put(vm->q[id]);
1571                         vm->q[id] = NULL;
1572                 }
1573         }
1574
1575         down_write(&vm->lock);
1576         xe_vm_lock(vm, false);
1577         drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1578                 vma = gpuva_to_vma(gpuva);
1579
1580                 if (xe_vma_has_no_bo(vma)) {
1581                         down_read(&vm->userptr.notifier_lock);
1582                         vma->gpuva.flags |= XE_VMA_DESTROYED;
1583                         up_read(&vm->userptr.notifier_lock);
1584                 }
1585
1586                 xe_vm_remove_vma(vm, vma);
1587
1588                 /* easy case, remove from VMA? */
1589                 if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1590                         list_del_init(&vma->combined_links.rebind);
1591                         xe_vma_destroy(vma, NULL);
1592                         continue;
1593                 }
1594
1595                 list_move_tail(&vma->combined_links.destroy, &contested);
1596                 vma->gpuva.flags |= XE_VMA_DESTROYED;
1597         }
1598
1599         /*
1600          * All vm operations will add shared fences to resv.
1601          * The only exception is eviction for a shared object,
1602          * but even so, the unbind when evicted would still
1603          * install a fence to resv. Hence it's safe to
1604          * destroy the pagetables immediately.
1605          */
1606         for_each_tile(tile, xe, id) {
1607                 if (vm->scratch_bo[id]) {
1608                         u32 i;
1609
1610                         xe_bo_unpin(vm->scratch_bo[id]);
1611                         xe_bo_put(vm->scratch_bo[id]);
1612                         for (i = 0; i < vm->pt_root[id]->level; i++)
1613                                 xe_pt_destroy(vm->scratch_pt[id][i], vm->flags,
1614                                               NULL);
1615                 }
1616                 if (vm->pt_root[id]) {
1617                         xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1618                         vm->pt_root[id] = NULL;
1619                 }
1620         }
1621         xe_vm_unlock(vm);
1622
1623         /*
1624          * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1625          * Since we hold a refcount to the bo, we can remove and free
1626          * the members safely without locking.
1627          */
1628         list_for_each_entry_safe(vma, next_vma, &contested,
1629                                  combined_links.destroy) {
1630                 list_del_init(&vma->combined_links.destroy);
1631                 xe_vma_destroy_unlocked(vma);
1632         }
1633
1634         if (vm->async_ops.error_capture.addr)
1635                 wake_up_all(&vm->async_ops.error_capture.wq);
1636
1637         xe_assert(xe, list_empty(&vm->extobj.list));
1638         up_write(&vm->lock);
1639
1640         mutex_lock(&xe->usm.lock);
1641         if (vm->flags & XE_VM_FLAG_FAULT_MODE)
1642                 xe->usm.num_vm_in_fault_mode--;
1643         else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1644                 xe->usm.num_vm_in_non_fault_mode--;
1645         mutex_unlock(&xe->usm.lock);
1646
1647         for_each_tile(tile, xe, id)
1648                 xe_range_fence_tree_fini(&vm->rftree[id]);
1649
1650         xe_vm_put(vm);
1651 }
1652
1653 static void vm_destroy_work_func(struct work_struct *w)
1654 {
1655         struct xe_vm *vm =
1656                 container_of(w, struct xe_vm, destroy_work);
1657         struct xe_device *xe = vm->xe;
1658         struct xe_tile *tile;
1659         u8 id;
1660         void *lookup;
1661
1662         /* xe_vm_close_and_put was not called? */
1663         xe_assert(xe, !vm->size);
1664
1665         if (!(vm->flags & XE_VM_FLAG_MIGRATION)) {
1666                 xe_device_mem_access_put(xe);
1667
1668                 if (xe->info.has_asid) {
1669                         mutex_lock(&xe->usm.lock);
1670                         lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1671                         xe_assert(xe, lookup == vm);
1672                         mutex_unlock(&xe->usm.lock);
1673                 }
1674         }
1675
1676         for_each_tile(tile, xe, id)
1677                 XE_WARN_ON(vm->pt_root[id]);
1678
1679         trace_xe_vm_free(vm);
1680         dma_fence_put(vm->rebind_fence);
1681         kfree(vm);
1682 }
1683
1684 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1685 {
1686         struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1687
1688         /* To destroy the VM we need to be able to sleep */
1689         queue_work(system_unbound_wq, &vm->destroy_work);
1690 }
1691
1692 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1693 {
1694         struct xe_vm *vm;
1695
1696         mutex_lock(&xef->vm.lock);
1697         vm = xa_load(&xef->vm.xa, id);
1698         if (vm)
1699                 xe_vm_get(vm);
1700         mutex_unlock(&xef->vm.lock);
1701
1702         return vm;
1703 }
1704
1705 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1706 {
1707         return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
1708                                          tile_to_xe(tile)->pat.idx[XE_CACHE_WB]);
1709 }
1710
1711 static struct dma_fence *
1712 xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1713                  struct xe_sync_entry *syncs, u32 num_syncs,
1714                  bool first_op, bool last_op)
1715 {
1716         struct xe_tile *tile;
1717         struct dma_fence *fence = NULL;
1718         struct dma_fence **fences = NULL;
1719         struct dma_fence_array *cf = NULL;
1720         struct xe_vm *vm = xe_vma_vm(vma);
1721         int cur_fence = 0, i;
1722         int number_tiles = hweight8(vma->tile_present);
1723         int err;
1724         u8 id;
1725
1726         trace_xe_vma_unbind(vma);
1727
1728         if (number_tiles > 1) {
1729                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1730                                        GFP_KERNEL);
1731                 if (!fences)
1732                         return ERR_PTR(-ENOMEM);
1733         }
1734
1735         for_each_tile(tile, vm->xe, id) {
1736                 if (!(vma->tile_present & BIT(id)))
1737                         goto next;
1738
1739                 fence = __xe_pt_unbind_vma(tile, vma, q ? q : vm->q[id],
1740                                            first_op ? syncs : NULL,
1741                                            first_op ? num_syncs : 0);
1742                 if (IS_ERR(fence)) {
1743                         err = PTR_ERR(fence);
1744                         goto err_fences;
1745                 }
1746
1747                 if (fences)
1748                         fences[cur_fence++] = fence;
1749
1750 next:
1751                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1752                         q = list_next_entry(q, multi_gt_list);
1753         }
1754
1755         if (fences) {
1756                 cf = dma_fence_array_create(number_tiles, fences,
1757                                             vm->composite_fence_ctx,
1758                                             vm->composite_fence_seqno++,
1759                                             false);
1760                 if (!cf) {
1761                         --vm->composite_fence_seqno;
1762                         err = -ENOMEM;
1763                         goto err_fences;
1764                 }
1765         }
1766
1767         if (last_op) {
1768                 for (i = 0; i < num_syncs; i++)
1769                         xe_sync_entry_signal(&syncs[i], NULL,
1770                                              cf ? &cf->base : fence);
1771         }
1772
1773         return cf ? &cf->base : !fence ? dma_fence_get_stub() : fence;
1774
1775 err_fences:
1776         if (fences) {
1777                 while (cur_fence) {
1778                         /* FIXME: Rewind the previous binds? */
1779                         dma_fence_put(fences[--cur_fence]);
1780                 }
1781                 kfree(fences);
1782         }
1783
1784         return ERR_PTR(err);
1785 }
1786
1787 static struct dma_fence *
1788 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1789                struct xe_sync_entry *syncs, u32 num_syncs,
1790                bool first_op, bool last_op)
1791 {
1792         struct xe_tile *tile;
1793         struct dma_fence *fence;
1794         struct dma_fence **fences = NULL;
1795         struct dma_fence_array *cf = NULL;
1796         struct xe_vm *vm = xe_vma_vm(vma);
1797         int cur_fence = 0, i;
1798         int number_tiles = hweight8(vma->tile_mask);
1799         int err;
1800         u8 id;
1801
1802         trace_xe_vma_bind(vma);
1803
1804         if (number_tiles > 1) {
1805                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1806                                        GFP_KERNEL);
1807                 if (!fences)
1808                         return ERR_PTR(-ENOMEM);
1809         }
1810
1811         for_each_tile(tile, vm->xe, id) {
1812                 if (!(vma->tile_mask & BIT(id)))
1813                         goto next;
1814
1815                 fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id],
1816                                          first_op ? syncs : NULL,
1817                                          first_op ? num_syncs : 0,
1818                                          vma->tile_present & BIT(id));
1819                 if (IS_ERR(fence)) {
1820                         err = PTR_ERR(fence);
1821                         goto err_fences;
1822                 }
1823
1824                 if (fences)
1825                         fences[cur_fence++] = fence;
1826
1827 next:
1828                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1829                         q = list_next_entry(q, multi_gt_list);
1830         }
1831
1832         if (fences) {
1833                 cf = dma_fence_array_create(number_tiles, fences,
1834                                             vm->composite_fence_ctx,
1835                                             vm->composite_fence_seqno++,
1836                                             false);
1837                 if (!cf) {
1838                         --vm->composite_fence_seqno;
1839                         err = -ENOMEM;
1840                         goto err_fences;
1841                 }
1842         }
1843
1844         if (last_op) {
1845                 for (i = 0; i < num_syncs; i++)
1846                         xe_sync_entry_signal(&syncs[i], NULL,
1847                                              cf ? &cf->base : fence);
1848         }
1849
1850         return cf ? &cf->base : fence;
1851
1852 err_fences:
1853         if (fences) {
1854                 while (cur_fence) {
1855                         /* FIXME: Rewind the previous binds? */
1856                         dma_fence_put(fences[--cur_fence]);
1857                 }
1858                 kfree(fences);
1859         }
1860
1861         return ERR_PTR(err);
1862 }
1863
1864 struct async_op_fence {
1865         struct dma_fence fence;
1866         struct dma_fence *wait_fence;
1867         struct dma_fence_cb cb;
1868         struct xe_vm *vm;
1869         wait_queue_head_t wq;
1870         bool started;
1871 };
1872
1873 static const char *async_op_fence_get_driver_name(struct dma_fence *dma_fence)
1874 {
1875         return "xe";
1876 }
1877
1878 static const char *
1879 async_op_fence_get_timeline_name(struct dma_fence *dma_fence)
1880 {
1881         return "async_op_fence";
1882 }
1883
1884 static const struct dma_fence_ops async_op_fence_ops = {
1885         .get_driver_name = async_op_fence_get_driver_name,
1886         .get_timeline_name = async_op_fence_get_timeline_name,
1887 };
1888
1889 static void async_op_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
1890 {
1891         struct async_op_fence *afence =
1892                 container_of(cb, struct async_op_fence, cb);
1893
1894         afence->fence.error = afence->wait_fence->error;
1895         dma_fence_signal(&afence->fence);
1896         xe_vm_put(afence->vm);
1897         dma_fence_put(afence->wait_fence);
1898         dma_fence_put(&afence->fence);
1899 }
1900
1901 static void add_async_op_fence_cb(struct xe_vm *vm,
1902                                   struct dma_fence *fence,
1903                                   struct async_op_fence *afence)
1904 {
1905         int ret;
1906
1907         if (!xe_vm_no_dma_fences(vm)) {
1908                 afence->started = true;
1909                 smp_wmb();
1910                 wake_up_all(&afence->wq);
1911         }
1912
1913         afence->wait_fence = dma_fence_get(fence);
1914         afence->vm = xe_vm_get(vm);
1915         dma_fence_get(&afence->fence);
1916         ret = dma_fence_add_callback(fence, &afence->cb, async_op_fence_cb);
1917         if (ret == -ENOENT) {
1918                 afence->fence.error = afence->wait_fence->error;
1919                 dma_fence_signal(&afence->fence);
1920         }
1921         if (ret) {
1922                 xe_vm_put(vm);
1923                 dma_fence_put(afence->wait_fence);
1924                 dma_fence_put(&afence->fence);
1925         }
1926         XE_WARN_ON(ret && ret != -ENOENT);
1927 }
1928
1929 int xe_vm_async_fence_wait_start(struct dma_fence *fence)
1930 {
1931         if (fence->ops == &async_op_fence_ops) {
1932                 struct async_op_fence *afence =
1933                         container_of(fence, struct async_op_fence, fence);
1934
1935                 xe_assert(afence->vm->xe, !xe_vm_no_dma_fences(afence->vm));
1936
1937                 smp_rmb();
1938                 return wait_event_interruptible(afence->wq, afence->started);
1939         }
1940
1941         return 0;
1942 }
1943
1944 static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
1945                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1946                         u32 num_syncs, struct async_op_fence *afence,
1947                         bool immediate, bool first_op, bool last_op)
1948 {
1949         struct dma_fence *fence;
1950
1951         xe_vm_assert_held(vm);
1952
1953         if (immediate) {
1954                 fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op,
1955                                        last_op);
1956                 if (IS_ERR(fence))
1957                         return PTR_ERR(fence);
1958         } else {
1959                 int i;
1960
1961                 xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1962
1963                 fence = dma_fence_get_stub();
1964                 if (last_op) {
1965                         for (i = 0; i < num_syncs; i++)
1966                                 xe_sync_entry_signal(&syncs[i], NULL, fence);
1967                 }
1968         }
1969         if (afence)
1970                 add_async_op_fence_cb(vm, fence, afence);
1971
1972         dma_fence_put(fence);
1973         return 0;
1974 }
1975
1976 static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
1977                       struct xe_bo *bo, struct xe_sync_entry *syncs,
1978                       u32 num_syncs, struct async_op_fence *afence,
1979                       bool immediate, bool first_op, bool last_op)
1980 {
1981         int err;
1982
1983         xe_vm_assert_held(vm);
1984         xe_bo_assert_held(bo);
1985
1986         if (bo && immediate) {
1987                 err = xe_bo_validate(bo, vm, true);
1988                 if (err)
1989                         return err;
1990         }
1991
1992         return __xe_vm_bind(vm, vma, q, syncs, num_syncs, afence, immediate,
1993                             first_op, last_op);
1994 }
1995
1996 static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
1997                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1998                         u32 num_syncs, struct async_op_fence *afence,
1999                         bool first_op, bool last_op)
2000 {
2001         struct dma_fence *fence;
2002
2003         xe_vm_assert_held(vm);
2004         xe_bo_assert_held(xe_vma_bo(vma));
2005
2006         fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op);
2007         if (IS_ERR(fence))
2008                 return PTR_ERR(fence);
2009         if (afence)
2010                 add_async_op_fence_cb(vm, fence, afence);
2011
2012         xe_vma_destroy(vma, fence);
2013         dma_fence_put(fence);
2014
2015         return 0;
2016 }
2017
2018 static int vm_set_error_capture_address(struct xe_device *xe, struct xe_vm *vm,
2019                                         u64 value)
2020 {
2021         if (XE_IOCTL_DBG(xe, !value))
2022                 return -EINVAL;
2023
2024         if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
2025                 return -EOPNOTSUPP;
2026
2027         if (XE_IOCTL_DBG(xe, vm->async_ops.error_capture.addr))
2028                 return -EOPNOTSUPP;
2029
2030         vm->async_ops.error_capture.mm = current->mm;
2031         vm->async_ops.error_capture.addr = value;
2032         init_waitqueue_head(&vm->async_ops.error_capture.wq);
2033
2034         return 0;
2035 }
2036
2037 typedef int (*xe_vm_set_property_fn)(struct xe_device *xe, struct xe_vm *vm,
2038                                      u64 value);
2039
2040 static const xe_vm_set_property_fn vm_set_property_funcs[] = {
2041         [XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS] =
2042                 vm_set_error_capture_address,
2043 };
2044
2045 static int vm_user_ext_set_property(struct xe_device *xe, struct xe_vm *vm,
2046                                     u64 extension)
2047 {
2048         u64 __user *address = u64_to_user_ptr(extension);
2049         struct drm_xe_ext_vm_set_property ext;
2050         int err;
2051
2052         err = __copy_from_user(&ext, address, sizeof(ext));
2053         if (XE_IOCTL_DBG(xe, err))
2054                 return -EFAULT;
2055
2056         if (XE_IOCTL_DBG(xe, ext.property >=
2057                          ARRAY_SIZE(vm_set_property_funcs)) ||
2058             XE_IOCTL_DBG(xe, ext.pad) ||
2059             XE_IOCTL_DBG(xe, ext.reserved[0] || ext.reserved[1]))
2060                 return -EINVAL;
2061
2062         return vm_set_property_funcs[ext.property](xe, vm, ext.value);
2063 }
2064
2065 typedef int (*xe_vm_user_extension_fn)(struct xe_device *xe, struct xe_vm *vm,
2066                                        u64 extension);
2067
2068 static const xe_vm_set_property_fn vm_user_extension_funcs[] = {
2069         [XE_VM_EXTENSION_SET_PROPERTY] = vm_user_ext_set_property,
2070 };
2071
2072 #define MAX_USER_EXTENSIONS     16
2073 static int vm_user_extensions(struct xe_device *xe, struct xe_vm *vm,
2074                               u64 extensions, int ext_number)
2075 {
2076         u64 __user *address = u64_to_user_ptr(extensions);
2077         struct xe_user_extension ext;
2078         int err;
2079
2080         if (XE_IOCTL_DBG(xe, ext_number >= MAX_USER_EXTENSIONS))
2081                 return -E2BIG;
2082
2083         err = __copy_from_user(&ext, address, sizeof(ext));
2084         if (XE_IOCTL_DBG(xe, err))
2085                 return -EFAULT;
2086
2087         if (XE_IOCTL_DBG(xe, ext.pad) ||
2088             XE_IOCTL_DBG(xe, ext.name >=
2089                          ARRAY_SIZE(vm_user_extension_funcs)))
2090                 return -EINVAL;
2091
2092         err = vm_user_extension_funcs[ext.name](xe, vm, extensions);
2093         if (XE_IOCTL_DBG(xe, err))
2094                 return err;
2095
2096         if (ext.next_extension)
2097                 return vm_user_extensions(xe, vm, ext.next_extension,
2098                                           ++ext_number);
2099
2100         return 0;
2101 }
2102
2103 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_SCRATCH_PAGE | \
2104                                     DRM_XE_VM_CREATE_COMPUTE_MODE | \
2105                                     DRM_XE_VM_CREATE_ASYNC_BIND_OPS | \
2106                                     DRM_XE_VM_CREATE_FAULT_MODE)
2107
2108 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2109                        struct drm_file *file)
2110 {
2111         struct xe_device *xe = to_xe_device(dev);
2112         struct xe_file *xef = to_xe_file(file);
2113         struct drm_xe_vm_create *args = data;
2114         struct xe_tile *tile;
2115         struct xe_vm *vm;
2116         u32 id, asid;
2117         int err;
2118         u32 flags = 0;
2119
2120         if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
2121                 args->flags |= DRM_XE_VM_CREATE_SCRATCH_PAGE;
2122
2123         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
2124                          !xe->info.supports_usm))
2125                 return -EINVAL;
2126
2127         if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2128                 return -EINVAL;
2129
2130         if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2131                 return -EINVAL;
2132
2133         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE &&
2134                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2135                 return -EINVAL;
2136
2137         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE &&
2138                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2139                 return -EINVAL;
2140
2141         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
2142                          xe_device_in_non_fault_mode(xe)))
2143                 return -EINVAL;
2144
2145         if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FAULT_MODE) &&
2146                          xe_device_in_fault_mode(xe)))
2147                 return -EINVAL;
2148
2149         if (args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE)
2150                 flags |= XE_VM_FLAG_SCRATCH_PAGE;
2151         if (args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE)
2152                 flags |= XE_VM_FLAG_COMPUTE_MODE;
2153         if (args->flags & DRM_XE_VM_CREATE_ASYNC_BIND_OPS)
2154                 flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
2155         if (args->flags & DRM_XE_VM_CREATE_FAULT_MODE)
2156                 flags |= XE_VM_FLAG_FAULT_MODE;
2157
2158         vm = xe_vm_create(xe, flags);
2159         if (IS_ERR(vm))
2160                 return PTR_ERR(vm);
2161
2162         if (args->extensions) {
2163                 err = vm_user_extensions(xe, vm, args->extensions, 0);
2164                 if (XE_IOCTL_DBG(xe, err)) {
2165                         xe_vm_close_and_put(vm);
2166                         return err;
2167                 }
2168         }
2169
2170         mutex_lock(&xef->vm.lock);
2171         err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2172         mutex_unlock(&xef->vm.lock);
2173         if (err) {
2174                 xe_vm_close_and_put(vm);
2175                 return err;
2176         }
2177
2178         if (xe->info.has_asid) {
2179                 mutex_lock(&xe->usm.lock);
2180                 err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2181                                       XA_LIMIT(0, XE_MAX_ASID - 1),
2182                                       &xe->usm.next_asid, GFP_KERNEL);
2183                 mutex_unlock(&xe->usm.lock);
2184                 if (err) {
2185                         xe_vm_close_and_put(vm);
2186                         return err;
2187                 }
2188                 vm->usm.asid = asid;
2189         }
2190
2191         args->vm_id = id;
2192         vm->xef = xef;
2193
2194         /* Record BO memory for VM pagetable created against client */
2195         for_each_tile(tile, xe, id)
2196                 if (vm->pt_root[id])
2197                         xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
2198
2199 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2200         /* Warning: Security issue - never enable by default */
2201         args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2202 #endif
2203
2204         return 0;
2205 }
2206
2207 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2208                         struct drm_file *file)
2209 {
2210         struct xe_device *xe = to_xe_device(dev);
2211         struct xe_file *xef = to_xe_file(file);
2212         struct drm_xe_vm_destroy *args = data;
2213         struct xe_vm *vm;
2214         int err = 0;
2215
2216         if (XE_IOCTL_DBG(xe, args->pad) ||
2217             XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2218                 return -EINVAL;
2219
2220         mutex_lock(&xef->vm.lock);
2221         vm = xa_load(&xef->vm.xa, args->vm_id);
2222         if (XE_IOCTL_DBG(xe, !vm))
2223                 err = -ENOENT;
2224         else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2225                 err = -EBUSY;
2226         else
2227                 xa_erase(&xef->vm.xa, args->vm_id);
2228         mutex_unlock(&xef->vm.lock);
2229
2230         if (!err)
2231                 xe_vm_close_and_put(vm);
2232
2233         return err;
2234 }
2235
2236 static const u32 region_to_mem_type[] = {
2237         XE_PL_TT,
2238         XE_PL_VRAM0,
2239         XE_PL_VRAM1,
2240 };
2241
2242 static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
2243                           struct xe_exec_queue *q, u32 region,
2244                           struct xe_sync_entry *syncs, u32 num_syncs,
2245                           struct async_op_fence *afence, bool first_op,
2246                           bool last_op)
2247 {
2248         int err;
2249
2250         xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
2251
2252         if (!xe_vma_has_no_bo(vma)) {
2253                 err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
2254                 if (err)
2255                         return err;
2256         }
2257
2258         if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
2259                 return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
2260                                   afence, true, first_op, last_op);
2261         } else {
2262                 int i;
2263
2264                 /* Nothing to do, signal fences now */
2265                 if (last_op) {
2266                         for (i = 0; i < num_syncs; i++)
2267                                 xe_sync_entry_signal(&syncs[i], NULL,
2268                                                      dma_fence_get_stub());
2269                 }
2270                 if (afence)
2271                         dma_fence_signal(&afence->fence);
2272                 return 0;
2273         }
2274 }
2275
2276 static void vm_set_async_error(struct xe_vm *vm, int err)
2277 {
2278         lockdep_assert_held(&vm->lock);
2279         vm->async_ops.error = err;
2280 }
2281
2282 static int vm_bind_ioctl_lookup_vma(struct xe_vm *vm, struct xe_bo *bo,
2283                                     u64 addr, u64 range, u32 op, u32 flags)
2284 {
2285         struct xe_device *xe = vm->xe;
2286         struct xe_vma *vma;
2287         bool async = !!(flags & XE_VM_BIND_FLAG_ASYNC);
2288
2289         lockdep_assert_held(&vm->lock);
2290
2291         switch (op) {
2292         case XE_VM_BIND_OP_MAP:
2293         case XE_VM_BIND_OP_MAP_USERPTR:
2294                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2295                 if (XE_IOCTL_DBG(xe, vma && !async))
2296                         return -EBUSY;
2297                 break;
2298         case XE_VM_BIND_OP_UNMAP:
2299         case XE_VM_BIND_OP_PREFETCH:
2300                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2301                 if (XE_IOCTL_DBG(xe, !vma))
2302                         /* Not an actual error, IOCTL cleans up returns and 0 */
2303                         return -ENODATA;
2304                 if (XE_IOCTL_DBG(xe, (xe_vma_start(vma) != addr ||
2305                                       xe_vma_end(vma) != addr + range) && !async))
2306                         return -EINVAL;
2307                 break;
2308         case XE_VM_BIND_OP_UNMAP_ALL:
2309                 if (XE_IOCTL_DBG(xe, list_empty(&bo->ttm.base.gpuva.list)))
2310                         /* Not an actual error, IOCTL cleans up returns and 0 */
2311                         return -ENODATA;
2312                 break;
2313         default:
2314                 drm_warn(&xe->drm, "NOT POSSIBLE");
2315                 return -EINVAL;
2316         }
2317
2318         return 0;
2319 }
2320
2321 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2322                              bool post_commit)
2323 {
2324         down_read(&vm->userptr.notifier_lock);
2325         vma->gpuva.flags |= XE_VMA_DESTROYED;
2326         up_read(&vm->userptr.notifier_lock);
2327         if (post_commit)
2328                 xe_vm_remove_vma(vm, vma);
2329 }
2330
2331 #undef ULL
2332 #define ULL     unsigned long long
2333
2334 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2335 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2336 {
2337         struct xe_vma *vma;
2338
2339         switch (op->op) {
2340         case DRM_GPUVA_OP_MAP:
2341                 vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2342                        (ULL)op->map.va.addr, (ULL)op->map.va.range);
2343                 break;
2344         case DRM_GPUVA_OP_REMAP:
2345                 vma = gpuva_to_vma(op->remap.unmap->va);
2346                 vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2347                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2348                        op->remap.unmap->keep ? 1 : 0);
2349                 if (op->remap.prev)
2350                         vm_dbg(&xe->drm,
2351                                "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2352                                (ULL)op->remap.prev->va.addr,
2353                                (ULL)op->remap.prev->va.range);
2354                 if (op->remap.next)
2355                         vm_dbg(&xe->drm,
2356                                "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2357                                (ULL)op->remap.next->va.addr,
2358                                (ULL)op->remap.next->va.range);
2359                 break;
2360         case DRM_GPUVA_OP_UNMAP:
2361                 vma = gpuva_to_vma(op->unmap.va);
2362                 vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2363                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2364                        op->unmap.keep ? 1 : 0);
2365                 break;
2366         case DRM_GPUVA_OP_PREFETCH:
2367                 vma = gpuva_to_vma(op->prefetch.va);
2368                 vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2369                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2370                 break;
2371         default:
2372                 drm_warn(&xe->drm, "NOT POSSIBLE");
2373         }
2374 }
2375 #else
2376 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2377 {
2378 }
2379 #endif
2380
2381 /*
2382  * Create operations list from IOCTL arguments, setup operations fields so parse
2383  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2384  */
2385 static struct drm_gpuva_ops *
2386 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2387                          u64 bo_offset_or_userptr, u64 addr, u64 range,
2388                          u32 operation, u32 flags, u8 tile_mask, u32 region)
2389 {
2390         struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2391         struct drm_gpuva_ops *ops;
2392         struct drm_gpuva_op *__op;
2393         struct xe_vma_op *op;
2394         struct drm_gpuvm_bo *vm_bo;
2395         int err;
2396
2397         lockdep_assert_held_write(&vm->lock);
2398
2399         vm_dbg(&vm->xe->drm,
2400                "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2401                operation, (ULL)addr, (ULL)range,
2402                (ULL)bo_offset_or_userptr);
2403
2404         switch (operation) {
2405         case XE_VM_BIND_OP_MAP:
2406         case XE_VM_BIND_OP_MAP_USERPTR:
2407                 ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2408                                                   obj, bo_offset_or_userptr);
2409                 if (IS_ERR(ops))
2410                         return ops;
2411
2412                 drm_gpuva_for_each_op(__op, ops) {
2413                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2414
2415                         op->tile_mask = tile_mask;
2416                         op->map.immediate =
2417                                 flags & XE_VM_BIND_FLAG_IMMEDIATE;
2418                         op->map.read_only =
2419                                 flags & XE_VM_BIND_FLAG_READONLY;
2420                         op->map.is_null = flags & XE_VM_BIND_FLAG_NULL;
2421                 }
2422                 break;
2423         case XE_VM_BIND_OP_UNMAP:
2424                 ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2425                 if (IS_ERR(ops))
2426                         return ops;
2427
2428                 drm_gpuva_for_each_op(__op, ops) {
2429                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2430
2431                         op->tile_mask = tile_mask;
2432                 }
2433                 break;
2434         case XE_VM_BIND_OP_PREFETCH:
2435                 ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2436                 if (IS_ERR(ops))
2437                         return ops;
2438
2439                 drm_gpuva_for_each_op(__op, ops) {
2440                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2441
2442                         op->tile_mask = tile_mask;
2443                         op->prefetch.region = region;
2444                 }
2445                 break;
2446         case XE_VM_BIND_OP_UNMAP_ALL:
2447                 xe_assert(vm->xe, bo);
2448
2449                 err = xe_bo_lock(bo, true);
2450                 if (err)
2451                         return ERR_PTR(err);
2452
2453                 vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj);
2454                 if (!vm_bo)
2455                         break;
2456
2457                 ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2458                 drm_gpuvm_bo_put(vm_bo);
2459                 xe_bo_unlock(bo);
2460                 if (IS_ERR(ops))
2461                         return ops;
2462
2463                 drm_gpuva_for_each_op(__op, ops) {
2464                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2465
2466                         op->tile_mask = tile_mask;
2467                 }
2468                 break;
2469         default:
2470                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2471                 ops = ERR_PTR(-EINVAL);
2472         }
2473
2474 #ifdef TEST_VM_ASYNC_OPS_ERROR
2475         if (operation & FORCE_ASYNC_OP_ERROR) {
2476                 op = list_first_entry_or_null(&ops->list, struct xe_vma_op,
2477                                               base.entry);
2478                 if (op)
2479                         op->inject_error = true;
2480         }
2481 #endif
2482
2483         if (!IS_ERR(ops))
2484                 drm_gpuva_for_each_op(__op, ops)
2485                         print_op(vm->xe, __op);
2486
2487         return ops;
2488 }
2489
2490 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2491                               u8 tile_mask, bool read_only, bool is_null)
2492 {
2493         struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2494         struct xe_vma *vma;
2495         int err;
2496
2497         lockdep_assert_held_write(&vm->lock);
2498
2499         if (bo) {
2500                 err = xe_bo_lock(bo, true);
2501                 if (err)
2502                         return ERR_PTR(err);
2503         }
2504         vma = xe_vma_create(vm, bo, op->gem.offset,
2505                             op->va.addr, op->va.addr +
2506                             op->va.range - 1, read_only, is_null,
2507                             tile_mask);
2508         if (bo)
2509                 xe_bo_unlock(bo);
2510
2511         if (xe_vma_is_userptr(vma)) {
2512                 err = xe_vma_userptr_pin_pages(vma);
2513                 if (err) {
2514                         prep_vma_destroy(vm, vma, false);
2515                         xe_vma_destroy_unlocked(vma);
2516                         return ERR_PTR(err);
2517                 }
2518         } else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
2519                 vm_insert_extobj(vm, vma);
2520                 err = add_preempt_fences(vm, bo);
2521                 if (err) {
2522                         prep_vma_destroy(vm, vma, false);
2523                         xe_vma_destroy_unlocked(vma);
2524                         return ERR_PTR(err);
2525                 }
2526         }
2527
2528         return vma;
2529 }
2530
2531 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2532 {
2533         if (vma->gpuva.flags & XE_VMA_PTE_1G)
2534                 return SZ_1G;
2535         else if (vma->gpuva.flags & XE_VMA_PTE_2M)
2536                 return SZ_2M;
2537
2538         return SZ_4K;
2539 }
2540
2541 static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2542 {
2543         switch (size) {
2544         case SZ_1G:
2545                 vma->gpuva.flags |= XE_VMA_PTE_1G;
2546                 break;
2547         case SZ_2M:
2548                 vma->gpuva.flags |= XE_VMA_PTE_2M;
2549                 break;
2550         }
2551
2552         return SZ_4K;
2553 }
2554
2555 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2556 {
2557         int err = 0;
2558
2559         lockdep_assert_held_write(&vm->lock);
2560
2561         switch (op->base.op) {
2562         case DRM_GPUVA_OP_MAP:
2563                 err |= xe_vm_insert_vma(vm, op->map.vma);
2564                 if (!err)
2565                         op->flags |= XE_VMA_OP_COMMITTED;
2566                 break;
2567         case DRM_GPUVA_OP_REMAP:
2568                 prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2569                                  true);
2570                 op->flags |= XE_VMA_OP_COMMITTED;
2571
2572                 if (op->remap.prev) {
2573                         err |= xe_vm_insert_vma(vm, op->remap.prev);
2574                         if (!err)
2575                                 op->flags |= XE_VMA_OP_PREV_COMMITTED;
2576                         if (!err && op->remap.skip_prev)
2577                                 op->remap.prev = NULL;
2578                 }
2579                 if (op->remap.next) {
2580                         err |= xe_vm_insert_vma(vm, op->remap.next);
2581                         if (!err)
2582                                 op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2583                         if (!err && op->remap.skip_next)
2584                                 op->remap.next = NULL;
2585                 }
2586
2587                 /* Adjust for partial unbind after removin VMA from VM */
2588                 if (!err) {
2589                         op->base.remap.unmap->va->va.addr = op->remap.start;
2590                         op->base.remap.unmap->va->va.range = op->remap.range;
2591                 }
2592                 break;
2593         case DRM_GPUVA_OP_UNMAP:
2594                 prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2595                 op->flags |= XE_VMA_OP_COMMITTED;
2596                 break;
2597         case DRM_GPUVA_OP_PREFETCH:
2598                 op->flags |= XE_VMA_OP_COMMITTED;
2599                 break;
2600         default:
2601                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2602         }
2603
2604         return err;
2605 }
2606
2607
2608 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
2609                                    struct drm_gpuva_ops *ops,
2610                                    struct xe_sync_entry *syncs, u32 num_syncs,
2611                                    struct list_head *ops_list, bool last,
2612                                    bool async)
2613 {
2614         struct xe_vma_op *last_op = NULL;
2615         struct async_op_fence *fence = NULL;
2616         struct drm_gpuva_op *__op;
2617         int err = 0;
2618
2619         lockdep_assert_held_write(&vm->lock);
2620
2621         if (last && num_syncs && async) {
2622                 u64 seqno;
2623
2624                 fence = kmalloc(sizeof(*fence), GFP_KERNEL);
2625                 if (!fence)
2626                         return -ENOMEM;
2627
2628                 seqno = q ? ++q->bind.fence_seqno : ++vm->async_ops.fence.seqno;
2629                 dma_fence_init(&fence->fence, &async_op_fence_ops,
2630                                &vm->async_ops.lock, q ? q->bind.fence_ctx :
2631                                vm->async_ops.fence.context, seqno);
2632
2633                 if (!xe_vm_no_dma_fences(vm)) {
2634                         fence->vm = vm;
2635                         fence->started = false;
2636                         init_waitqueue_head(&fence->wq);
2637                 }
2638         }
2639
2640         drm_gpuva_for_each_op(__op, ops) {
2641                 struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2642                 bool first = list_empty(ops_list);
2643
2644                 xe_assert(vm->xe, first || async);
2645
2646                 INIT_LIST_HEAD(&op->link);
2647                 list_add_tail(&op->link, ops_list);
2648
2649                 if (first) {
2650                         op->flags |= XE_VMA_OP_FIRST;
2651                         op->num_syncs = num_syncs;
2652                         op->syncs = syncs;
2653                 }
2654
2655                 op->q = q;
2656
2657                 switch (op->base.op) {
2658                 case DRM_GPUVA_OP_MAP:
2659                 {
2660                         struct xe_vma *vma;
2661
2662                         vma = new_vma(vm, &op->base.map,
2663                                       op->tile_mask, op->map.read_only,
2664                                       op->map.is_null);
2665                         if (IS_ERR(vma)) {
2666                                 err = PTR_ERR(vma);
2667                                 goto free_fence;
2668                         }
2669
2670                         op->map.vma = vma;
2671                         break;
2672                 }
2673                 case DRM_GPUVA_OP_REMAP:
2674                 {
2675                         struct xe_vma *old =
2676                                 gpuva_to_vma(op->base.remap.unmap->va);
2677
2678                         op->remap.start = xe_vma_start(old);
2679                         op->remap.range = xe_vma_size(old);
2680
2681                         if (op->base.remap.prev) {
2682                                 struct xe_vma *vma;
2683                                 bool read_only =
2684                                         op->base.remap.unmap->va->flags &
2685                                         XE_VMA_READ_ONLY;
2686                                 bool is_null =
2687                                         op->base.remap.unmap->va->flags &
2688                                         DRM_GPUVA_SPARSE;
2689
2690                                 vma = new_vma(vm, op->base.remap.prev,
2691                                               op->tile_mask, read_only,
2692                                               is_null);
2693                                 if (IS_ERR(vma)) {
2694                                         err = PTR_ERR(vma);
2695                                         goto free_fence;
2696                                 }
2697
2698                                 op->remap.prev = vma;
2699
2700                                 /*
2701                                  * Userptr creates a new SG mapping so
2702                                  * we must also rebind.
2703                                  */
2704                                 op->remap.skip_prev = !xe_vma_is_userptr(old) &&
2705                                         IS_ALIGNED(xe_vma_end(vma),
2706                                                    xe_vma_max_pte_size(old));
2707                                 if (op->remap.skip_prev) {
2708                                         xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2709                                         op->remap.range -=
2710                                                 xe_vma_end(vma) -
2711                                                 xe_vma_start(old);
2712                                         op->remap.start = xe_vma_end(vma);
2713                                 }
2714                         }
2715
2716                         if (op->base.remap.next) {
2717                                 struct xe_vma *vma;
2718                                 bool read_only =
2719                                         op->base.remap.unmap->va->flags &
2720                                         XE_VMA_READ_ONLY;
2721
2722                                 bool is_null =
2723                                         op->base.remap.unmap->va->flags &
2724                                         DRM_GPUVA_SPARSE;
2725
2726                                 vma = new_vma(vm, op->base.remap.next,
2727                                               op->tile_mask, read_only,
2728                                               is_null);
2729                                 if (IS_ERR(vma)) {
2730                                         err = PTR_ERR(vma);
2731                                         goto free_fence;
2732                                 }
2733
2734                                 op->remap.next = vma;
2735
2736                                 /*
2737                                  * Userptr creates a new SG mapping so
2738                                  * we must also rebind.
2739                                  */
2740                                 op->remap.skip_next = !xe_vma_is_userptr(old) &&
2741                                         IS_ALIGNED(xe_vma_start(vma),
2742                                                    xe_vma_max_pte_size(old));
2743                                 if (op->remap.skip_next) {
2744                                         xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2745                                         op->remap.range -=
2746                                                 xe_vma_end(old) -
2747                                                 xe_vma_start(vma);
2748                                 }
2749                         }
2750                         break;
2751                 }
2752                 case DRM_GPUVA_OP_UNMAP:
2753                 case DRM_GPUVA_OP_PREFETCH:
2754                         /* Nothing to do */
2755                         break;
2756                 default:
2757                         drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2758                 }
2759
2760                 last_op = op;
2761
2762                 err = xe_vma_op_commit(vm, op);
2763                 if (err)
2764                         goto free_fence;
2765         }
2766
2767         /* FIXME: Unhandled corner case */
2768         XE_WARN_ON(!last_op && last && !list_empty(ops_list));
2769
2770         if (!last_op)
2771                 goto free_fence;
2772         last_op->ops = ops;
2773         if (last) {
2774                 last_op->flags |= XE_VMA_OP_LAST;
2775                 last_op->num_syncs = num_syncs;
2776                 last_op->syncs = syncs;
2777                 last_op->fence = fence;
2778         }
2779
2780         return 0;
2781
2782 free_fence:
2783         kfree(fence);
2784         return err;
2785 }
2786
2787 static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
2788                       struct xe_vma *vma, struct xe_vma_op *op)
2789 {
2790         int err;
2791
2792         lockdep_assert_held_write(&vm->lock);
2793
2794         err = xe_vm_prepare_vma(exec, vma, 1);
2795         if (err)
2796                 return err;
2797
2798         xe_vm_assert_held(vm);
2799         xe_bo_assert_held(xe_vma_bo(vma));
2800
2801         switch (op->base.op) {
2802         case DRM_GPUVA_OP_MAP:
2803                 err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
2804                                  op->syncs, op->num_syncs, op->fence,
2805                                  op->map.immediate || !xe_vm_in_fault_mode(vm),
2806                                  op->flags & XE_VMA_OP_FIRST,
2807                                  op->flags & XE_VMA_OP_LAST);
2808                 break;
2809         case DRM_GPUVA_OP_REMAP:
2810         {
2811                 bool prev = !!op->remap.prev;
2812                 bool next = !!op->remap.next;
2813
2814                 if (!op->remap.unmap_done) {
2815                         if (prev || next) {
2816                                 vm->async_ops.munmap_rebind_inflight = true;
2817                                 vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
2818                         }
2819                         err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2820                                            op->num_syncs,
2821                                            !prev && !next ? op->fence : NULL,
2822                                            op->flags & XE_VMA_OP_FIRST,
2823                                            op->flags & XE_VMA_OP_LAST && !prev &&
2824                                            !next);
2825                         if (err)
2826                                 break;
2827                         op->remap.unmap_done = true;
2828                 }
2829
2830                 if (prev) {
2831                         op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
2832                         err = xe_vm_bind(vm, op->remap.prev, op->q,
2833                                          xe_vma_bo(op->remap.prev), op->syncs,
2834                                          op->num_syncs,
2835                                          !next ? op->fence : NULL, true, false,
2836                                          op->flags & XE_VMA_OP_LAST && !next);
2837                         op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2838                         if (err)
2839                                 break;
2840                         op->remap.prev = NULL;
2841                 }
2842
2843                 if (next) {
2844                         op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
2845                         err = xe_vm_bind(vm, op->remap.next, op->q,
2846                                          xe_vma_bo(op->remap.next),
2847                                          op->syncs, op->num_syncs,
2848                                          op->fence, true, false,
2849                                          op->flags & XE_VMA_OP_LAST);
2850                         op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2851                         if (err)
2852                                 break;
2853                         op->remap.next = NULL;
2854                 }
2855                 vm->async_ops.munmap_rebind_inflight = false;
2856
2857                 break;
2858         }
2859         case DRM_GPUVA_OP_UNMAP:
2860                 err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2861                                    op->num_syncs, op->fence,
2862                                    op->flags & XE_VMA_OP_FIRST,
2863                                    op->flags & XE_VMA_OP_LAST);
2864                 break;
2865         case DRM_GPUVA_OP_PREFETCH:
2866                 err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
2867                                      op->syncs, op->num_syncs, op->fence,
2868                                      op->flags & XE_VMA_OP_FIRST,
2869                                      op->flags & XE_VMA_OP_LAST);
2870                 break;
2871         default:
2872                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2873         }
2874
2875         if (err)
2876                 trace_xe_vma_fail(vma);
2877
2878         return err;
2879 }
2880
2881 static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
2882                                struct xe_vma_op *op)
2883 {
2884         struct drm_exec exec;
2885         int err;
2886
2887 retry_userptr:
2888         drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
2889         drm_exec_until_all_locked(&exec) {
2890                 err = op_execute(&exec, vm, vma, op);
2891                 drm_exec_retry_on_contention(&exec);
2892                 if (err)
2893                         break;
2894         }
2895         drm_exec_fini(&exec);
2896
2897         if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
2898                 lockdep_assert_held_write(&vm->lock);
2899                 err = xe_vma_userptr_pin_pages(vma);
2900                 if (!err)
2901                         goto retry_userptr;
2902
2903                 trace_xe_vma_fail(vma);
2904         }
2905
2906         return err;
2907 }
2908
2909 static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
2910 {
2911         int ret = 0;
2912
2913         lockdep_assert_held_write(&vm->lock);
2914
2915 #ifdef TEST_VM_ASYNC_OPS_ERROR
2916         if (op->inject_error) {
2917                 op->inject_error = false;
2918                 return -ENOMEM;
2919         }
2920 #endif
2921
2922         switch (op->base.op) {
2923         case DRM_GPUVA_OP_MAP:
2924                 ret = __xe_vma_op_execute(vm, op->map.vma, op);
2925                 break;
2926         case DRM_GPUVA_OP_REMAP:
2927         {
2928                 struct xe_vma *vma;
2929
2930                 if (!op->remap.unmap_done)
2931                         vma = gpuva_to_vma(op->base.remap.unmap->va);
2932                 else if (op->remap.prev)
2933                         vma = op->remap.prev;
2934                 else
2935                         vma = op->remap.next;
2936
2937                 ret = __xe_vma_op_execute(vm, vma, op);
2938                 break;
2939         }
2940         case DRM_GPUVA_OP_UNMAP:
2941                 ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
2942                                           op);
2943                 break;
2944         case DRM_GPUVA_OP_PREFETCH:
2945                 ret = __xe_vma_op_execute(vm,
2946                                           gpuva_to_vma(op->base.prefetch.va),
2947                                           op);
2948                 break;
2949         default:
2950                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2951         }
2952
2953         return ret;
2954 }
2955
2956 static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
2957 {
2958         bool last = op->flags & XE_VMA_OP_LAST;
2959
2960         if (last) {
2961                 while (op->num_syncs--)
2962                         xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
2963                 kfree(op->syncs);
2964                 if (op->q)
2965                         xe_exec_queue_put(op->q);
2966                 if (op->fence)
2967                         dma_fence_put(&op->fence->fence);
2968         }
2969         if (!list_empty(&op->link)) {
2970                 spin_lock_irq(&vm->async_ops.lock);
2971                 list_del(&op->link);
2972                 spin_unlock_irq(&vm->async_ops.lock);
2973         }
2974         if (op->ops)
2975                 drm_gpuva_ops_free(&vm->gpuvm, op->ops);
2976         if (last)
2977                 xe_vm_put(vm);
2978 }
2979
2980 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2981                              bool post_commit, bool prev_post_commit,
2982                              bool next_post_commit)
2983 {
2984         lockdep_assert_held_write(&vm->lock);
2985
2986         switch (op->base.op) {
2987         case DRM_GPUVA_OP_MAP:
2988                 if (op->map.vma) {
2989                         prep_vma_destroy(vm, op->map.vma, post_commit);
2990                         xe_vma_destroy_unlocked(op->map.vma);
2991                 }
2992                 break;
2993         case DRM_GPUVA_OP_UNMAP:
2994         {
2995                 struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2996
2997                 if (vma) {
2998                         down_read(&vm->userptr.notifier_lock);
2999                         vma->gpuva.flags &= ~XE_VMA_DESTROYED;
3000                         up_read(&vm->userptr.notifier_lock);
3001                         if (post_commit)
3002                                 xe_vm_insert_vma(vm, vma);
3003                 }
3004                 break;
3005         }
3006         case DRM_GPUVA_OP_REMAP:
3007         {
3008                 struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
3009
3010                 if (op->remap.prev) {
3011                         prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
3012                         xe_vma_destroy_unlocked(op->remap.prev);
3013                 }
3014                 if (op->remap.next) {
3015                         prep_vma_destroy(vm, op->remap.next, next_post_commit);
3016                         xe_vma_destroy_unlocked(op->remap.next);
3017                 }
3018                 if (vma) {
3019                         down_read(&vm->userptr.notifier_lock);
3020                         vma->gpuva.flags &= ~XE_VMA_DESTROYED;
3021                         up_read(&vm->userptr.notifier_lock);
3022                         if (post_commit)
3023                                 xe_vm_insert_vma(vm, vma);
3024                 }
3025                 break;
3026         }
3027         case DRM_GPUVA_OP_PREFETCH:
3028                 /* Nothing to do */
3029                 break;
3030         default:
3031                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3032         }
3033 }
3034
3035 static struct xe_vma_op *next_vma_op(struct xe_vm *vm)
3036 {
3037         return list_first_entry_or_null(&vm->async_ops.pending,
3038                                         struct xe_vma_op, link);
3039 }
3040
3041 static void xe_vma_op_work_func(struct work_struct *w)
3042 {
3043         struct xe_vm *vm = container_of(w, struct xe_vm, async_ops.work);
3044
3045         for (;;) {
3046                 struct xe_vma_op *op;
3047                 int err;
3048
3049                 if (vm->async_ops.error && !xe_vm_is_closed(vm))
3050                         break;
3051
3052                 spin_lock_irq(&vm->async_ops.lock);
3053                 op = next_vma_op(vm);
3054                 spin_unlock_irq(&vm->async_ops.lock);
3055
3056                 if (!op)
3057                         break;
3058
3059                 if (!xe_vm_is_closed(vm)) {
3060                         down_write(&vm->lock);
3061                         err = xe_vma_op_execute(vm, op);
3062                         if (err) {
3063                                 drm_warn(&vm->xe->drm,
3064                                          "Async VM op(%d) failed with %d",
3065                                          op->base.op, err);
3066                                 vm_set_async_error(vm, err);
3067                                 up_write(&vm->lock);
3068
3069                                 if (vm->async_ops.error_capture.addr)
3070                                         vm_error_capture(vm, err, 0, 0, 0);
3071                                 break;
3072                         }
3073                         up_write(&vm->lock);
3074                 } else {
3075                         struct xe_vma *vma;
3076
3077                         switch (op->base.op) {
3078                         case DRM_GPUVA_OP_REMAP:
3079                                 vma = gpuva_to_vma(op->base.remap.unmap->va);
3080                                 trace_xe_vma_flush(vma);
3081
3082                                 down_write(&vm->lock);
3083                                 xe_vma_destroy_unlocked(vma);
3084                                 up_write(&vm->lock);
3085                                 break;
3086                         case DRM_GPUVA_OP_UNMAP:
3087                                 vma = gpuva_to_vma(op->base.unmap.va);
3088                                 trace_xe_vma_flush(vma);
3089
3090                                 down_write(&vm->lock);
3091                                 xe_vma_destroy_unlocked(vma);
3092                                 up_write(&vm->lock);
3093                                 break;
3094                         default:
3095                                 /* Nothing to do */
3096                                 break;
3097                         }
3098
3099                         if (op->fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
3100                                                    &op->fence->fence.flags)) {
3101                                 if (!xe_vm_no_dma_fences(vm)) {
3102                                         op->fence->started = true;
3103                                         wake_up_all(&op->fence->wq);
3104                                 }
3105                                 dma_fence_signal(&op->fence->fence);
3106                         }
3107                 }
3108
3109                 xe_vma_op_cleanup(vm, op);
3110         }
3111 }
3112
3113 static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3114                                      struct list_head *ops_list, bool async)
3115 {
3116         struct xe_vma_op *op, *last_op, *next;
3117         int err;
3118
3119         lockdep_assert_held_write(&vm->lock);
3120
3121         last_op = list_last_entry(ops_list, struct xe_vma_op, link);
3122
3123         if (!async) {
3124                 err = xe_vma_op_execute(vm, last_op);
3125                 if (err)
3126                         goto unwind;
3127                 xe_vma_op_cleanup(vm, last_op);
3128         } else {
3129                 int i;
3130                 bool installed = false;
3131
3132                 for (i = 0; i < last_op->num_syncs; i++)
3133                         installed |= xe_sync_entry_signal(&last_op->syncs[i],
3134                                                           NULL,
3135                                                           &last_op->fence->fence);
3136                 if (!installed && last_op->fence)
3137                         dma_fence_signal(&last_op->fence->fence);
3138
3139                 spin_lock_irq(&vm->async_ops.lock);
3140                 list_splice_tail(ops_list, &vm->async_ops.pending);
3141                 spin_unlock_irq(&vm->async_ops.lock);
3142
3143                 if (!vm->async_ops.error)
3144                         queue_work(system_unbound_wq, &vm->async_ops.work);
3145         }
3146
3147         return 0;
3148
3149 unwind:
3150         list_for_each_entry_reverse(op, ops_list, link)
3151                 xe_vma_op_unwind(vm, op, op->flags & XE_VMA_OP_COMMITTED,
3152                                  op->flags & XE_VMA_OP_PREV_COMMITTED,
3153                                  op->flags & XE_VMA_OP_NEXT_COMMITTED);
3154         list_for_each_entry_safe(op, next, ops_list, link)
3155                 xe_vma_op_cleanup(vm, op);
3156
3157         return err;
3158 }
3159
3160 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
3161                                      struct drm_gpuva_ops **ops,
3162                                      int num_ops_list)
3163 {
3164         int i;
3165
3166         for (i = num_ops_list - 1; i; ++i) {
3167                 struct drm_gpuva_ops *__ops = ops[i];
3168                 struct drm_gpuva_op *__op;
3169
3170                 if (!__ops)
3171                         continue;
3172
3173                 drm_gpuva_for_each_op_reverse(__op, __ops) {
3174                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3175
3176                         xe_vma_op_unwind(vm, op,
3177                                          op->flags & XE_VMA_OP_COMMITTED,
3178                                          op->flags & XE_VMA_OP_PREV_COMMITTED,
3179                                          op->flags & XE_VMA_OP_NEXT_COMMITTED);
3180                 }
3181
3182                 drm_gpuva_ops_free(&vm->gpuvm, __ops);
3183         }
3184 }
3185
3186 #ifdef TEST_VM_ASYNC_OPS_ERROR
3187 #define SUPPORTED_FLAGS \
3188         (FORCE_ASYNC_OP_ERROR | XE_VM_BIND_FLAG_ASYNC | \
3189          XE_VM_BIND_FLAG_READONLY | XE_VM_BIND_FLAG_IMMEDIATE | \
3190          XE_VM_BIND_FLAG_NULL | 0xffff)
3191 #else
3192 #define SUPPORTED_FLAGS \
3193         (XE_VM_BIND_FLAG_ASYNC | XE_VM_BIND_FLAG_READONLY | \
3194          XE_VM_BIND_FLAG_IMMEDIATE | XE_VM_BIND_FLAG_NULL | 0xffff)
3195 #endif
3196 #define XE_64K_PAGE_MASK 0xffffull
3197
3198 #define MAX_BINDS       512     /* FIXME: Picking random upper limit */
3199
3200 static int vm_bind_ioctl_check_args(struct xe_device *xe,
3201                                     struct drm_xe_vm_bind *args,
3202                                     struct drm_xe_vm_bind_op **bind_ops,
3203                                     bool *async)
3204 {
3205         int err;
3206         int i;
3207
3208         if (XE_IOCTL_DBG(xe, args->extensions) ||
3209             XE_IOCTL_DBG(xe, !args->num_binds) ||
3210             XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS))
3211                 return -EINVAL;
3212
3213         if (args->num_binds > 1) {
3214                 u64 __user *bind_user =
3215                         u64_to_user_ptr(args->vector_of_binds);
3216
3217                 *bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) *
3218                                     args->num_binds, GFP_KERNEL);
3219                 if (!*bind_ops)
3220                         return -ENOMEM;
3221
3222                 err = __copy_from_user(*bind_ops, bind_user,
3223                                        sizeof(struct drm_xe_vm_bind_op) *
3224                                        args->num_binds);
3225                 if (XE_IOCTL_DBG(xe, err)) {
3226                         err = -EFAULT;
3227                         goto free_bind_ops;
3228                 }
3229         } else {
3230                 *bind_ops = &args->bind;
3231         }
3232
3233         for (i = 0; i < args->num_binds; ++i) {
3234                 u64 range = (*bind_ops)[i].range;
3235                 u64 addr = (*bind_ops)[i].addr;
3236                 u32 op = (*bind_ops)[i].op;
3237                 u32 flags = (*bind_ops)[i].flags;
3238                 u32 obj = (*bind_ops)[i].obj;
3239                 u64 obj_offset = (*bind_ops)[i].obj_offset;
3240                 u32 region = (*bind_ops)[i].region;
3241                 bool is_null = flags & XE_VM_BIND_FLAG_NULL;
3242
3243                 if (i == 0) {
3244                         *async = !!(flags & XE_VM_BIND_FLAG_ASYNC);
3245                 } else if (XE_IOCTL_DBG(xe, !*async) ||
3246                            XE_IOCTL_DBG(xe, !(flags & XE_VM_BIND_FLAG_ASYNC)) ||
3247                            XE_IOCTL_DBG(xe, op == XE_VM_BIND_OP_RESTART)) {
3248                         err = -EINVAL;
3249                         goto free_bind_ops;
3250                 }
3251
3252                 if (XE_IOCTL_DBG(xe, !*async &&
3253                                  op == XE_VM_BIND_OP_UNMAP_ALL)) {
3254                         err = -EINVAL;
3255                         goto free_bind_ops;
3256                 }
3257
3258                 if (XE_IOCTL_DBG(xe, !*async &&
3259                                  op == XE_VM_BIND_OP_PREFETCH)) {
3260                         err = -EINVAL;
3261                         goto free_bind_ops;
3262                 }
3263
3264                 if (XE_IOCTL_DBG(xe, op > XE_VM_BIND_OP_PREFETCH) ||
3265                     XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
3266                     XE_IOCTL_DBG(xe, obj && is_null) ||
3267                     XE_IOCTL_DBG(xe, obj_offset && is_null) ||
3268                     XE_IOCTL_DBG(xe, op != XE_VM_BIND_OP_MAP &&
3269                                  is_null) ||
3270                     XE_IOCTL_DBG(xe, !obj &&
3271                                  op == XE_VM_BIND_OP_MAP &&
3272                                  !is_null) ||
3273                     XE_IOCTL_DBG(xe, !obj &&
3274                                  op == XE_VM_BIND_OP_UNMAP_ALL) ||
3275                     XE_IOCTL_DBG(xe, addr &&
3276                                  op == XE_VM_BIND_OP_UNMAP_ALL) ||
3277                     XE_IOCTL_DBG(xe, range &&
3278                                  op == XE_VM_BIND_OP_UNMAP_ALL) ||
3279                     XE_IOCTL_DBG(xe, obj &&
3280                                  op == XE_VM_BIND_OP_MAP_USERPTR) ||
3281                     XE_IOCTL_DBG(xe, obj &&
3282                                  op == XE_VM_BIND_OP_PREFETCH) ||
3283                     XE_IOCTL_DBG(xe, region &&
3284                                  op != XE_VM_BIND_OP_PREFETCH) ||
3285                     XE_IOCTL_DBG(xe, !(BIT(region) &
3286                                        xe->info.mem_region_mask)) ||
3287                     XE_IOCTL_DBG(xe, obj &&
3288                                  op == XE_VM_BIND_OP_UNMAP)) {
3289                         err = -EINVAL;
3290                         goto free_bind_ops;
3291                 }
3292
3293                 if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3294                     XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3295                     XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3296                     XE_IOCTL_DBG(xe, !range && op !=
3297                                  XE_VM_BIND_OP_RESTART &&
3298                                  op != XE_VM_BIND_OP_UNMAP_ALL)) {
3299                         err = -EINVAL;
3300                         goto free_bind_ops;
3301                 }
3302         }
3303
3304         return 0;
3305
3306 free_bind_ops:
3307         if (args->num_binds > 1)
3308                 kfree(*bind_ops);
3309         return err;
3310 }
3311
3312 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3313 {
3314         struct xe_device *xe = to_xe_device(dev);
3315         struct xe_file *xef = to_xe_file(file);
3316         struct drm_xe_vm_bind *args = data;
3317         struct drm_xe_sync __user *syncs_user;
3318         struct xe_bo **bos = NULL;
3319         struct drm_gpuva_ops **ops = NULL;
3320         struct xe_vm *vm;
3321         struct xe_exec_queue *q = NULL;
3322         u32 num_syncs;
3323         struct xe_sync_entry *syncs = NULL;
3324         struct drm_xe_vm_bind_op *bind_ops;
3325         LIST_HEAD(ops_list);
3326         bool async;
3327         int err;
3328         int i;
3329
3330         err = vm_bind_ioctl_check_args(xe, args, &bind_ops, &async);
3331         if (err)
3332                 return err;
3333
3334         if (args->exec_queue_id) {
3335                 q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3336                 if (XE_IOCTL_DBG(xe, !q)) {
3337                         err = -ENOENT;
3338                         goto free_objs;
3339                 }
3340
3341                 if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3342                         err = -EINVAL;
3343                         goto put_exec_queue;
3344                 }
3345         }
3346
3347         vm = xe_vm_lookup(xef, args->vm_id);
3348         if (XE_IOCTL_DBG(xe, !vm)) {
3349                 err = -EINVAL;
3350                 goto put_exec_queue;
3351         }
3352
3353         err = down_write_killable(&vm->lock);
3354         if (err)
3355                 goto put_vm;
3356
3357         if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3358                 err = -ENOENT;
3359                 goto release_vm_lock;
3360         }
3361
3362         if (bind_ops[0].op == XE_VM_BIND_OP_RESTART) {
3363                 if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
3364                         err = -EOPNOTSUPP;
3365                 if (XE_IOCTL_DBG(xe, !err && args->num_syncs))
3366                         err = EINVAL;
3367                 if (XE_IOCTL_DBG(xe, !err && !vm->async_ops.error))
3368                         err = -EPROTO;
3369
3370                 if (!err) {
3371                         trace_xe_vm_restart(vm);
3372                         vm_set_async_error(vm, 0);
3373
3374                         queue_work(system_unbound_wq, &vm->async_ops.work);
3375
3376                         /* Rebinds may have been blocked, give worker a kick */
3377                         if (xe_vm_in_compute_mode(vm))
3378                                 xe_vm_queue_rebind_worker(vm);
3379                 }
3380
3381                 goto release_vm_lock;
3382         }
3383
3384         if (XE_IOCTL_DBG(xe, !vm->async_ops.error &&
3385                          async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) {
3386                 err = -EOPNOTSUPP;
3387                 goto release_vm_lock;
3388         }
3389
3390         for (i = 0; i < args->num_binds; ++i) {
3391                 u64 range = bind_ops[i].range;
3392                 u64 addr = bind_ops[i].addr;
3393
3394                 if (XE_IOCTL_DBG(xe, range > vm->size) ||
3395                     XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3396                         err = -EINVAL;
3397                         goto release_vm_lock;
3398                 }
3399
3400                 if (bind_ops[i].tile_mask) {
3401                         u64 valid_tiles = BIT(xe->info.tile_count) - 1;
3402
3403                         if (XE_IOCTL_DBG(xe, bind_ops[i].tile_mask &
3404                                          ~valid_tiles)) {
3405                                 err = -EINVAL;
3406                                 goto release_vm_lock;
3407                         }
3408                 }
3409         }
3410
3411         bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
3412         if (!bos) {
3413                 err = -ENOMEM;
3414                 goto release_vm_lock;
3415         }
3416
3417         ops = kzalloc(sizeof(*ops) * args->num_binds, GFP_KERNEL);
3418         if (!ops) {
3419                 err = -ENOMEM;
3420                 goto release_vm_lock;
3421         }
3422
3423         for (i = 0; i < args->num_binds; ++i) {
3424                 struct drm_gem_object *gem_obj;
3425                 u64 range = bind_ops[i].range;
3426                 u64 addr = bind_ops[i].addr;
3427                 u32 obj = bind_ops[i].obj;
3428                 u64 obj_offset = bind_ops[i].obj_offset;
3429
3430                 if (!obj)
3431                         continue;
3432
3433                 gem_obj = drm_gem_object_lookup(file, obj);
3434                 if (XE_IOCTL_DBG(xe, !gem_obj)) {
3435                         err = -ENOENT;
3436                         goto put_obj;
3437                 }
3438                 bos[i] = gem_to_xe_bo(gem_obj);
3439
3440                 if (XE_IOCTL_DBG(xe, range > bos[i]->size) ||
3441                     XE_IOCTL_DBG(xe, obj_offset >
3442                                  bos[i]->size - range)) {
3443                         err = -EINVAL;
3444                         goto put_obj;
3445                 }
3446
3447                 if (bos[i]->flags & XE_BO_INTERNAL_64K) {
3448                         if (XE_IOCTL_DBG(xe, obj_offset &
3449                                          XE_64K_PAGE_MASK) ||
3450                             XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3451                             XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3452                                 err = -EINVAL;
3453                                 goto put_obj;
3454                         }
3455                 }
3456         }
3457
3458         if (args->num_syncs) {
3459                 syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3460                 if (!syncs) {
3461                         err = -ENOMEM;
3462                         goto put_obj;
3463                 }
3464         }
3465
3466         syncs_user = u64_to_user_ptr(args->syncs);
3467         for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3468                 err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3469                                           &syncs_user[num_syncs], false,
3470                                           xe_vm_no_dma_fences(vm));
3471                 if (err)
3472                         goto free_syncs;
3473         }
3474
3475         /* Do some error checking first to make the unwind easier */
3476         for (i = 0; i < args->num_binds; ++i) {
3477                 u64 range = bind_ops[i].range;
3478                 u64 addr = bind_ops[i].addr;
3479                 u32 op = bind_ops[i].op;
3480                 u32 flags = bind_ops[i].flags;
3481
3482                 err = vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op, flags);
3483                 if (err)
3484                         goto free_syncs;
3485         }
3486
3487         for (i = 0; i < args->num_binds; ++i) {
3488                 u64 range = bind_ops[i].range;
3489                 u64 addr = bind_ops[i].addr;
3490                 u32 op = bind_ops[i].op;
3491                 u32 flags = bind_ops[i].flags;
3492                 u64 obj_offset = bind_ops[i].obj_offset;
3493                 u8 tile_mask = bind_ops[i].tile_mask;
3494                 u32 region = bind_ops[i].region;
3495
3496                 ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3497                                                   addr, range, op, flags,
3498                                                   tile_mask, region);
3499                 if (IS_ERR(ops[i])) {
3500                         err = PTR_ERR(ops[i]);
3501                         ops[i] = NULL;
3502                         goto unwind_ops;
3503                 }
3504
3505                 err = vm_bind_ioctl_ops_parse(vm, q, ops[i], syncs, num_syncs,
3506                                               &ops_list,
3507                                               i == args->num_binds - 1,
3508                                               async);
3509                 if (err)
3510                         goto unwind_ops;
3511         }
3512
3513         /* Nothing to do */
3514         if (list_empty(&ops_list)) {
3515                 err = -ENODATA;
3516                 goto unwind_ops;
3517         }
3518
3519         err = vm_bind_ioctl_ops_execute(vm, &ops_list, async);
3520         up_write(&vm->lock);
3521
3522         for (i = 0; i < args->num_binds; ++i)
3523                 xe_bo_put(bos[i]);
3524
3525         kfree(bos);
3526         kfree(ops);
3527         if (args->num_binds > 1)
3528                 kfree(bind_ops);
3529
3530         return err;
3531
3532 unwind_ops:
3533         vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3534 free_syncs:
3535         for (i = 0; err == -ENODATA && i < num_syncs; i++)
3536                 xe_sync_entry_signal(&syncs[i], NULL, dma_fence_get_stub());
3537         while (num_syncs--)
3538                 xe_sync_entry_cleanup(&syncs[num_syncs]);
3539
3540         kfree(syncs);
3541 put_obj:
3542         for (i = 0; i < args->num_binds; ++i)
3543                 xe_bo_put(bos[i]);
3544 release_vm_lock:
3545         up_write(&vm->lock);
3546 put_vm:
3547         xe_vm_put(vm);
3548 put_exec_queue:
3549         if (q)
3550                 xe_exec_queue_put(q);
3551 free_objs:
3552         kfree(bos);
3553         kfree(ops);
3554         if (args->num_binds > 1)
3555                 kfree(bind_ops);
3556         return err == -ENODATA ? 0 : err;
3557 }
3558
3559 /**
3560  * xe_vm_lock() - Lock the vm's dma_resv object
3561  * @vm: The struct xe_vm whose lock is to be locked
3562  * @intr: Whether to perform any wait interruptible
3563  *
3564  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3565  * contended lock was interrupted. If @intr is false, the function
3566  * always returns 0.
3567  */
3568 int xe_vm_lock(struct xe_vm *vm, bool intr)
3569 {
3570         if (intr)
3571                 return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3572
3573         return dma_resv_lock(xe_vm_resv(vm), NULL);
3574 }
3575
3576 /**
3577  * xe_vm_unlock() - Unlock the vm's dma_resv object
3578  * @vm: The struct xe_vm whose lock is to be released.
3579  *
3580  * Unlock a buffer object lock that was locked by xe_vm_lock().
3581  */
3582 void xe_vm_unlock(struct xe_vm *vm)
3583 {
3584         dma_resv_unlock(xe_vm_resv(vm));
3585 }
3586
3587 /**
3588  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3589  * @vma: VMA to invalidate
3590  *
3591  * Walks a list of page tables leaves which it memset the entries owned by this
3592  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3593  * complete.
3594  *
3595  * Returns 0 for success, negative error code otherwise.
3596  */
3597 int xe_vm_invalidate_vma(struct xe_vma *vma)
3598 {
3599         struct xe_device *xe = xe_vma_vm(vma)->xe;
3600         struct xe_tile *tile;
3601         u32 tile_needs_invalidate = 0;
3602         int seqno[XE_MAX_TILES_PER_DEVICE];
3603         u8 id;
3604         int ret;
3605
3606         xe_assert(xe, xe_vm_in_fault_mode(xe_vma_vm(vma)));
3607         xe_assert(xe, !xe_vma_is_null(vma));
3608         trace_xe_vma_usm_invalidate(vma);
3609
3610         /* Check that we don't race with page-table updates */
3611         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3612                 if (xe_vma_is_userptr(vma)) {
3613                         WARN_ON_ONCE(!mmu_interval_check_retry
3614                                      (&vma->userptr.notifier,
3615                                       vma->userptr.notifier_seq));
3616                         WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3617                                                              DMA_RESV_USAGE_BOOKKEEP));
3618
3619                 } else {
3620                         xe_bo_assert_held(xe_vma_bo(vma));
3621                 }
3622         }
3623
3624         for_each_tile(tile, xe, id) {
3625                 if (xe_pt_zap_ptes(tile, vma)) {
3626                         tile_needs_invalidate |= BIT(id);
3627                         xe_device_wmb(xe);
3628                         /*
3629                          * FIXME: We potentially need to invalidate multiple
3630                          * GTs within the tile
3631                          */
3632                         seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
3633                         if (seqno[id] < 0)
3634                                 return seqno[id];
3635                 }
3636         }
3637
3638         for_each_tile(tile, xe, id) {
3639                 if (tile_needs_invalidate & BIT(id)) {
3640                         ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
3641                         if (ret < 0)
3642                                 return ret;
3643                 }
3644         }
3645
3646         vma->usm.tile_invalidated = vma->tile_mask;
3647
3648         return 0;
3649 }
3650
3651 int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
3652 {
3653         struct drm_gpuva *gpuva;
3654         bool is_vram;
3655         uint64_t addr;
3656
3657         if (!down_read_trylock(&vm->lock)) {
3658                 drm_printf(p, " Failed to acquire VM lock to dump capture");
3659                 return 0;
3660         }
3661         if (vm->pt_root[gt_id]) {
3662                 addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
3663                 is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
3664                 drm_printf(p, " VM root: A:0x%llx %s\n", addr,
3665                            is_vram ? "VRAM" : "SYS");
3666         }
3667
3668         drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3669                 struct xe_vma *vma = gpuva_to_vma(gpuva);
3670                 bool is_userptr = xe_vma_is_userptr(vma);
3671                 bool is_null = xe_vma_is_null(vma);
3672
3673                 if (is_null) {
3674                         addr = 0;
3675                 } else if (is_userptr) {
3676                         struct xe_res_cursor cur;
3677
3678                         if (vma->userptr.sg) {
3679                                 xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE,
3680                                                 &cur);
3681                                 addr = xe_res_dma(&cur);
3682                         } else {
3683                                 addr = 0;
3684                         }
3685                 } else {
3686                         addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
3687                         is_vram = xe_bo_is_vram(xe_vma_bo(vma));
3688                 }
3689                 drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
3690                            xe_vma_start(vma), xe_vma_end(vma) - 1,
3691                            xe_vma_size(vma),
3692                            addr, is_null ? "NULL" : is_userptr ? "USR" :
3693                            is_vram ? "VRAM" : "SYS");
3694         }
3695         up_read(&vm->lock);
3696
3697         return 0;
3698 }