drm/xe: Use pat_index to encode pde/pte
[linux-2.6-microblaze.git] / drivers / gpu / drm / xe / xe_vm.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2021 Intel Corporation
4  */
5
6 #include "xe_vm.h"
7
8 #include <linux/dma-fence-array.h>
9
10 #include <drm/drm_exec.h>
11 #include <drm/drm_print.h>
12 #include <drm/ttm/ttm_execbuf_util.h>
13 #include <drm/ttm/ttm_tt.h>
14 #include <drm/xe_drm.h>
15 #include <linux/delay.h>
16 #include <linux/kthread.h>
17 #include <linux/mm.h>
18 #include <linux/swap.h>
19
20 #include "xe_bo.h"
21 #include "xe_device.h"
22 #include "xe_drm_client.h"
23 #include "xe_exec_queue.h"
24 #include "xe_gt.h"
25 #include "xe_gt_pagefault.h"
26 #include "xe_gt_tlb_invalidation.h"
27 #include "xe_migrate.h"
28 #include "xe_pm.h"
29 #include "xe_preempt_fence.h"
30 #include "xe_pt.h"
31 #include "xe_res_cursor.h"
32 #include "xe_sync.h"
33 #include "xe_trace.h"
34 #include "generated/xe_wa_oob.h"
35 #include "xe_wa.h"
36
37 #define TEST_VM_ASYNC_OPS_ERROR
38
39 static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
40 {
41         return vm->gpuvm.r_obj;
42 }
43
44 /**
45  * xe_vma_userptr_check_repin() - Advisory check for repin needed
46  * @vma: The userptr vma
47  *
48  * Check if the userptr vma has been invalidated since last successful
49  * repin. The check is advisory only and can the function can be called
50  * without the vm->userptr.notifier_lock held. There is no guarantee that the
51  * vma userptr will remain valid after a lockless check, so typically
52  * the call needs to be followed by a proper check under the notifier_lock.
53  *
54  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
55  */
56 int xe_vma_userptr_check_repin(struct xe_vma *vma)
57 {
58         return mmu_interval_check_retry(&vma->userptr.notifier,
59                                         vma->userptr.notifier_seq) ?
60                 -EAGAIN : 0;
61 }
62
63 int xe_vma_userptr_pin_pages(struct xe_vma *vma)
64 {
65         struct xe_vm *vm = xe_vma_vm(vma);
66         struct xe_device *xe = vm->xe;
67         const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
68         struct page **pages;
69         bool in_kthread = !current->mm;
70         unsigned long notifier_seq;
71         int pinned, ret, i;
72         bool read_only = xe_vma_read_only(vma);
73
74         lockdep_assert_held(&vm->lock);
75         xe_assert(xe, xe_vma_is_userptr(vma));
76 retry:
77         if (vma->gpuva.flags & XE_VMA_DESTROYED)
78                 return 0;
79
80         notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
81         if (notifier_seq == vma->userptr.notifier_seq)
82                 return 0;
83
84         pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
85         if (!pages)
86                 return -ENOMEM;
87
88         if (vma->userptr.sg) {
89                 dma_unmap_sgtable(xe->drm.dev,
90                                   vma->userptr.sg,
91                                   read_only ? DMA_TO_DEVICE :
92                                   DMA_BIDIRECTIONAL, 0);
93                 sg_free_table(vma->userptr.sg);
94                 vma->userptr.sg = NULL;
95         }
96
97         pinned = ret = 0;
98         if (in_kthread) {
99                 if (!mmget_not_zero(vma->userptr.notifier.mm)) {
100                         ret = -EFAULT;
101                         goto mm_closed;
102                 }
103                 kthread_use_mm(vma->userptr.notifier.mm);
104         }
105
106         while (pinned < num_pages) {
107                 ret = get_user_pages_fast(xe_vma_userptr(vma) +
108                                           pinned * PAGE_SIZE,
109                                           num_pages - pinned,
110                                           read_only ? 0 : FOLL_WRITE,
111                                           &pages[pinned]);
112                 if (ret < 0) {
113                         if (in_kthread)
114                                 ret = 0;
115                         break;
116                 }
117
118                 pinned += ret;
119                 ret = 0;
120         }
121
122         if (in_kthread) {
123                 kthread_unuse_mm(vma->userptr.notifier.mm);
124                 mmput(vma->userptr.notifier.mm);
125         }
126 mm_closed:
127         if (ret)
128                 goto out;
129
130         ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages,
131                                                 pinned, 0,
132                                                 (u64)pinned << PAGE_SHIFT,
133                                                 xe_sg_segment_size(xe->drm.dev),
134                                                 GFP_KERNEL);
135         if (ret) {
136                 vma->userptr.sg = NULL;
137                 goto out;
138         }
139         vma->userptr.sg = &vma->userptr.sgt;
140
141         ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg,
142                               read_only ? DMA_TO_DEVICE :
143                               DMA_BIDIRECTIONAL,
144                               DMA_ATTR_SKIP_CPU_SYNC |
145                               DMA_ATTR_NO_KERNEL_MAPPING);
146         if (ret) {
147                 sg_free_table(vma->userptr.sg);
148                 vma->userptr.sg = NULL;
149                 goto out;
150         }
151
152         for (i = 0; i < pinned; ++i) {
153                 if (!read_only) {
154                         lock_page(pages[i]);
155                         set_page_dirty(pages[i]);
156                         unlock_page(pages[i]);
157                 }
158
159                 mark_page_accessed(pages[i]);
160         }
161
162 out:
163         release_pages(pages, pinned);
164         kvfree(pages);
165
166         if (!(ret < 0)) {
167                 vma->userptr.notifier_seq = notifier_seq;
168                 if (xe_vma_userptr_check_repin(vma) == -EAGAIN)
169                         goto retry;
170         }
171
172         return ret < 0 ? ret : 0;
173 }
174
175 static bool preempt_fences_waiting(struct xe_vm *vm)
176 {
177         struct xe_exec_queue *q;
178
179         lockdep_assert_held(&vm->lock);
180         xe_vm_assert_held(vm);
181
182         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
183                 if (!q->compute.pfence ||
184                     (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
185                                                    &q->compute.pfence->flags))) {
186                         return true;
187                 }
188         }
189
190         return false;
191 }
192
193 static void free_preempt_fences(struct list_head *list)
194 {
195         struct list_head *link, *next;
196
197         list_for_each_safe(link, next, list)
198                 xe_preempt_fence_free(to_preempt_fence_from_link(link));
199 }
200
201 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
202                                 unsigned int *count)
203 {
204         lockdep_assert_held(&vm->lock);
205         xe_vm_assert_held(vm);
206
207         if (*count >= vm->preempt.num_exec_queues)
208                 return 0;
209
210         for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
211                 struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
212
213                 if (IS_ERR(pfence))
214                         return PTR_ERR(pfence);
215
216                 list_move_tail(xe_preempt_fence_link(pfence), list);
217         }
218
219         return 0;
220 }
221
222 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
223 {
224         struct xe_exec_queue *q;
225
226         xe_vm_assert_held(vm);
227
228         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
229                 if (q->compute.pfence) {
230                         long timeout = dma_fence_wait(q->compute.pfence, false);
231
232                         if (timeout < 0)
233                                 return -ETIME;
234                         dma_fence_put(q->compute.pfence);
235                         q->compute.pfence = NULL;
236                 }
237         }
238
239         return 0;
240 }
241
242 static bool xe_vm_is_idle(struct xe_vm *vm)
243 {
244         struct xe_exec_queue *q;
245
246         xe_vm_assert_held(vm);
247         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
248                 if (!xe_exec_queue_is_idle(q))
249                         return false;
250         }
251
252         return true;
253 }
254
255 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
256 {
257         struct list_head *link;
258         struct xe_exec_queue *q;
259
260         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
261                 struct dma_fence *fence;
262
263                 link = list->next;
264                 xe_assert(vm->xe, link != list);
265
266                 fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
267                                              q, q->compute.context,
268                                              ++q->compute.seqno);
269                 dma_fence_put(q->compute.pfence);
270                 q->compute.pfence = fence;
271         }
272 }
273
274 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
275 {
276         struct xe_exec_queue *q;
277         int err;
278
279         err = xe_bo_lock(bo, true);
280         if (err)
281                 return err;
282
283         err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
284         if (err)
285                 goto out_unlock;
286
287         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
288                 if (q->compute.pfence) {
289                         dma_resv_add_fence(bo->ttm.base.resv,
290                                            q->compute.pfence,
291                                            DMA_RESV_USAGE_BOOKKEEP);
292                 }
293
294 out_unlock:
295         xe_bo_unlock(bo);
296         return err;
297 }
298
299 /**
300  * xe_vm_fence_all_extobjs() - Add a fence to vm's external objects' resv
301  * @vm: The vm.
302  * @fence: The fence to add.
303  * @usage: The resv usage for the fence.
304  *
305  * Loops over all of the vm's external object bindings and adds a @fence
306  * with the given @usage to all of the external object's reservation
307  * objects.
308  */
309 void xe_vm_fence_all_extobjs(struct xe_vm *vm, struct dma_fence *fence,
310                              enum dma_resv_usage usage)
311 {
312         struct xe_vma *vma;
313
314         list_for_each_entry(vma, &vm->extobj.list, extobj.link)
315                 dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, usage);
316 }
317
318 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm)
319 {
320         struct xe_exec_queue *q;
321
322         lockdep_assert_held(&vm->lock);
323         xe_vm_assert_held(vm);
324
325         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
326                 q->ops->resume(q);
327
328                 dma_resv_add_fence(xe_vm_resv(vm), q->compute.pfence,
329                                    DMA_RESV_USAGE_BOOKKEEP);
330                 xe_vm_fence_all_extobjs(vm, q->compute.pfence,
331                                         DMA_RESV_USAGE_BOOKKEEP);
332         }
333 }
334
335 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
336 {
337         struct drm_exec exec;
338         struct dma_fence *pfence;
339         int err;
340         bool wait;
341
342         xe_assert(vm->xe, xe_vm_in_compute_mode(vm));
343
344         down_write(&vm->lock);
345         drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
346         drm_exec_until_all_locked(&exec) {
347                 err = xe_vm_lock_dma_resv(vm, &exec, 1, true);
348                 drm_exec_retry_on_contention(&exec);
349                 if (err)
350                         goto out_unlock;
351         }
352
353         pfence = xe_preempt_fence_create(q, q->compute.context,
354                                          ++q->compute.seqno);
355         if (!pfence) {
356                 err = -ENOMEM;
357                 goto out_unlock;
358         }
359
360         list_add(&q->compute.link, &vm->preempt.exec_queues);
361         ++vm->preempt.num_exec_queues;
362         q->compute.pfence = pfence;
363
364         down_read(&vm->userptr.notifier_lock);
365
366         dma_resv_add_fence(xe_vm_resv(vm), pfence,
367                            DMA_RESV_USAGE_BOOKKEEP);
368
369         xe_vm_fence_all_extobjs(vm, pfence, DMA_RESV_USAGE_BOOKKEEP);
370
371         /*
372          * Check to see if a preemption on VM is in flight or userptr
373          * invalidation, if so trigger this preempt fence to sync state with
374          * other preempt fences on the VM.
375          */
376         wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
377         if (wait)
378                 dma_fence_enable_sw_signaling(pfence);
379
380         up_read(&vm->userptr.notifier_lock);
381
382 out_unlock:
383         drm_exec_fini(&exec);
384         up_write(&vm->lock);
385
386         return err;
387 }
388
389 /**
390  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
391  * that need repinning.
392  * @vm: The VM.
393  *
394  * This function checks for whether the VM has userptrs that need repinning,
395  * and provides a release-type barrier on the userptr.notifier_lock after
396  * checking.
397  *
398  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
399  */
400 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
401 {
402         lockdep_assert_held_read(&vm->userptr.notifier_lock);
403
404         return (list_empty(&vm->userptr.repin_list) &&
405                 list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
406 }
407
408 /**
409  * xe_vm_lock_dma_resv() - Lock the vm dma_resv object and the dma_resv
410  * objects of the vm's external buffer objects.
411  * @vm: The vm.
412  * @exec: Pointer to a struct drm_exec locking context.
413  * @num_shared: Number of dma-fence slots to reserve in the locked objects.
414  * @lock_vm: Lock also the vm's dma_resv.
415  *
416  * Locks the vm dma-resv objects and all the dma-resv objects of the
417  * buffer objects on the vm external object list.
418  *
419  * Return: 0 on success, Negative error code on error. In particular if
420  * @intr is set to true, -EINTR or -ERESTARTSYS may be returned.
421  */
422 int xe_vm_lock_dma_resv(struct xe_vm *vm, struct drm_exec *exec,
423                         unsigned int num_shared, bool lock_vm)
424 {
425         struct xe_vma *vma, *next;
426         int err = 0;
427
428         lockdep_assert_held(&vm->lock);
429
430         if (lock_vm) {
431                 err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared);
432                 if (err)
433                         return err;
434         }
435
436         list_for_each_entry(vma, &vm->extobj.list, extobj.link) {
437                 err = drm_exec_prepare_obj(exec, &xe_vma_bo(vma)->ttm.base, num_shared);
438                 if (err)
439                         return err;
440         }
441
442         spin_lock(&vm->notifier.list_lock);
443         list_for_each_entry_safe(vma, next, &vm->notifier.rebind_list,
444                                  notifier.rebind_link) {
445                 xe_bo_assert_held(xe_vma_bo(vma));
446
447                 list_del_init(&vma->notifier.rebind_link);
448                 if (vma->tile_present && !(vma->gpuva.flags & XE_VMA_DESTROYED))
449                         list_move_tail(&vma->combined_links.rebind,
450                                        &vm->rebind_list);
451         }
452         spin_unlock(&vm->notifier.list_lock);
453
454         return 0;
455 }
456
457 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
458
459 static void xe_vm_kill(struct xe_vm *vm)
460 {
461         struct xe_exec_queue *q;
462
463         lockdep_assert_held(&vm->lock);
464
465         xe_vm_lock(vm, false);
466         vm->flags |= XE_VM_FLAG_BANNED;
467         trace_xe_vm_kill(vm);
468
469         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
470                 q->ops->kill(q);
471         xe_vm_unlock(vm);
472
473         /* TODO: Inform user the VM is banned */
474 }
475
476 /**
477  * xe_vm_validate_should_retry() - Whether to retry after a validate error.
478  * @exec: The drm_exec object used for locking before validation.
479  * @err: The error returned from ttm_bo_validate().
480  * @end: A ktime_t cookie that should be set to 0 before first use and
481  * that should be reused on subsequent calls.
482  *
483  * With multiple active VMs, under memory pressure, it is possible that
484  * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
485  * Until ttm properly handles locking in such scenarios, best thing the
486  * driver can do is retry with a timeout. Check if that is necessary, and
487  * if so unlock the drm_exec's objects while keeping the ticket to prepare
488  * for a rerun.
489  *
490  * Return: true if a retry after drm_exec_init() is recommended;
491  * false otherwise.
492  */
493 bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
494 {
495         ktime_t cur;
496
497         if (err != -ENOMEM)
498                 return false;
499
500         cur = ktime_get();
501         *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
502         if (!ktime_before(cur, *end))
503                 return false;
504
505         /*
506          * We would like to keep the ticket here with
507          * drm_exec_unlock_all(), but WW mutex asserts currently
508          * stop us from that. In any case this function could go away
509          * with proper TTM -EDEADLK handling.
510          */
511         drm_exec_fini(exec);
512
513         msleep(20);
514         return true;
515 }
516
517 static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
518                                  bool *done)
519 {
520         struct xe_vma *vma;
521         int err;
522
523         /*
524          * 1 fence for each preempt fence plus a fence for each tile from a
525          * possible rebind
526          */
527         err = drm_exec_prepare_obj(exec, xe_vm_obj(vm),
528                                    vm->preempt.num_exec_queues +
529                                    vm->xe->info.tile_count);
530         if (err)
531                 return err;
532
533         if (xe_vm_is_idle(vm)) {
534                 vm->preempt.rebind_deactivated = true;
535                 *done = true;
536                 return 0;
537         }
538
539         if (!preempt_fences_waiting(vm)) {
540                 *done = true;
541                 return 0;
542         }
543
544         err = xe_vm_lock_dma_resv(vm, exec, vm->preempt.num_exec_queues, false);
545         if (err)
546                 return err;
547
548         err = wait_for_existing_preempt_fences(vm);
549         if (err)
550                 return err;
551
552         list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
553                 if (xe_vma_has_no_bo(vma) ||
554                     vma->gpuva.flags & XE_VMA_DESTROYED)
555                         continue;
556
557                 err = xe_bo_validate(xe_vma_bo(vma), vm, false);
558                 if (err)
559                         break;
560         }
561
562         return err;
563 }
564
565 static void preempt_rebind_work_func(struct work_struct *w)
566 {
567         struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
568         struct drm_exec exec;
569         struct dma_fence *rebind_fence;
570         unsigned int fence_count = 0;
571         LIST_HEAD(preempt_fences);
572         ktime_t end = 0;
573         int err;
574         long wait;
575         int __maybe_unused tries = 0;
576
577         xe_assert(vm->xe, xe_vm_in_compute_mode(vm));
578         trace_xe_vm_rebind_worker_enter(vm);
579
580         down_write(&vm->lock);
581
582         if (xe_vm_is_closed_or_banned(vm)) {
583                 up_write(&vm->lock);
584                 trace_xe_vm_rebind_worker_exit(vm);
585                 return;
586         }
587
588 retry:
589         if (vm->async_ops.error)
590                 goto out_unlock_outer;
591
592         /*
593          * Extreme corner where we exit a VM error state with a munmap style VM
594          * unbind inflight which requires a rebind. In this case the rebind
595          * needs to install some fences into the dma-resv slots. The worker to
596          * do this queued, let that worker make progress by dropping vm->lock
597          * and trying this again.
598          */
599         if (vm->async_ops.munmap_rebind_inflight) {
600                 up_write(&vm->lock);
601                 flush_work(&vm->async_ops.work);
602                 goto retry;
603         }
604
605         if (xe_vm_userptr_check_repin(vm)) {
606                 err = xe_vm_userptr_pin(vm);
607                 if (err)
608                         goto out_unlock_outer;
609         }
610
611         drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
612
613         drm_exec_until_all_locked(&exec) {
614                 bool done = false;
615
616                 err = xe_preempt_work_begin(&exec, vm, &done);
617                 drm_exec_retry_on_contention(&exec);
618                 if (err && xe_vm_validate_should_retry(&exec, err, &end)) {
619                         err = -EAGAIN;
620                         goto out_unlock_outer;
621                 }
622                 if (err || done)
623                         goto out_unlock;
624         }
625
626         err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
627         if (err)
628                 goto out_unlock;
629
630         rebind_fence = xe_vm_rebind(vm, true);
631         if (IS_ERR(rebind_fence)) {
632                 err = PTR_ERR(rebind_fence);
633                 goto out_unlock;
634         }
635
636         if (rebind_fence) {
637                 dma_fence_wait(rebind_fence, false);
638                 dma_fence_put(rebind_fence);
639         }
640
641         /* Wait on munmap style VM unbinds */
642         wait = dma_resv_wait_timeout(xe_vm_resv(vm),
643                                      DMA_RESV_USAGE_KERNEL,
644                                      false, MAX_SCHEDULE_TIMEOUT);
645         if (wait <= 0) {
646                 err = -ETIME;
647                 goto out_unlock;
648         }
649
650 #define retry_required(__tries, __vm) \
651         (IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
652         (!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
653         __xe_vm_userptr_needs_repin(__vm))
654
655         down_read(&vm->userptr.notifier_lock);
656         if (retry_required(tries, vm)) {
657                 up_read(&vm->userptr.notifier_lock);
658                 err = -EAGAIN;
659                 goto out_unlock;
660         }
661
662 #undef retry_required
663
664         spin_lock(&vm->xe->ttm.lru_lock);
665         ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
666         spin_unlock(&vm->xe->ttm.lru_lock);
667
668         /* Point of no return. */
669         arm_preempt_fences(vm, &preempt_fences);
670         resume_and_reinstall_preempt_fences(vm);
671         up_read(&vm->userptr.notifier_lock);
672
673 out_unlock:
674         drm_exec_fini(&exec);
675 out_unlock_outer:
676         if (err == -EAGAIN) {
677                 trace_xe_vm_rebind_worker_retry(vm);
678                 goto retry;
679         }
680
681         if (err) {
682                 drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
683                 xe_vm_kill(vm);
684         }
685         up_write(&vm->lock);
686
687         free_preempt_fences(&preempt_fences);
688
689         trace_xe_vm_rebind_worker_exit(vm);
690 }
691
692 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
693                                    const struct mmu_notifier_range *range,
694                                    unsigned long cur_seq)
695 {
696         struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier);
697         struct xe_vm *vm = xe_vma_vm(vma);
698         struct dma_resv_iter cursor;
699         struct dma_fence *fence;
700         long err;
701
702         xe_assert(vm->xe, xe_vma_is_userptr(vma));
703         trace_xe_vma_userptr_invalidate(vma);
704
705         if (!mmu_notifier_range_blockable(range))
706                 return false;
707
708         down_write(&vm->userptr.notifier_lock);
709         mmu_interval_set_seq(mni, cur_seq);
710
711         /* No need to stop gpu access if the userptr is not yet bound. */
712         if (!vma->userptr.initial_bind) {
713                 up_write(&vm->userptr.notifier_lock);
714                 return true;
715         }
716
717         /*
718          * Tell exec and rebind worker they need to repin and rebind this
719          * userptr.
720          */
721         if (!xe_vm_in_fault_mode(vm) &&
722             !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
723                 spin_lock(&vm->userptr.invalidated_lock);
724                 list_move_tail(&vma->userptr.invalidate_link,
725                                &vm->userptr.invalidated);
726                 spin_unlock(&vm->userptr.invalidated_lock);
727         }
728
729         up_write(&vm->userptr.notifier_lock);
730
731         /*
732          * Preempt fences turn into schedule disables, pipeline these.
733          * Note that even in fault mode, we need to wait for binds and
734          * unbinds to complete, and those are attached as BOOKMARK fences
735          * to the vm.
736          */
737         dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
738                             DMA_RESV_USAGE_BOOKKEEP);
739         dma_resv_for_each_fence_unlocked(&cursor, fence)
740                 dma_fence_enable_sw_signaling(fence);
741         dma_resv_iter_end(&cursor);
742
743         err = dma_resv_wait_timeout(xe_vm_resv(vm),
744                                     DMA_RESV_USAGE_BOOKKEEP,
745                                     false, MAX_SCHEDULE_TIMEOUT);
746         XE_WARN_ON(err <= 0);
747
748         if (xe_vm_in_fault_mode(vm)) {
749                 err = xe_vm_invalidate_vma(vma);
750                 XE_WARN_ON(err);
751         }
752
753         trace_xe_vma_userptr_invalidate_complete(vma);
754
755         return true;
756 }
757
758 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
759         .invalidate = vma_userptr_invalidate,
760 };
761
762 int xe_vm_userptr_pin(struct xe_vm *vm)
763 {
764         struct xe_vma *vma, *next;
765         int err = 0;
766         LIST_HEAD(tmp_evict);
767
768         lockdep_assert_held_write(&vm->lock);
769
770         /* Collect invalidated userptrs */
771         spin_lock(&vm->userptr.invalidated_lock);
772         list_for_each_entry_safe(vma, next, &vm->userptr.invalidated,
773                                  userptr.invalidate_link) {
774                 list_del_init(&vma->userptr.invalidate_link);
775                 if (list_empty(&vma->combined_links.userptr))
776                         list_move_tail(&vma->combined_links.userptr,
777                                        &vm->userptr.repin_list);
778         }
779         spin_unlock(&vm->userptr.invalidated_lock);
780
781         /* Pin and move to temporary list */
782         list_for_each_entry_safe(vma, next, &vm->userptr.repin_list,
783                                  combined_links.userptr) {
784                 err = xe_vma_userptr_pin_pages(vma);
785                 if (err < 0)
786                         goto out_err;
787
788                 list_move_tail(&vma->combined_links.userptr, &tmp_evict);
789         }
790
791         /* Take lock and move to rebind_list for rebinding. */
792         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
793         if (err)
794                 goto out_err;
795
796         list_for_each_entry_safe(vma, next, &tmp_evict, combined_links.userptr)
797                 list_move_tail(&vma->combined_links.rebind, &vm->rebind_list);
798
799         dma_resv_unlock(xe_vm_resv(vm));
800
801         return 0;
802
803 out_err:
804         list_splice_tail(&tmp_evict, &vm->userptr.repin_list);
805
806         return err;
807 }
808
809 /**
810  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
811  * that need repinning.
812  * @vm: The VM.
813  *
814  * This function does an advisory check for whether the VM has userptrs that
815  * need repinning.
816  *
817  * Return: 0 if there are no indications of userptrs needing repinning,
818  * -EAGAIN if there are.
819  */
820 int xe_vm_userptr_check_repin(struct xe_vm *vm)
821 {
822         return (list_empty_careful(&vm->userptr.repin_list) &&
823                 list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
824 }
825
826 static struct dma_fence *
827 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
828                struct xe_sync_entry *syncs, u32 num_syncs,
829                bool first_op, bool last_op);
830
831 struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
832 {
833         struct dma_fence *fence = NULL;
834         struct xe_vma *vma, *next;
835
836         lockdep_assert_held(&vm->lock);
837         if (xe_vm_no_dma_fences(vm) && !rebind_worker)
838                 return NULL;
839
840         xe_vm_assert_held(vm);
841         list_for_each_entry_safe(vma, next, &vm->rebind_list,
842                                  combined_links.rebind) {
843                 xe_assert(vm->xe, vma->tile_present);
844
845                 list_del_init(&vma->combined_links.rebind);
846                 dma_fence_put(fence);
847                 if (rebind_worker)
848                         trace_xe_vma_rebind_worker(vma);
849                 else
850                         trace_xe_vma_rebind_exec(vma);
851                 fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
852                 if (IS_ERR(fence))
853                         return fence;
854         }
855
856         return fence;
857 }
858
859 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
860                                     struct xe_bo *bo,
861                                     u64 bo_offset_or_userptr,
862                                     u64 start, u64 end,
863                                     bool read_only,
864                                     bool is_null,
865                                     u8 tile_mask)
866 {
867         struct xe_vma *vma;
868         struct xe_tile *tile;
869         u8 id;
870
871         xe_assert(vm->xe, start < end);
872         xe_assert(vm->xe, end < vm->size);
873
874         if (!bo && !is_null)    /* userptr */
875                 vma = kzalloc(sizeof(*vma), GFP_KERNEL);
876         else
877                 vma = kzalloc(sizeof(*vma) - sizeof(struct xe_userptr),
878                               GFP_KERNEL);
879         if (!vma) {
880                 vma = ERR_PTR(-ENOMEM);
881                 return vma;
882         }
883
884         INIT_LIST_HEAD(&vma->combined_links.rebind);
885         INIT_LIST_HEAD(&vma->notifier.rebind_link);
886         INIT_LIST_HEAD(&vma->extobj.link);
887
888         INIT_LIST_HEAD(&vma->gpuva.gem.entry);
889         vma->gpuva.vm = &vm->gpuvm;
890         vma->gpuva.va.addr = start;
891         vma->gpuva.va.range = end - start + 1;
892         if (read_only)
893                 vma->gpuva.flags |= XE_VMA_READ_ONLY;
894         if (is_null)
895                 vma->gpuva.flags |= DRM_GPUVA_SPARSE;
896
897         if (tile_mask) {
898                 vma->tile_mask = tile_mask;
899         } else {
900                 for_each_tile(tile, vm->xe, id)
901                         vma->tile_mask |= 0x1 << id;
902         }
903
904         if (vm->xe->info.platform == XE_PVC)
905                 vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
906
907         if (bo) {
908                 struct drm_gpuvm_bo *vm_bo;
909
910                 xe_bo_assert_held(bo);
911
912                 vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
913                 if (IS_ERR(vm_bo)) {
914                         kfree(vma);
915                         return ERR_CAST(vm_bo);
916                 }
917
918                 drm_gem_object_get(&bo->ttm.base);
919                 vma->gpuva.gem.obj = &bo->ttm.base;
920                 vma->gpuva.gem.offset = bo_offset_or_userptr;
921                 drm_gpuva_link(&vma->gpuva, vm_bo);
922                 drm_gpuvm_bo_put(vm_bo);
923         } else /* userptr or null */ {
924                 if (!is_null) {
925                         u64 size = end - start + 1;
926                         int err;
927
928                         INIT_LIST_HEAD(&vma->userptr.invalidate_link);
929                         vma->gpuva.gem.offset = bo_offset_or_userptr;
930
931                         err = mmu_interval_notifier_insert(&vma->userptr.notifier,
932                                                            current->mm,
933                                                            xe_vma_userptr(vma), size,
934                                                            &vma_userptr_notifier_ops);
935                         if (err) {
936                                 kfree(vma);
937                                 vma = ERR_PTR(err);
938                                 return vma;
939                         }
940
941                         vma->userptr.notifier_seq = LONG_MAX;
942                 }
943
944                 xe_vm_get(vm);
945         }
946
947         return vma;
948 }
949
950 static bool vm_remove_extobj(struct xe_vma *vma)
951 {
952         if (!list_empty(&vma->extobj.link)) {
953                 xe_vma_vm(vma)->extobj.entries--;
954                 list_del_init(&vma->extobj.link);
955                 return true;
956         }
957         return false;
958 }
959
960 static void xe_vma_destroy_late(struct xe_vma *vma)
961 {
962         struct xe_vm *vm = xe_vma_vm(vma);
963         struct xe_device *xe = vm->xe;
964         bool read_only = xe_vma_read_only(vma);
965
966         if (xe_vma_is_userptr(vma)) {
967                 if (vma->userptr.sg) {
968                         dma_unmap_sgtable(xe->drm.dev,
969                                           vma->userptr.sg,
970                                           read_only ? DMA_TO_DEVICE :
971                                           DMA_BIDIRECTIONAL, 0);
972                         sg_free_table(vma->userptr.sg);
973                         vma->userptr.sg = NULL;
974                 }
975
976                 /*
977                  * Since userptr pages are not pinned, we can't remove
978                  * the notifer until we're sure the GPU is not accessing
979                  * them anymore
980                  */
981                 mmu_interval_notifier_remove(&vma->userptr.notifier);
982                 xe_vm_put(vm);
983         } else if (xe_vma_is_null(vma)) {
984                 xe_vm_put(vm);
985         } else {
986                 xe_bo_put(xe_vma_bo(vma));
987         }
988
989         kfree(vma);
990 }
991
992 static void vma_destroy_work_func(struct work_struct *w)
993 {
994         struct xe_vma *vma =
995                 container_of(w, struct xe_vma, destroy_work);
996
997         xe_vma_destroy_late(vma);
998 }
999
1000 static struct xe_vma *
1001 bo_has_vm_references_locked(struct xe_bo *bo, struct xe_vm *vm,
1002                             struct xe_vma *ignore)
1003 {
1004         struct drm_gpuvm_bo *vm_bo;
1005         struct drm_gpuva *va;
1006         struct drm_gem_object *obj = &bo->ttm.base;
1007
1008         xe_bo_assert_held(bo);
1009
1010         drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
1011                 drm_gpuvm_bo_for_each_va(va, vm_bo) {
1012                         struct xe_vma *vma = gpuva_to_vma(va);
1013
1014                         if (vma != ignore && xe_vma_vm(vma) == vm)
1015                                 return vma;
1016                 }
1017         }
1018
1019         return NULL;
1020 }
1021
1022 static bool bo_has_vm_references(struct xe_bo *bo, struct xe_vm *vm,
1023                                  struct xe_vma *ignore)
1024 {
1025         bool ret;
1026
1027         xe_bo_lock(bo, false);
1028         ret = !!bo_has_vm_references_locked(bo, vm, ignore);
1029         xe_bo_unlock(bo);
1030
1031         return ret;
1032 }
1033
1034 static void __vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1035 {
1036         lockdep_assert_held_write(&vm->lock);
1037
1038         list_add(&vma->extobj.link, &vm->extobj.list);
1039         vm->extobj.entries++;
1040 }
1041
1042 static void vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1043 {
1044         struct xe_bo *bo = xe_vma_bo(vma);
1045
1046         lockdep_assert_held_write(&vm->lock);
1047
1048         if (bo_has_vm_references(bo, vm, vma))
1049                 return;
1050
1051         __vm_insert_extobj(vm, vma);
1052 }
1053
1054 static void vma_destroy_cb(struct dma_fence *fence,
1055                            struct dma_fence_cb *cb)
1056 {
1057         struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1058
1059         INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1060         queue_work(system_unbound_wq, &vma->destroy_work);
1061 }
1062
1063 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1064 {
1065         struct xe_vm *vm = xe_vma_vm(vma);
1066
1067         lockdep_assert_held_write(&vm->lock);
1068         xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1069
1070         if (xe_vma_is_userptr(vma)) {
1071                 xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1072
1073                 spin_lock(&vm->userptr.invalidated_lock);
1074                 list_del(&vma->userptr.invalidate_link);
1075                 spin_unlock(&vm->userptr.invalidated_lock);
1076         } else if (!xe_vma_is_null(vma)) {
1077                 xe_bo_assert_held(xe_vma_bo(vma));
1078
1079                 spin_lock(&vm->notifier.list_lock);
1080                 list_del(&vma->notifier.rebind_link);
1081                 spin_unlock(&vm->notifier.list_lock);
1082
1083                 drm_gpuva_unlink(&vma->gpuva);
1084
1085                 if (!xe_vma_bo(vma)->vm && vm_remove_extobj(vma)) {
1086                         struct xe_vma *other;
1087
1088                         other = bo_has_vm_references_locked(xe_vma_bo(vma), vm, NULL);
1089
1090                         if (other)
1091                                 __vm_insert_extobj(vm, other);
1092                 }
1093         }
1094
1095         xe_vm_assert_held(vm);
1096         if (fence) {
1097                 int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1098                                                  vma_destroy_cb);
1099
1100                 if (ret) {
1101                         XE_WARN_ON(ret != -ENOENT);
1102                         xe_vma_destroy_late(vma);
1103                 }
1104         } else {
1105                 xe_vma_destroy_late(vma);
1106         }
1107 }
1108
1109 /**
1110  * xe_vm_prepare_vma() - drm_exec utility to lock a vma
1111  * @exec: The drm_exec object we're currently locking for.
1112  * @vma: The vma for witch we want to lock the vm resv and any attached
1113  * object's resv.
1114  * @num_shared: The number of dma-fence slots to pre-allocate in the
1115  * objects' reservation objects.
1116  *
1117  * Return: 0 on success, negative error code on error. In particular
1118  * may return -EDEADLK on WW transaction contention and -EINTR if
1119  * an interruptible wait is terminated by a signal.
1120  */
1121 int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma,
1122                       unsigned int num_shared)
1123 {
1124         struct xe_vm *vm = xe_vma_vm(vma);
1125         struct xe_bo *bo = xe_vma_bo(vma);
1126         int err;
1127
1128         XE_WARN_ON(!vm);
1129         err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared);
1130         if (!err && bo && !bo->vm)
1131                 err = drm_exec_prepare_obj(exec, &bo->ttm.base, num_shared);
1132
1133         return err;
1134 }
1135
1136 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1137 {
1138         struct drm_exec exec;
1139         int err;
1140
1141         drm_exec_init(&exec, 0);
1142         drm_exec_until_all_locked(&exec) {
1143                 err = xe_vm_prepare_vma(&exec, vma, 0);
1144                 drm_exec_retry_on_contention(&exec);
1145                 if (XE_WARN_ON(err))
1146                         break;
1147         }
1148
1149         xe_vma_destroy(vma, NULL);
1150
1151         drm_exec_fini(&exec);
1152 }
1153
1154 struct xe_vma *
1155 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1156 {
1157         struct drm_gpuva *gpuva;
1158
1159         lockdep_assert_held(&vm->lock);
1160
1161         if (xe_vm_is_closed_or_banned(vm))
1162                 return NULL;
1163
1164         xe_assert(vm->xe, start + range <= vm->size);
1165
1166         gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1167
1168         return gpuva ? gpuva_to_vma(gpuva) : NULL;
1169 }
1170
1171 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1172 {
1173         int err;
1174
1175         xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1176         lockdep_assert_held(&vm->lock);
1177
1178         err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1179         XE_WARN_ON(err);        /* Shouldn't be possible */
1180
1181         return err;
1182 }
1183
1184 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1185 {
1186         xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1187         lockdep_assert_held(&vm->lock);
1188
1189         drm_gpuva_remove(&vma->gpuva);
1190         if (vm->usm.last_fault_vma == vma)
1191                 vm->usm.last_fault_vma = NULL;
1192 }
1193
1194 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1195 {
1196         struct xe_vma_op *op;
1197
1198         op = kzalloc(sizeof(*op), GFP_KERNEL);
1199
1200         if (unlikely(!op))
1201                 return NULL;
1202
1203         return &op->base;
1204 }
1205
1206 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1207
1208 static struct drm_gpuvm_ops gpuvm_ops = {
1209         .op_alloc = xe_vm_op_alloc,
1210         .vm_free = xe_vm_free,
1211 };
1212
1213 static u64 pde_encode_cache(struct xe_device *xe, enum xe_cache_level cache)
1214 {
1215         u32 pat_index = xe->pat.idx[cache];
1216         u64 pte = 0;
1217
1218         if (pat_index & BIT(0))
1219                 pte |= XE_PPGTT_PTE_PAT0;
1220
1221         if (pat_index & BIT(1))
1222                 pte |= XE_PPGTT_PTE_PAT1;
1223
1224         return pte;
1225 }
1226
1227 static u64 pte_encode_cache(struct xe_device *xe, enum xe_cache_level cache)
1228 {
1229         u32 pat_index = xe->pat.idx[cache];
1230         u64 pte = 0;
1231
1232         if (pat_index & BIT(0))
1233                 pte |= XE_PPGTT_PTE_PAT0;
1234
1235         if (pat_index & BIT(1))
1236                 pte |= XE_PPGTT_PTE_PAT1;
1237
1238         if (pat_index & BIT(2))
1239                 pte |= XE_PPGTT_PTE_PAT2;
1240
1241         if (pat_index & BIT(3))
1242                 pte |= XELPG_PPGTT_PTE_PAT3;
1243
1244         return pte;
1245 }
1246
1247 static u64 pte_encode_ps(u32 pt_level)
1248 {
1249         XE_WARN_ON(pt_level > 2);
1250
1251         if (pt_level == 1)
1252                 return XE_PDE_PS_2M;
1253         else if (pt_level == 2)
1254                 return XE_PDPE_PS_1G;
1255
1256         return 0;
1257 }
1258
1259 static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
1260                               const enum xe_cache_level cache)
1261 {
1262         struct xe_device *xe = xe_bo_device(bo);
1263         u64 pde;
1264
1265         pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1266         pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1267         pde |= pde_encode_cache(xe, cache);
1268
1269         return pde;
1270 }
1271
1272 static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1273                               enum xe_cache_level cache, u32 pt_level)
1274 {
1275         struct xe_device *xe = xe_bo_device(bo);
1276         u64 pte;
1277
1278         pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1279         pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1280         pte |= pte_encode_cache(xe, cache);
1281         pte |= pte_encode_ps(pt_level);
1282
1283         if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1284                 pte |= XE_PPGTT_PTE_DM;
1285
1286         return pte;
1287 }
1288
1289 static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1290                                enum xe_cache_level cache, u32 pt_level)
1291 {
1292         struct xe_device *xe = xe_vma_vm(vma)->xe;
1293
1294         pte |= XE_PAGE_PRESENT;
1295
1296         if (likely(!xe_vma_read_only(vma)))
1297                 pte |= XE_PAGE_RW;
1298
1299         pte |= pte_encode_cache(xe, cache);
1300         pte |= pte_encode_ps(pt_level);
1301
1302         if (unlikely(xe_vma_is_null(vma)))
1303                 pte |= XE_PTE_NULL;
1304
1305         return pte;
1306 }
1307
1308 static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1309                                 enum xe_cache_level cache,
1310                                 u32 pt_level, bool devmem, u64 flags)
1311 {
1312         u64 pte;
1313
1314         /* Avoid passing random bits directly as flags */
1315         XE_WARN_ON(flags & ~XE_PTE_PS64);
1316
1317         pte = addr;
1318         pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1319         pte |= pte_encode_cache(xe, cache);
1320         pte |= pte_encode_ps(pt_level);
1321
1322         if (devmem)
1323                 pte |= XE_PPGTT_PTE_DM;
1324
1325         pte |= flags;
1326
1327         return pte;
1328 }
1329
1330 static const struct xe_pt_ops xelp_pt_ops = {
1331         .pte_encode_bo = xelp_pte_encode_bo,
1332         .pte_encode_vma = xelp_pte_encode_vma,
1333         .pte_encode_addr = xelp_pte_encode_addr,
1334         .pde_encode_bo = xelp_pde_encode_bo,
1335 };
1336
1337 static void xe_vma_op_work_func(struct work_struct *w);
1338 static void vm_destroy_work_func(struct work_struct *w);
1339
1340 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1341 {
1342         struct drm_gem_object *vm_resv_obj;
1343         struct xe_vm *vm;
1344         int err, number_tiles = 0;
1345         struct xe_tile *tile;
1346         u8 id;
1347
1348         vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1349         if (!vm)
1350                 return ERR_PTR(-ENOMEM);
1351
1352         vm->xe = xe;
1353
1354         vm->size = 1ull << xe->info.va_bits;
1355
1356         vm->flags = flags;
1357
1358         init_rwsem(&vm->lock);
1359
1360         INIT_LIST_HEAD(&vm->rebind_list);
1361
1362         INIT_LIST_HEAD(&vm->userptr.repin_list);
1363         INIT_LIST_HEAD(&vm->userptr.invalidated);
1364         init_rwsem(&vm->userptr.notifier_lock);
1365         spin_lock_init(&vm->userptr.invalidated_lock);
1366
1367         INIT_LIST_HEAD(&vm->notifier.rebind_list);
1368         spin_lock_init(&vm->notifier.list_lock);
1369
1370         INIT_LIST_HEAD(&vm->async_ops.pending);
1371         INIT_WORK(&vm->async_ops.work, xe_vma_op_work_func);
1372         spin_lock_init(&vm->async_ops.lock);
1373
1374         INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1375
1376         INIT_LIST_HEAD(&vm->preempt.exec_queues);
1377         vm->preempt.min_run_period_ms = 10;     /* FIXME: Wire up to uAPI */
1378
1379         for_each_tile(tile, xe, id)
1380                 xe_range_fence_tree_init(&vm->rftree[id]);
1381
1382         INIT_LIST_HEAD(&vm->extobj.list);
1383
1384         vm->pt_ops = &xelp_pt_ops;
1385
1386         if (!(flags & XE_VM_FLAG_MIGRATION))
1387                 xe_device_mem_access_get(xe);
1388
1389         vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1390         if (!vm_resv_obj) {
1391                 err = -ENOMEM;
1392                 goto err_no_resv;
1393         }
1394
1395         drm_gpuvm_init(&vm->gpuvm, "Xe VM", 0, &xe->drm, vm_resv_obj,
1396                        0, vm->size, 0, 0, &gpuvm_ops);
1397
1398         drm_gem_object_put(vm_resv_obj);
1399
1400         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
1401         if (err)
1402                 goto err_close;
1403
1404         if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1405                 vm->flags |= XE_VM_FLAG_64K;
1406
1407         for_each_tile(tile, xe, id) {
1408                 if (flags & XE_VM_FLAG_MIGRATION &&
1409                     tile->id != XE_VM_FLAG_TILE_ID(flags))
1410                         continue;
1411
1412                 vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1413                 if (IS_ERR(vm->pt_root[id])) {
1414                         err = PTR_ERR(vm->pt_root[id]);
1415                         vm->pt_root[id] = NULL;
1416                         goto err_unlock_close;
1417                 }
1418         }
1419
1420         if (flags & XE_VM_FLAG_SCRATCH_PAGE) {
1421                 for_each_tile(tile, xe, id) {
1422                         if (!vm->pt_root[id])
1423                                 continue;
1424
1425                         err = xe_pt_create_scratch(xe, tile, vm);
1426                         if (err)
1427                                 goto err_unlock_close;
1428                 }
1429                 vm->batch_invalidate_tlb = true;
1430         }
1431
1432         if (flags & XE_VM_FLAG_COMPUTE_MODE) {
1433                 INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1434                 vm->flags |= XE_VM_FLAG_COMPUTE_MODE;
1435                 vm->batch_invalidate_tlb = false;
1436         }
1437
1438         if (flags & XE_VM_FLAG_ASYNC_BIND_OPS) {
1439                 vm->async_ops.fence.context = dma_fence_context_alloc(1);
1440                 vm->flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
1441         }
1442
1443         /* Fill pt_root after allocating scratch tables */
1444         for_each_tile(tile, xe, id) {
1445                 if (!vm->pt_root[id])
1446                         continue;
1447
1448                 xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1449         }
1450         dma_resv_unlock(xe_vm_resv(vm));
1451
1452         /* Kernel migration VM shouldn't have a circular loop.. */
1453         if (!(flags & XE_VM_FLAG_MIGRATION)) {
1454                 for_each_tile(tile, xe, id) {
1455                         struct xe_gt *gt = tile->primary_gt;
1456                         struct xe_vm *migrate_vm;
1457                         struct xe_exec_queue *q;
1458
1459                         if (!vm->pt_root[id])
1460                                 continue;
1461
1462                         migrate_vm = xe_migrate_get_vm(tile->migrate);
1463                         q = xe_exec_queue_create_class(xe, gt, migrate_vm,
1464                                                        XE_ENGINE_CLASS_COPY,
1465                                                        EXEC_QUEUE_FLAG_VM);
1466                         xe_vm_put(migrate_vm);
1467                         if (IS_ERR(q)) {
1468                                 err = PTR_ERR(q);
1469                                 goto err_close;
1470                         }
1471                         vm->q[id] = q;
1472                         number_tiles++;
1473                 }
1474         }
1475
1476         if (number_tiles > 1)
1477                 vm->composite_fence_ctx = dma_fence_context_alloc(1);
1478
1479         mutex_lock(&xe->usm.lock);
1480         if (flags & XE_VM_FLAG_FAULT_MODE)
1481                 xe->usm.num_vm_in_fault_mode++;
1482         else if (!(flags & XE_VM_FLAG_MIGRATION))
1483                 xe->usm.num_vm_in_non_fault_mode++;
1484         mutex_unlock(&xe->usm.lock);
1485
1486         trace_xe_vm_create(vm);
1487
1488         return vm;
1489
1490 err_unlock_close:
1491         dma_resv_unlock(xe_vm_resv(vm));
1492 err_close:
1493         xe_vm_close_and_put(vm);
1494         return ERR_PTR(err);
1495
1496 err_no_resv:
1497         for_each_tile(tile, xe, id)
1498                 xe_range_fence_tree_fini(&vm->rftree[id]);
1499         kfree(vm);
1500         if (!(flags & XE_VM_FLAG_MIGRATION))
1501                 xe_device_mem_access_put(xe);
1502         return ERR_PTR(err);
1503 }
1504
1505 static void flush_async_ops(struct xe_vm *vm)
1506 {
1507         queue_work(system_unbound_wq, &vm->async_ops.work);
1508         flush_work(&vm->async_ops.work);
1509 }
1510
1511 static void vm_error_capture(struct xe_vm *vm, int err,
1512                              u32 op, u64 addr, u64 size)
1513 {
1514         struct drm_xe_vm_bind_op_error_capture capture;
1515         u64 __user *address =
1516                 u64_to_user_ptr(vm->async_ops.error_capture.addr);
1517         bool in_kthread = !current->mm;
1518
1519         capture.error = err;
1520         capture.op = op;
1521         capture.addr = addr;
1522         capture.size = size;
1523
1524         if (in_kthread) {
1525                 if (!mmget_not_zero(vm->async_ops.error_capture.mm))
1526                         goto mm_closed;
1527                 kthread_use_mm(vm->async_ops.error_capture.mm);
1528         }
1529
1530         if (copy_to_user(address, &capture, sizeof(capture)))
1531                 drm_warn(&vm->xe->drm, "Copy to user failed");
1532
1533         if (in_kthread) {
1534                 kthread_unuse_mm(vm->async_ops.error_capture.mm);
1535                 mmput(vm->async_ops.error_capture.mm);
1536         }
1537
1538 mm_closed:
1539         wake_up_all(&vm->async_ops.error_capture.wq);
1540 }
1541
1542 static void xe_vm_close(struct xe_vm *vm)
1543 {
1544         down_write(&vm->lock);
1545         vm->size = 0;
1546         up_write(&vm->lock);
1547 }
1548
1549 void xe_vm_close_and_put(struct xe_vm *vm)
1550 {
1551         LIST_HEAD(contested);
1552         struct xe_device *xe = vm->xe;
1553         struct xe_tile *tile;
1554         struct xe_vma *vma, *next_vma;
1555         struct drm_gpuva *gpuva, *next;
1556         u8 id;
1557
1558         xe_assert(xe, !vm->preempt.num_exec_queues);
1559
1560         xe_vm_close(vm);
1561         flush_async_ops(vm);
1562         if (xe_vm_in_compute_mode(vm))
1563                 flush_work(&vm->preempt.rebind_work);
1564
1565         for_each_tile(tile, xe, id) {
1566                 if (vm->q[id]) {
1567                         xe_exec_queue_kill(vm->q[id]);
1568                         xe_exec_queue_put(vm->q[id]);
1569                         vm->q[id] = NULL;
1570                 }
1571         }
1572
1573         down_write(&vm->lock);
1574         xe_vm_lock(vm, false);
1575         drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1576                 vma = gpuva_to_vma(gpuva);
1577
1578                 if (xe_vma_has_no_bo(vma)) {
1579                         down_read(&vm->userptr.notifier_lock);
1580                         vma->gpuva.flags |= XE_VMA_DESTROYED;
1581                         up_read(&vm->userptr.notifier_lock);
1582                 }
1583
1584                 xe_vm_remove_vma(vm, vma);
1585
1586                 /* easy case, remove from VMA? */
1587                 if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1588                         list_del_init(&vma->combined_links.rebind);
1589                         xe_vma_destroy(vma, NULL);
1590                         continue;
1591                 }
1592
1593                 list_move_tail(&vma->combined_links.destroy, &contested);
1594                 vma->gpuva.flags |= XE_VMA_DESTROYED;
1595         }
1596
1597         /*
1598          * All vm operations will add shared fences to resv.
1599          * The only exception is eviction for a shared object,
1600          * but even so, the unbind when evicted would still
1601          * install a fence to resv. Hence it's safe to
1602          * destroy the pagetables immediately.
1603          */
1604         for_each_tile(tile, xe, id) {
1605                 if (vm->scratch_bo[id]) {
1606                         u32 i;
1607
1608                         xe_bo_unpin(vm->scratch_bo[id]);
1609                         xe_bo_put(vm->scratch_bo[id]);
1610                         for (i = 0; i < vm->pt_root[id]->level; i++)
1611                                 xe_pt_destroy(vm->scratch_pt[id][i], vm->flags,
1612                                               NULL);
1613                 }
1614                 if (vm->pt_root[id]) {
1615                         xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1616                         vm->pt_root[id] = NULL;
1617                 }
1618         }
1619         xe_vm_unlock(vm);
1620
1621         /*
1622          * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1623          * Since we hold a refcount to the bo, we can remove and free
1624          * the members safely without locking.
1625          */
1626         list_for_each_entry_safe(vma, next_vma, &contested,
1627                                  combined_links.destroy) {
1628                 list_del_init(&vma->combined_links.destroy);
1629                 xe_vma_destroy_unlocked(vma);
1630         }
1631
1632         if (vm->async_ops.error_capture.addr)
1633                 wake_up_all(&vm->async_ops.error_capture.wq);
1634
1635         xe_assert(xe, list_empty(&vm->extobj.list));
1636         up_write(&vm->lock);
1637
1638         mutex_lock(&xe->usm.lock);
1639         if (vm->flags & XE_VM_FLAG_FAULT_MODE)
1640                 xe->usm.num_vm_in_fault_mode--;
1641         else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1642                 xe->usm.num_vm_in_non_fault_mode--;
1643         mutex_unlock(&xe->usm.lock);
1644
1645         for_each_tile(tile, xe, id)
1646                 xe_range_fence_tree_fini(&vm->rftree[id]);
1647
1648         xe_vm_put(vm);
1649 }
1650
1651 static void vm_destroy_work_func(struct work_struct *w)
1652 {
1653         struct xe_vm *vm =
1654                 container_of(w, struct xe_vm, destroy_work);
1655         struct xe_device *xe = vm->xe;
1656         struct xe_tile *tile;
1657         u8 id;
1658         void *lookup;
1659
1660         /* xe_vm_close_and_put was not called? */
1661         xe_assert(xe, !vm->size);
1662
1663         if (!(vm->flags & XE_VM_FLAG_MIGRATION)) {
1664                 xe_device_mem_access_put(xe);
1665
1666                 if (xe->info.has_asid) {
1667                         mutex_lock(&xe->usm.lock);
1668                         lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1669                         xe_assert(xe, lookup == vm);
1670                         mutex_unlock(&xe->usm.lock);
1671                 }
1672         }
1673
1674         for_each_tile(tile, xe, id)
1675                 XE_WARN_ON(vm->pt_root[id]);
1676
1677         trace_xe_vm_free(vm);
1678         dma_fence_put(vm->rebind_fence);
1679         kfree(vm);
1680 }
1681
1682 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1683 {
1684         struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1685
1686         /* To destroy the VM we need to be able to sleep */
1687         queue_work(system_unbound_wq, &vm->destroy_work);
1688 }
1689
1690 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1691 {
1692         struct xe_vm *vm;
1693
1694         mutex_lock(&xef->vm.lock);
1695         vm = xa_load(&xef->vm.xa, id);
1696         if (vm)
1697                 xe_vm_get(vm);
1698         mutex_unlock(&xef->vm.lock);
1699
1700         return vm;
1701 }
1702
1703 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1704 {
1705         return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
1706                                          XE_CACHE_WB);
1707 }
1708
1709 static struct dma_fence *
1710 xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1711                  struct xe_sync_entry *syncs, u32 num_syncs,
1712                  bool first_op, bool last_op)
1713 {
1714         struct xe_tile *tile;
1715         struct dma_fence *fence = NULL;
1716         struct dma_fence **fences = NULL;
1717         struct dma_fence_array *cf = NULL;
1718         struct xe_vm *vm = xe_vma_vm(vma);
1719         int cur_fence = 0, i;
1720         int number_tiles = hweight8(vma->tile_present);
1721         int err;
1722         u8 id;
1723
1724         trace_xe_vma_unbind(vma);
1725
1726         if (number_tiles > 1) {
1727                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1728                                        GFP_KERNEL);
1729                 if (!fences)
1730                         return ERR_PTR(-ENOMEM);
1731         }
1732
1733         for_each_tile(tile, vm->xe, id) {
1734                 if (!(vma->tile_present & BIT(id)))
1735                         goto next;
1736
1737                 fence = __xe_pt_unbind_vma(tile, vma, q, first_op ? syncs : NULL,
1738                                            first_op ? num_syncs : 0);
1739                 if (IS_ERR(fence)) {
1740                         err = PTR_ERR(fence);
1741                         goto err_fences;
1742                 }
1743
1744                 if (fences)
1745                         fences[cur_fence++] = fence;
1746
1747 next:
1748                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1749                         q = list_next_entry(q, multi_gt_list);
1750         }
1751
1752         if (fences) {
1753                 cf = dma_fence_array_create(number_tiles, fences,
1754                                             vm->composite_fence_ctx,
1755                                             vm->composite_fence_seqno++,
1756                                             false);
1757                 if (!cf) {
1758                         --vm->composite_fence_seqno;
1759                         err = -ENOMEM;
1760                         goto err_fences;
1761                 }
1762         }
1763
1764         if (last_op) {
1765                 for (i = 0; i < num_syncs; i++)
1766                         xe_sync_entry_signal(&syncs[i], NULL,
1767                                              cf ? &cf->base : fence);
1768         }
1769
1770         return cf ? &cf->base : !fence ? dma_fence_get_stub() : fence;
1771
1772 err_fences:
1773         if (fences) {
1774                 while (cur_fence) {
1775                         /* FIXME: Rewind the previous binds? */
1776                         dma_fence_put(fences[--cur_fence]);
1777                 }
1778                 kfree(fences);
1779         }
1780
1781         return ERR_PTR(err);
1782 }
1783
1784 static struct dma_fence *
1785 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1786                struct xe_sync_entry *syncs, u32 num_syncs,
1787                bool first_op, bool last_op)
1788 {
1789         struct xe_tile *tile;
1790         struct dma_fence *fence;
1791         struct dma_fence **fences = NULL;
1792         struct dma_fence_array *cf = NULL;
1793         struct xe_vm *vm = xe_vma_vm(vma);
1794         int cur_fence = 0, i;
1795         int number_tiles = hweight8(vma->tile_mask);
1796         int err;
1797         u8 id;
1798
1799         trace_xe_vma_bind(vma);
1800
1801         if (number_tiles > 1) {
1802                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1803                                        GFP_KERNEL);
1804                 if (!fences)
1805                         return ERR_PTR(-ENOMEM);
1806         }
1807
1808         for_each_tile(tile, vm->xe, id) {
1809                 if (!(vma->tile_mask & BIT(id)))
1810                         goto next;
1811
1812                 fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id],
1813                                          first_op ? syncs : NULL,
1814                                          first_op ? num_syncs : 0,
1815                                          vma->tile_present & BIT(id));
1816                 if (IS_ERR(fence)) {
1817                         err = PTR_ERR(fence);
1818                         goto err_fences;
1819                 }
1820
1821                 if (fences)
1822                         fences[cur_fence++] = fence;
1823
1824 next:
1825                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1826                         q = list_next_entry(q, multi_gt_list);
1827         }
1828
1829         if (fences) {
1830                 cf = dma_fence_array_create(number_tiles, fences,
1831                                             vm->composite_fence_ctx,
1832                                             vm->composite_fence_seqno++,
1833                                             false);
1834                 if (!cf) {
1835                         --vm->composite_fence_seqno;
1836                         err = -ENOMEM;
1837                         goto err_fences;
1838                 }
1839         }
1840
1841         if (last_op) {
1842                 for (i = 0; i < num_syncs; i++)
1843                         xe_sync_entry_signal(&syncs[i], NULL,
1844                                              cf ? &cf->base : fence);
1845         }
1846
1847         return cf ? &cf->base : fence;
1848
1849 err_fences:
1850         if (fences) {
1851                 while (cur_fence) {
1852                         /* FIXME: Rewind the previous binds? */
1853                         dma_fence_put(fences[--cur_fence]);
1854                 }
1855                 kfree(fences);
1856         }
1857
1858         return ERR_PTR(err);
1859 }
1860
1861 struct async_op_fence {
1862         struct dma_fence fence;
1863         struct dma_fence *wait_fence;
1864         struct dma_fence_cb cb;
1865         struct xe_vm *vm;
1866         wait_queue_head_t wq;
1867         bool started;
1868 };
1869
1870 static const char *async_op_fence_get_driver_name(struct dma_fence *dma_fence)
1871 {
1872         return "xe";
1873 }
1874
1875 static const char *
1876 async_op_fence_get_timeline_name(struct dma_fence *dma_fence)
1877 {
1878         return "async_op_fence";
1879 }
1880
1881 static const struct dma_fence_ops async_op_fence_ops = {
1882         .get_driver_name = async_op_fence_get_driver_name,
1883         .get_timeline_name = async_op_fence_get_timeline_name,
1884 };
1885
1886 static void async_op_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
1887 {
1888         struct async_op_fence *afence =
1889                 container_of(cb, struct async_op_fence, cb);
1890
1891         afence->fence.error = afence->wait_fence->error;
1892         dma_fence_signal(&afence->fence);
1893         xe_vm_put(afence->vm);
1894         dma_fence_put(afence->wait_fence);
1895         dma_fence_put(&afence->fence);
1896 }
1897
1898 static void add_async_op_fence_cb(struct xe_vm *vm,
1899                                   struct dma_fence *fence,
1900                                   struct async_op_fence *afence)
1901 {
1902         int ret;
1903
1904         if (!xe_vm_no_dma_fences(vm)) {
1905                 afence->started = true;
1906                 smp_wmb();
1907                 wake_up_all(&afence->wq);
1908         }
1909
1910         afence->wait_fence = dma_fence_get(fence);
1911         afence->vm = xe_vm_get(vm);
1912         dma_fence_get(&afence->fence);
1913         ret = dma_fence_add_callback(fence, &afence->cb, async_op_fence_cb);
1914         if (ret == -ENOENT) {
1915                 afence->fence.error = afence->wait_fence->error;
1916                 dma_fence_signal(&afence->fence);
1917         }
1918         if (ret) {
1919                 xe_vm_put(vm);
1920                 dma_fence_put(afence->wait_fence);
1921                 dma_fence_put(&afence->fence);
1922         }
1923         XE_WARN_ON(ret && ret != -ENOENT);
1924 }
1925
1926 int xe_vm_async_fence_wait_start(struct dma_fence *fence)
1927 {
1928         if (fence->ops == &async_op_fence_ops) {
1929                 struct async_op_fence *afence =
1930                         container_of(fence, struct async_op_fence, fence);
1931
1932                 xe_assert(afence->vm->xe, !xe_vm_no_dma_fences(afence->vm));
1933
1934                 smp_rmb();
1935                 return wait_event_interruptible(afence->wq, afence->started);
1936         }
1937
1938         return 0;
1939 }
1940
1941 static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
1942                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1943                         u32 num_syncs, struct async_op_fence *afence,
1944                         bool immediate, bool first_op, bool last_op)
1945 {
1946         struct dma_fence *fence;
1947
1948         xe_vm_assert_held(vm);
1949
1950         if (immediate) {
1951                 fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op,
1952                                        last_op);
1953                 if (IS_ERR(fence))
1954                         return PTR_ERR(fence);
1955         } else {
1956                 int i;
1957
1958                 xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1959
1960                 fence = dma_fence_get_stub();
1961                 if (last_op) {
1962                         for (i = 0; i < num_syncs; i++)
1963                                 xe_sync_entry_signal(&syncs[i], NULL, fence);
1964                 }
1965         }
1966         if (afence)
1967                 add_async_op_fence_cb(vm, fence, afence);
1968
1969         dma_fence_put(fence);
1970         return 0;
1971 }
1972
1973 static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
1974                       struct xe_bo *bo, struct xe_sync_entry *syncs,
1975                       u32 num_syncs, struct async_op_fence *afence,
1976                       bool immediate, bool first_op, bool last_op)
1977 {
1978         int err;
1979
1980         xe_vm_assert_held(vm);
1981         xe_bo_assert_held(bo);
1982
1983         if (bo && immediate) {
1984                 err = xe_bo_validate(bo, vm, true);
1985                 if (err)
1986                         return err;
1987         }
1988
1989         return __xe_vm_bind(vm, vma, q, syncs, num_syncs, afence, immediate,
1990                             first_op, last_op);
1991 }
1992
1993 static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
1994                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1995                         u32 num_syncs, struct async_op_fence *afence,
1996                         bool first_op, bool last_op)
1997 {
1998         struct dma_fence *fence;
1999
2000         xe_vm_assert_held(vm);
2001         xe_bo_assert_held(xe_vma_bo(vma));
2002
2003         fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op);
2004         if (IS_ERR(fence))
2005                 return PTR_ERR(fence);
2006         if (afence)
2007                 add_async_op_fence_cb(vm, fence, afence);
2008
2009         xe_vma_destroy(vma, fence);
2010         dma_fence_put(fence);
2011
2012         return 0;
2013 }
2014
2015 static int vm_set_error_capture_address(struct xe_device *xe, struct xe_vm *vm,
2016                                         u64 value)
2017 {
2018         if (XE_IOCTL_DBG(xe, !value))
2019                 return -EINVAL;
2020
2021         if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
2022                 return -EOPNOTSUPP;
2023
2024         if (XE_IOCTL_DBG(xe, vm->async_ops.error_capture.addr))
2025                 return -EOPNOTSUPP;
2026
2027         vm->async_ops.error_capture.mm = current->mm;
2028         vm->async_ops.error_capture.addr = value;
2029         init_waitqueue_head(&vm->async_ops.error_capture.wq);
2030
2031         return 0;
2032 }
2033
2034 typedef int (*xe_vm_set_property_fn)(struct xe_device *xe, struct xe_vm *vm,
2035                                      u64 value);
2036
2037 static const xe_vm_set_property_fn vm_set_property_funcs[] = {
2038         [XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS] =
2039                 vm_set_error_capture_address,
2040 };
2041
2042 static int vm_user_ext_set_property(struct xe_device *xe, struct xe_vm *vm,
2043                                     u64 extension)
2044 {
2045         u64 __user *address = u64_to_user_ptr(extension);
2046         struct drm_xe_ext_vm_set_property ext;
2047         int err;
2048
2049         err = __copy_from_user(&ext, address, sizeof(ext));
2050         if (XE_IOCTL_DBG(xe, err))
2051                 return -EFAULT;
2052
2053         if (XE_IOCTL_DBG(xe, ext.property >=
2054                          ARRAY_SIZE(vm_set_property_funcs)) ||
2055             XE_IOCTL_DBG(xe, ext.pad) ||
2056             XE_IOCTL_DBG(xe, ext.reserved[0] || ext.reserved[1]))
2057                 return -EINVAL;
2058
2059         return vm_set_property_funcs[ext.property](xe, vm, ext.value);
2060 }
2061
2062 typedef int (*xe_vm_user_extension_fn)(struct xe_device *xe, struct xe_vm *vm,
2063                                        u64 extension);
2064
2065 static const xe_vm_set_property_fn vm_user_extension_funcs[] = {
2066         [XE_VM_EXTENSION_SET_PROPERTY] = vm_user_ext_set_property,
2067 };
2068
2069 #define MAX_USER_EXTENSIONS     16
2070 static int vm_user_extensions(struct xe_device *xe, struct xe_vm *vm,
2071                               u64 extensions, int ext_number)
2072 {
2073         u64 __user *address = u64_to_user_ptr(extensions);
2074         struct xe_user_extension ext;
2075         int err;
2076
2077         if (XE_IOCTL_DBG(xe, ext_number >= MAX_USER_EXTENSIONS))
2078                 return -E2BIG;
2079
2080         err = __copy_from_user(&ext, address, sizeof(ext));
2081         if (XE_IOCTL_DBG(xe, err))
2082                 return -EFAULT;
2083
2084         if (XE_IOCTL_DBG(xe, ext.pad) ||
2085             XE_IOCTL_DBG(xe, ext.name >=
2086                          ARRAY_SIZE(vm_user_extension_funcs)))
2087                 return -EINVAL;
2088
2089         err = vm_user_extension_funcs[ext.name](xe, vm, extensions);
2090         if (XE_IOCTL_DBG(xe, err))
2091                 return err;
2092
2093         if (ext.next_extension)
2094                 return vm_user_extensions(xe, vm, ext.next_extension,
2095                                           ++ext_number);
2096
2097         return 0;
2098 }
2099
2100 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_SCRATCH_PAGE | \
2101                                     DRM_XE_VM_CREATE_COMPUTE_MODE | \
2102                                     DRM_XE_VM_CREATE_ASYNC_BIND_OPS | \
2103                                     DRM_XE_VM_CREATE_FAULT_MODE)
2104
2105 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
2106                        struct drm_file *file)
2107 {
2108         struct xe_device *xe = to_xe_device(dev);
2109         struct xe_file *xef = to_xe_file(file);
2110         struct drm_xe_vm_create *args = data;
2111         struct xe_tile *tile;
2112         struct xe_vm *vm;
2113         u32 id, asid;
2114         int err;
2115         u32 flags = 0;
2116
2117         if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
2118                 args->flags |= DRM_XE_VM_CREATE_SCRATCH_PAGE;
2119
2120         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
2121                          !xe->info.supports_usm))
2122                 return -EINVAL;
2123
2124         if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2125                 return -EINVAL;
2126
2127         if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
2128                 return -EINVAL;
2129
2130         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE &&
2131                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2132                 return -EINVAL;
2133
2134         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE &&
2135                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2136                 return -EINVAL;
2137
2138         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
2139                          xe_device_in_non_fault_mode(xe)))
2140                 return -EINVAL;
2141
2142         if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FAULT_MODE) &&
2143                          xe_device_in_fault_mode(xe)))
2144                 return -EINVAL;
2145
2146         if (args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE)
2147                 flags |= XE_VM_FLAG_SCRATCH_PAGE;
2148         if (args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE)
2149                 flags |= XE_VM_FLAG_COMPUTE_MODE;
2150         if (args->flags & DRM_XE_VM_CREATE_ASYNC_BIND_OPS)
2151                 flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
2152         if (args->flags & DRM_XE_VM_CREATE_FAULT_MODE)
2153                 flags |= XE_VM_FLAG_FAULT_MODE;
2154
2155         vm = xe_vm_create(xe, flags);
2156         if (IS_ERR(vm))
2157                 return PTR_ERR(vm);
2158
2159         if (args->extensions) {
2160                 err = vm_user_extensions(xe, vm, args->extensions, 0);
2161                 if (XE_IOCTL_DBG(xe, err)) {
2162                         xe_vm_close_and_put(vm);
2163                         return err;
2164                 }
2165         }
2166
2167         mutex_lock(&xef->vm.lock);
2168         err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2169         mutex_unlock(&xef->vm.lock);
2170         if (err) {
2171                 xe_vm_close_and_put(vm);
2172                 return err;
2173         }
2174
2175         if (xe->info.has_asid) {
2176                 mutex_lock(&xe->usm.lock);
2177                 err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2178                                       XA_LIMIT(0, XE_MAX_ASID - 1),
2179                                       &xe->usm.next_asid, GFP_KERNEL);
2180                 mutex_unlock(&xe->usm.lock);
2181                 if (err) {
2182                         xe_vm_close_and_put(vm);
2183                         return err;
2184                 }
2185                 vm->usm.asid = asid;
2186         }
2187
2188         args->vm_id = id;
2189         vm->xef = xef;
2190
2191         /* Record BO memory for VM pagetable created against client */
2192         for_each_tile(tile, xe, id)
2193                 if (vm->pt_root[id])
2194                         xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
2195
2196 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2197         /* Warning: Security issue - never enable by default */
2198         args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2199 #endif
2200
2201         return 0;
2202 }
2203
2204 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2205                         struct drm_file *file)
2206 {
2207         struct xe_device *xe = to_xe_device(dev);
2208         struct xe_file *xef = to_xe_file(file);
2209         struct drm_xe_vm_destroy *args = data;
2210         struct xe_vm *vm;
2211         int err = 0;
2212
2213         if (XE_IOCTL_DBG(xe, args->pad) ||
2214             XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2215                 return -EINVAL;
2216
2217         mutex_lock(&xef->vm.lock);
2218         vm = xa_load(&xef->vm.xa, args->vm_id);
2219         if (XE_IOCTL_DBG(xe, !vm))
2220                 err = -ENOENT;
2221         else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2222                 err = -EBUSY;
2223         else
2224                 xa_erase(&xef->vm.xa, args->vm_id);
2225         mutex_unlock(&xef->vm.lock);
2226
2227         if (!err)
2228                 xe_vm_close_and_put(vm);
2229
2230         return err;
2231 }
2232
2233 static const u32 region_to_mem_type[] = {
2234         XE_PL_TT,
2235         XE_PL_VRAM0,
2236         XE_PL_VRAM1,
2237 };
2238
2239 static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
2240                           struct xe_exec_queue *q, u32 region,
2241                           struct xe_sync_entry *syncs, u32 num_syncs,
2242                           struct async_op_fence *afence, bool first_op,
2243                           bool last_op)
2244 {
2245         int err;
2246
2247         xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
2248
2249         if (!xe_vma_has_no_bo(vma)) {
2250                 err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
2251                 if (err)
2252                         return err;
2253         }
2254
2255         if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
2256                 return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
2257                                   afence, true, first_op, last_op);
2258         } else {
2259                 int i;
2260
2261                 /* Nothing to do, signal fences now */
2262                 if (last_op) {
2263                         for (i = 0; i < num_syncs; i++)
2264                                 xe_sync_entry_signal(&syncs[i], NULL,
2265                                                      dma_fence_get_stub());
2266                 }
2267                 if (afence)
2268                         dma_fence_signal(&afence->fence);
2269                 return 0;
2270         }
2271 }
2272
2273 #define VM_BIND_OP(op)  (op & 0xffff)
2274
2275 static void vm_set_async_error(struct xe_vm *vm, int err)
2276 {
2277         lockdep_assert_held(&vm->lock);
2278         vm->async_ops.error = err;
2279 }
2280
2281 static int vm_bind_ioctl_lookup_vma(struct xe_vm *vm, struct xe_bo *bo,
2282                                     u64 addr, u64 range, u32 op)
2283 {
2284         struct xe_device *xe = vm->xe;
2285         struct xe_vma *vma;
2286         bool async = !!(op & XE_VM_BIND_FLAG_ASYNC);
2287
2288         lockdep_assert_held(&vm->lock);
2289
2290         switch (VM_BIND_OP(op)) {
2291         case XE_VM_BIND_OP_MAP:
2292         case XE_VM_BIND_OP_MAP_USERPTR:
2293                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2294                 if (XE_IOCTL_DBG(xe, vma && !async))
2295                         return -EBUSY;
2296                 break;
2297         case XE_VM_BIND_OP_UNMAP:
2298         case XE_VM_BIND_OP_PREFETCH:
2299                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2300                 if (XE_IOCTL_DBG(xe, !vma))
2301                         /* Not an actual error, IOCTL cleans up returns and 0 */
2302                         return -ENODATA;
2303                 if (XE_IOCTL_DBG(xe, (xe_vma_start(vma) != addr ||
2304                                       xe_vma_end(vma) != addr + range) && !async))
2305                         return -EINVAL;
2306                 break;
2307         case XE_VM_BIND_OP_UNMAP_ALL:
2308                 if (XE_IOCTL_DBG(xe, list_empty(&bo->ttm.base.gpuva.list)))
2309                         /* Not an actual error, IOCTL cleans up returns and 0 */
2310                         return -ENODATA;
2311                 break;
2312         default:
2313                 drm_warn(&xe->drm, "NOT POSSIBLE");
2314                 return -EINVAL;
2315         }
2316
2317         return 0;
2318 }
2319
2320 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2321                              bool post_commit)
2322 {
2323         down_read(&vm->userptr.notifier_lock);
2324         vma->gpuva.flags |= XE_VMA_DESTROYED;
2325         up_read(&vm->userptr.notifier_lock);
2326         if (post_commit)
2327                 xe_vm_remove_vma(vm, vma);
2328 }
2329
2330 #undef ULL
2331 #define ULL     unsigned long long
2332
2333 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2334 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2335 {
2336         struct xe_vma *vma;
2337
2338         switch (op->op) {
2339         case DRM_GPUVA_OP_MAP:
2340                 vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2341                        (ULL)op->map.va.addr, (ULL)op->map.va.range);
2342                 break;
2343         case DRM_GPUVA_OP_REMAP:
2344                 vma = gpuva_to_vma(op->remap.unmap->va);
2345                 vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2346                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2347                        op->unmap.keep ? 1 : 0);
2348                 if (op->remap.prev)
2349                         vm_dbg(&xe->drm,
2350                                "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2351                                (ULL)op->remap.prev->va.addr,
2352                                (ULL)op->remap.prev->va.range);
2353                 if (op->remap.next)
2354                         vm_dbg(&xe->drm,
2355                                "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2356                                (ULL)op->remap.next->va.addr,
2357                                (ULL)op->remap.next->va.range);
2358                 break;
2359         case DRM_GPUVA_OP_UNMAP:
2360                 vma = gpuva_to_vma(op->unmap.va);
2361                 vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2362                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2363                        op->unmap.keep ? 1 : 0);
2364                 break;
2365         case DRM_GPUVA_OP_PREFETCH:
2366                 vma = gpuva_to_vma(op->prefetch.va);
2367                 vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2368                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2369                 break;
2370         default:
2371                 drm_warn(&xe->drm, "NOT POSSIBLE");
2372         }
2373 }
2374 #else
2375 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2376 {
2377 }
2378 #endif
2379
2380 /*
2381  * Create operations list from IOCTL arguments, setup operations fields so parse
2382  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2383  */
2384 static struct drm_gpuva_ops *
2385 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2386                          u64 bo_offset_or_userptr, u64 addr, u64 range,
2387                          u32 operation, u8 tile_mask, u32 region)
2388 {
2389         struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2390         struct drm_gpuva_ops *ops;
2391         struct drm_gpuva_op *__op;
2392         struct xe_vma_op *op;
2393         struct drm_gpuvm_bo *vm_bo;
2394         int err;
2395
2396         lockdep_assert_held_write(&vm->lock);
2397
2398         vm_dbg(&vm->xe->drm,
2399                "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2400                VM_BIND_OP(operation), (ULL)addr, (ULL)range,
2401                (ULL)bo_offset_or_userptr);
2402
2403         switch (VM_BIND_OP(operation)) {
2404         case XE_VM_BIND_OP_MAP:
2405         case XE_VM_BIND_OP_MAP_USERPTR:
2406                 ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2407                                                   obj, bo_offset_or_userptr);
2408                 if (IS_ERR(ops))
2409                         return ops;
2410
2411                 drm_gpuva_for_each_op(__op, ops) {
2412                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2413
2414                         op->tile_mask = tile_mask;
2415                         op->map.immediate =
2416                                 operation & XE_VM_BIND_FLAG_IMMEDIATE;
2417                         op->map.read_only =
2418                                 operation & XE_VM_BIND_FLAG_READONLY;
2419                         op->map.is_null = operation & XE_VM_BIND_FLAG_NULL;
2420                 }
2421                 break;
2422         case XE_VM_BIND_OP_UNMAP:
2423                 ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2424                 if (IS_ERR(ops))
2425                         return ops;
2426
2427                 drm_gpuva_for_each_op(__op, ops) {
2428                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2429
2430                         op->tile_mask = tile_mask;
2431                 }
2432                 break;
2433         case XE_VM_BIND_OP_PREFETCH:
2434                 ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2435                 if (IS_ERR(ops))
2436                         return ops;
2437
2438                 drm_gpuva_for_each_op(__op, ops) {
2439                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2440
2441                         op->tile_mask = tile_mask;
2442                         op->prefetch.region = region;
2443                 }
2444                 break;
2445         case XE_VM_BIND_OP_UNMAP_ALL:
2446                 xe_assert(vm->xe, bo);
2447
2448                 err = xe_bo_lock(bo, true);
2449                 if (err)
2450                         return ERR_PTR(err);
2451
2452                 vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj);
2453                 if (!vm_bo)
2454                         break;
2455
2456                 ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2457                 drm_gpuvm_bo_put(vm_bo);
2458                 xe_bo_unlock(bo);
2459                 if (IS_ERR(ops))
2460                         return ops;
2461
2462                 drm_gpuva_for_each_op(__op, ops) {
2463                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2464
2465                         op->tile_mask = tile_mask;
2466                 }
2467                 break;
2468         default:
2469                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2470                 ops = ERR_PTR(-EINVAL);
2471         }
2472
2473 #ifdef TEST_VM_ASYNC_OPS_ERROR
2474         if (operation & FORCE_ASYNC_OP_ERROR) {
2475                 op = list_first_entry_or_null(&ops->list, struct xe_vma_op,
2476                                               base.entry);
2477                 if (op)
2478                         op->inject_error = true;
2479         }
2480 #endif
2481
2482         if (!IS_ERR(ops))
2483                 drm_gpuva_for_each_op(__op, ops)
2484                         print_op(vm->xe, __op);
2485
2486         return ops;
2487 }
2488
2489 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2490                               u8 tile_mask, bool read_only, bool is_null)
2491 {
2492         struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2493         struct xe_vma *vma;
2494         int err;
2495
2496         lockdep_assert_held_write(&vm->lock);
2497
2498         if (bo) {
2499                 err = xe_bo_lock(bo, true);
2500                 if (err)
2501                         return ERR_PTR(err);
2502         }
2503         vma = xe_vma_create(vm, bo, op->gem.offset,
2504                             op->va.addr, op->va.addr +
2505                             op->va.range - 1, read_only, is_null,
2506                             tile_mask);
2507         if (bo)
2508                 xe_bo_unlock(bo);
2509
2510         if (xe_vma_is_userptr(vma)) {
2511                 err = xe_vma_userptr_pin_pages(vma);
2512                 if (err) {
2513                         prep_vma_destroy(vm, vma, false);
2514                         xe_vma_destroy_unlocked(vma);
2515                         return ERR_PTR(err);
2516                 }
2517         } else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
2518                 vm_insert_extobj(vm, vma);
2519                 err = add_preempt_fences(vm, bo);
2520                 if (err) {
2521                         prep_vma_destroy(vm, vma, false);
2522                         xe_vma_destroy_unlocked(vma);
2523                         return ERR_PTR(err);
2524                 }
2525         }
2526
2527         return vma;
2528 }
2529
2530 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2531 {
2532         if (vma->gpuva.flags & XE_VMA_PTE_1G)
2533                 return SZ_1G;
2534         else if (vma->gpuva.flags & XE_VMA_PTE_2M)
2535                 return SZ_2M;
2536
2537         return SZ_4K;
2538 }
2539
2540 static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2541 {
2542         switch (size) {
2543         case SZ_1G:
2544                 vma->gpuva.flags |= XE_VMA_PTE_1G;
2545                 break;
2546         case SZ_2M:
2547                 vma->gpuva.flags |= XE_VMA_PTE_2M;
2548                 break;
2549         }
2550
2551         return SZ_4K;
2552 }
2553
2554 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2555 {
2556         int err = 0;
2557
2558         lockdep_assert_held_write(&vm->lock);
2559
2560         switch (op->base.op) {
2561         case DRM_GPUVA_OP_MAP:
2562                 err |= xe_vm_insert_vma(vm, op->map.vma);
2563                 if (!err)
2564                         op->flags |= XE_VMA_OP_COMMITTED;
2565                 break;
2566         case DRM_GPUVA_OP_REMAP:
2567                 prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2568                                  true);
2569                 op->flags |= XE_VMA_OP_COMMITTED;
2570
2571                 if (op->remap.prev) {
2572                         err |= xe_vm_insert_vma(vm, op->remap.prev);
2573                         if (!err)
2574                                 op->flags |= XE_VMA_OP_PREV_COMMITTED;
2575                         if (!err && op->remap.skip_prev)
2576                                 op->remap.prev = NULL;
2577                 }
2578                 if (op->remap.next) {
2579                         err |= xe_vm_insert_vma(vm, op->remap.next);
2580                         if (!err)
2581                                 op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2582                         if (!err && op->remap.skip_next)
2583                                 op->remap.next = NULL;
2584                 }
2585
2586                 /* Adjust for partial unbind after removin VMA from VM */
2587                 if (!err) {
2588                         op->base.remap.unmap->va->va.addr = op->remap.start;
2589                         op->base.remap.unmap->va->va.range = op->remap.range;
2590                 }
2591                 break;
2592         case DRM_GPUVA_OP_UNMAP:
2593                 prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2594                 op->flags |= XE_VMA_OP_COMMITTED;
2595                 break;
2596         case DRM_GPUVA_OP_PREFETCH:
2597                 op->flags |= XE_VMA_OP_COMMITTED;
2598                 break;
2599         default:
2600                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2601         }
2602
2603         return err;
2604 }
2605
2606
2607 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
2608                                    struct drm_gpuva_ops *ops,
2609                                    struct xe_sync_entry *syncs, u32 num_syncs,
2610                                    struct list_head *ops_list, bool last,
2611                                    bool async)
2612 {
2613         struct xe_vma_op *last_op = NULL;
2614         struct async_op_fence *fence = NULL;
2615         struct drm_gpuva_op *__op;
2616         int err = 0;
2617
2618         lockdep_assert_held_write(&vm->lock);
2619
2620         if (last && num_syncs && async) {
2621                 u64 seqno;
2622
2623                 fence = kmalloc(sizeof(*fence), GFP_KERNEL);
2624                 if (!fence)
2625                         return -ENOMEM;
2626
2627                 seqno = q ? ++q->bind.fence_seqno : ++vm->async_ops.fence.seqno;
2628                 dma_fence_init(&fence->fence, &async_op_fence_ops,
2629                                &vm->async_ops.lock, q ? q->bind.fence_ctx :
2630                                vm->async_ops.fence.context, seqno);
2631
2632                 if (!xe_vm_no_dma_fences(vm)) {
2633                         fence->vm = vm;
2634                         fence->started = false;
2635                         init_waitqueue_head(&fence->wq);
2636                 }
2637         }
2638
2639         drm_gpuva_for_each_op(__op, ops) {
2640                 struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2641                 bool first = list_empty(ops_list);
2642
2643                 xe_assert(vm->xe, first || async);
2644
2645                 INIT_LIST_HEAD(&op->link);
2646                 list_add_tail(&op->link, ops_list);
2647
2648                 if (first) {
2649                         op->flags |= XE_VMA_OP_FIRST;
2650                         op->num_syncs = num_syncs;
2651                         op->syncs = syncs;
2652                 }
2653
2654                 op->q = q;
2655
2656                 switch (op->base.op) {
2657                 case DRM_GPUVA_OP_MAP:
2658                 {
2659                         struct xe_vma *vma;
2660
2661                         vma = new_vma(vm, &op->base.map,
2662                                       op->tile_mask, op->map.read_only,
2663                                       op->map.is_null);
2664                         if (IS_ERR(vma)) {
2665                                 err = PTR_ERR(vma);
2666                                 goto free_fence;
2667                         }
2668
2669                         op->map.vma = vma;
2670                         break;
2671                 }
2672                 case DRM_GPUVA_OP_REMAP:
2673                 {
2674                         struct xe_vma *old =
2675                                 gpuva_to_vma(op->base.remap.unmap->va);
2676
2677                         op->remap.start = xe_vma_start(old);
2678                         op->remap.range = xe_vma_size(old);
2679
2680                         if (op->base.remap.prev) {
2681                                 struct xe_vma *vma;
2682                                 bool read_only =
2683                                         op->base.remap.unmap->va->flags &
2684                                         XE_VMA_READ_ONLY;
2685                                 bool is_null =
2686                                         op->base.remap.unmap->va->flags &
2687                                         DRM_GPUVA_SPARSE;
2688
2689                                 vma = new_vma(vm, op->base.remap.prev,
2690                                               op->tile_mask, read_only,
2691                                               is_null);
2692                                 if (IS_ERR(vma)) {
2693                                         err = PTR_ERR(vma);
2694                                         goto free_fence;
2695                                 }
2696
2697                                 op->remap.prev = vma;
2698
2699                                 /*
2700                                  * Userptr creates a new SG mapping so
2701                                  * we must also rebind.
2702                                  */
2703                                 op->remap.skip_prev = !xe_vma_is_userptr(old) &&
2704                                         IS_ALIGNED(xe_vma_end(vma),
2705                                                    xe_vma_max_pte_size(old));
2706                                 if (op->remap.skip_prev) {
2707                                         xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2708                                         op->remap.range -=
2709                                                 xe_vma_end(vma) -
2710                                                 xe_vma_start(old);
2711                                         op->remap.start = xe_vma_end(vma);
2712                                 }
2713                         }
2714
2715                         if (op->base.remap.next) {
2716                                 struct xe_vma *vma;
2717                                 bool read_only =
2718                                         op->base.remap.unmap->va->flags &
2719                                         XE_VMA_READ_ONLY;
2720
2721                                 bool is_null =
2722                                         op->base.remap.unmap->va->flags &
2723                                         DRM_GPUVA_SPARSE;
2724
2725                                 vma = new_vma(vm, op->base.remap.next,
2726                                               op->tile_mask, read_only,
2727                                               is_null);
2728                                 if (IS_ERR(vma)) {
2729                                         err = PTR_ERR(vma);
2730                                         goto free_fence;
2731                                 }
2732
2733                                 op->remap.next = vma;
2734
2735                                 /*
2736                                  * Userptr creates a new SG mapping so
2737                                  * we must also rebind.
2738                                  */
2739                                 op->remap.skip_next = !xe_vma_is_userptr(old) &&
2740                                         IS_ALIGNED(xe_vma_start(vma),
2741                                                    xe_vma_max_pte_size(old));
2742                                 if (op->remap.skip_next) {
2743                                         xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2744                                         op->remap.range -=
2745                                                 xe_vma_end(old) -
2746                                                 xe_vma_start(vma);
2747                                 }
2748                         }
2749                         break;
2750                 }
2751                 case DRM_GPUVA_OP_UNMAP:
2752                 case DRM_GPUVA_OP_PREFETCH:
2753                         /* Nothing to do */
2754                         break;
2755                 default:
2756                         drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2757                 }
2758
2759                 last_op = op;
2760
2761                 err = xe_vma_op_commit(vm, op);
2762                 if (err)
2763                         goto free_fence;
2764         }
2765
2766         /* FIXME: Unhandled corner case */
2767         XE_WARN_ON(!last_op && last && !list_empty(ops_list));
2768
2769         if (!last_op)
2770                 goto free_fence;
2771         last_op->ops = ops;
2772         if (last) {
2773                 last_op->flags |= XE_VMA_OP_LAST;
2774                 last_op->num_syncs = num_syncs;
2775                 last_op->syncs = syncs;
2776                 last_op->fence = fence;
2777         }
2778
2779         return 0;
2780
2781 free_fence:
2782         kfree(fence);
2783         return err;
2784 }
2785
2786 static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
2787                       struct xe_vma *vma, struct xe_vma_op *op)
2788 {
2789         int err;
2790
2791         lockdep_assert_held_write(&vm->lock);
2792
2793         err = xe_vm_prepare_vma(exec, vma, 1);
2794         if (err)
2795                 return err;
2796
2797         xe_vm_assert_held(vm);
2798         xe_bo_assert_held(xe_vma_bo(vma));
2799
2800         switch (op->base.op) {
2801         case DRM_GPUVA_OP_MAP:
2802                 err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
2803                                  op->syncs, op->num_syncs, op->fence,
2804                                  op->map.immediate || !xe_vm_in_fault_mode(vm),
2805                                  op->flags & XE_VMA_OP_FIRST,
2806                                  op->flags & XE_VMA_OP_LAST);
2807                 break;
2808         case DRM_GPUVA_OP_REMAP:
2809         {
2810                 bool prev = !!op->remap.prev;
2811                 bool next = !!op->remap.next;
2812
2813                 if (!op->remap.unmap_done) {
2814                         if (prev || next) {
2815                                 vm->async_ops.munmap_rebind_inflight = true;
2816                                 vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
2817                         }
2818                         err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2819                                            op->num_syncs,
2820                                            !prev && !next ? op->fence : NULL,
2821                                            op->flags & XE_VMA_OP_FIRST,
2822                                            op->flags & XE_VMA_OP_LAST && !prev &&
2823                                            !next);
2824                         if (err)
2825                                 break;
2826                         op->remap.unmap_done = true;
2827                 }
2828
2829                 if (prev) {
2830                         op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
2831                         err = xe_vm_bind(vm, op->remap.prev, op->q,
2832                                          xe_vma_bo(op->remap.prev), op->syncs,
2833                                          op->num_syncs,
2834                                          !next ? op->fence : NULL, true, false,
2835                                          op->flags & XE_VMA_OP_LAST && !next);
2836                         op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2837                         if (err)
2838                                 break;
2839                         op->remap.prev = NULL;
2840                 }
2841
2842                 if (next) {
2843                         op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
2844                         err = xe_vm_bind(vm, op->remap.next, op->q,
2845                                          xe_vma_bo(op->remap.next),
2846                                          op->syncs, op->num_syncs,
2847                                          op->fence, true, false,
2848                                          op->flags & XE_VMA_OP_LAST);
2849                         op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2850                         if (err)
2851                                 break;
2852                         op->remap.next = NULL;
2853                 }
2854                 vm->async_ops.munmap_rebind_inflight = false;
2855
2856                 break;
2857         }
2858         case DRM_GPUVA_OP_UNMAP:
2859                 err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2860                                    op->num_syncs, op->fence,
2861                                    op->flags & XE_VMA_OP_FIRST,
2862                                    op->flags & XE_VMA_OP_LAST);
2863                 break;
2864         case DRM_GPUVA_OP_PREFETCH:
2865                 err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
2866                                      op->syncs, op->num_syncs, op->fence,
2867                                      op->flags & XE_VMA_OP_FIRST,
2868                                      op->flags & XE_VMA_OP_LAST);
2869                 break;
2870         default:
2871                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2872         }
2873
2874         if (err)
2875                 trace_xe_vma_fail(vma);
2876
2877         return err;
2878 }
2879
2880 static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
2881                                struct xe_vma_op *op)
2882 {
2883         struct drm_exec exec;
2884         int err;
2885
2886 retry_userptr:
2887         drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT);
2888         drm_exec_until_all_locked(&exec) {
2889                 err = op_execute(&exec, vm, vma, op);
2890                 drm_exec_retry_on_contention(&exec);
2891                 if (err)
2892                         break;
2893         }
2894         drm_exec_fini(&exec);
2895
2896         if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
2897                 lockdep_assert_held_write(&vm->lock);
2898                 err = xe_vma_userptr_pin_pages(vma);
2899                 if (!err)
2900                         goto retry_userptr;
2901
2902                 trace_xe_vma_fail(vma);
2903         }
2904
2905         return err;
2906 }
2907
2908 static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
2909 {
2910         int ret = 0;
2911
2912         lockdep_assert_held_write(&vm->lock);
2913
2914 #ifdef TEST_VM_ASYNC_OPS_ERROR
2915         if (op->inject_error) {
2916                 op->inject_error = false;
2917                 return -ENOMEM;
2918         }
2919 #endif
2920
2921         switch (op->base.op) {
2922         case DRM_GPUVA_OP_MAP:
2923                 ret = __xe_vma_op_execute(vm, op->map.vma, op);
2924                 break;
2925         case DRM_GPUVA_OP_REMAP:
2926         {
2927                 struct xe_vma *vma;
2928
2929                 if (!op->remap.unmap_done)
2930                         vma = gpuva_to_vma(op->base.remap.unmap->va);
2931                 else if (op->remap.prev)
2932                         vma = op->remap.prev;
2933                 else
2934                         vma = op->remap.next;
2935
2936                 ret = __xe_vma_op_execute(vm, vma, op);
2937                 break;
2938         }
2939         case DRM_GPUVA_OP_UNMAP:
2940                 ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
2941                                           op);
2942                 break;
2943         case DRM_GPUVA_OP_PREFETCH:
2944                 ret = __xe_vma_op_execute(vm,
2945                                           gpuva_to_vma(op->base.prefetch.va),
2946                                           op);
2947                 break;
2948         default:
2949                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2950         }
2951
2952         return ret;
2953 }
2954
2955 static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
2956 {
2957         bool last = op->flags & XE_VMA_OP_LAST;
2958
2959         if (last) {
2960                 while (op->num_syncs--)
2961                         xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
2962                 kfree(op->syncs);
2963                 if (op->q)
2964                         xe_exec_queue_put(op->q);
2965                 if (op->fence)
2966                         dma_fence_put(&op->fence->fence);
2967         }
2968         if (!list_empty(&op->link)) {
2969                 spin_lock_irq(&vm->async_ops.lock);
2970                 list_del(&op->link);
2971                 spin_unlock_irq(&vm->async_ops.lock);
2972         }
2973         if (op->ops)
2974                 drm_gpuva_ops_free(&vm->gpuvm, op->ops);
2975         if (last)
2976                 xe_vm_put(vm);
2977 }
2978
2979 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2980                              bool post_commit, bool prev_post_commit,
2981                              bool next_post_commit)
2982 {
2983         lockdep_assert_held_write(&vm->lock);
2984
2985         switch (op->base.op) {
2986         case DRM_GPUVA_OP_MAP:
2987                 if (op->map.vma) {
2988                         prep_vma_destroy(vm, op->map.vma, post_commit);
2989                         xe_vma_destroy_unlocked(op->map.vma);
2990                 }
2991                 break;
2992         case DRM_GPUVA_OP_UNMAP:
2993         {
2994                 struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2995
2996                 if (vma) {
2997                         down_read(&vm->userptr.notifier_lock);
2998                         vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2999                         up_read(&vm->userptr.notifier_lock);
3000                         if (post_commit)
3001                                 xe_vm_insert_vma(vm, vma);
3002                 }
3003                 break;
3004         }
3005         case DRM_GPUVA_OP_REMAP:
3006         {
3007                 struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
3008
3009                 if (op->remap.prev) {
3010                         prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
3011                         xe_vma_destroy_unlocked(op->remap.prev);
3012                 }
3013                 if (op->remap.next) {
3014                         prep_vma_destroy(vm, op->remap.next, next_post_commit);
3015                         xe_vma_destroy_unlocked(op->remap.next);
3016                 }
3017                 if (vma) {
3018                         down_read(&vm->userptr.notifier_lock);
3019                         vma->gpuva.flags &= ~XE_VMA_DESTROYED;
3020                         up_read(&vm->userptr.notifier_lock);
3021                         if (post_commit)
3022                                 xe_vm_insert_vma(vm, vma);
3023                 }
3024                 break;
3025         }
3026         case DRM_GPUVA_OP_PREFETCH:
3027                 /* Nothing to do */
3028                 break;
3029         default:
3030                 drm_warn(&vm->xe->drm, "NOT POSSIBLE");
3031         }
3032 }
3033
3034 static struct xe_vma_op *next_vma_op(struct xe_vm *vm)
3035 {
3036         return list_first_entry_or_null(&vm->async_ops.pending,
3037                                         struct xe_vma_op, link);
3038 }
3039
3040 static void xe_vma_op_work_func(struct work_struct *w)
3041 {
3042         struct xe_vm *vm = container_of(w, struct xe_vm, async_ops.work);
3043
3044         for (;;) {
3045                 struct xe_vma_op *op;
3046                 int err;
3047
3048                 if (vm->async_ops.error && !xe_vm_is_closed(vm))
3049                         break;
3050
3051                 spin_lock_irq(&vm->async_ops.lock);
3052                 op = next_vma_op(vm);
3053                 spin_unlock_irq(&vm->async_ops.lock);
3054
3055                 if (!op)
3056                         break;
3057
3058                 if (!xe_vm_is_closed(vm)) {
3059                         down_write(&vm->lock);
3060                         err = xe_vma_op_execute(vm, op);
3061                         if (err) {
3062                                 drm_warn(&vm->xe->drm,
3063                                          "Async VM op(%d) failed with %d",
3064                                          op->base.op, err);
3065                                 vm_set_async_error(vm, err);
3066                                 up_write(&vm->lock);
3067
3068                                 if (vm->async_ops.error_capture.addr)
3069                                         vm_error_capture(vm, err, 0, 0, 0);
3070                                 break;
3071                         }
3072                         up_write(&vm->lock);
3073                 } else {
3074                         struct xe_vma *vma;
3075
3076                         switch (op->base.op) {
3077                         case DRM_GPUVA_OP_REMAP:
3078                                 vma = gpuva_to_vma(op->base.remap.unmap->va);
3079                                 trace_xe_vma_flush(vma);
3080
3081                                 down_write(&vm->lock);
3082                                 xe_vma_destroy_unlocked(vma);
3083                                 up_write(&vm->lock);
3084                                 break;
3085                         case DRM_GPUVA_OP_UNMAP:
3086                                 vma = gpuva_to_vma(op->base.unmap.va);
3087                                 trace_xe_vma_flush(vma);
3088
3089                                 down_write(&vm->lock);
3090                                 xe_vma_destroy_unlocked(vma);
3091                                 up_write(&vm->lock);
3092                                 break;
3093                         default:
3094                                 /* Nothing to do */
3095                                 break;
3096                         }
3097
3098                         if (op->fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
3099                                                    &op->fence->fence.flags)) {
3100                                 if (!xe_vm_no_dma_fences(vm)) {
3101                                         op->fence->started = true;
3102                                         wake_up_all(&op->fence->wq);
3103                                 }
3104                                 dma_fence_signal(&op->fence->fence);
3105                         }
3106                 }
3107
3108                 xe_vma_op_cleanup(vm, op);
3109         }
3110 }
3111
3112 static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
3113                                      struct list_head *ops_list, bool async)
3114 {
3115         struct xe_vma_op *op, *last_op, *next;
3116         int err;
3117
3118         lockdep_assert_held_write(&vm->lock);
3119
3120         list_for_each_entry(op, ops_list, link)
3121                 last_op = op;
3122
3123         if (!async) {
3124                 err = xe_vma_op_execute(vm, last_op);
3125                 if (err)
3126                         goto unwind;
3127                 xe_vma_op_cleanup(vm, last_op);
3128         } else {
3129                 int i;
3130                 bool installed = false;
3131
3132                 for (i = 0; i < last_op->num_syncs; i++)
3133                         installed |= xe_sync_entry_signal(&last_op->syncs[i],
3134                                                           NULL,
3135                                                           &last_op->fence->fence);
3136                 if (!installed && last_op->fence)
3137                         dma_fence_signal(&last_op->fence->fence);
3138
3139                 spin_lock_irq(&vm->async_ops.lock);
3140                 list_splice_tail(ops_list, &vm->async_ops.pending);
3141                 spin_unlock_irq(&vm->async_ops.lock);
3142
3143                 if (!vm->async_ops.error)
3144                         queue_work(system_unbound_wq, &vm->async_ops.work);
3145         }
3146
3147         return 0;
3148
3149 unwind:
3150         list_for_each_entry_reverse(op, ops_list, link)
3151                 xe_vma_op_unwind(vm, op, op->flags & XE_VMA_OP_COMMITTED,
3152                                  op->flags & XE_VMA_OP_PREV_COMMITTED,
3153                                  op->flags & XE_VMA_OP_NEXT_COMMITTED);
3154         list_for_each_entry_safe(op, next, ops_list, link)
3155                 xe_vma_op_cleanup(vm, op);
3156
3157         return err;
3158 }
3159
3160 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
3161                                      struct drm_gpuva_ops **ops,
3162                                      int num_ops_list)
3163 {
3164         int i;
3165
3166         for (i = num_ops_list - 1; i; ++i) {
3167                 struct drm_gpuva_ops *__ops = ops[i];
3168                 struct drm_gpuva_op *__op;
3169
3170                 if (!__ops)
3171                         continue;
3172
3173                 drm_gpuva_for_each_op_reverse(__op, __ops) {
3174                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3175
3176                         xe_vma_op_unwind(vm, op,
3177                                          op->flags & XE_VMA_OP_COMMITTED,
3178                                          op->flags & XE_VMA_OP_PREV_COMMITTED,
3179                                          op->flags & XE_VMA_OP_NEXT_COMMITTED);
3180                 }
3181
3182                 drm_gpuva_ops_free(&vm->gpuvm, __ops);
3183         }
3184 }
3185
3186 #ifdef TEST_VM_ASYNC_OPS_ERROR
3187 #define SUPPORTED_FLAGS \
3188         (FORCE_ASYNC_OP_ERROR | XE_VM_BIND_FLAG_ASYNC | \
3189          XE_VM_BIND_FLAG_READONLY | XE_VM_BIND_FLAG_IMMEDIATE | \
3190          XE_VM_BIND_FLAG_NULL | 0xffff)
3191 #else
3192 #define SUPPORTED_FLAGS \
3193         (XE_VM_BIND_FLAG_ASYNC | XE_VM_BIND_FLAG_READONLY | \
3194          XE_VM_BIND_FLAG_IMMEDIATE | XE_VM_BIND_FLAG_NULL | 0xffff)
3195 #endif
3196 #define XE_64K_PAGE_MASK 0xffffull
3197
3198 #define MAX_BINDS       512     /* FIXME: Picking random upper limit */
3199
3200 static int vm_bind_ioctl_check_args(struct xe_device *xe,
3201                                     struct drm_xe_vm_bind *args,
3202                                     struct drm_xe_vm_bind_op **bind_ops,
3203                                     bool *async)
3204 {
3205         int err;
3206         int i;
3207
3208         if (XE_IOCTL_DBG(xe, args->extensions) ||
3209             XE_IOCTL_DBG(xe, !args->num_binds) ||
3210             XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS))
3211                 return -EINVAL;
3212
3213         if (args->num_binds > 1) {
3214                 u64 __user *bind_user =
3215                         u64_to_user_ptr(args->vector_of_binds);
3216
3217                 *bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) *
3218                                     args->num_binds, GFP_KERNEL);
3219                 if (!*bind_ops)
3220                         return -ENOMEM;
3221
3222                 err = __copy_from_user(*bind_ops, bind_user,
3223                                        sizeof(struct drm_xe_vm_bind_op) *
3224                                        args->num_binds);
3225                 if (XE_IOCTL_DBG(xe, err)) {
3226                         err = -EFAULT;
3227                         goto free_bind_ops;
3228                 }
3229         } else {
3230                 *bind_ops = &args->bind;
3231         }
3232
3233         for (i = 0; i < args->num_binds; ++i) {
3234                 u64 range = (*bind_ops)[i].range;
3235                 u64 addr = (*bind_ops)[i].addr;
3236                 u32 op = (*bind_ops)[i].op;
3237                 u32 obj = (*bind_ops)[i].obj;
3238                 u64 obj_offset = (*bind_ops)[i].obj_offset;
3239                 u32 region = (*bind_ops)[i].region;
3240                 bool is_null = op & XE_VM_BIND_FLAG_NULL;
3241
3242                 if (i == 0) {
3243                         *async = !!(op & XE_VM_BIND_FLAG_ASYNC);
3244                 } else if (XE_IOCTL_DBG(xe, !*async) ||
3245                            XE_IOCTL_DBG(xe, !(op & XE_VM_BIND_FLAG_ASYNC)) ||
3246                            XE_IOCTL_DBG(xe, VM_BIND_OP(op) ==
3247                                         XE_VM_BIND_OP_RESTART)) {
3248                         err = -EINVAL;
3249                         goto free_bind_ops;
3250                 }
3251
3252                 if (XE_IOCTL_DBG(xe, !*async &&
3253                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL)) {
3254                         err = -EINVAL;
3255                         goto free_bind_ops;
3256                 }
3257
3258                 if (XE_IOCTL_DBG(xe, !*async &&
3259                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH)) {
3260                         err = -EINVAL;
3261                         goto free_bind_ops;
3262                 }
3263
3264                 if (XE_IOCTL_DBG(xe, VM_BIND_OP(op) >
3265                                  XE_VM_BIND_OP_PREFETCH) ||
3266                     XE_IOCTL_DBG(xe, op & ~SUPPORTED_FLAGS) ||
3267                     XE_IOCTL_DBG(xe, obj && is_null) ||
3268                     XE_IOCTL_DBG(xe, obj_offset && is_null) ||
3269                     XE_IOCTL_DBG(xe, VM_BIND_OP(op) != XE_VM_BIND_OP_MAP &&
3270                                  is_null) ||
3271                     XE_IOCTL_DBG(xe, !obj &&
3272                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP &&
3273                                  !is_null) ||
3274                     XE_IOCTL_DBG(xe, !obj &&
3275                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3276                     XE_IOCTL_DBG(xe, addr &&
3277                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3278                     XE_IOCTL_DBG(xe, range &&
3279                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3280                     XE_IOCTL_DBG(xe, obj &&
3281                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP_USERPTR) ||
3282                     XE_IOCTL_DBG(xe, obj &&
3283                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH) ||
3284                     XE_IOCTL_DBG(xe, region &&
3285                                  VM_BIND_OP(op) != XE_VM_BIND_OP_PREFETCH) ||
3286                     XE_IOCTL_DBG(xe, !(BIT(region) &
3287                                        xe->info.mem_region_mask)) ||
3288                     XE_IOCTL_DBG(xe, obj &&
3289                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP)) {
3290                         err = -EINVAL;
3291                         goto free_bind_ops;
3292                 }
3293
3294                 if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3295                     XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3296                     XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3297                     XE_IOCTL_DBG(xe, !range && VM_BIND_OP(op) !=
3298                                  XE_VM_BIND_OP_RESTART &&
3299                                  VM_BIND_OP(op) != XE_VM_BIND_OP_UNMAP_ALL)) {
3300                         err = -EINVAL;
3301                         goto free_bind_ops;
3302                 }
3303         }
3304
3305         return 0;
3306
3307 free_bind_ops:
3308         if (args->num_binds > 1)
3309                 kfree(*bind_ops);
3310         return err;
3311 }
3312
3313 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3314 {
3315         struct xe_device *xe = to_xe_device(dev);
3316         struct xe_file *xef = to_xe_file(file);
3317         struct drm_xe_vm_bind *args = data;
3318         struct drm_xe_sync __user *syncs_user;
3319         struct xe_bo **bos = NULL;
3320         struct drm_gpuva_ops **ops = NULL;
3321         struct xe_vm *vm;
3322         struct xe_exec_queue *q = NULL;
3323         u32 num_syncs;
3324         struct xe_sync_entry *syncs = NULL;
3325         struct drm_xe_vm_bind_op *bind_ops;
3326         LIST_HEAD(ops_list);
3327         bool async;
3328         int err;
3329         int i;
3330
3331         err = vm_bind_ioctl_check_args(xe, args, &bind_ops, &async);
3332         if (err)
3333                 return err;
3334
3335         if (args->exec_queue_id) {
3336                 q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3337                 if (XE_IOCTL_DBG(xe, !q)) {
3338                         err = -ENOENT;
3339                         goto free_objs;
3340                 }
3341
3342                 if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3343                         err = -EINVAL;
3344                         goto put_exec_queue;
3345                 }
3346         }
3347
3348         vm = xe_vm_lookup(xef, args->vm_id);
3349         if (XE_IOCTL_DBG(xe, !vm)) {
3350                 err = -EINVAL;
3351                 goto put_exec_queue;
3352         }
3353
3354         err = down_write_killable(&vm->lock);
3355         if (err)
3356                 goto put_vm;
3357
3358         if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3359                 err = -ENOENT;
3360                 goto release_vm_lock;
3361         }
3362
3363         if (VM_BIND_OP(bind_ops[0].op) == XE_VM_BIND_OP_RESTART) {
3364                 if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
3365                         err = -EOPNOTSUPP;
3366                 if (XE_IOCTL_DBG(xe, !err && args->num_syncs))
3367                         err = EINVAL;
3368                 if (XE_IOCTL_DBG(xe, !err && !vm->async_ops.error))
3369                         err = -EPROTO;
3370
3371                 if (!err) {
3372                         trace_xe_vm_restart(vm);
3373                         vm_set_async_error(vm, 0);
3374
3375                         queue_work(system_unbound_wq, &vm->async_ops.work);
3376
3377                         /* Rebinds may have been blocked, give worker a kick */
3378                         if (xe_vm_in_compute_mode(vm))
3379                                 xe_vm_queue_rebind_worker(vm);
3380                 }
3381
3382                 goto release_vm_lock;
3383         }
3384
3385         if (XE_IOCTL_DBG(xe, !vm->async_ops.error &&
3386                          async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) {
3387                 err = -EOPNOTSUPP;
3388                 goto release_vm_lock;
3389         }
3390
3391         for (i = 0; i < args->num_binds; ++i) {
3392                 u64 range = bind_ops[i].range;
3393                 u64 addr = bind_ops[i].addr;
3394
3395                 if (XE_IOCTL_DBG(xe, range > vm->size) ||
3396                     XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3397                         err = -EINVAL;
3398                         goto release_vm_lock;
3399                 }
3400
3401                 if (bind_ops[i].tile_mask) {
3402                         u64 valid_tiles = BIT(xe->info.tile_count) - 1;
3403
3404                         if (XE_IOCTL_DBG(xe, bind_ops[i].tile_mask &
3405                                          ~valid_tiles)) {
3406                                 err = -EINVAL;
3407                                 goto release_vm_lock;
3408                         }
3409                 }
3410         }
3411
3412         bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
3413         if (!bos) {
3414                 err = -ENOMEM;
3415                 goto release_vm_lock;
3416         }
3417
3418         ops = kzalloc(sizeof(*ops) * args->num_binds, GFP_KERNEL);
3419         if (!ops) {
3420                 err = -ENOMEM;
3421                 goto release_vm_lock;
3422         }
3423
3424         for (i = 0; i < args->num_binds; ++i) {
3425                 struct drm_gem_object *gem_obj;
3426                 u64 range = bind_ops[i].range;
3427                 u64 addr = bind_ops[i].addr;
3428                 u32 obj = bind_ops[i].obj;
3429                 u64 obj_offset = bind_ops[i].obj_offset;
3430
3431                 if (!obj)
3432                         continue;
3433
3434                 gem_obj = drm_gem_object_lookup(file, obj);
3435                 if (XE_IOCTL_DBG(xe, !gem_obj)) {
3436                         err = -ENOENT;
3437                         goto put_obj;
3438                 }
3439                 bos[i] = gem_to_xe_bo(gem_obj);
3440
3441                 if (XE_IOCTL_DBG(xe, range > bos[i]->size) ||
3442                     XE_IOCTL_DBG(xe, obj_offset >
3443                                  bos[i]->size - range)) {
3444                         err = -EINVAL;
3445                         goto put_obj;
3446                 }
3447
3448                 if (bos[i]->flags & XE_BO_INTERNAL_64K) {
3449                         if (XE_IOCTL_DBG(xe, obj_offset &
3450                                          XE_64K_PAGE_MASK) ||
3451                             XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3452                             XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3453                                 err = -EINVAL;
3454                                 goto put_obj;
3455                         }
3456                 }
3457         }
3458
3459         if (args->num_syncs) {
3460                 syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3461                 if (!syncs) {
3462                         err = -ENOMEM;
3463                         goto put_obj;
3464                 }
3465         }
3466
3467         syncs_user = u64_to_user_ptr(args->syncs);
3468         for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3469                 err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3470                                           &syncs_user[num_syncs], false,
3471                                           xe_vm_no_dma_fences(vm));
3472                 if (err)
3473                         goto free_syncs;
3474         }
3475
3476         /* Do some error checking first to make the unwind easier */
3477         for (i = 0; i < args->num_binds; ++i) {
3478                 u64 range = bind_ops[i].range;
3479                 u64 addr = bind_ops[i].addr;
3480                 u32 op = bind_ops[i].op;
3481
3482                 err = vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op);
3483                 if (err)
3484                         goto free_syncs;
3485         }
3486
3487         for (i = 0; i < args->num_binds; ++i) {
3488                 u64 range = bind_ops[i].range;
3489                 u64 addr = bind_ops[i].addr;
3490                 u32 op = bind_ops[i].op;
3491                 u64 obj_offset = bind_ops[i].obj_offset;
3492                 u8 tile_mask = bind_ops[i].tile_mask;
3493                 u32 region = bind_ops[i].region;
3494
3495                 ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3496                                                   addr, range, op, tile_mask,
3497                                                   region);
3498                 if (IS_ERR(ops[i])) {
3499                         err = PTR_ERR(ops[i]);
3500                         ops[i] = NULL;
3501                         goto unwind_ops;
3502                 }
3503
3504                 err = vm_bind_ioctl_ops_parse(vm, q, ops[i], syncs, num_syncs,
3505                                               &ops_list,
3506                                               i == args->num_binds - 1,
3507                                               async);
3508                 if (err)
3509                         goto unwind_ops;
3510         }
3511
3512         /* Nothing to do */
3513         if (list_empty(&ops_list)) {
3514                 err = -ENODATA;
3515                 goto unwind_ops;
3516         }
3517
3518         err = vm_bind_ioctl_ops_execute(vm, &ops_list, async);
3519         up_write(&vm->lock);
3520
3521         for (i = 0; i < args->num_binds; ++i)
3522                 xe_bo_put(bos[i]);
3523
3524         kfree(bos);
3525         kfree(ops);
3526         if (args->num_binds > 1)
3527                 kfree(bind_ops);
3528
3529         return err;
3530
3531 unwind_ops:
3532         vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3533 free_syncs:
3534         for (i = 0; err == -ENODATA && i < num_syncs; i++)
3535                 xe_sync_entry_signal(&syncs[i], NULL, dma_fence_get_stub());
3536         while (num_syncs--)
3537                 xe_sync_entry_cleanup(&syncs[num_syncs]);
3538
3539         kfree(syncs);
3540 put_obj:
3541         for (i = 0; i < args->num_binds; ++i)
3542                 xe_bo_put(bos[i]);
3543 release_vm_lock:
3544         up_write(&vm->lock);
3545 put_vm:
3546         xe_vm_put(vm);
3547 put_exec_queue:
3548         if (q)
3549                 xe_exec_queue_put(q);
3550 free_objs:
3551         kfree(bos);
3552         kfree(ops);
3553         if (args->num_binds > 1)
3554                 kfree(bind_ops);
3555         return err == -ENODATA ? 0 : err;
3556 }
3557
3558 /**
3559  * xe_vm_lock() - Lock the vm's dma_resv object
3560  * @vm: The struct xe_vm whose lock is to be locked
3561  * @intr: Whether to perform any wait interruptible
3562  *
3563  * Return: 0 on success, -EINTR if @intr is true and the wait for a
3564  * contended lock was interrupted. If @intr is false, the function
3565  * always returns 0.
3566  */
3567 int xe_vm_lock(struct xe_vm *vm, bool intr)
3568 {
3569         if (intr)
3570                 return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3571
3572         return dma_resv_lock(xe_vm_resv(vm), NULL);
3573 }
3574
3575 /**
3576  * xe_vm_unlock() - Unlock the vm's dma_resv object
3577  * @vm: The struct xe_vm whose lock is to be released.
3578  *
3579  * Unlock a buffer object lock that was locked by xe_vm_lock().
3580  */
3581 void xe_vm_unlock(struct xe_vm *vm)
3582 {
3583         dma_resv_unlock(xe_vm_resv(vm));
3584 }
3585
3586 /**
3587  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3588  * @vma: VMA to invalidate
3589  *
3590  * Walks a list of page tables leaves which it memset the entries owned by this
3591  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3592  * complete.
3593  *
3594  * Returns 0 for success, negative error code otherwise.
3595  */
3596 int xe_vm_invalidate_vma(struct xe_vma *vma)
3597 {
3598         struct xe_device *xe = xe_vma_vm(vma)->xe;
3599         struct xe_tile *tile;
3600         u32 tile_needs_invalidate = 0;
3601         int seqno[XE_MAX_TILES_PER_DEVICE];
3602         u8 id;
3603         int ret;
3604
3605         xe_assert(xe, xe_vm_in_fault_mode(xe_vma_vm(vma)));
3606         xe_assert(xe, !xe_vma_is_null(vma));
3607         trace_xe_vma_usm_invalidate(vma);
3608
3609         /* Check that we don't race with page-table updates */
3610         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3611                 if (xe_vma_is_userptr(vma)) {
3612                         WARN_ON_ONCE(!mmu_interval_check_retry
3613                                      (&vma->userptr.notifier,
3614                                       vma->userptr.notifier_seq));
3615                         WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3616                                                              DMA_RESV_USAGE_BOOKKEEP));
3617
3618                 } else {
3619                         xe_bo_assert_held(xe_vma_bo(vma));
3620                 }
3621         }
3622
3623         for_each_tile(tile, xe, id) {
3624                 if (xe_pt_zap_ptes(tile, vma)) {
3625                         tile_needs_invalidate |= BIT(id);
3626                         xe_device_wmb(xe);
3627                         /*
3628                          * FIXME: We potentially need to invalidate multiple
3629                          * GTs within the tile
3630                          */
3631                         seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
3632                         if (seqno[id] < 0)
3633                                 return seqno[id];
3634                 }
3635         }
3636
3637         for_each_tile(tile, xe, id) {
3638                 if (tile_needs_invalidate & BIT(id)) {
3639                         ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
3640                         if (ret < 0)
3641                                 return ret;
3642                 }
3643         }
3644
3645         vma->usm.tile_invalidated = vma->tile_mask;
3646
3647         return 0;
3648 }
3649
3650 int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
3651 {
3652         struct drm_gpuva *gpuva;
3653         bool is_vram;
3654         uint64_t addr;
3655
3656         if (!down_read_trylock(&vm->lock)) {
3657                 drm_printf(p, " Failed to acquire VM lock to dump capture");
3658                 return 0;
3659         }
3660         if (vm->pt_root[gt_id]) {
3661                 addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
3662                 is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
3663                 drm_printf(p, " VM root: A:0x%llx %s\n", addr,
3664                            is_vram ? "VRAM" : "SYS");
3665         }
3666
3667         drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3668                 struct xe_vma *vma = gpuva_to_vma(gpuva);
3669                 bool is_userptr = xe_vma_is_userptr(vma);
3670                 bool is_null = xe_vma_is_null(vma);
3671
3672                 if (is_null) {
3673                         addr = 0;
3674                 } else if (is_userptr) {
3675                         struct xe_res_cursor cur;
3676
3677                         if (vma->userptr.sg) {
3678                                 xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE,
3679                                                 &cur);
3680                                 addr = xe_res_dma(&cur);
3681                         } else {
3682                                 addr = 0;
3683                         }
3684                 } else {
3685                         addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
3686                         is_vram = xe_bo_is_vram(xe_vma_bo(vma));
3687                 }
3688                 drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
3689                            xe_vma_start(vma), xe_vma_end(vma) - 1,
3690                            xe_vma_size(vma),
3691                            addr, is_null ? "NULL" : is_userptr ? "USR" :
3692                            is_vram ? "VRAM" : "SYS");
3693         }
3694         up_read(&vm->lock);
3695
3696         return 0;
3697 }