drivers/gpu/drm/xe/xe_vm.c

   1 // SPDX-License-Identifier: MIT
   2 /*
   3  * Copyright © 2021 Intel Corporation
   4  */
   5
   6 #include "xe_vm.h"
   7
   8 #include <linux/dma-fence-array.h>
   9
  10 #include <drm/drm_print.h>
  11 #include <drm/ttm/ttm_execbuf_util.h>
  12 #include <drm/ttm/ttm_tt.h>
  13 #include <drm/xe_drm.h>
  14 #include <linux/delay.h>
  15 #include <linux/kthread.h>
  16 #include <linux/mm.h>
  17 #include <linux/swap.h>
  18
  19 #include "xe_bo.h"
  20 #include "xe_device.h"
  21 #include "xe_exec_queue.h"
  22 #include "xe_gt.h"
  23 #include "xe_gt_pagefault.h"
  24 #include "xe_gt_tlb_invalidation.h"
  25 #include "xe_migrate.h"
  26 #include "xe_pm.h"
  27 #include "xe_preempt_fence.h"
  28 #include "xe_pt.h"
  29 #include "xe_res_cursor.h"
  30 #include "xe_sync.h"
  31 #include "xe_trace.h"
  32 #include "generated/xe_wa_oob.h"
  33 #include "xe_wa.h"
  34
  35 #define TEST_VM_ASYNC_OPS_ERROR
  36
  37 /**
  38  * xe_vma_userptr_check_repin() - Advisory check for repin needed
  39  * @vma: The userptr vma
  40  *
  41  * Check if the userptr vma has been invalidated since last successful
  42  * repin. The check is advisory only and can the function can be called
  43  * without the vm->userptr.notifier_lock held. There is no guarantee that the
  44  * vma userptr will remain valid after a lockless check, so typically
  45  * the call needs to be followed by a proper check under the notifier_lock.
  46  *
  47  * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
  48  */
  49 int xe_vma_userptr_check_repin(struct xe_vma *vma)
  50 {
  51         return mmu_interval_check_retry(&vma->userptr.notifier,
  52                                         vma->userptr.notifier_seq) ?
  53                 -EAGAIN : 0;
  54 }
  55
  56 int xe_vma_userptr_pin_pages(struct xe_vma *vma)
  57 {
  58         struct xe_vm *vm = xe_vma_vm(vma);
  59         struct xe_device *xe = vm->xe;
  60         const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
  61         struct page **pages;
  62         bool in_kthread = !current->mm;
  63         unsigned long notifier_seq;
  64         int pinned, ret, i;
  65         bool read_only = xe_vma_read_only(vma);
  66
  67         lockdep_assert_held(&vm->lock);
  68         XE_WARN_ON(!xe_vma_is_userptr(vma));
  69 retry:
  70         if (vma->gpuva.flags & XE_VMA_DESTROYED)
  71                 return 0;
  72
  73         notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier);
  74         if (notifier_seq == vma->userptr.notifier_seq)
  75                 return 0;
  76
  77         pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
  78         if (!pages)
  79                 return -ENOMEM;
  80
  81         if (vma->userptr.sg) {
  82                 dma_unmap_sgtable(xe->drm.dev,
  83                                   vma->userptr.sg,
  84                                   read_only ? DMA_TO_DEVICE :
  85                                   DMA_BIDIRECTIONAL, 0);
  86                 sg_free_table(vma->userptr.sg);
  87                 vma->userptr.sg = NULL;
  88         }
  89
  90         pinned = ret = 0;
  91         if (in_kthread) {
  92                 if (!mmget_not_zero(vma->userptr.notifier.mm)) {
  93                         ret = -EFAULT;
  94                         goto mm_closed;
  95                 }
  96                 kthread_use_mm(vma->userptr.notifier.mm);
  97         }
  98
  99         while (pinned < num_pages) {
 100                 ret = get_user_pages_fast(xe_vma_userptr(vma) +
 101                                           pinned * PAGE_SIZE,
 102                                           num_pages - pinned,
 103                                           read_only ? 0 : FOLL_WRITE,
 104                                           &pages[pinned]);
 105                 if (ret < 0) {
 106                         if (in_kthread)
 107                                 ret = 0;
 108                         break;
 109                 }
 110
 111                 pinned += ret;
 112                 ret = 0;
 113         }
 114
 115         if (in_kthread) {
 116                 kthread_unuse_mm(vma->userptr.notifier.mm);
 117                 mmput(vma->userptr.notifier.mm);
 118         }
 119 mm_closed:
 120         if (ret)
 121                 goto out;
 122
 123         ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages,
 124                                                 pinned, 0,
 125                                                 (u64)pinned << PAGE_SHIFT,
 126                                                 xe_sg_segment_size(xe->drm.dev),
 127                                                 GFP_KERNEL);
 128         if (ret) {
 129                 vma->userptr.sg = NULL;
 130                 goto out;
 131         }
 132         vma->userptr.sg = &vma->userptr.sgt;
 133
 134         ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg,
 135                               read_only ? DMA_TO_DEVICE :
 136                               DMA_BIDIRECTIONAL,
 137                               DMA_ATTR_SKIP_CPU_SYNC |
 138                               DMA_ATTR_NO_KERNEL_MAPPING);
 139         if (ret) {
 140                 sg_free_table(vma->userptr.sg);
 141                 vma->userptr.sg = NULL;
 142                 goto out;
 143         }
 144
 145         for (i = 0; i < pinned; ++i) {
 146                 if (!read_only) {
 147                         lock_page(pages[i]);
 148                         set_page_dirty(pages[i]);
 149                         unlock_page(pages[i]);
 150                 }
 151
 152                 mark_page_accessed(pages[i]);
 153         }
 154
 155 out:
 156         release_pages(pages, pinned);
 157         kvfree(pages);
 158
 159         if (!(ret < 0)) {
 160                 vma->userptr.notifier_seq = notifier_seq;
 161                 if (xe_vma_userptr_check_repin(vma) == -EAGAIN)
 162                         goto retry;
 163         }
 164
 165         return ret < 0 ? ret : 0;
 166 }
 167
 168 static bool preempt_fences_waiting(struct xe_vm *vm)
 169 {
 170         struct xe_exec_queue *q;
 171
 172         lockdep_assert_held(&vm->lock);
 173         xe_vm_assert_held(vm);
 174
 175         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
 176                 if (!q->compute.pfence ||
 177                     (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 178                                                    &q->compute.pfence->flags))) {
 179                         return true;
 180                 }
 181         }
 182
 183         return false;
 184 }
 185
 186 static void free_preempt_fences(struct list_head *list)
 187 {
 188         struct list_head *link, *next;
 189
 190         list_for_each_safe(link, next, list)
 191                 xe_preempt_fence_free(to_preempt_fence_from_link(link));
 192 }
 193
 194 static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
 195                                 unsigned int *count)
 196 {
 197         lockdep_assert_held(&vm->lock);
 198         xe_vm_assert_held(vm);
 199
 200         if (*count >= vm->preempt.num_exec_queues)
 201                 return 0;
 202
 203         for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
 204                 struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
 205
 206                 if (IS_ERR(pfence))
 207                         return PTR_ERR(pfence);
 208
 209                 list_move_tail(xe_preempt_fence_link(pfence), list);
 210         }
 211
 212         return 0;
 213 }
 214
 215 static int wait_for_existing_preempt_fences(struct xe_vm *vm)
 216 {
 217         struct xe_exec_queue *q;
 218
 219         xe_vm_assert_held(vm);
 220
 221         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
 222                 if (q->compute.pfence) {
 223                         long timeout = dma_fence_wait(q->compute.pfence, false);
 224
 225                         if (timeout < 0)
 226                                 return -ETIME;
 227                         dma_fence_put(q->compute.pfence);
 228                         q->compute.pfence = NULL;
 229                 }
 230         }
 231
 232         return 0;
 233 }
 234
 235 static bool xe_vm_is_idle(struct xe_vm *vm)
 236 {
 237         struct xe_exec_queue *q;
 238
 239         xe_vm_assert_held(vm);
 240         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
 241                 if (!xe_exec_queue_is_idle(q))
 242                         return false;
 243         }
 244
 245         return true;
 246 }
 247
 248 static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
 249 {
 250         struct list_head *link;
 251         struct xe_exec_queue *q;
 252
 253         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
 254                 struct dma_fence *fence;
 255
 256                 link = list->next;
 257                 XE_WARN_ON(link == list);
 258
 259                 fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
 260                                              q, q->compute.context,
 261                                              ++q->compute.seqno);
 262                 dma_fence_put(q->compute.pfence);
 263                 q->compute.pfence = fence;
 264         }
 265 }
 266
 267 static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
 268 {
 269         struct xe_exec_queue *q;
 270         struct ww_acquire_ctx ww;
 271         int err;
 272
 273         err = xe_bo_lock(bo, &ww, vm->preempt.num_exec_queues, true);
 274         if (err)
 275                 return err;
 276
 277         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
 278                 if (q->compute.pfence) {
 279                         dma_resv_add_fence(bo->ttm.base.resv,
 280                                            q->compute.pfence,
 281                                            DMA_RESV_USAGE_BOOKKEEP);
 282                 }
 283
 284         xe_bo_unlock(bo, &ww);
 285         return 0;
 286 }
 287
 288 /**
 289  * xe_vm_fence_all_extobjs() - Add a fence to vm's external objects' resv
 290  * @vm: The vm.
 291  * @fence: The fence to add.
 292  * @usage: The resv usage for the fence.
 293  *
 294  * Loops over all of the vm's external object bindings and adds a @fence
 295  * with the given @usage to all of the external object's reservation
 296  * objects.
 297  */
 298 void xe_vm_fence_all_extobjs(struct xe_vm *vm, struct dma_fence *fence,
 299                              enum dma_resv_usage usage)
 300 {
 301         struct xe_vma *vma;
 302
 303         list_for_each_entry(vma, &vm->extobj.list, extobj.link)
 304                 dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, usage);
 305 }
 306
 307 static void resume_and_reinstall_preempt_fences(struct xe_vm *vm)
 308 {
 309         struct xe_exec_queue *q;
 310
 311         lockdep_assert_held(&vm->lock);
 312         xe_vm_assert_held(vm);
 313
 314         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
 315                 q->ops->resume(q);
 316
 317                 dma_resv_add_fence(xe_vm_resv(vm), q->compute.pfence,
 318                                    DMA_RESV_USAGE_BOOKKEEP);
 319                 xe_vm_fence_all_extobjs(vm, q->compute.pfence,
 320                                         DMA_RESV_USAGE_BOOKKEEP);
 321         }
 322 }
 323
 324 int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
 325 {
 326         struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
 327         struct ttm_validate_buffer *tv;
 328         struct ww_acquire_ctx ww;
 329         struct list_head objs;
 330         struct dma_fence *pfence;
 331         int err;
 332         bool wait;
 333
 334         XE_WARN_ON(!xe_vm_in_compute_mode(vm));
 335
 336         down_write(&vm->lock);
 337
 338         err = xe_vm_lock_dma_resv(vm, &ww, tv_onstack, &tv, &objs, true, 1);
 339         if (err)
 340                 goto out_unlock_outer;
 341
 342         pfence = xe_preempt_fence_create(q, q->compute.context,
 343                                          ++q->compute.seqno);
 344         if (!pfence) {
 345                 err = -ENOMEM;
 346                 goto out_unlock;
 347         }
 348
 349         list_add(&q->compute.link, &vm->preempt.exec_queues);
 350         ++vm->preempt.num_exec_queues;
 351         q->compute.pfence = pfence;
 352
 353         down_read(&vm->userptr.notifier_lock);
 354
 355         dma_resv_add_fence(xe_vm_resv(vm), pfence,
 356                            DMA_RESV_USAGE_BOOKKEEP);
 357
 358         xe_vm_fence_all_extobjs(vm, pfence, DMA_RESV_USAGE_BOOKKEEP);
 359
 360         /*
 361          * Check to see if a preemption on VM is in flight or userptr
 362          * invalidation, if so trigger this preempt fence to sync state with
 363          * other preempt fences on the VM.
 364          */
 365         wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
 366         if (wait)
 367                 dma_fence_enable_sw_signaling(pfence);
 368
 369         up_read(&vm->userptr.notifier_lock);
 370
 371 out_unlock:
 372         xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
 373 out_unlock_outer:
 374         up_write(&vm->lock);
 375
 376         return err;
 377 }
 378
 379 /**
 380  * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
 381  * that need repinning.
 382  * @vm: The VM.
 383  *
 384  * This function checks for whether the VM has userptrs that need repinning,
 385  * and provides a release-type barrier on the userptr.notifier_lock after
 386  * checking.
 387  *
 388  * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
 389  */
 390 int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
 391 {
 392         lockdep_assert_held_read(&vm->userptr.notifier_lock);
 393
 394         return (list_empty(&vm->userptr.repin_list) &&
 395                 list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
 396 }
 397
 398 /**
 399  * xe_vm_lock_dma_resv() - Lock the vm dma_resv object and the dma_resv
 400  * objects of the vm's external buffer objects.
 401  * @vm: The vm.
 402  * @ww: Pointer to a struct ww_acquire_ctx locking context.
 403  * @tv_onstack: Array size XE_ONSTACK_TV of storage for the struct
 404  * ttm_validate_buffers used for locking.
 405  * @tv: Pointer to a pointer that on output contains the actual storage used.
 406  * @objs: List head for the buffer objects locked.
 407  * @intr: Whether to lock interruptible.
 408  * @num_shared: Number of dma-fence slots to reserve in the locked objects.
 409  *
 410  * Locks the vm dma-resv objects and all the dma-resv objects of the
 411  * buffer objects on the vm external object list. The TTM utilities require
 412  * a list of struct ttm_validate_buffers pointing to the actual buffer
 413  * objects to lock. Storage for those struct ttm_validate_buffers should
 414  * be provided in @tv_onstack, and is typically reserved on the stack
 415  * of the caller. If the size of @tv_onstack isn't sufficient, then
 416  * storage will be allocated internally using kvmalloc().
 417  *
 418  * The function performs deadlock handling internally, and after a
 419  * successful return the ww locking transaction should be considered
 420  * sealed.
 421  *
 422  * Return: 0 on success, Negative error code on error. In particular if
 423  * @intr is set to true, -EINTR or -ERESTARTSYS may be returned. In case
 424  * of error, any locking performed has been reverted.
 425  */
 426 int xe_vm_lock_dma_resv(struct xe_vm *vm, struct ww_acquire_ctx *ww,
 427                         struct ttm_validate_buffer *tv_onstack,
 428                         struct ttm_validate_buffer **tv,
 429                         struct list_head *objs,
 430                         bool intr,
 431                         unsigned int num_shared)
 432 {
 433         struct ttm_validate_buffer *tv_vm, *tv_bo;
 434         struct xe_vma *vma, *next;
 435         LIST_HEAD(dups);
 436         int err;
 437
 438         lockdep_assert_held(&vm->lock);
 439
 440         if (vm->extobj.entries < XE_ONSTACK_TV) {
 441                 tv_vm = tv_onstack;
 442         } else {
 443                 tv_vm = kvmalloc_array(vm->extobj.entries + 1, sizeof(*tv_vm),
 444                                        GFP_KERNEL);
 445                 if (!tv_vm)
 446                         return -ENOMEM;
 447         }
 448         tv_bo = tv_vm + 1;
 449
 450         INIT_LIST_HEAD(objs);
 451         list_for_each_entry(vma, &vm->extobj.list, extobj.link) {
 452                 tv_bo->num_shared = num_shared;
 453                 tv_bo->bo = &xe_vma_bo(vma)->ttm;
 454
 455                 list_add_tail(&tv_bo->head, objs);
 456                 tv_bo++;
 457         }
 458         tv_vm->num_shared = num_shared;
 459         tv_vm->bo = xe_vm_ttm_bo(vm);
 460         list_add_tail(&tv_vm->head, objs);
 461         err = ttm_eu_reserve_buffers(ww, objs, intr, &dups);
 462         if (err)
 463                 goto out_err;
 464
 465         spin_lock(&vm->notifier.list_lock);
 466         list_for_each_entry_safe(vma, next, &vm->notifier.rebind_list,
 467                                  notifier.rebind_link) {
 468                 xe_bo_assert_held(xe_vma_bo(vma));
 469
 470                 list_del_init(&vma->notifier.rebind_link);
 471                 if (vma->tile_present && !(vma->gpuva.flags & XE_VMA_DESTROYED))
 472                         list_move_tail(&vma->combined_links.rebind,
 473                                        &vm->rebind_list);
 474         }
 475         spin_unlock(&vm->notifier.list_lock);
 476
 477         *tv = tv_vm;
 478         return 0;
 479
 480 out_err:
 481         if (tv_vm != tv_onstack)
 482                 kvfree(tv_vm);
 483
 484         return err;
 485 }
 486
 487 /**
 488  * xe_vm_unlock_dma_resv() - Unlock reservation objects locked by
 489  * xe_vm_lock_dma_resv()
 490  * @vm: The vm.
 491  * @tv_onstack: The @tv_onstack array given to xe_vm_lock_dma_resv().
 492  * @tv: The value of *@tv given by xe_vm_lock_dma_resv().
 493  * @ww: The ww_acquire_context used for locking.
 494  * @objs: The list returned from xe_vm_lock_dma_resv().
 495  *
 496  * Unlocks the reservation objects and frees any memory allocated by
 497  * xe_vm_lock_dma_resv().
 498  */
 499 void xe_vm_unlock_dma_resv(struct xe_vm *vm,
 500                            struct ttm_validate_buffer *tv_onstack,
 501                            struct ttm_validate_buffer *tv,
 502                            struct ww_acquire_ctx *ww,
 503                            struct list_head *objs)
 504 {
 505         /*
 506          * Nothing should've been able to enter the list while we were locked,
 507          * since we've held the dma-resvs of all the vm's external objects,
 508          * and holding the dma_resv of an object is required for list
 509          * addition, and we shouldn't add ourselves.
 510          */
 511         XE_WARN_ON(!list_empty(&vm->notifier.rebind_list));
 512
 513         ttm_eu_backoff_reservation(ww, objs);
 514         if (tv && tv != tv_onstack)
 515                 kvfree(tv);
 516 }
 517
 518 #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
 519
 520 static void xe_vm_kill(struct xe_vm *vm)
 521 {
 522         struct ww_acquire_ctx ww;
 523         struct xe_exec_queue *q;
 524
 525         lockdep_assert_held(&vm->lock);
 526
 527         xe_vm_lock(vm, &ww, 0, false);
 528         vm->flags |= XE_VM_FLAG_BANNED;
 529         trace_xe_vm_kill(vm);
 530
 531         list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
 532                 q->ops->kill(q);
 533         xe_vm_unlock(vm, &ww);
 534
 535         /* TODO: Inform user the VM is banned */
 536 }
 537
 538 static void preempt_rebind_work_func(struct work_struct *w)
 539 {
 540         struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
 541         struct xe_vma *vma;
 542         struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV];
 543         struct ttm_validate_buffer *tv;
 544         struct ww_acquire_ctx ww;
 545         struct list_head objs;
 546         struct dma_fence *rebind_fence;
 547         unsigned int fence_count = 0;
 548         LIST_HEAD(preempt_fences);
 549         ktime_t end = 0;
 550         int err;
 551         long wait;
 552         int __maybe_unused tries = 0;
 553
 554         XE_WARN_ON(!xe_vm_in_compute_mode(vm));
 555         trace_xe_vm_rebind_worker_enter(vm);
 556
 557         down_write(&vm->lock);
 558
 559         if (xe_vm_is_closed_or_banned(vm)) {
 560                 up_write(&vm->lock);
 561                 trace_xe_vm_rebind_worker_exit(vm);
 562                 return;
 563         }
 564
 565 retry:
 566         if (vm->async_ops.error)
 567                 goto out_unlock_outer;
 568
 569         /*
 570          * Extreme corner where we exit a VM error state with a munmap style VM
 571          * unbind inflight which requires a rebind. In this case the rebind
 572          * needs to install some fences into the dma-resv slots. The worker to
 573          * do this queued, let that worker make progress by dropping vm->lock
 574          * and trying this again.
 575          */
 576         if (vm->async_ops.munmap_rebind_inflight) {
 577                 up_write(&vm->lock);
 578                 flush_work(&vm->async_ops.work);
 579                 goto retry;
 580         }
 581
 582         if (xe_vm_userptr_check_repin(vm)) {
 583                 err = xe_vm_userptr_pin(vm);
 584                 if (err)
 585                         goto out_unlock_outer;
 586         }
 587
 588         err = xe_vm_lock_dma_resv(vm, &ww, tv_onstack, &tv, &objs,
 589                                   false, vm->preempt.num_exec_queues);
 590         if (err)
 591                 goto out_unlock_outer;
 592
 593         if (xe_vm_is_idle(vm)) {
 594                 vm->preempt.rebind_deactivated = true;
 595                 goto out_unlock;
 596         }
 597
 598         /* Fresh preempt fences already installed. Everyting is running. */
 599         if (!preempt_fences_waiting(vm))
 600                 goto out_unlock;
 601
 602         /*
 603          * This makes sure vm is completely suspended and also balances
 604          * xe_engine suspend- and resume; we resume *all* vm engines below.
 605          */
 606         err = wait_for_existing_preempt_fences(vm);
 607         if (err)
 608                 goto out_unlock;
 609
 610         err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
 611         if (err)
 612                 goto out_unlock;
 613
 614         list_for_each_entry(vma, &vm->rebind_list, combined_links.rebind) {
 615                 if (xe_vma_has_no_bo(vma) ||
 616                     vma->gpuva.flags & XE_VMA_DESTROYED)
 617                         continue;
 618
 619                 err = xe_bo_validate(xe_vma_bo(vma), vm, false);
 620                 if (err)
 621                         goto out_unlock;
 622         }
 623
 624         rebind_fence = xe_vm_rebind(vm, true);
 625         if (IS_ERR(rebind_fence)) {
 626                 err = PTR_ERR(rebind_fence);
 627                 goto out_unlock;
 628         }
 629
 630         if (rebind_fence) {
 631                 dma_fence_wait(rebind_fence, false);
 632                 dma_fence_put(rebind_fence);
 633         }
 634
 635         /* Wait on munmap style VM unbinds */
 636         wait = dma_resv_wait_timeout(xe_vm_resv(vm),
 637                                      DMA_RESV_USAGE_KERNEL,
 638                                      false, MAX_SCHEDULE_TIMEOUT);
 639         if (wait <= 0) {
 640                 err = -ETIME;
 641                 goto out_unlock;
 642         }
 643
 644 #define retry_required(__tries, __vm) \
 645         (IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
 646         (!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
 647         __xe_vm_userptr_needs_repin(__vm))
 648
 649         down_read(&vm->userptr.notifier_lock);
 650         if (retry_required(tries, vm)) {
 651                 up_read(&vm->userptr.notifier_lock);
 652                 err = -EAGAIN;
 653                 goto out_unlock;
 654         }
 655
 656 #undef retry_required
 657
 658         spin_lock(&vm->xe->ttm.lru_lock);
 659         ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
 660         spin_unlock(&vm->xe->ttm.lru_lock);
 661
 662         /* Point of no return. */
 663         arm_preempt_fences(vm, &preempt_fences);
 664         resume_and_reinstall_preempt_fences(vm);
 665         up_read(&vm->userptr.notifier_lock);
 666
 667 out_unlock:
 668         xe_vm_unlock_dma_resv(vm, tv_onstack, tv, &ww, &objs);
 669 out_unlock_outer:
 670         if (err == -EAGAIN) {
 671                 trace_xe_vm_rebind_worker_retry(vm);
 672                 goto retry;
 673         }
 674
 675         /*
 676          * With multiple active VMs, under memory pressure, it is possible that
 677          * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
 678          * Until ttm properly handles locking in such scenarios, best thing the
 679          * driver can do is retry with a timeout. Killing the VM or putting it
 680          * in error state after timeout or other error scenarios is still TBD.
 681          */
 682         if (err == -ENOMEM) {
 683                 ktime_t cur = ktime_get();
 684
 685                 end = end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
 686                 if (ktime_before(cur, end)) {
 687                         msleep(20);
 688                         trace_xe_vm_rebind_worker_retry(vm);
 689                         goto retry;
 690                 }
 691         }
 692         if (err) {
 693                 drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
 694                 xe_vm_kill(vm);
 695         }
 696         up_write(&vm->lock);
 697
 698         free_preempt_fences(&preempt_fences);
 699
 700         trace_xe_vm_rebind_worker_exit(vm);
 701 }
 702
 703 static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
 704                                    const struct mmu_notifier_range *range,
 705                                    unsigned long cur_seq)
 706 {
 707         struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier);
 708         struct xe_vm *vm = xe_vma_vm(vma);
 709         struct dma_resv_iter cursor;
 710         struct dma_fence *fence;
 711         long err;
 712
 713         XE_WARN_ON(!xe_vma_is_userptr(vma));
 714         trace_xe_vma_userptr_invalidate(vma);
 715
 716         if (!mmu_notifier_range_blockable(range))
 717                 return false;
 718
 719         down_write(&vm->userptr.notifier_lock);
 720         mmu_interval_set_seq(mni, cur_seq);
 721
 722         /* No need to stop gpu access if the userptr is not yet bound. */
 723         if (!vma->userptr.initial_bind) {
 724                 up_write(&vm->userptr.notifier_lock);
 725                 return true;
 726         }
 727
 728         /*
 729          * Tell exec and rebind worker they need to repin and rebind this
 730          * userptr.
 731          */
 732         if (!xe_vm_in_fault_mode(vm) &&
 733             !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
 734                 spin_lock(&vm->userptr.invalidated_lock);
 735                 list_move_tail(&vma->userptr.invalidate_link,
 736                                &vm->userptr.invalidated);
 737                 spin_unlock(&vm->userptr.invalidated_lock);
 738         }
 739
 740         up_write(&vm->userptr.notifier_lock);
 741
 742         /*
 743          * Preempt fences turn into schedule disables, pipeline these.
 744          * Note that even in fault mode, we need to wait for binds and
 745          * unbinds to complete, and those are attached as BOOKMARK fences
 746          * to the vm.
 747          */
 748         dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
 749                             DMA_RESV_USAGE_BOOKKEEP);
 750         dma_resv_for_each_fence_unlocked(&cursor, fence)
 751                 dma_fence_enable_sw_signaling(fence);
 752         dma_resv_iter_end(&cursor);
 753
 754         err = dma_resv_wait_timeout(xe_vm_resv(vm),
 755                                     DMA_RESV_USAGE_BOOKKEEP,
 756                                     false, MAX_SCHEDULE_TIMEOUT);
 757         XE_WARN_ON(err <= 0);
 758
 759         if (xe_vm_in_fault_mode(vm)) {
 760                 err = xe_vm_invalidate_vma(vma);
 761                 XE_WARN_ON(err);
 762         }
 763
 764         trace_xe_vma_userptr_invalidate_complete(vma);
 765
 766         return true;
 767 }
 768
 769 static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
 770         .invalidate = vma_userptr_invalidate,
 771 };
 772
 773 int xe_vm_userptr_pin(struct xe_vm *vm)
 774 {
 775         struct xe_vma *vma, *next;
 776         int err = 0;
 777         LIST_HEAD(tmp_evict);
 778
 779         lockdep_assert_held_write(&vm->lock);
 780
 781         /* Collect invalidated userptrs */
 782         spin_lock(&vm->userptr.invalidated_lock);
 783         list_for_each_entry_safe(vma, next, &vm->userptr.invalidated,
 784                                  userptr.invalidate_link) {
 785                 list_del_init(&vma->userptr.invalidate_link);
 786                 if (list_empty(&vma->combined_links.userptr))
 787                         list_move_tail(&vma->combined_links.userptr,
 788                                        &vm->userptr.repin_list);
 789         }
 790         spin_unlock(&vm->userptr.invalidated_lock);
 791
 792         /* Pin and move to temporary list */
 793         list_for_each_entry_safe(vma, next, &vm->userptr.repin_list,
 794                                  combined_links.userptr) {
 795                 err = xe_vma_userptr_pin_pages(vma);
 796                 if (err < 0)
 797                         goto out_err;
 798
 799                 list_move_tail(&vma->combined_links.userptr, &tmp_evict);
 800         }
 801
 802         /* Take lock and move to rebind_list for rebinding. */
 803         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
 804         if (err)
 805                 goto out_err;
 806
 807         list_for_each_entry_safe(vma, next, &tmp_evict, combined_links.userptr)
 808                 list_move_tail(&vma->combined_links.rebind, &vm->rebind_list);
 809
 810         dma_resv_unlock(xe_vm_resv(vm));
 811
 812         return 0;
 813
 814 out_err:
 815         list_splice_tail(&tmp_evict, &vm->userptr.repin_list);
 816
 817         return err;
 818 }
 819
 820 /**
 821  * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
 822  * that need repinning.
 823  * @vm: The VM.
 824  *
 825  * This function does an advisory check for whether the VM has userptrs that
 826  * need repinning.
 827  *
 828  * Return: 0 if there are no indications of userptrs needing repinning,
 829  * -EAGAIN if there are.
 830  */
 831 int xe_vm_userptr_check_repin(struct xe_vm *vm)
 832 {
 833         return (list_empty_careful(&vm->userptr.repin_list) &&
 834                 list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
 835 }
 836
 837 static struct dma_fence *
 838 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
 839                struct xe_sync_entry *syncs, u32 num_syncs,
 840                bool first_op, bool last_op);
 841
 842 struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
 843 {
 844         struct dma_fence *fence = NULL;
 845         struct xe_vma *vma, *next;
 846
 847         lockdep_assert_held(&vm->lock);
 848         if (xe_vm_no_dma_fences(vm) && !rebind_worker)
 849                 return NULL;
 850
 851         xe_vm_assert_held(vm);
 852         list_for_each_entry_safe(vma, next, &vm->rebind_list,
 853                                  combined_links.rebind) {
 854                 XE_WARN_ON(!vma->tile_present);
 855
 856                 list_del_init(&vma->combined_links.rebind);
 857                 dma_fence_put(fence);
 858                 if (rebind_worker)
 859                         trace_xe_vma_rebind_worker(vma);
 860                 else
 861                         trace_xe_vma_rebind_exec(vma);
 862                 fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
 863                 if (IS_ERR(fence))
 864                         return fence;
 865         }
 866
 867         return fence;
 868 }
 869
 870 static struct xe_vma *xe_vma_create(struct xe_vm *vm,
 871                                     struct xe_bo *bo,
 872                                     u64 bo_offset_or_userptr,
 873                                     u64 start, u64 end,
 874                                     bool read_only,
 875                                     bool is_null,
 876                                     u8 tile_mask)
 877 {
 878         struct xe_vma *vma;
 879         struct xe_tile *tile;
 880         u8 id;
 881
 882         XE_WARN_ON(start >= end);
 883         XE_WARN_ON(end >= vm->size);
 884
 885         if (!bo && !is_null)    /* userptr */
 886                 vma = kzalloc(sizeof(*vma), GFP_KERNEL);
 887         else
 888                 vma = kzalloc(sizeof(*vma) - sizeof(struct xe_userptr),
 889                               GFP_KERNEL);
 890         if (!vma) {
 891                 vma = ERR_PTR(-ENOMEM);
 892                 return vma;
 893         }
 894
 895         INIT_LIST_HEAD(&vma->combined_links.rebind);
 896         INIT_LIST_HEAD(&vma->notifier.rebind_link);
 897         INIT_LIST_HEAD(&vma->extobj.link);
 898
 899         INIT_LIST_HEAD(&vma->gpuva.gem.entry);
 900         vma->gpuva.vm = &vm->gpuvm;
 901         vma->gpuva.va.addr = start;
 902         vma->gpuva.va.range = end - start + 1;
 903         if (read_only)
 904                 vma->gpuva.flags |= XE_VMA_READ_ONLY;
 905         if (is_null)
 906                 vma->gpuva.flags |= DRM_GPUVA_SPARSE;
 907
 908         if (tile_mask) {
 909                 vma->tile_mask = tile_mask;
 910         } else {
 911                 for_each_tile(tile, vm->xe, id)
 912                         vma->tile_mask |= 0x1 << id;
 913         }
 914
 915         if (vm->xe->info.platform == XE_PVC)
 916                 vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
 917
 918         if (bo) {
 919                 struct drm_gpuvm_bo *vm_bo;
 920
 921                 xe_bo_assert_held(bo);
 922
 923                 vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
 924                 if (IS_ERR(vm_bo)) {
 925                         kfree(vma);
 926                         return ERR_CAST(vm_bo);
 927                 }
 928
 929                 drm_gem_object_get(&bo->ttm.base);
 930                 vma->gpuva.gem.obj = &bo->ttm.base;
 931                 vma->gpuva.gem.offset = bo_offset_or_userptr;
 932                 drm_gpuva_link(&vma->gpuva, vm_bo);
 933                 drm_gpuvm_bo_put(vm_bo);
 934         } else /* userptr or null */ {
 935                 if (!is_null) {
 936                         u64 size = end - start + 1;
 937                         int err;
 938
 939                         INIT_LIST_HEAD(&vma->userptr.invalidate_link);
 940                         vma->gpuva.gem.offset = bo_offset_or_userptr;
 941
 942                         err = mmu_interval_notifier_insert(&vma->userptr.notifier,
 943                                                            current->mm,
 944                                                            xe_vma_userptr(vma), size,
 945                                                            &vma_userptr_notifier_ops);
 946                         if (err) {
 947                                 kfree(vma);
 948                                 vma = ERR_PTR(err);
 949                                 return vma;
 950                         }
 951
 952                         vma->userptr.notifier_seq = LONG_MAX;
 953                 }
 954
 955                 xe_vm_get(vm);
 956         }
 957
 958         return vma;
 959 }
 960
 961 static bool vm_remove_extobj(struct xe_vma *vma)
 962 {
 963         if (!list_empty(&vma->extobj.link)) {
 964                 xe_vma_vm(vma)->extobj.entries--;
 965                 list_del_init(&vma->extobj.link);
 966                 return true;
 967         }
 968         return false;
 969 }
 970
 971 static void xe_vma_destroy_late(struct xe_vma *vma)
 972 {
 973         struct xe_vm *vm = xe_vma_vm(vma);
 974         struct xe_device *xe = vm->xe;
 975         bool read_only = xe_vma_read_only(vma);
 976
 977         if (xe_vma_is_userptr(vma)) {
 978                 if (vma->userptr.sg) {
 979                         dma_unmap_sgtable(xe->drm.dev,
 980                                           vma->userptr.sg,
 981                                           read_only ? DMA_TO_DEVICE :
 982                                           DMA_BIDIRECTIONAL, 0);
 983                         sg_free_table(vma->userptr.sg);
 984                         vma->userptr.sg = NULL;
 985                 }
 986
 987                 /*
 988                  * Since userptr pages are not pinned, we can't remove
 989                  * the notifer until we're sure the GPU is not accessing
 990                  * them anymore
 991                  */
 992                 mmu_interval_notifier_remove(&vma->userptr.notifier);
 993                 xe_vm_put(vm);
 994         } else if (xe_vma_is_null(vma)) {
 995                 xe_vm_put(vm);
 996         } else {
 997                 xe_bo_put(xe_vma_bo(vma));
 998         }
 999
1000         kfree(vma);
1001 }
1002
1003 static void vma_destroy_work_func(struct work_struct *w)
1004 {
1005         struct xe_vma *vma =
1006                 container_of(w, struct xe_vma, destroy_work);
1007
1008         xe_vma_destroy_late(vma);
1009 }
1010
1011 static struct xe_vma *
1012 bo_has_vm_references_locked(struct xe_bo *bo, struct xe_vm *vm,
1013                             struct xe_vma *ignore)
1014 {
1015         struct drm_gpuvm_bo *vm_bo;
1016         struct drm_gpuva *va;
1017         struct drm_gem_object *obj = &bo->ttm.base;
1018
1019         xe_bo_assert_held(bo);
1020
1021         drm_gem_for_each_gpuvm_bo(vm_bo, obj) {
1022                 drm_gpuvm_bo_for_each_va(va, vm_bo) {
1023                         struct xe_vma *vma = gpuva_to_vma(va);
1024
1025                         if (vma != ignore && xe_vma_vm(vma) == vm)
1026                                 return vma;
1027                 }
1028         }
1029
1030         return NULL;
1031 }
1032
1033 static bool bo_has_vm_references(struct xe_bo *bo, struct xe_vm *vm,
1034                                  struct xe_vma *ignore)
1035 {
1036         struct ww_acquire_ctx ww;
1037         bool ret;
1038
1039         xe_bo_lock(bo, &ww, 0, false);
1040         ret = !!bo_has_vm_references_locked(bo, vm, ignore);
1041         xe_bo_unlock(bo, &ww);
1042
1043         return ret;
1044 }
1045
1046 static void __vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1047 {
1048         lockdep_assert_held_write(&vm->lock);
1049
1050         list_add(&vma->extobj.link, &vm->extobj.list);
1051         vm->extobj.entries++;
1052 }
1053
1054 static void vm_insert_extobj(struct xe_vm *vm, struct xe_vma *vma)
1055 {
1056         struct xe_bo *bo = xe_vma_bo(vma);
1057
1058         lockdep_assert_held_write(&vm->lock);
1059
1060         if (bo_has_vm_references(bo, vm, vma))
1061                 return;
1062
1063         __vm_insert_extobj(vm, vma);
1064 }
1065
1066 static void vma_destroy_cb(struct dma_fence *fence,
1067                            struct dma_fence_cb *cb)
1068 {
1069         struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1070
1071         INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1072         queue_work(system_unbound_wq, &vma->destroy_work);
1073 }
1074
1075 static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1076 {
1077         struct xe_vm *vm = xe_vma_vm(vma);
1078
1079         lockdep_assert_held_write(&vm->lock);
1080         XE_WARN_ON(!list_empty(&vma->combined_links.destroy));
1081
1082         if (xe_vma_is_userptr(vma)) {
1083                 XE_WARN_ON(!(vma->gpuva.flags & XE_VMA_DESTROYED));
1084
1085                 spin_lock(&vm->userptr.invalidated_lock);
1086                 list_del(&vma->userptr.invalidate_link);
1087                 spin_unlock(&vm->userptr.invalidated_lock);
1088         } else if (!xe_vma_is_null(vma)) {
1089                 xe_bo_assert_held(xe_vma_bo(vma));
1090
1091                 spin_lock(&vm->notifier.list_lock);
1092                 list_del(&vma->notifier.rebind_link);
1093                 spin_unlock(&vm->notifier.list_lock);
1094
1095                 drm_gpuva_unlink(&vma->gpuva);
1096
1097                 if (!xe_vma_bo(vma)->vm && vm_remove_extobj(vma)) {
1098                         struct xe_vma *other;
1099
1100                         other = bo_has_vm_references_locked(xe_vma_bo(vma), vm, NULL);
1101
1102                         if (other)
1103                                 __vm_insert_extobj(vm, other);
1104                 }
1105         }
1106
1107         xe_vm_assert_held(vm);
1108         if (fence) {
1109                 int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1110                                                  vma_destroy_cb);
1111
1112                 if (ret) {
1113                         XE_WARN_ON(ret != -ENOENT);
1114                         xe_vma_destroy_late(vma);
1115                 }
1116         } else {
1117                 xe_vma_destroy_late(vma);
1118         }
1119 }
1120
1121 static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1122 {
1123         struct ttm_validate_buffer tv[2];
1124         struct ww_acquire_ctx ww;
1125         struct xe_bo *bo = xe_vma_bo(vma);
1126         LIST_HEAD(objs);
1127         LIST_HEAD(dups);
1128         int err;
1129
1130         memset(tv, 0, sizeof(tv));
1131         tv[0].bo = xe_vm_ttm_bo(xe_vma_vm(vma));
1132         list_add(&tv[0].head, &objs);
1133
1134         if (bo) {
1135                 tv[1].bo = &xe_bo_get(bo)->ttm;
1136                 list_add(&tv[1].head, &objs);
1137         }
1138         err = ttm_eu_reserve_buffers(&ww, &objs, false, &dups);
1139         XE_WARN_ON(err);
1140
1141         xe_vma_destroy(vma, NULL);
1142
1143         ttm_eu_backoff_reservation(&ww, &objs);
1144         if (bo)
1145                 xe_bo_put(bo);
1146 }
1147
1148 struct xe_vma *
1149 xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1150 {
1151         struct drm_gpuva *gpuva;
1152
1153         lockdep_assert_held(&vm->lock);
1154
1155         if (xe_vm_is_closed_or_banned(vm))
1156                 return NULL;
1157
1158         XE_WARN_ON(start + range > vm->size);
1159
1160         gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1161
1162         return gpuva ? gpuva_to_vma(gpuva) : NULL;
1163 }
1164
1165 static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1166 {
1167         int err;
1168
1169         XE_WARN_ON(xe_vma_vm(vma) != vm);
1170         lockdep_assert_held(&vm->lock);
1171
1172         err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1173         XE_WARN_ON(err);        /* Shouldn't be possible */
1174
1175         return err;
1176 }
1177
1178 static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1179 {
1180         XE_WARN_ON(xe_vma_vm(vma) != vm);
1181         lockdep_assert_held(&vm->lock);
1182
1183         drm_gpuva_remove(&vma->gpuva);
1184         if (vm->usm.last_fault_vma == vma)
1185                 vm->usm.last_fault_vma = NULL;
1186 }
1187
1188 static struct drm_gpuva_op *xe_vm_op_alloc(void)
1189 {
1190         struct xe_vma_op *op;
1191
1192         op = kzalloc(sizeof(*op), GFP_KERNEL);
1193
1194         if (unlikely(!op))
1195                 return NULL;
1196
1197         return &op->base;
1198 }
1199
1200 static void xe_vm_free(struct drm_gpuvm *gpuvm);
1201
1202 static struct drm_gpuvm_ops gpuvm_ops = {
1203         .op_alloc = xe_vm_op_alloc,
1204         .vm_free = xe_vm_free,
1205 };
1206
1207 static void xe_vma_op_work_func(struct work_struct *w);
1208 static void vm_destroy_work_func(struct work_struct *w);
1209
1210 struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1211 {
1212         struct drm_gem_object *vm_resv_obj;
1213         struct xe_vm *vm;
1214         int err, number_tiles = 0;
1215         struct xe_tile *tile;
1216         u8 id;
1217
1218         vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1219         if (!vm)
1220                 return ERR_PTR(-ENOMEM);
1221
1222         vm->xe = xe;
1223
1224         vm->size = 1ull << xe->info.va_bits;
1225
1226         vm->flags = flags;
1227
1228         init_rwsem(&vm->lock);
1229
1230         INIT_LIST_HEAD(&vm->rebind_list);
1231
1232         INIT_LIST_HEAD(&vm->userptr.repin_list);
1233         INIT_LIST_HEAD(&vm->userptr.invalidated);
1234         init_rwsem(&vm->userptr.notifier_lock);
1235         spin_lock_init(&vm->userptr.invalidated_lock);
1236
1237         INIT_LIST_HEAD(&vm->notifier.rebind_list);
1238         spin_lock_init(&vm->notifier.list_lock);
1239
1240         INIT_LIST_HEAD(&vm->async_ops.pending);
1241         INIT_WORK(&vm->async_ops.work, xe_vma_op_work_func);
1242         spin_lock_init(&vm->async_ops.lock);
1243
1244         INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1245
1246         INIT_LIST_HEAD(&vm->preempt.exec_queues);
1247         vm->preempt.min_run_period_ms = 10;     /* FIXME: Wire up to uAPI */
1248
1249         for_each_tile(tile, xe, id)
1250                 xe_range_fence_tree_init(&vm->rftree[id]);
1251
1252         INIT_LIST_HEAD(&vm->extobj.list);
1253
1254         if (!(flags & XE_VM_FLAG_MIGRATION))
1255                 xe_device_mem_access_get(xe);
1256
1257         vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1258         if (!vm_resv_obj) {
1259                 err = -ENOMEM;
1260                 goto err_no_resv;
1261         }
1262
1263         drm_gpuvm_init(&vm->gpuvm, "Xe VM", 0, &xe->drm, vm_resv_obj,
1264                        0, vm->size, 0, 0, &gpuvm_ops);
1265
1266         drm_gem_object_put(vm_resv_obj);
1267
1268         err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
1269         if (err)
1270                 goto err_close;
1271
1272         if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1273                 vm->flags |= XE_VM_FLAG_64K;
1274
1275         for_each_tile(tile, xe, id) {
1276                 if (flags & XE_VM_FLAG_MIGRATION &&
1277                     tile->id != XE_VM_FLAG_TILE_ID(flags))
1278                         continue;
1279
1280                 vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1281                 if (IS_ERR(vm->pt_root[id])) {
1282                         err = PTR_ERR(vm->pt_root[id]);
1283                         vm->pt_root[id] = NULL;
1284                         goto err_unlock_close;
1285                 }
1286         }
1287
1288         if (flags & XE_VM_FLAG_SCRATCH_PAGE) {
1289                 for_each_tile(tile, xe, id) {
1290                         if (!vm->pt_root[id])
1291                                 continue;
1292
1293                         err = xe_pt_create_scratch(xe, tile, vm);
1294                         if (err)
1295                                 goto err_unlock_close;
1296                 }
1297                 vm->batch_invalidate_tlb = true;
1298         }
1299
1300         if (flags & XE_VM_FLAG_COMPUTE_MODE) {
1301                 INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1302                 vm->flags |= XE_VM_FLAG_COMPUTE_MODE;
1303                 vm->batch_invalidate_tlb = false;
1304         }
1305
1306         if (flags & XE_VM_FLAG_ASYNC_BIND_OPS) {
1307                 vm->async_ops.fence.context = dma_fence_context_alloc(1);
1308                 vm->flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
1309         }
1310
1311         /* Fill pt_root after allocating scratch tables */
1312         for_each_tile(tile, xe, id) {
1313                 if (!vm->pt_root[id])
1314                         continue;
1315
1316                 xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1317         }
1318         dma_resv_unlock(xe_vm_resv(vm));
1319
1320         /* Kernel migration VM shouldn't have a circular loop.. */
1321         if (!(flags & XE_VM_FLAG_MIGRATION)) {
1322                 for_each_tile(tile, xe, id) {
1323                         struct xe_gt *gt = tile->primary_gt;
1324                         struct xe_vm *migrate_vm;
1325                         struct xe_exec_queue *q;
1326
1327                         if (!vm->pt_root[id])
1328                                 continue;
1329
1330                         migrate_vm = xe_migrate_get_vm(tile->migrate);
1331                         q = xe_exec_queue_create_class(xe, gt, migrate_vm,
1332                                                        XE_ENGINE_CLASS_COPY,
1333                                                        EXEC_QUEUE_FLAG_VM);
1334                         xe_vm_put(migrate_vm);
1335                         if (IS_ERR(q)) {
1336                                 err = PTR_ERR(q);
1337                                 goto err_close;
1338                         }
1339                         vm->q[id] = q;
1340                         number_tiles++;
1341                 }
1342         }
1343
1344         if (number_tiles > 1)
1345                 vm->composite_fence_ctx = dma_fence_context_alloc(1);
1346
1347         mutex_lock(&xe->usm.lock);
1348         if (flags & XE_VM_FLAG_FAULT_MODE)
1349                 xe->usm.num_vm_in_fault_mode++;
1350         else if (!(flags & XE_VM_FLAG_MIGRATION))
1351                 xe->usm.num_vm_in_non_fault_mode++;
1352         mutex_unlock(&xe->usm.lock);
1353
1354         trace_xe_vm_create(vm);
1355
1356         return vm;
1357
1358 err_unlock_close:
1359         dma_resv_unlock(xe_vm_resv(vm));
1360 err_close:
1361         xe_vm_close_and_put(vm);
1362         return ERR_PTR(err);
1363
1364 err_no_resv:
1365         for_each_tile(tile, xe, id)
1366                 xe_range_fence_tree_fini(&vm->rftree[id]);
1367         kfree(vm);
1368         if (!(flags & XE_VM_FLAG_MIGRATION))
1369                 xe_device_mem_access_put(xe);
1370         return ERR_PTR(err);
1371 }
1372
1373 static void flush_async_ops(struct xe_vm *vm)
1374 {
1375         queue_work(system_unbound_wq, &vm->async_ops.work);
1376         flush_work(&vm->async_ops.work);
1377 }
1378
1379 static void vm_error_capture(struct xe_vm *vm, int err,
1380                              u32 op, u64 addr, u64 size)
1381 {
1382         struct drm_xe_vm_bind_op_error_capture capture;
1383         u64 __user *address =
1384                 u64_to_user_ptr(vm->async_ops.error_capture.addr);
1385         bool in_kthread = !current->mm;
1386
1387         capture.error = err;
1388         capture.op = op;
1389         capture.addr = addr;
1390         capture.size = size;
1391
1392         if (in_kthread) {
1393                 if (!mmget_not_zero(vm->async_ops.error_capture.mm))
1394                         goto mm_closed;
1395                 kthread_use_mm(vm->async_ops.error_capture.mm);
1396         }
1397
1398         if (copy_to_user(address, &capture, sizeof(capture)))
1399                 XE_WARN_ON("Copy to user failed");
1400
1401         if (in_kthread) {
1402                 kthread_unuse_mm(vm->async_ops.error_capture.mm);
1403                 mmput(vm->async_ops.error_capture.mm);
1404         }
1405
1406 mm_closed:
1407         wake_up_all(&vm->async_ops.error_capture.wq);
1408 }
1409
1410 static void xe_vm_close(struct xe_vm *vm)
1411 {
1412         down_write(&vm->lock);
1413         vm->size = 0;
1414         up_write(&vm->lock);
1415 }
1416
1417 void xe_vm_close_and_put(struct xe_vm *vm)
1418 {
1419         LIST_HEAD(contested);
1420         struct ww_acquire_ctx ww;
1421         struct xe_device *xe = vm->xe;
1422         struct xe_tile *tile;
1423         struct xe_vma *vma, *next_vma;
1424         struct drm_gpuva *gpuva, *next;
1425         u8 id;
1426
1427         XE_WARN_ON(vm->preempt.num_exec_queues);
1428
1429         xe_vm_close(vm);
1430         flush_async_ops(vm);
1431         if (xe_vm_in_compute_mode(vm))
1432                 flush_work(&vm->preempt.rebind_work);
1433
1434         for_each_tile(tile, xe, id) {
1435                 if (vm->q[id]) {
1436                         xe_exec_queue_kill(vm->q[id]);
1437                         xe_exec_queue_put(vm->q[id]);
1438                         vm->q[id] = NULL;
1439                 }
1440         }
1441
1442         down_write(&vm->lock);
1443         xe_vm_lock(vm, &ww, 0, false);
1444         drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1445                 vma = gpuva_to_vma(gpuva);
1446
1447                 if (xe_vma_has_no_bo(vma)) {
1448                         down_read(&vm->userptr.notifier_lock);
1449                         vma->gpuva.flags |= XE_VMA_DESTROYED;
1450                         up_read(&vm->userptr.notifier_lock);
1451                 }
1452
1453                 xe_vm_remove_vma(vm, vma);
1454
1455                 /* easy case, remove from VMA? */
1456                 if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1457                         list_del_init(&vma->combined_links.rebind);
1458                         xe_vma_destroy(vma, NULL);
1459                         continue;
1460                 }
1461
1462                 list_move_tail(&vma->combined_links.destroy, &contested);
1463                 vma->gpuva.flags |= XE_VMA_DESTROYED;
1464         }
1465
1466         /*
1467          * All vm operations will add shared fences to resv.
1468          * The only exception is eviction for a shared object,
1469          * but even so, the unbind when evicted would still
1470          * install a fence to resv. Hence it's safe to
1471          * destroy the pagetables immediately.
1472          */
1473         for_each_tile(tile, xe, id) {
1474                 if (vm->scratch_bo[id]) {
1475                         u32 i;
1476
1477                         xe_bo_unpin(vm->scratch_bo[id]);
1478                         xe_bo_put(vm->scratch_bo[id]);
1479                         for (i = 0; i < vm->pt_root[id]->level; i++)
1480                                 xe_pt_destroy(vm->scratch_pt[id][i], vm->flags,
1481                                               NULL);
1482                 }
1483                 if (vm->pt_root[id]) {
1484                         xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1485                         vm->pt_root[id] = NULL;
1486                 }
1487         }
1488         xe_vm_unlock(vm, &ww);
1489
1490         /*
1491          * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1492          * Since we hold a refcount to the bo, we can remove and free
1493          * the members safely without locking.
1494          */
1495         list_for_each_entry_safe(vma, next_vma, &contested,
1496                                  combined_links.destroy) {
1497                 list_del_init(&vma->combined_links.destroy);
1498                 xe_vma_destroy_unlocked(vma);
1499         }
1500
1501         if (vm->async_ops.error_capture.addr)
1502                 wake_up_all(&vm->async_ops.error_capture.wq);
1503
1504         XE_WARN_ON(!list_empty(&vm->extobj.list));
1505         up_write(&vm->lock);
1506
1507         mutex_lock(&xe->usm.lock);
1508         if (vm->flags & XE_VM_FLAG_FAULT_MODE)
1509                 xe->usm.num_vm_in_fault_mode--;
1510         else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1511                 xe->usm.num_vm_in_non_fault_mode--;
1512         mutex_unlock(&xe->usm.lock);
1513
1514         for_each_tile(tile, xe, id)
1515                 xe_range_fence_tree_fini(&vm->rftree[id]);
1516
1517         xe_vm_put(vm);
1518 }
1519
1520 static void vm_destroy_work_func(struct work_struct *w)
1521 {
1522         struct xe_vm *vm =
1523                 container_of(w, struct xe_vm, destroy_work);
1524         struct xe_device *xe = vm->xe;
1525         struct xe_tile *tile;
1526         u8 id;
1527         void *lookup;
1528
1529         /* xe_vm_close_and_put was not called? */
1530         XE_WARN_ON(vm->size);
1531
1532         if (!(vm->flags & XE_VM_FLAG_MIGRATION)) {
1533                 xe_device_mem_access_put(xe);
1534
1535                 if (xe->info.has_asid) {
1536                         mutex_lock(&xe->usm.lock);
1537                         lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1538                         XE_WARN_ON(lookup != vm);
1539                         mutex_unlock(&xe->usm.lock);
1540                 }
1541         }
1542
1543         for_each_tile(tile, xe, id)
1544                 XE_WARN_ON(vm->pt_root[id]);
1545
1546         trace_xe_vm_free(vm);
1547         dma_fence_put(vm->rebind_fence);
1548         kfree(vm);
1549 }
1550
1551 static void xe_vm_free(struct drm_gpuvm *gpuvm)
1552 {
1553         struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1554
1555         /* To destroy the VM we need to be able to sleep */
1556         queue_work(system_unbound_wq, &vm->destroy_work);
1557 }
1558
1559 struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1560 {
1561         struct xe_vm *vm;
1562
1563         mutex_lock(&xef->vm.lock);
1564         vm = xa_load(&xef->vm.xa, id);
1565         if (vm)
1566                 xe_vm_get(vm);
1567         mutex_unlock(&xef->vm.lock);
1568
1569         return vm;
1570 }
1571
1572 u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1573 {
1574         return xe_pde_encode(vm->pt_root[tile->id]->bo, 0,
1575                              XE_CACHE_WB);
1576 }
1577
1578 static struct dma_fence *
1579 xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1580                  struct xe_sync_entry *syncs, u32 num_syncs,
1581                  bool first_op, bool last_op)
1582 {
1583         struct xe_tile *tile;
1584         struct dma_fence *fence = NULL;
1585         struct dma_fence **fences = NULL;
1586         struct dma_fence_array *cf = NULL;
1587         struct xe_vm *vm = xe_vma_vm(vma);
1588         int cur_fence = 0, i;
1589         int number_tiles = hweight8(vma->tile_present);
1590         int err;
1591         u8 id;
1592
1593         trace_xe_vma_unbind(vma);
1594
1595         if (number_tiles > 1) {
1596                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1597                                        GFP_KERNEL);
1598                 if (!fences)
1599                         return ERR_PTR(-ENOMEM);
1600         }
1601
1602         for_each_tile(tile, vm->xe, id) {
1603                 if (!(vma->tile_present & BIT(id)))
1604                         goto next;
1605
1606                 fence = __xe_pt_unbind_vma(tile, vma, q, first_op ? syncs : NULL,
1607                                            first_op ? num_syncs : 0);
1608                 if (IS_ERR(fence)) {
1609                         err = PTR_ERR(fence);
1610                         goto err_fences;
1611                 }
1612
1613                 if (fences)
1614                         fences[cur_fence++] = fence;
1615
1616 next:
1617                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1618                         q = list_next_entry(q, multi_gt_list);
1619         }
1620
1621         if (fences) {
1622                 cf = dma_fence_array_create(number_tiles, fences,
1623                                             vm->composite_fence_ctx,
1624                                             vm->composite_fence_seqno++,
1625                                             false);
1626                 if (!cf) {
1627                         --vm->composite_fence_seqno;
1628                         err = -ENOMEM;
1629                         goto err_fences;
1630                 }
1631         }
1632
1633         if (last_op) {
1634                 for (i = 0; i < num_syncs; i++)
1635                         xe_sync_entry_signal(&syncs[i], NULL,
1636                                              cf ? &cf->base : fence);
1637         }
1638
1639         return cf ? &cf->base : !fence ? dma_fence_get_stub() : fence;
1640
1641 err_fences:
1642         if (fences) {
1643                 while (cur_fence) {
1644                         /* FIXME: Rewind the previous binds? */
1645                         dma_fence_put(fences[--cur_fence]);
1646                 }
1647                 kfree(fences);
1648         }
1649
1650         return ERR_PTR(err);
1651 }
1652
1653 static struct dma_fence *
1654 xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1655                struct xe_sync_entry *syncs, u32 num_syncs,
1656                bool first_op, bool last_op)
1657 {
1658         struct xe_tile *tile;
1659         struct dma_fence *fence;
1660         struct dma_fence **fences = NULL;
1661         struct dma_fence_array *cf = NULL;
1662         struct xe_vm *vm = xe_vma_vm(vma);
1663         int cur_fence = 0, i;
1664         int number_tiles = hweight8(vma->tile_mask);
1665         int err;
1666         u8 id;
1667
1668         trace_xe_vma_bind(vma);
1669
1670         if (number_tiles > 1) {
1671                 fences = kmalloc_array(number_tiles, sizeof(*fences),
1672                                        GFP_KERNEL);
1673                 if (!fences)
1674                         return ERR_PTR(-ENOMEM);
1675         }
1676
1677         for_each_tile(tile, vm->xe, id) {
1678                 if (!(vma->tile_mask & BIT(id)))
1679                         goto next;
1680
1681                 fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id],
1682                                          first_op ? syncs : NULL,
1683                                          first_op ? num_syncs : 0,
1684                                          vma->tile_present & BIT(id));
1685                 if (IS_ERR(fence)) {
1686                         err = PTR_ERR(fence);
1687                         goto err_fences;
1688                 }
1689
1690                 if (fences)
1691                         fences[cur_fence++] = fence;
1692
1693 next:
1694                 if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1695                         q = list_next_entry(q, multi_gt_list);
1696         }
1697
1698         if (fences) {
1699                 cf = dma_fence_array_create(number_tiles, fences,
1700                                             vm->composite_fence_ctx,
1701                                             vm->composite_fence_seqno++,
1702                                             false);
1703                 if (!cf) {
1704                         --vm->composite_fence_seqno;
1705                         err = -ENOMEM;
1706                         goto err_fences;
1707                 }
1708         }
1709
1710         if (last_op) {
1711                 for (i = 0; i < num_syncs; i++)
1712                         xe_sync_entry_signal(&syncs[i], NULL,
1713                                              cf ? &cf->base : fence);
1714         }
1715
1716         return cf ? &cf->base : fence;
1717
1718 err_fences:
1719         if (fences) {
1720                 while (cur_fence) {
1721                         /* FIXME: Rewind the previous binds? */
1722                         dma_fence_put(fences[--cur_fence]);
1723                 }
1724                 kfree(fences);
1725         }
1726
1727         return ERR_PTR(err);
1728 }
1729
1730 struct async_op_fence {
1731         struct dma_fence fence;
1732         struct dma_fence *wait_fence;
1733         struct dma_fence_cb cb;
1734         struct xe_vm *vm;
1735         wait_queue_head_t wq;
1736         bool started;
1737 };
1738
1739 static const char *async_op_fence_get_driver_name(struct dma_fence *dma_fence)
1740 {
1741         return "xe";
1742 }
1743
1744 static const char *
1745 async_op_fence_get_timeline_name(struct dma_fence *dma_fence)
1746 {
1747         return "async_op_fence";
1748 }
1749
1750 static const struct dma_fence_ops async_op_fence_ops = {
1751         .get_driver_name = async_op_fence_get_driver_name,
1752         .get_timeline_name = async_op_fence_get_timeline_name,
1753 };
1754
1755 static void async_op_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
1756 {
1757         struct async_op_fence *afence =
1758                 container_of(cb, struct async_op_fence, cb);
1759
1760         afence->fence.error = afence->wait_fence->error;
1761         dma_fence_signal(&afence->fence);
1762         xe_vm_put(afence->vm);
1763         dma_fence_put(afence->wait_fence);
1764         dma_fence_put(&afence->fence);
1765 }
1766
1767 static void add_async_op_fence_cb(struct xe_vm *vm,
1768                                   struct dma_fence *fence,
1769                                   struct async_op_fence *afence)
1770 {
1771         int ret;
1772
1773         if (!xe_vm_no_dma_fences(vm)) {
1774                 afence->started = true;
1775                 smp_wmb();
1776                 wake_up_all(&afence->wq);
1777         }
1778
1779         afence->wait_fence = dma_fence_get(fence);
1780         afence->vm = xe_vm_get(vm);
1781         dma_fence_get(&afence->fence);
1782         ret = dma_fence_add_callback(fence, &afence->cb, async_op_fence_cb);
1783         if (ret == -ENOENT) {
1784                 afence->fence.error = afence->wait_fence->error;
1785                 dma_fence_signal(&afence->fence);
1786         }
1787         if (ret) {
1788                 xe_vm_put(vm);
1789                 dma_fence_put(afence->wait_fence);
1790                 dma_fence_put(&afence->fence);
1791         }
1792         XE_WARN_ON(ret && ret != -ENOENT);
1793 }
1794
1795 int xe_vm_async_fence_wait_start(struct dma_fence *fence)
1796 {
1797         if (fence->ops == &async_op_fence_ops) {
1798                 struct async_op_fence *afence =
1799                         container_of(fence, struct async_op_fence, fence);
1800
1801                 XE_WARN_ON(xe_vm_no_dma_fences(afence->vm));
1802
1803                 smp_rmb();
1804                 return wait_event_interruptible(afence->wq, afence->started);
1805         }
1806
1807         return 0;
1808 }
1809
1810 static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
1811                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1812                         u32 num_syncs, struct async_op_fence *afence,
1813                         bool immediate, bool first_op, bool last_op)
1814 {
1815         struct dma_fence *fence;
1816
1817         xe_vm_assert_held(vm);
1818
1819         if (immediate) {
1820                 fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op,
1821                                        last_op);
1822                 if (IS_ERR(fence))
1823                         return PTR_ERR(fence);
1824         } else {
1825                 int i;
1826
1827                 XE_WARN_ON(!xe_vm_in_fault_mode(vm));
1828
1829                 fence = dma_fence_get_stub();
1830                 if (last_op) {
1831                         for (i = 0; i < num_syncs; i++)
1832                                 xe_sync_entry_signal(&syncs[i], NULL, fence);
1833                 }
1834         }
1835         if (afence)
1836                 add_async_op_fence_cb(vm, fence, afence);
1837
1838         dma_fence_put(fence);
1839         return 0;
1840 }
1841
1842 static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
1843                       struct xe_bo *bo, struct xe_sync_entry *syncs,
1844                       u32 num_syncs, struct async_op_fence *afence,
1845                       bool immediate, bool first_op, bool last_op)
1846 {
1847         int err;
1848
1849         xe_vm_assert_held(vm);
1850         xe_bo_assert_held(bo);
1851
1852         if (bo && immediate) {
1853                 err = xe_bo_validate(bo, vm, true);
1854                 if (err)
1855                         return err;
1856         }
1857
1858         return __xe_vm_bind(vm, vma, q, syncs, num_syncs, afence, immediate,
1859                             first_op, last_op);
1860 }
1861
1862 static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
1863                         struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1864                         u32 num_syncs, struct async_op_fence *afence,
1865                         bool first_op, bool last_op)
1866 {
1867         struct dma_fence *fence;
1868
1869         xe_vm_assert_held(vm);
1870         xe_bo_assert_held(xe_vma_bo(vma));
1871
1872         fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op);
1873         if (IS_ERR(fence))
1874                 return PTR_ERR(fence);
1875         if (afence)
1876                 add_async_op_fence_cb(vm, fence, afence);
1877
1878         xe_vma_destroy(vma, fence);
1879         dma_fence_put(fence);
1880
1881         return 0;
1882 }
1883
1884 static int vm_set_error_capture_address(struct xe_device *xe, struct xe_vm *vm,
1885                                         u64 value)
1886 {
1887         if (XE_IOCTL_DBG(xe, !value))
1888                 return -EINVAL;
1889
1890         if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
1891                 return -EOPNOTSUPP;
1892
1893         if (XE_IOCTL_DBG(xe, vm->async_ops.error_capture.addr))
1894                 return -EOPNOTSUPP;
1895
1896         vm->async_ops.error_capture.mm = current->mm;
1897         vm->async_ops.error_capture.addr = value;
1898         init_waitqueue_head(&vm->async_ops.error_capture.wq);
1899
1900         return 0;
1901 }
1902
1903 typedef int (*xe_vm_set_property_fn)(struct xe_device *xe, struct xe_vm *vm,
1904                                      u64 value);
1905
1906 static const xe_vm_set_property_fn vm_set_property_funcs[] = {
1907         [XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS] =
1908                 vm_set_error_capture_address,
1909 };
1910
1911 static int vm_user_ext_set_property(struct xe_device *xe, struct xe_vm *vm,
1912                                     u64 extension)
1913 {
1914         u64 __user *address = u64_to_user_ptr(extension);
1915         struct drm_xe_ext_vm_set_property ext;
1916         int err;
1917
1918         err = __copy_from_user(&ext, address, sizeof(ext));
1919         if (XE_IOCTL_DBG(xe, err))
1920                 return -EFAULT;
1921
1922         if (XE_IOCTL_DBG(xe, ext.property >=
1923                          ARRAY_SIZE(vm_set_property_funcs)) ||
1924             XE_IOCTL_DBG(xe, ext.pad) ||
1925             XE_IOCTL_DBG(xe, ext.reserved[0] || ext.reserved[1]))
1926                 return -EINVAL;
1927
1928         return vm_set_property_funcs[ext.property](xe, vm, ext.value);
1929 }
1930
1931 typedef int (*xe_vm_user_extension_fn)(struct xe_device *xe, struct xe_vm *vm,
1932                                        u64 extension);
1933
1934 static const xe_vm_set_property_fn vm_user_extension_funcs[] = {
1935         [XE_VM_EXTENSION_SET_PROPERTY] = vm_user_ext_set_property,
1936 };
1937
1938 #define MAX_USER_EXTENSIONS     16
1939 static int vm_user_extensions(struct xe_device *xe, struct xe_vm *vm,
1940                               u64 extensions, int ext_number)
1941 {
1942         u64 __user *address = u64_to_user_ptr(extensions);
1943         struct xe_user_extension ext;
1944         int err;
1945
1946         if (XE_IOCTL_DBG(xe, ext_number >= MAX_USER_EXTENSIONS))
1947                 return -E2BIG;
1948
1949         err = __copy_from_user(&ext, address, sizeof(ext));
1950         if (XE_IOCTL_DBG(xe, err))
1951                 return -EFAULT;
1952
1953         if (XE_IOCTL_DBG(xe, ext.pad) ||
1954             XE_IOCTL_DBG(xe, ext.name >=
1955                          ARRAY_SIZE(vm_user_extension_funcs)))
1956                 return -EINVAL;
1957
1958         err = vm_user_extension_funcs[ext.name](xe, vm, extensions);
1959         if (XE_IOCTL_DBG(xe, err))
1960                 return err;
1961
1962         if (ext.next_extension)
1963                 return vm_user_extensions(xe, vm, ext.next_extension,
1964                                           ++ext_number);
1965
1966         return 0;
1967 }
1968
1969 #define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_SCRATCH_PAGE | \
1970                                     DRM_XE_VM_CREATE_COMPUTE_MODE | \
1971                                     DRM_XE_VM_CREATE_ASYNC_BIND_OPS | \
1972                                     DRM_XE_VM_CREATE_FAULT_MODE)
1973
1974 int xe_vm_create_ioctl(struct drm_device *dev, void *data,
1975                        struct drm_file *file)
1976 {
1977         struct xe_device *xe = to_xe_device(dev);
1978         struct xe_file *xef = to_xe_file(file);
1979         struct drm_xe_vm_create *args = data;
1980         struct xe_vm *vm;
1981         u32 id, asid;
1982         int err;
1983         u32 flags = 0;
1984
1985         if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
1986                 args->flags |= DRM_XE_VM_CREATE_SCRATCH_PAGE;
1987
1988         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
1989                          !xe->info.supports_usm))
1990                 return -EINVAL;
1991
1992         if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
1993                 return -EINVAL;
1994
1995         if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
1996                 return -EINVAL;
1997
1998         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE &&
1999                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2000                 return -EINVAL;
2001
2002         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE &&
2003                          args->flags & DRM_XE_VM_CREATE_FAULT_MODE))
2004                 return -EINVAL;
2005
2006         if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FAULT_MODE &&
2007                          xe_device_in_non_fault_mode(xe)))
2008                 return -EINVAL;
2009
2010         if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FAULT_MODE) &&
2011                          xe_device_in_fault_mode(xe)))
2012                 return -EINVAL;
2013
2014         if (args->flags & DRM_XE_VM_CREATE_SCRATCH_PAGE)
2015                 flags |= XE_VM_FLAG_SCRATCH_PAGE;
2016         if (args->flags & DRM_XE_VM_CREATE_COMPUTE_MODE)
2017                 flags |= XE_VM_FLAG_COMPUTE_MODE;
2018         if (args->flags & DRM_XE_VM_CREATE_ASYNC_BIND_OPS)
2019                 flags |= XE_VM_FLAG_ASYNC_BIND_OPS;
2020         if (args->flags & DRM_XE_VM_CREATE_FAULT_MODE)
2021                 flags |= XE_VM_FLAG_FAULT_MODE;
2022
2023         vm = xe_vm_create(xe, flags);
2024         if (IS_ERR(vm))
2025                 return PTR_ERR(vm);
2026
2027         if (args->extensions) {
2028                 err = vm_user_extensions(xe, vm, args->extensions, 0);
2029                 if (XE_IOCTL_DBG(xe, err)) {
2030                         xe_vm_close_and_put(vm);
2031                         return err;
2032                 }
2033         }
2034
2035         mutex_lock(&xef->vm.lock);
2036         err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
2037         mutex_unlock(&xef->vm.lock);
2038         if (err) {
2039                 xe_vm_close_and_put(vm);
2040                 return err;
2041         }
2042
2043         if (xe->info.has_asid) {
2044                 mutex_lock(&xe->usm.lock);
2045                 err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
2046                                       XA_LIMIT(0, XE_MAX_ASID - 1),
2047                                       &xe->usm.next_asid, GFP_KERNEL);
2048                 mutex_unlock(&xe->usm.lock);
2049                 if (err) {
2050                         xe_vm_close_and_put(vm);
2051                         return err;
2052                 }
2053                 vm->usm.asid = asid;
2054         }
2055
2056         args->vm_id = id;
2057
2058 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2059         /* Warning: Security issue - never enable by default */
2060         args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2061 #endif
2062
2063         return 0;
2064 }
2065
2066 int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2067                         struct drm_file *file)
2068 {
2069         struct xe_device *xe = to_xe_device(dev);
2070         struct xe_file *xef = to_xe_file(file);
2071         struct drm_xe_vm_destroy *args = data;
2072         struct xe_vm *vm;
2073         int err = 0;
2074
2075         if (XE_IOCTL_DBG(xe, args->pad) ||
2076             XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2077                 return -EINVAL;
2078
2079         mutex_lock(&xef->vm.lock);
2080         vm = xa_load(&xef->vm.xa, args->vm_id);
2081         if (XE_IOCTL_DBG(xe, !vm))
2082                 err = -ENOENT;
2083         else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2084                 err = -EBUSY;
2085         else
2086                 xa_erase(&xef->vm.xa, args->vm_id);
2087         mutex_unlock(&xef->vm.lock);
2088
2089         if (!err)
2090                 xe_vm_close_and_put(vm);
2091
2092         return err;
2093 }
2094
2095 static const u32 region_to_mem_type[] = {
2096         XE_PL_TT,
2097         XE_PL_VRAM0,
2098         XE_PL_VRAM1,
2099 };
2100
2101 static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
2102                           struct xe_exec_queue *q, u32 region,
2103                           struct xe_sync_entry *syncs, u32 num_syncs,
2104                           struct async_op_fence *afence, bool first_op,
2105                           bool last_op)
2106 {
2107         int err;
2108
2109         XE_WARN_ON(region > ARRAY_SIZE(region_to_mem_type));
2110
2111         if (!xe_vma_has_no_bo(vma)) {
2112                 err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
2113                 if (err)
2114                         return err;
2115         }
2116
2117         if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) {
2118                 return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
2119                                   afence, true, first_op, last_op);
2120         } else {
2121                 int i;
2122
2123                 /* Nothing to do, signal fences now */
2124                 if (last_op) {
2125                         for (i = 0; i < num_syncs; i++)
2126                                 xe_sync_entry_signal(&syncs[i], NULL,
2127                                                      dma_fence_get_stub());
2128                 }
2129                 if (afence)
2130                         dma_fence_signal(&afence->fence);
2131                 return 0;
2132         }
2133 }
2134
2135 #define VM_BIND_OP(op)  (op & 0xffff)
2136
2137 struct ttm_buffer_object *xe_vm_ttm_bo(struct xe_vm *vm)
2138 {
2139         int idx = vm->flags & XE_VM_FLAG_MIGRATION ?
2140                 XE_VM_FLAG_TILE_ID(vm->flags) : 0;
2141
2142         /* Safe to use index 0 as all BO in the VM share a single dma-resv lock */
2143         return &vm->pt_root[idx]->bo->ttm;
2144 }
2145
2146 static void xe_vm_tv_populate(struct xe_vm *vm, struct ttm_validate_buffer *tv)
2147 {
2148         tv->num_shared = 1;
2149         tv->bo = xe_vm_ttm_bo(vm);
2150 }
2151
2152 static void vm_set_async_error(struct xe_vm *vm, int err)
2153 {
2154         lockdep_assert_held(&vm->lock);
2155         vm->async_ops.error = err;
2156 }
2157
2158 static int vm_bind_ioctl_lookup_vma(struct xe_vm *vm, struct xe_bo *bo,
2159                                     u64 addr, u64 range, u32 op)
2160 {
2161         struct xe_device *xe = vm->xe;
2162         struct xe_vma *vma;
2163         bool async = !!(op & XE_VM_BIND_FLAG_ASYNC);
2164
2165         lockdep_assert_held(&vm->lock);
2166
2167         switch (VM_BIND_OP(op)) {
2168         case XE_VM_BIND_OP_MAP:
2169         case XE_VM_BIND_OP_MAP_USERPTR:
2170                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2171                 if (XE_IOCTL_DBG(xe, vma && !async))
2172                         return -EBUSY;
2173                 break;
2174         case XE_VM_BIND_OP_UNMAP:
2175         case XE_VM_BIND_OP_PREFETCH:
2176                 vma = xe_vm_find_overlapping_vma(vm, addr, range);
2177                 if (XE_IOCTL_DBG(xe, !vma))
2178                         /* Not an actual error, IOCTL cleans up returns and 0 */
2179                         return -ENODATA;
2180                 if (XE_IOCTL_DBG(xe, (xe_vma_start(vma) != addr ||
2181                                       xe_vma_end(vma) != addr + range) && !async))
2182                         return -EINVAL;
2183                 break;
2184         case XE_VM_BIND_OP_UNMAP_ALL:
2185                 if (XE_IOCTL_DBG(xe, list_empty(&bo->ttm.base.gpuva.list)))
2186                         /* Not an actual error, IOCTL cleans up returns and 0 */
2187                         return -ENODATA;
2188                 break;
2189         default:
2190                 XE_WARN_ON("NOT POSSIBLE");
2191                 return -EINVAL;
2192         }
2193
2194         return 0;
2195 }
2196
2197 static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2198                              bool post_commit)
2199 {
2200         down_read(&vm->userptr.notifier_lock);
2201         vma->gpuva.flags |= XE_VMA_DESTROYED;
2202         up_read(&vm->userptr.notifier_lock);
2203         if (post_commit)
2204                 xe_vm_remove_vma(vm, vma);
2205 }
2206
2207 #undef ULL
2208 #define ULL     unsigned long long
2209
2210 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2211 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2212 {
2213         struct xe_vma *vma;
2214
2215         switch (op->op) {
2216         case DRM_GPUVA_OP_MAP:
2217                 vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2218                        (ULL)op->map.va.addr, (ULL)op->map.va.range);
2219                 break;
2220         case DRM_GPUVA_OP_REMAP:
2221                 vma = gpuva_to_vma(op->remap.unmap->va);
2222                 vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2223                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2224                        op->unmap.keep ? 1 : 0);
2225                 if (op->remap.prev)
2226                         vm_dbg(&xe->drm,
2227                                "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2228                                (ULL)op->remap.prev->va.addr,
2229                                (ULL)op->remap.prev->va.range);
2230                 if (op->remap.next)
2231                         vm_dbg(&xe->drm,
2232                                "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2233                                (ULL)op->remap.next->va.addr,
2234                                (ULL)op->remap.next->va.range);
2235                 break;
2236         case DRM_GPUVA_OP_UNMAP:
2237                 vma = gpuva_to_vma(op->unmap.va);
2238                 vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2239                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2240                        op->unmap.keep ? 1 : 0);
2241                 break;
2242         case DRM_GPUVA_OP_PREFETCH:
2243                 vma = gpuva_to_vma(op->prefetch.va);
2244                 vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2245                        (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2246                 break;
2247         default:
2248                 XE_WARN_ON("NOT POSSIBLE");
2249         }
2250 }
2251 #else
2252 static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2253 {
2254 }
2255 #endif
2256
2257 /*
2258  * Create operations list from IOCTL arguments, setup operations fields so parse
2259  * and commit steps are decoupled from IOCTL arguments. This step can fail.
2260  */
2261 static struct drm_gpuva_ops *
2262 vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2263                          u64 bo_offset_or_userptr, u64 addr, u64 range,
2264                          u32 operation, u8 tile_mask, u32 region)
2265 {
2266         struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2267         struct ww_acquire_ctx ww;
2268         struct drm_gpuva_ops *ops;
2269         struct drm_gpuva_op *__op;
2270         struct xe_vma_op *op;
2271         struct drm_gpuvm_bo *vm_bo;
2272         int err;
2273
2274         lockdep_assert_held_write(&vm->lock);
2275
2276         vm_dbg(&vm->xe->drm,
2277                "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2278                VM_BIND_OP(operation), (ULL)addr, (ULL)range,
2279                (ULL)bo_offset_or_userptr);
2280
2281         switch (VM_BIND_OP(operation)) {
2282         case XE_VM_BIND_OP_MAP:
2283         case XE_VM_BIND_OP_MAP_USERPTR:
2284                 ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2285                                                   obj, bo_offset_or_userptr);
2286                 if (IS_ERR(ops))
2287                         return ops;
2288
2289                 drm_gpuva_for_each_op(__op, ops) {
2290                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2291
2292                         op->tile_mask = tile_mask;
2293                         op->map.immediate =
2294                                 operation & XE_VM_BIND_FLAG_IMMEDIATE;
2295                         op->map.read_only =
2296                                 operation & XE_VM_BIND_FLAG_READONLY;
2297                         op->map.is_null = operation & XE_VM_BIND_FLAG_NULL;
2298                 }
2299                 break;
2300         case XE_VM_BIND_OP_UNMAP:
2301                 ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2302                 if (IS_ERR(ops))
2303                         return ops;
2304
2305                 drm_gpuva_for_each_op(__op, ops) {
2306                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2307
2308                         op->tile_mask = tile_mask;
2309                 }
2310                 break;
2311         case XE_VM_BIND_OP_PREFETCH:
2312                 ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2313                 if (IS_ERR(ops))
2314                         return ops;
2315
2316                 drm_gpuva_for_each_op(__op, ops) {
2317                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2318
2319                         op->tile_mask = tile_mask;
2320                         op->prefetch.region = region;
2321                 }
2322                 break;
2323         case XE_VM_BIND_OP_UNMAP_ALL:
2324                 XE_WARN_ON(!bo);
2325
2326                 err = xe_bo_lock(bo, &ww, 0, true);
2327                 if (err)
2328                         return ERR_PTR(err);
2329
2330                 vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj);
2331                 if (!vm_bo)
2332                         break;
2333
2334                 ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2335                 drm_gpuvm_bo_put(vm_bo);
2336                 xe_bo_unlock(bo, &ww);
2337                 if (IS_ERR(ops))
2338                         return ops;
2339
2340                 drm_gpuva_for_each_op(__op, ops) {
2341                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2342
2343                         op->tile_mask = tile_mask;
2344                 }
2345                 break;
2346         default:
2347                 XE_WARN_ON("NOT POSSIBLE");
2348                 ops = ERR_PTR(-EINVAL);
2349         }
2350
2351 #ifdef TEST_VM_ASYNC_OPS_ERROR
2352         if (operation & FORCE_ASYNC_OP_ERROR) {
2353                 op = list_first_entry_or_null(&ops->list, struct xe_vma_op,
2354                                               base.entry);
2355                 if (op)
2356                         op->inject_error = true;
2357         }
2358 #endif
2359
2360         if (!IS_ERR(ops))
2361                 drm_gpuva_for_each_op(__op, ops)
2362                         print_op(vm->xe, __op);
2363
2364         return ops;
2365 }
2366
2367 static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2368                               u8 tile_mask, bool read_only, bool is_null)
2369 {
2370         struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2371         struct xe_vma *vma;
2372         struct ww_acquire_ctx ww;
2373         int err;
2374
2375         lockdep_assert_held_write(&vm->lock);
2376
2377         if (bo) {
2378                 err = xe_bo_lock(bo, &ww, 0, true);
2379                 if (err)
2380                         return ERR_PTR(err);
2381         }
2382         vma = xe_vma_create(vm, bo, op->gem.offset,
2383                             op->va.addr, op->va.addr +
2384                             op->va.range - 1, read_only, is_null,
2385                             tile_mask);
2386         if (bo)
2387                 xe_bo_unlock(bo, &ww);
2388
2389         if (xe_vma_is_userptr(vma)) {
2390                 err = xe_vma_userptr_pin_pages(vma);
2391                 if (err) {
2392                         prep_vma_destroy(vm, vma, false);
2393                         xe_vma_destroy_unlocked(vma);
2394                         return ERR_PTR(err);
2395                 }
2396         } else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
2397                 vm_insert_extobj(vm, vma);
2398                 err = add_preempt_fences(vm, bo);
2399                 if (err) {
2400                         prep_vma_destroy(vm, vma, false);
2401                         xe_vma_destroy_unlocked(vma);
2402                         return ERR_PTR(err);
2403                 }
2404         }
2405
2406         return vma;
2407 }
2408
2409 static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2410 {
2411         if (vma->gpuva.flags & XE_VMA_PTE_1G)
2412                 return SZ_1G;
2413         else if (vma->gpuva.flags & XE_VMA_PTE_2M)
2414                 return SZ_2M;
2415
2416         return SZ_4K;
2417 }
2418
2419 static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2420 {
2421         switch (size) {
2422         case SZ_1G:
2423                 vma->gpuva.flags |= XE_VMA_PTE_1G;
2424                 break;
2425         case SZ_2M:
2426                 vma->gpuva.flags |= XE_VMA_PTE_2M;
2427                 break;
2428         }
2429
2430         return SZ_4K;
2431 }
2432
2433 /*
2434  * Parse operations list and create any resources needed for the operations
2435  * prior to fully committing to the operations. This setup can fail.
2436  */
2437 static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
2438                                    struct drm_gpuva_ops **ops, int num_ops_list,
2439                                    struct xe_sync_entry *syncs, u32 num_syncs,
2440                                    struct list_head *ops_list, bool async)
2441 {
2442         struct xe_vma_op *last_op = NULL;
2443         struct list_head *async_list = NULL;
2444         struct async_op_fence *fence = NULL;
2445         int err, i;
2446
2447         lockdep_assert_held_write(&vm->lock);
2448         XE_WARN_ON(num_ops_list > 1 && !async);
2449
2450         if (num_syncs && async) {
2451                 u64 seqno;
2452
2453                 fence = kmalloc(sizeof(*fence), GFP_KERNEL);
2454                 if (!fence)
2455                         return -ENOMEM;
2456
2457                 seqno = q ? ++q->bind.fence_seqno : ++vm->async_ops.fence.seqno;
2458                 dma_fence_init(&fence->fence, &async_op_fence_ops,
2459                                &vm->async_ops.lock, q ? q->bind.fence_ctx :
2460                                vm->async_ops.fence.context, seqno);
2461
2462                 if (!xe_vm_no_dma_fences(vm)) {
2463                         fence->vm = vm;
2464                         fence->started = false;
2465                         init_waitqueue_head(&fence->wq);
2466                 }
2467         }
2468
2469         for (i = 0; i < num_ops_list; ++i) {
2470                 struct drm_gpuva_ops *__ops = ops[i];
2471                 struct drm_gpuva_op *__op;
2472
2473                 drm_gpuva_for_each_op(__op, __ops) {
2474                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2475                         bool first = !async_list;
2476
2477                         XE_WARN_ON(!first && !async);
2478
2479                         INIT_LIST_HEAD(&op->link);
2480                         if (first)
2481                                 async_list = ops_list;
2482                         list_add_tail(&op->link, async_list);
2483
2484                         if (first) {
2485                                 op->flags |= XE_VMA_OP_FIRST;
2486                                 op->num_syncs = num_syncs;
2487                                 op->syncs = syncs;
2488                         }
2489
2490                         op->q = q;
2491
2492                         switch (op->base.op) {
2493                         case DRM_GPUVA_OP_MAP:
2494                         {
2495                                 struct xe_vma *vma;
2496
2497                                 vma = new_vma(vm, &op->base.map,
2498                                               op->tile_mask, op->map.read_only,
2499                                               op->map.is_null);
2500                                 if (IS_ERR(vma)) {
2501                                         err = PTR_ERR(vma);
2502                                         goto free_fence;
2503                                 }
2504
2505                                 op->map.vma = vma;
2506                                 break;
2507                         }
2508                         case DRM_GPUVA_OP_REMAP:
2509                         {
2510                                 struct xe_vma *old =
2511                                         gpuva_to_vma(op->base.remap.unmap->va);
2512
2513                                 op->remap.start = xe_vma_start(old);
2514                                 op->remap.range = xe_vma_size(old);
2515
2516                                 if (op->base.remap.prev) {
2517                                         struct xe_vma *vma;
2518                                         bool read_only =
2519                                                 op->base.remap.unmap->va->flags &
2520                                                 XE_VMA_READ_ONLY;
2521                                         bool is_null =
2522                                                 op->base.remap.unmap->va->flags &
2523                                                 DRM_GPUVA_SPARSE;
2524
2525                                         vma = new_vma(vm, op->base.remap.prev,
2526                                                       op->tile_mask, read_only,
2527                                                       is_null);
2528                                         if (IS_ERR(vma)) {
2529                                                 err = PTR_ERR(vma);
2530                                                 goto free_fence;
2531                                         }
2532
2533                                         op->remap.prev = vma;
2534
2535                                         /*
2536                                          * Userptr creates a new SG mapping so
2537                                          * we must also rebind.
2538                                          */
2539                                         op->remap.skip_prev = !xe_vma_is_userptr(old) &&
2540                                                 IS_ALIGNED(xe_vma_end(vma),
2541                                                            xe_vma_max_pte_size(old));
2542                                         if (op->remap.skip_prev) {
2543                                                 xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2544                                                 op->remap.range -=
2545                                                         xe_vma_end(vma) -
2546                                                         xe_vma_start(old);
2547                                                 op->remap.start = xe_vma_end(vma);
2548                                         }
2549                                 }
2550
2551                                 if (op->base.remap.next) {
2552                                         struct xe_vma *vma;
2553                                         bool read_only =
2554                                                 op->base.remap.unmap->va->flags &
2555                                                 XE_VMA_READ_ONLY;
2556
2557                                         bool is_null =
2558                                                 op->base.remap.unmap->va->flags &
2559                                                 DRM_GPUVA_SPARSE;
2560
2561                                         vma = new_vma(vm, op->base.remap.next,
2562                                                       op->tile_mask, read_only,
2563                                                       is_null);
2564                                         if (IS_ERR(vma)) {
2565                                                 err = PTR_ERR(vma);
2566                                                 goto free_fence;
2567                                         }
2568
2569                                         op->remap.next = vma;
2570
2571                                         /*
2572                                          * Userptr creates a new SG mapping so
2573                                          * we must also rebind.
2574                                          */
2575                                         op->remap.skip_next = !xe_vma_is_userptr(old) &&
2576                                                 IS_ALIGNED(xe_vma_start(vma),
2577                                                            xe_vma_max_pte_size(old));
2578                                         if (op->remap.skip_next) {
2579                                                 xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2580                                                 op->remap.range -=
2581                                                         xe_vma_end(old) -
2582                                                         xe_vma_start(vma);
2583                                         }
2584                                 }
2585                                 break;
2586                         }
2587                         case DRM_GPUVA_OP_UNMAP:
2588                         case DRM_GPUVA_OP_PREFETCH:
2589                                 /* Nothing to do */
2590                                 break;
2591                         default:
2592                                 XE_WARN_ON("NOT POSSIBLE");
2593                         }
2594
2595                         last_op = op;
2596                 }
2597
2598                 last_op->ops = __ops;
2599         }
2600
2601         if (!last_op)
2602                 return -ENODATA;
2603
2604         last_op->flags |= XE_VMA_OP_LAST;
2605         last_op->num_syncs = num_syncs;
2606         last_op->syncs = syncs;
2607         last_op->fence = fence;
2608
2609         return 0;
2610
2611 free_fence:
2612         kfree(fence);
2613         return err;
2614 }
2615
2616 static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2617 {
2618         int err = 0;
2619
2620         lockdep_assert_held_write(&vm->lock);
2621
2622         switch (op->base.op) {
2623         case DRM_GPUVA_OP_MAP:
2624                 err |= xe_vm_insert_vma(vm, op->map.vma);
2625                 if (!err)
2626                         op->flags |= XE_VMA_OP_COMMITTED;
2627                 break;
2628         case DRM_GPUVA_OP_REMAP:
2629                 prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2630                                  true);
2631                 op->flags |= XE_VMA_OP_COMMITTED;
2632
2633                 if (op->remap.prev) {
2634                         err |= xe_vm_insert_vma(vm, op->remap.prev);
2635                         if (!err)
2636                                 op->flags |= XE_VMA_OP_PREV_COMMITTED;
2637                         if (!err && op->remap.skip_prev)
2638                                 op->remap.prev = NULL;
2639                 }
2640                 if (op->remap.next) {
2641                         err |= xe_vm_insert_vma(vm, op->remap.next);
2642                         if (!err)
2643                                 op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2644                         if (!err && op->remap.skip_next)
2645                                 op->remap.next = NULL;
2646                 }
2647
2648                 /* Adjust for partial unbind after removin VMA from VM */
2649                 if (!err) {
2650                         op->base.remap.unmap->va->va.addr = op->remap.start;
2651                         op->base.remap.unmap->va->va.range = op->remap.range;
2652                 }
2653                 break;
2654         case DRM_GPUVA_OP_UNMAP:
2655                 prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2656                 op->flags |= XE_VMA_OP_COMMITTED;
2657                 break;
2658         case DRM_GPUVA_OP_PREFETCH:
2659                 op->flags |= XE_VMA_OP_COMMITTED;
2660                 break;
2661         default:
2662                 XE_WARN_ON("NOT POSSIBLE");
2663         }
2664
2665         return err;
2666 }
2667
2668 static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
2669                                struct xe_vma_op *op)
2670 {
2671         LIST_HEAD(objs);
2672         LIST_HEAD(dups);
2673         struct ttm_validate_buffer tv_bo, tv_vm;
2674         struct ww_acquire_ctx ww;
2675         struct xe_bo *vbo;
2676         int err;
2677
2678         lockdep_assert_held_write(&vm->lock);
2679
2680         xe_vm_tv_populate(vm, &tv_vm);
2681         list_add_tail(&tv_vm.head, &objs);
2682         vbo = xe_vma_bo(vma);
2683         if (vbo) {
2684                 /*
2685                  * An unbind can drop the last reference to the BO and
2686                  * the BO is needed for ttm_eu_backoff_reservation so
2687                  * take a reference here.
2688                  */
2689                 xe_bo_get(vbo);
2690
2691                 if (!vbo->vm) {
2692                         tv_bo.bo = &vbo->ttm;
2693                         tv_bo.num_shared = 1;
2694                         list_add(&tv_bo.head, &objs);
2695                 }
2696         }
2697
2698 again:
2699         err = ttm_eu_reserve_buffers(&ww, &objs, true, &dups);
2700         if (err) {
2701                 xe_bo_put(vbo);
2702                 return err;
2703         }
2704
2705         xe_vm_assert_held(vm);
2706         xe_bo_assert_held(xe_vma_bo(vma));
2707
2708         switch (op->base.op) {
2709         case DRM_GPUVA_OP_MAP:
2710                 err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
2711                                  op->syncs, op->num_syncs, op->fence,
2712                                  op->map.immediate || !xe_vm_in_fault_mode(vm),
2713                                  op->flags & XE_VMA_OP_FIRST,
2714                                  op->flags & XE_VMA_OP_LAST);
2715                 break;
2716         case DRM_GPUVA_OP_REMAP:
2717         {
2718                 bool prev = !!op->remap.prev;
2719                 bool next = !!op->remap.next;
2720
2721                 if (!op->remap.unmap_done) {
2722                         if (prev || next) {
2723                                 vm->async_ops.munmap_rebind_inflight = true;
2724                                 vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
2725                         }
2726                         err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2727                                            op->num_syncs,
2728                                            !prev && !next ? op->fence : NULL,
2729                                            op->flags & XE_VMA_OP_FIRST,
2730                                            op->flags & XE_VMA_OP_LAST && !prev &&
2731                                            !next);
2732                         if (err)
2733                                 break;
2734                         op->remap.unmap_done = true;
2735                 }
2736
2737                 if (prev) {
2738                         op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
2739                         err = xe_vm_bind(vm, op->remap.prev, op->q,
2740                                          xe_vma_bo(op->remap.prev), op->syncs,
2741                                          op->num_syncs,
2742                                          !next ? op->fence : NULL, true, false,
2743                                          op->flags & XE_VMA_OP_LAST && !next);
2744                         op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2745                         if (err)
2746                                 break;
2747                         op->remap.prev = NULL;
2748                 }
2749
2750                 if (next) {
2751                         op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
2752                         err = xe_vm_bind(vm, op->remap.next, op->q,
2753                                          xe_vma_bo(op->remap.next),
2754                                          op->syncs, op->num_syncs,
2755                                          op->fence, true, false,
2756                                          op->flags & XE_VMA_OP_LAST);
2757                         op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2758                         if (err)
2759                                 break;
2760                         op->remap.next = NULL;
2761                 }
2762                 vm->async_ops.munmap_rebind_inflight = false;
2763
2764                 break;
2765         }
2766         case DRM_GPUVA_OP_UNMAP:
2767                 err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2768                                    op->num_syncs, op->fence,
2769                                    op->flags & XE_VMA_OP_FIRST,
2770                                    op->flags & XE_VMA_OP_LAST);
2771                 break;
2772         case DRM_GPUVA_OP_PREFETCH:
2773                 err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
2774                                      op->syncs, op->num_syncs, op->fence,
2775                                      op->flags & XE_VMA_OP_FIRST,
2776                                      op->flags & XE_VMA_OP_LAST);
2777                 break;
2778         default:
2779                 XE_WARN_ON("NOT POSSIBLE");
2780         }
2781
2782         ttm_eu_backoff_reservation(&ww, &objs);
2783         if (err == -EAGAIN && xe_vma_is_userptr(vma)) {
2784                 lockdep_assert_held_write(&vm->lock);
2785                 err = xe_vma_userptr_pin_pages(vma);
2786                 if (!err)
2787                         goto again;
2788         }
2789         xe_bo_put(vbo);
2790
2791         if (err)
2792                 trace_xe_vma_fail(vma);
2793
2794         return err;
2795 }
2796
2797 static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
2798 {
2799         int ret = 0;
2800
2801         lockdep_assert_held_write(&vm->lock);
2802
2803 #ifdef TEST_VM_ASYNC_OPS_ERROR
2804         if (op->inject_error) {
2805                 op->inject_error = false;
2806                 return -ENOMEM;
2807         }
2808 #endif
2809
2810         switch (op->base.op) {
2811         case DRM_GPUVA_OP_MAP:
2812                 ret = __xe_vma_op_execute(vm, op->map.vma, op);
2813                 break;
2814         case DRM_GPUVA_OP_REMAP:
2815         {
2816                 struct xe_vma *vma;
2817
2818                 if (!op->remap.unmap_done)
2819                         vma = gpuva_to_vma(op->base.remap.unmap->va);
2820                 else if (op->remap.prev)
2821                         vma = op->remap.prev;
2822                 else
2823                         vma = op->remap.next;
2824
2825                 ret = __xe_vma_op_execute(vm, vma, op);
2826                 break;
2827         }
2828         case DRM_GPUVA_OP_UNMAP:
2829                 ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
2830                                           op);
2831                 break;
2832         case DRM_GPUVA_OP_PREFETCH:
2833                 ret = __xe_vma_op_execute(vm,
2834                                           gpuva_to_vma(op->base.prefetch.va),
2835                                           op);
2836                 break;
2837         default:
2838                 XE_WARN_ON("NOT POSSIBLE");
2839         }
2840
2841         return ret;
2842 }
2843
2844 static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
2845 {
2846         bool last = op->flags & XE_VMA_OP_LAST;
2847
2848         if (last) {
2849                 while (op->num_syncs--)
2850                         xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
2851                 kfree(op->syncs);
2852                 if (op->q)
2853                         xe_exec_queue_put(op->q);
2854                 if (op->fence)
2855                         dma_fence_put(&op->fence->fence);
2856         }
2857         if (!list_empty(&op->link)) {
2858                 spin_lock_irq(&vm->async_ops.lock);
2859                 list_del(&op->link);
2860                 spin_unlock_irq(&vm->async_ops.lock);
2861         }
2862         if (op->ops)
2863                 drm_gpuva_ops_free(&vm->gpuvm, op->ops);
2864         if (last)
2865                 xe_vm_put(vm);
2866 }
2867
2868 static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2869                              bool post_commit, bool prev_post_commit,
2870                              bool next_post_commit)
2871 {
2872         lockdep_assert_held_write(&vm->lock);
2873
2874         switch (op->base.op) {
2875         case DRM_GPUVA_OP_MAP:
2876                 if (op->map.vma) {
2877                         prep_vma_destroy(vm, op->map.vma, post_commit);
2878                         xe_vma_destroy_unlocked(op->map.vma);
2879                 }
2880                 break;
2881         case DRM_GPUVA_OP_UNMAP:
2882         {
2883                 struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2884
2885                 down_read(&vm->userptr.notifier_lock);
2886                 vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2887                 up_read(&vm->userptr.notifier_lock);
2888                 if (post_commit)
2889                         xe_vm_insert_vma(vm, vma);
2890                 break;
2891         }
2892         case DRM_GPUVA_OP_REMAP:
2893         {
2894                 struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2895
2896                 if (op->remap.prev) {
2897                         prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2898                         xe_vma_destroy_unlocked(op->remap.prev);
2899                 }
2900                 if (op->remap.next) {
2901                         prep_vma_destroy(vm, op->remap.next, next_post_commit);
2902                         xe_vma_destroy_unlocked(op->remap.next);
2903                 }
2904                 down_read(&vm->userptr.notifier_lock);
2905                 vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2906                 up_read(&vm->userptr.notifier_lock);
2907                 if (post_commit)
2908                         xe_vm_insert_vma(vm, vma);
2909                 break;
2910         }
2911         case DRM_GPUVA_OP_PREFETCH:
2912                 /* Nothing to do */
2913                 break;
2914         default:
2915                 XE_WARN_ON("NOT POSSIBLE");
2916         }
2917 }
2918
2919 static struct xe_vma_op *next_vma_op(struct xe_vm *vm)
2920 {
2921         return list_first_entry_or_null(&vm->async_ops.pending,
2922                                         struct xe_vma_op, link);
2923 }
2924
2925 static void xe_vma_op_work_func(struct work_struct *w)
2926 {
2927         struct xe_vm *vm = container_of(w, struct xe_vm, async_ops.work);
2928
2929         for (;;) {
2930                 struct xe_vma_op *op;
2931                 int err;
2932
2933                 if (vm->async_ops.error && !xe_vm_is_closed(vm))
2934                         break;
2935
2936                 spin_lock_irq(&vm->async_ops.lock);
2937                 op = next_vma_op(vm);
2938                 spin_unlock_irq(&vm->async_ops.lock);
2939
2940                 if (!op)
2941                         break;
2942
2943                 if (!xe_vm_is_closed(vm)) {
2944                         down_write(&vm->lock);
2945                         err = xe_vma_op_execute(vm, op);
2946                         if (err) {
2947                                 drm_warn(&vm->xe->drm,
2948                                          "Async VM op(%d) failed with %d",
2949                                          op->base.op, err);
2950                                 vm_set_async_error(vm, err);
2951                                 up_write(&vm->lock);
2952
2953                                 if (vm->async_ops.error_capture.addr)
2954                                         vm_error_capture(vm, err, 0, 0, 0);
2955                                 break;
2956                         }
2957                         up_write(&vm->lock);
2958                 } else {
2959                         struct xe_vma *vma;
2960
2961                         switch (op->base.op) {
2962                         case DRM_GPUVA_OP_REMAP:
2963                                 vma = gpuva_to_vma(op->base.remap.unmap->va);
2964                                 trace_xe_vma_flush(vma);
2965
2966                                 down_write(&vm->lock);
2967                                 xe_vma_destroy_unlocked(vma);
2968                                 up_write(&vm->lock);
2969                                 break;
2970                         case DRM_GPUVA_OP_UNMAP:
2971                                 vma = gpuva_to_vma(op->base.unmap.va);
2972                                 trace_xe_vma_flush(vma);
2973
2974                                 down_write(&vm->lock);
2975                                 xe_vma_destroy_unlocked(vma);
2976                                 up_write(&vm->lock);
2977                                 break;
2978                         default:
2979                                 /* Nothing to do */
2980                                 break;
2981                         }
2982
2983                         if (op->fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
2984                                                    &op->fence->fence.flags)) {
2985                                 if (!xe_vm_no_dma_fences(vm)) {
2986                                         op->fence->started = true;
2987                                         wake_up_all(&op->fence->wq);
2988                                 }
2989                                 dma_fence_signal(&op->fence->fence);
2990                         }
2991                 }
2992
2993                 xe_vma_op_cleanup(vm, op);
2994         }
2995 }
2996
2997 static int vm_bind_ioctl_ops_commit(struct xe_vm *vm,
2998                                     struct list_head *ops_list, bool async)
2999 {
3000         struct xe_vma_op *op, *last_op, *next;
3001         int err;
3002
3003         lockdep_assert_held_write(&vm->lock);
3004
3005         list_for_each_entry(op, ops_list, link) {
3006                 last_op = op;
3007                 err = xe_vma_op_commit(vm, op);
3008                 if (err)
3009                         goto unwind;
3010         }
3011
3012         if (!async) {
3013                 err = xe_vma_op_execute(vm, last_op);
3014                 if (err)
3015                         goto unwind;
3016                 xe_vma_op_cleanup(vm, last_op);
3017         } else {
3018                 int i;
3019                 bool installed = false;
3020
3021                 for (i = 0; i < last_op->num_syncs; i++)
3022                         installed |= xe_sync_entry_signal(&last_op->syncs[i],
3023                                                           NULL,
3024                                                           &last_op->fence->fence);
3025                 if (!installed && last_op->fence)
3026                         dma_fence_signal(&last_op->fence->fence);
3027
3028                 spin_lock_irq(&vm->async_ops.lock);
3029                 list_splice_tail(ops_list, &vm->async_ops.pending);
3030                 spin_unlock_irq(&vm->async_ops.lock);
3031
3032                 if (!vm->async_ops.error)
3033                         queue_work(system_unbound_wq, &vm->async_ops.work);
3034         }
3035
3036         return 0;
3037
3038 unwind:
3039         list_for_each_entry_reverse(op, ops_list, link)
3040                 xe_vma_op_unwind(vm, op, op->flags & XE_VMA_OP_COMMITTED,
3041                                  op->flags & XE_VMA_OP_PREV_COMMITTED,
3042                                  op->flags & XE_VMA_OP_NEXT_COMMITTED);
3043         list_for_each_entry_safe(op, next, ops_list, link)
3044                 xe_vma_op_cleanup(vm, op);
3045
3046         return err;
3047 }
3048
3049 /*
3050  * Unwind operations list, called after a failure of vm_bind_ioctl_ops_create or
3051  * vm_bind_ioctl_ops_parse.
3052  */
3053 static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
3054                                      struct drm_gpuva_ops **ops,
3055                                      int num_ops_list)
3056 {
3057         int i;
3058
3059         for (i = 0; i < num_ops_list; ++i) {
3060                 struct drm_gpuva_ops *__ops = ops[i];
3061                 struct drm_gpuva_op *__op;
3062
3063                 if (!__ops)
3064                         continue;
3065
3066                 drm_gpuva_for_each_op(__op, __ops) {
3067                         struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
3068
3069                         xe_vma_op_unwind(vm, op, false, false, false);
3070                 }
3071         }
3072 }
3073
3074 #ifdef TEST_VM_ASYNC_OPS_ERROR
3075 #define SUPPORTED_FLAGS \
3076         (FORCE_ASYNC_OP_ERROR | XE_VM_BIND_FLAG_ASYNC | \
3077          XE_VM_BIND_FLAG_READONLY | XE_VM_BIND_FLAG_IMMEDIATE | \
3078          XE_VM_BIND_FLAG_NULL | 0xffff)
3079 #else
3080 #define SUPPORTED_FLAGS \
3081         (XE_VM_BIND_FLAG_ASYNC | XE_VM_BIND_FLAG_READONLY | \
3082          XE_VM_BIND_FLAG_IMMEDIATE | XE_VM_BIND_FLAG_NULL | 0xffff)
3083 #endif
3084 #define XE_64K_PAGE_MASK 0xffffull
3085
3086 #define MAX_BINDS       512     /* FIXME: Picking random upper limit */
3087
3088 static int vm_bind_ioctl_check_args(struct xe_device *xe,
3089                                     struct drm_xe_vm_bind *args,
3090                                     struct drm_xe_vm_bind_op **bind_ops,
3091                                     bool *async)
3092 {
3093         int err;
3094         int i;
3095
3096         if (XE_IOCTL_DBG(xe, args->extensions) ||
3097             XE_IOCTL_DBG(xe, !args->num_binds) ||
3098             XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS))
3099                 return -EINVAL;
3100
3101         if (args->num_binds > 1) {
3102                 u64 __user *bind_user =
3103                         u64_to_user_ptr(args->vector_of_binds);
3104
3105                 *bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) *
3106                                     args->num_binds, GFP_KERNEL);
3107                 if (!*bind_ops)
3108                         return -ENOMEM;
3109
3110                 err = __copy_from_user(*bind_ops, bind_user,
3111                                        sizeof(struct drm_xe_vm_bind_op) *
3112                                        args->num_binds);
3113                 if (XE_IOCTL_DBG(xe, err)) {
3114                         err = -EFAULT;
3115                         goto free_bind_ops;
3116                 }
3117         } else {
3118                 *bind_ops = &args->bind;
3119         }
3120
3121         for (i = 0; i < args->num_binds; ++i) {
3122                 u64 range = (*bind_ops)[i].range;
3123                 u64 addr = (*bind_ops)[i].addr;
3124                 u32 op = (*bind_ops)[i].op;
3125                 u32 obj = (*bind_ops)[i].obj;
3126                 u64 obj_offset = (*bind_ops)[i].obj_offset;
3127                 u32 region = (*bind_ops)[i].region;
3128                 bool is_null = op & XE_VM_BIND_FLAG_NULL;
3129
3130                 if (i == 0) {
3131                         *async = !!(op & XE_VM_BIND_FLAG_ASYNC);
3132                 } else if (XE_IOCTL_DBG(xe, !*async) ||
3133                            XE_IOCTL_DBG(xe, !(op & XE_VM_BIND_FLAG_ASYNC)) ||
3134                            XE_IOCTL_DBG(xe, VM_BIND_OP(op) ==
3135                                         XE_VM_BIND_OP_RESTART)) {
3136                         err = -EINVAL;
3137                         goto free_bind_ops;
3138                 }
3139
3140                 if (XE_IOCTL_DBG(xe, !*async &&
3141                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL)) {
3142                         err = -EINVAL;
3143                         goto free_bind_ops;
3144                 }
3145
3146                 if (XE_IOCTL_DBG(xe, !*async &&
3147                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH)) {
3148                         err = -EINVAL;
3149                         goto free_bind_ops;
3150                 }
3151
3152                 if (XE_IOCTL_DBG(xe, VM_BIND_OP(op) >
3153                                  XE_VM_BIND_OP_PREFETCH) ||
3154                     XE_IOCTL_DBG(xe, op & ~SUPPORTED_FLAGS) ||
3155                     XE_IOCTL_DBG(xe, obj && is_null) ||
3156                     XE_IOCTL_DBG(xe, obj_offset && is_null) ||
3157                     XE_IOCTL_DBG(xe, VM_BIND_OP(op) != XE_VM_BIND_OP_MAP &&
3158                                  is_null) ||
3159                     XE_IOCTL_DBG(xe, !obj &&
3160                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP &&
3161                                  !is_null) ||
3162                     XE_IOCTL_DBG(xe, !obj &&
3163                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3164                     XE_IOCTL_DBG(xe, addr &&
3165                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3166                     XE_IOCTL_DBG(xe, range &&
3167                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP_ALL) ||
3168                     XE_IOCTL_DBG(xe, obj &&
3169                                  VM_BIND_OP(op) == XE_VM_BIND_OP_MAP_USERPTR) ||
3170                     XE_IOCTL_DBG(xe, obj &&
3171                                  VM_BIND_OP(op) == XE_VM_BIND_OP_PREFETCH) ||
3172                     XE_IOCTL_DBG(xe, region &&
3173                                  VM_BIND_OP(op) != XE_VM_BIND_OP_PREFETCH) ||
3174                     XE_IOCTL_DBG(xe, !(BIT(region) &
3175                                        xe->info.mem_region_mask)) ||
3176                     XE_IOCTL_DBG(xe, obj &&
3177                                  VM_BIND_OP(op) == XE_VM_BIND_OP_UNMAP)) {
3178                         err = -EINVAL;
3179                         goto free_bind_ops;
3180                 }
3181
3182                 if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
3183                     XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
3184                     XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
3185                     XE_IOCTL_DBG(xe, !range && VM_BIND_OP(op) !=
3186                                  XE_VM_BIND_OP_RESTART &&
3187                                  VM_BIND_OP(op) != XE_VM_BIND_OP_UNMAP_ALL)) {
3188                         err = -EINVAL;
3189                         goto free_bind_ops;
3190                 }
3191         }
3192
3193         return 0;
3194
3195 free_bind_ops:
3196         if (args->num_binds > 1)
3197                 kfree(*bind_ops);
3198         return err;
3199 }
3200
3201 int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3202 {
3203         struct xe_device *xe = to_xe_device(dev);
3204         struct xe_file *xef = to_xe_file(file);
3205         struct drm_xe_vm_bind *args = data;
3206         struct drm_xe_sync __user *syncs_user;
3207         struct xe_bo **bos = NULL;
3208         struct drm_gpuva_ops **ops = NULL;
3209         struct xe_vm *vm;
3210         struct xe_exec_queue *q = NULL;
3211         u32 num_syncs;
3212         struct xe_sync_entry *syncs = NULL;
3213         struct drm_xe_vm_bind_op *bind_ops;
3214         LIST_HEAD(ops_list);
3215         bool async;
3216         int err;
3217         int i;
3218
3219         err = vm_bind_ioctl_check_args(xe, args, &bind_ops, &async);
3220         if (err)
3221                 return err;
3222
3223         if (args->exec_queue_id) {
3224                 q = xe_exec_queue_lookup(xef, args->exec_queue_id);
3225                 if (XE_IOCTL_DBG(xe, !q)) {
3226                         err = -ENOENT;
3227                         goto free_objs;
3228                 }
3229
3230                 if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
3231                         err = -EINVAL;
3232                         goto put_exec_queue;
3233                 }
3234         }
3235
3236         vm = xe_vm_lookup(xef, args->vm_id);
3237         if (XE_IOCTL_DBG(xe, !vm)) {
3238                 err = -EINVAL;
3239                 goto put_exec_queue;
3240         }
3241
3242         err = down_write_killable(&vm->lock);
3243         if (err)
3244                 goto put_vm;
3245
3246         if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3247                 err = -ENOENT;
3248                 goto release_vm_lock;
3249         }
3250
3251         if (VM_BIND_OP(bind_ops[0].op) == XE_VM_BIND_OP_RESTART) {
3252                 if (XE_IOCTL_DBG(xe, !(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS)))
3253                         err = -EOPNOTSUPP;
3254                 if (XE_IOCTL_DBG(xe, !err && args->num_syncs))
3255                         err = EINVAL;
3256                 if (XE_IOCTL_DBG(xe, !err && !vm->async_ops.error))
3257                         err = -EPROTO;
3258
3259                 if (!err) {
3260                         trace_xe_vm_restart(vm);
3261                         vm_set_async_error(vm, 0);
3262
3263                         queue_work(system_unbound_wq, &vm->async_ops.work);
3264
3265                         /* Rebinds may have been blocked, give worker a kick */
3266                         if (xe_vm_in_compute_mode(vm))
3267                                 xe_vm_queue_rebind_worker(vm);
3268                 }
3269
3270                 goto release_vm_lock;
3271         }
3272
3273         if (XE_IOCTL_DBG(xe, !vm->async_ops.error &&
3274                          async != !!(vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS))) {
3275                 err = -EOPNOTSUPP;
3276                 goto release_vm_lock;
3277         }
3278
3279         for (i = 0; i < args->num_binds; ++i) {
3280                 u64 range = bind_ops[i].range;
3281                 u64 addr = bind_ops[i].addr;
3282
3283                 if (XE_IOCTL_DBG(xe, range > vm->size) ||
3284                     XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3285                         err = -EINVAL;
3286                         goto release_vm_lock;
3287                 }
3288
3289                 if (bind_ops[i].tile_mask) {
3290                         u64 valid_tiles = BIT(xe->info.tile_count) - 1;
3291
3292                         if (XE_IOCTL_DBG(xe, bind_ops[i].tile_mask &
3293                                          ~valid_tiles)) {
3294                                 err = -EINVAL;
3295                                 goto release_vm_lock;
3296                         }
3297                 }
3298         }
3299
3300         bos = kzalloc(sizeof(*bos) * args->num_binds, GFP_KERNEL);
3301         if (!bos) {
3302                 err = -ENOMEM;
3303                 goto release_vm_lock;
3304         }
3305
3306         ops = kzalloc(sizeof(*ops) * args->num_binds, GFP_KERNEL);
3307         if (!ops) {
3308                 err = -ENOMEM;
3309                 goto release_vm_lock;
3310         }
3311
3312         for (i = 0; i < args->num_binds; ++i) {
3313                 struct drm_gem_object *gem_obj;
3314                 u64 range = bind_ops[i].range;
3315                 u64 addr = bind_ops[i].addr;
3316                 u32 obj = bind_ops[i].obj;
3317                 u64 obj_offset = bind_ops[i].obj_offset;
3318
3319                 if (!obj)
3320                         continue;
3321
3322                 gem_obj = drm_gem_object_lookup(file, obj);
3323                 if (XE_IOCTL_DBG(xe, !gem_obj)) {
3324                         err = -ENOENT;
3325                         goto put_obj;
3326                 }
3327                 bos[i] = gem_to_xe_bo(gem_obj);
3328
3329                 if (XE_IOCTL_DBG(xe, range > bos[i]->size) ||
3330                     XE_IOCTL_DBG(xe, obj_offset >
3331                                  bos[i]->size - range)) {
3332                         err = -EINVAL;
3333                         goto put_obj;
3334                 }
3335
3336                 if (bos[i]->flags & XE_BO_INTERNAL_64K) {
3337                         if (XE_IOCTL_DBG(xe, obj_offset &
3338                                          XE_64K_PAGE_MASK) ||
3339                             XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3340                             XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3341                                 err = -EINVAL;
3342                                 goto put_obj;
3343                         }
3344                 }
3345         }
3346
3347         if (args->num_syncs) {
3348                 syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3349                 if (!syncs) {
3350                         err = -ENOMEM;
3351                         goto put_obj;
3352                 }
3353         }
3354
3355         syncs_user = u64_to_user_ptr(args->syncs);
3356         for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3357                 err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3358                                           &syncs_user[num_syncs], false,
3359                                           xe_vm_no_dma_fences(vm));
3360                 if (err)
3361                         goto free_syncs;
3362         }
3363
3364         /* Do some error checking first to make the unwind easier */
3365         for (i = 0; i < args->num_binds; ++i) {
3366                 u64 range = bind_ops[i].range;
3367                 u64 addr = bind_ops[i].addr;
3368                 u32 op = bind_ops[i].op;
3369
3370                 err = vm_bind_ioctl_lookup_vma(vm, bos[i], addr, range, op);
3371                 if (err)
3372                         goto free_syncs;
3373         }
3374
3375         for (i = 0; i < args->num_binds; ++i) {
3376                 u64 range = bind_ops[i].range;
3377                 u64 addr = bind_ops[i].addr;
3378                 u32 op = bind_ops[i].op;
3379                 u64 obj_offset = bind_ops[i].obj_offset;
3380                 u8 tile_mask = bind_ops[i].tile_mask;
3381                 u32 region = bind_ops[i].region;
3382
3383                 ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3384                                                   addr, range, op, tile_mask,
3385                                                   region);
3386                 if (IS_ERR(ops[i])) {
3387                         err = PTR_ERR(ops[i]);
3388                         ops[i] = NULL;
3389                         goto unwind_ops;
3390                 }
3391         }
3392
3393         err = vm_bind_ioctl_ops_parse(vm, q, ops, args->num_binds,
3394                                       syncs, num_syncs, &ops_list, async);
3395         if (err)
3396                 goto unwind_ops;
3397
3398         err = vm_bind_ioctl_ops_commit(vm, &ops_list, async);
3399         up_write(&vm->lock);
3400
3401         for (i = 0; i < args->num_binds; ++i)
3402                 xe_bo_put(bos[i]);
3403
3404         kfree(bos);
3405         kfree(ops);
3406         if (args->num_binds > 1)
3407                 kfree(bind_ops);
3408
3409         return err;
3410
3411 unwind_ops:
3412         vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3413 free_syncs:
3414         for (i = 0; err == -ENODATA && i < num_syncs; i++)
3415                 xe_sync_entry_signal(&syncs[i], NULL, dma_fence_get_stub());
3416         while (num_syncs--)
3417                 xe_sync_entry_cleanup(&syncs[num_syncs]);
3418
3419         kfree(syncs);
3420 put_obj:
3421         for (i = 0; i < args->num_binds; ++i)
3422                 xe_bo_put(bos[i]);
3423 release_vm_lock:
3424         up_write(&vm->lock);
3425 put_vm:
3426         xe_vm_put(vm);
3427 put_exec_queue:
3428         if (q)
3429                 xe_exec_queue_put(q);
3430 free_objs:
3431         kfree(bos);
3432         kfree(ops);
3433         if (args->num_binds > 1)
3434                 kfree(bind_ops);
3435         return err == -ENODATA ? 0 : err;
3436 }
3437
3438 /*
3439  * XXX: Using the TTM wrappers for now, likely can call into dma-resv code
3440  * directly to optimize. Also this likely should be an inline function.
3441  */
3442 int xe_vm_lock(struct xe_vm *vm, struct ww_acquire_ctx *ww,
3443                int num_resv, bool intr)
3444 {
3445         struct ttm_validate_buffer tv_vm;
3446         LIST_HEAD(objs);
3447         LIST_HEAD(dups);
3448
3449         XE_WARN_ON(!ww);
3450
3451         tv_vm.num_shared = num_resv;
3452         tv_vm.bo = xe_vm_ttm_bo(vm);
3453         list_add_tail(&tv_vm.head, &objs);
3454
3455         return ttm_eu_reserve_buffers(ww, &objs, intr, &dups);
3456 }
3457
3458 void xe_vm_unlock(struct xe_vm *vm, struct ww_acquire_ctx *ww)
3459 {
3460         dma_resv_unlock(xe_vm_resv(vm));
3461         ww_acquire_fini(ww);
3462 }
3463
3464 /**
3465  * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3466  * @vma: VMA to invalidate
3467  *
3468  * Walks a list of page tables leaves which it memset the entries owned by this
3469  * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3470  * complete.
3471  *
3472  * Returns 0 for success, negative error code otherwise.
3473  */
3474 int xe_vm_invalidate_vma(struct xe_vma *vma)
3475 {
3476         struct xe_device *xe = xe_vma_vm(vma)->xe;
3477         struct xe_tile *tile;
3478         u32 tile_needs_invalidate = 0;
3479         int seqno[XE_MAX_TILES_PER_DEVICE];
3480         u8 id;
3481         int ret;
3482
3483         XE_WARN_ON(!xe_vm_in_fault_mode(xe_vma_vm(vma)));
3484         XE_WARN_ON(xe_vma_is_null(vma));
3485         trace_xe_vma_usm_invalidate(vma);
3486
3487         /* Check that we don't race with page-table updates */
3488         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3489                 if (xe_vma_is_userptr(vma)) {
3490                         WARN_ON_ONCE(!mmu_interval_check_retry
3491                                      (&vma->userptr.notifier,
3492                                       vma->userptr.notifier_seq));
3493                         WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3494                                                              DMA_RESV_USAGE_BOOKKEEP));
3495
3496                 } else {
3497                         xe_bo_assert_held(xe_vma_bo(vma));
3498                 }
3499         }
3500
3501         for_each_tile(tile, xe, id) {
3502                 if (xe_pt_zap_ptes(tile, vma)) {
3503                         tile_needs_invalidate |= BIT(id);
3504                         xe_device_wmb(xe);
3505                         /*
3506                          * FIXME: We potentially need to invalidate multiple
3507                          * GTs within the tile
3508                          */
3509                         seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
3510                         if (seqno[id] < 0)
3511                                 return seqno[id];
3512                 }
3513         }
3514
3515         for_each_tile(tile, xe, id) {
3516                 if (tile_needs_invalidate & BIT(id)) {
3517                         ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
3518                         if (ret < 0)
3519                                 return ret;
3520                 }
3521         }
3522
3523         vma->usm.tile_invalidated = vma->tile_mask;
3524
3525         return 0;
3526 }
3527
3528 int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
3529 {
3530         struct drm_gpuva *gpuva;
3531         bool is_vram;
3532         uint64_t addr;
3533
3534         if (!down_read_trylock(&vm->lock)) {
3535                 drm_printf(p, " Failed to acquire VM lock to dump capture");
3536                 return 0;
3537         }
3538         if (vm->pt_root[gt_id]) {
3539                 addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
3540                 is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
3541                 drm_printf(p, " VM root: A:0x%llx %s\n", addr,
3542                            is_vram ? "VRAM" : "SYS");
3543         }
3544
3545         drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3546                 struct xe_vma *vma = gpuva_to_vma(gpuva);
3547                 bool is_userptr = xe_vma_is_userptr(vma);
3548                 bool is_null = xe_vma_is_null(vma);
3549
3550                 if (is_null) {
3551                         addr = 0;
3552                 } else if (is_userptr) {
3553                         struct xe_res_cursor cur;
3554
3555                         if (vma->userptr.sg) {
3556                                 xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE,
3557                                                 &cur);
3558                                 addr = xe_res_dma(&cur);
3559                         } else {
3560                                 addr = 0;
3561                         }
3562                 } else {
3563                         addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
3564                         is_vram = xe_bo_is_vram(xe_vma_bo(vma));
3565                 }
3566                 drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
3567                            xe_vma_start(vma), xe_vma_end(vma) - 1,
3568                            xe_vma_size(vma),
3569                            addr, is_null ? "NULL" : is_userptr ? "USR" :
3570                            is_vram ? "VRAM" : "SYS");
3571         }
3572         up_read(&vm->lock);
3573
3574         return 0;
3575 }