Merge tag 'amd-drm-next-6.7-2023-10-13' of https://gitlab.freedesktop.org/agd5f/linux...
[linux-2.6-microblaze.git] / drivers / gpu / drm / amd / amdkfd / kfd_svm.c
index bb16b79..f4038b3 100644 (file)
@@ -158,12 +158,13 @@ svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr)
 static int
 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
                      unsigned long offset, unsigned long npages,
-                     unsigned long *hmm_pfns, uint32_t gpuidx)
+                     unsigned long *hmm_pfns, uint32_t gpuidx, uint64_t *vram_pages)
 {
        enum dma_data_direction dir = DMA_BIDIRECTIONAL;
        dma_addr_t *addr = prange->dma_addr[gpuidx];
        struct device *dev = adev->dev;
        struct page *page;
+       uint64_t vram_pages_dev;
        int i, r;
 
        if (!addr) {
@@ -173,6 +174,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
                prange->dma_addr[gpuidx] = addr;
        }
 
+       vram_pages_dev = 0;
        addr += offset;
        for (i = 0; i < npages; i++) {
                if (svm_is_valid_dma_mapping_addr(dev, addr[i]))
@@ -182,6 +184,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
                if (is_zone_device_page(page)) {
                        struct amdgpu_device *bo_adev = prange->svm_bo->node->adev;
 
+                       vram_pages_dev++;
                        addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
                                   bo_adev->vm_manager.vram_base_offset -
                                   bo_adev->kfd.pgmap.range.start;
@@ -198,13 +201,14 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
                pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n",
                                     addr[i] >> PAGE_SHIFT, page_to_pfn(page));
        }
+       *vram_pages = vram_pages_dev;
        return 0;
 }
 
 static int
 svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
                  unsigned long offset, unsigned long npages,
-                 unsigned long *hmm_pfns)
+                 unsigned long *hmm_pfns, uint64_t *vram_pages)
 {
        struct kfd_process *p;
        uint32_t gpuidx;
@@ -223,7 +227,7 @@ svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
                }
 
                r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages,
-                                         hmm_pfns, gpuidx);
+                                         hmm_pfns, gpuidx, vram_pages);
                if (r)
                        break;
        }
@@ -231,7 +235,7 @@ svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
        return r;
 }
 
-void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
+void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr,
                         unsigned long offset, unsigned long npages)
 {
        enum dma_data_direction dir = DMA_BIDIRECTIONAL;
@@ -249,7 +253,7 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
        }
 }
 
-void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma)
+void svm_range_dma_unmap(struct svm_range *prange)
 {
        struct kfd_process_device *pdd;
        dma_addr_t *dma_addr;
@@ -270,10 +274,8 @@ void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma)
                        continue;
                }
                dev = &pdd->dev->adev->pdev->dev;
-               if (unmap_dma)
-                       svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
-               kvfree(dma_addr);
-               prange->dma_addr[gpuidx] = NULL;
+
+               svm_range_dma_unmap_dev(dev, dma_addr, 0, prange->npages);
        }
 }
 
@@ -281,18 +283,29 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap)
 {
        uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT;
        struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
+       uint32_t gpuidx;
 
        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
                 prange->start, prange->last);
 
        svm_range_vram_node_free(prange);
-       svm_range_free_dma_mappings(prange, do_unmap);
+       if (do_unmap)
+               svm_range_dma_unmap(prange);
 
        if (do_unmap && !p->xnack_enabled) {
                pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
                amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
                                        KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
        }
+
+       /* free dma_addr array for each gpu */
+       for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
+               if (prange->dma_addr[gpuidx]) {
+                       kvfree(prange->dma_addr[gpuidx]);
+                               prange->dma_addr[gpuidx] = NULL;
+               }
+       }
+
        mutex_destroy(&prange->lock);
        mutex_destroy(&prange->migrate_mutex);
        kfree(prange);
@@ -340,6 +353,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
        INIT_LIST_HEAD(&prange->child_list);
        atomic_set(&prange->invalid, 0);
        prange->validate_timestamp = 0;
+       prange->vram_pages = 0;
        mutex_init(&prange->migrate_mutex);
        mutex_init(&prange->lock);
 
@@ -386,6 +400,8 @@ static void svm_range_bo_release(struct kref *kref)
                         prange->start, prange->last);
                mutex_lock(&prange->lock);
                prange->svm_bo = NULL;
+               /* prange should not hold vram page now */
+               WARN_ON(prange->actual_loc);
                mutex_unlock(&prange->lock);
 
                spin_lock(&svm_bo->list_lock);
@@ -495,11 +511,11 @@ svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange)
 
        /* We need a new svm_bo. Spin-loop to wait for concurrent
         * svm_range_bo_release to finish removing this range from
-        * its range list. After this, it is safe to reuse the
-        * svm_bo pointer and svm_bo_list head.
+        * its range list and set prange->svm_bo to null. After this,
+        * it is safe to reuse the svm_bo pointer and svm_bo_list head.
         */
-       while (!list_empty_careful(&prange->svm_bo_list))
-               ;
+       while (!list_empty_careful(&prange->svm_bo_list) || prange->svm_bo)
+               cond_resched();
 
        return false;
 }
@@ -628,8 +644,15 @@ create_bo_failed:
 
 void svm_range_vram_node_free(struct svm_range *prange)
 {
-       svm_range_bo_unref(prange->svm_bo);
-       prange->ttm_res = NULL;
+       /* serialize prange->svm_bo unref */
+       mutex_lock(&prange->lock);
+       /* prange->svm_bo has not been unref */
+       if (prange->ttm_res) {
+               prange->ttm_res = NULL;
+               mutex_unlock(&prange->lock);
+               svm_range_bo_unref(prange->svm_bo);
+       } else
+               mutex_unlock(&prange->lock);
 }
 
 struct kfd_node *
@@ -820,7 +843,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
                }
        }
 
-       return !prange->is_error_flag;
+       return true;
 }
 
 /**
@@ -959,6 +982,11 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
        new->svm_bo = svm_range_bo_ref(old->svm_bo);
        new->ttm_res = old->ttm_res;
 
+       /* set new's vram_pages as old range's now, the acurate vram_pages
+        * will be updated during mapping
+        */
+       new->vram_pages = min(old->vram_pages, new->npages);
+
        spin_lock(&new->svm_bo->list_lock);
        list_add(&new->svm_bo_list, &new->svm_bo->range_list);
        spin_unlock(&new->svm_bo->list_lock);
@@ -1189,14 +1217,15 @@ svm_range_get_pte_flags(struct kfd_node *node,
        uint32_t mapping_flags = 0;
        uint64_t pte_flags;
        bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
-       bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT;
+       bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT);
+       bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT;
        bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/
        unsigned int mtype_local;
 
        if (domain == SVM_RANGE_VRAM_DOMAIN)
                bo_node = prange->svm_bo->node;
 
-       switch (node->adev->ip_versions[GC_HWIP][0]) {
+       switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) {
        case IP_VERSION(9, 4, 1):
                if (domain == SVM_RANGE_VRAM_DOMAIN) {
                        if (bo_node == node) {
@@ -1233,7 +1262,8 @@ svm_range_get_pte_flags(struct kfd_node *node,
                break;
        case IP_VERSION(9, 4, 3):
                mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC :
-                            (amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW);
+                             (amdgpu_mtype_local == 2 || ext_coherent ?
+                                       AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW);
                snoop = true;
                if (uncached) {
                        mapping_flags |= AMDGPU_VM_MTYPE_UC;
@@ -1242,10 +1272,12 @@ svm_range_get_pte_flags(struct kfd_node *node,
                        if (bo_node->adev == node->adev &&
                            (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id))
                                mapping_flags |= mtype_local;
-                       /* local HBM region far from partition or remote XGMI GPU */
-                       else if (svm_nodes_in_same_hive(bo_node, node))
+                       /* local HBM region far from partition or remote XGMI GPU
+                        * with regular system scope coherence
+                        */
+                       else if (svm_nodes_in_same_hive(bo_node, node) && !ext_coherent)
                                mapping_flags |= AMDGPU_VM_MTYPE_NC;
-                       /* PCIe P2P */
+                       /* PCIe P2P or extended system scope coherence */
                        else
                                mapping_flags |= AMDGPU_VM_MTYPE_UC;
                /* system memory accessed by the APU */
@@ -1592,6 +1624,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
        struct svm_validate_context *ctx;
        unsigned long start, end, addr;
        struct kfd_process *p;
+       uint64_t vram_pages;
        void *owner;
        int32_t idx;
        int r = 0;
@@ -1660,75 +1693,85 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
                }
        }
 
+       vram_pages = 0;
        start = prange->start << PAGE_SHIFT;
        end = (prange->last + 1) << PAGE_SHIFT;
-       for (addr = start; addr < end && !r; ) {
+       for (addr = start; !r && addr < end; ) {
                struct hmm_range *hmm_range;
                struct vm_area_struct *vma;
-               unsigned long next;
+               uint64_t vram_pages_vma;
+               unsigned long next = 0;
                unsigned long offset;
                unsigned long npages;
                bool readonly;
 
                vma = vma_lookup(mm, addr);
-               if (!vma) {
+               if (vma) {
+                       readonly = !(vma->vm_flags & VM_WRITE);
+
+                       next = min(vma->vm_end, end);
+                       npages = (next - addr) >> PAGE_SHIFT;
+                       WRITE_ONCE(p->svms.faulting_task, current);
+                       r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
+                                                      readonly, owner, NULL,
+                                                      &hmm_range);
+                       WRITE_ONCE(p->svms.faulting_task, NULL);
+                       if (r) {
+                               pr_debug("failed %d to get svm range pages\n", r);
+                               if (r == -EBUSY)
+                                       r = -EAGAIN;
+                       }
+               } else {
                        r = -EFAULT;
-                       goto unreserve_out;
-               }
-               readonly = !(vma->vm_flags & VM_WRITE);
-
-               next = min(vma->vm_end, end);
-               npages = (next - addr) >> PAGE_SHIFT;
-               WRITE_ONCE(p->svms.faulting_task, current);
-               r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
-                                              readonly, owner, NULL,
-                                              &hmm_range);
-               WRITE_ONCE(p->svms.faulting_task, NULL);
-               if (r) {
-                       pr_debug("failed %d to get svm range pages\n", r);
-                       if (r == -EBUSY)
-                               r = -EAGAIN;
-                       goto unreserve_out;
                }
 
-               offset = (addr - start) >> PAGE_SHIFT;
-               r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
-                                     hmm_range->hmm_pfns);
-               if (r) {
-                       pr_debug("failed %d to dma map range\n", r);
-                       goto unreserve_out;
+               if (!r) {
+                       offset = (addr - start) >> PAGE_SHIFT;
+                       r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
+                                             hmm_range->hmm_pfns, &vram_pages_vma);
+                       if (r)
+                               pr_debug("failed %d to dma map range\n", r);
+                       else
+                               vram_pages += vram_pages_vma;
                }
 
                svm_range_lock(prange);
-               if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
+               if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
                        pr_debug("hmm update the range, need validate again\n");
                        r = -EAGAIN;
-                       goto unlock_out;
                }
-               if (!list_empty(&prange->child_list)) {
+
+               if (!r && !list_empty(&prange->child_list)) {
                        pr_debug("range split by unmap in parallel, validate again\n");
                        r = -EAGAIN;
-                       goto unlock_out;
                }
 
-               r = svm_range_map_to_gpus(prange, offset, npages, readonly,
-                                         ctx->bitmap, wait, flush_tlb);
+               if (!r)
+                       r = svm_range_map_to_gpus(prange, offset, npages, readonly,
+                                                 ctx->bitmap, wait, flush_tlb);
+
+               if (!r && next == end)
+                       prange->mapped_to_gpu = true;
 
-unlock_out:
                svm_range_unlock(prange);
 
                addr = next;
        }
 
        if (addr == end) {
-               prange->validated_once = true;
-               prange->mapped_to_gpu = true;
+               prange->vram_pages = vram_pages;
+
+               /* if prange does not include any vram page and it
+                * has not released svm_bo drop its svm_bo reference
+                * and set its actaul_loc to sys ram
+                */
+               if (!vram_pages && prange->ttm_res) {
+                       prange->actual_loc = 0;
+                       svm_range_vram_node_free(prange);
+               }
        }
 
-unreserve_out:
        svm_range_unreserve_bos(ctx);
-
-       prange->is_error_flag = !!r;
        if (!r)
                prange->validate_timestamp = ktime_get_boottime();
 
@@ -1980,6 +2023,7 @@ static struct svm_range *svm_range_clone(struct svm_range *old)
        new->actual_loc = old->actual_loc;
        new->granularity = old->granularity;
        new->mapped_to_gpu = old->mapped_to_gpu;
+       new->vram_pages = old->vram_pages;
        bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
        bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
 
@@ -2097,7 +2141,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
                next = interval_tree_iter_next(node, start, last);
                next_start = min(node->last, last) + 1;
 
-               if (svm_range_is_same_attrs(p, prange, nattr, attrs)) {
+               if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
+                   prange->mapped_to_gpu) {
                        /* nothing to do */
                } else if (node->start < start || node->last > last) {
                        /* node intersects the update range and its attributes
@@ -2884,6 +2929,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
                        uint32_t vmid, uint32_t node_id,
                        uint64_t addr, bool write_fault)
 {
+       unsigned long start, last, size;
        struct mm_struct *mm = NULL;
        struct svm_range_list *svms;
        struct svm_range *prange;
@@ -3019,32 +3065,35 @@ retry_write_locked:
        kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr,
                                       write_fault, timestamp);
 
-       if (prange->actual_loc != best_loc) {
+       if (prange->actual_loc != 0 || best_loc != 0) {
                migration = true;
+               /* Align migration range start and size to granularity size */
+               size = 1UL << prange->granularity;
+               start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start);
+               last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last);
+
                if (best_loc) {
-                       r = svm_migrate_to_vram(prange, best_loc, mm,
-                                       KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
+                       r = svm_migrate_to_vram(prange, best_loc, start, last,
+                                       mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
                        if (r) {
                                pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
                                         r, addr);
                                /* Fallback to system memory if migration to
                                 * VRAM failed
                                 */
-                               if (prange->actual_loc)
-                                       r = svm_migrate_vram_to_ram(prange, mm,
-                                          KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
-                                          NULL);
+                               if (prange->actual_loc && prange->actual_loc != best_loc)
+                                       r = svm_migrate_vram_to_ram(prange, mm, start, last,
+                                               KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL);
                                else
                                        r = 0;
                        }
                } else {
-                       r = svm_migrate_vram_to_ram(prange, mm,
-                                       KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
-                                       NULL);
+                       r = svm_migrate_vram_to_ram(prange, mm, start, last,
+                                       KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL);
                }
                if (r) {
                        pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
-                                r, svms, prange->start, prange->last);
+                                r, svms, start, last);
                        goto out_unlock_range;
                }
        }
@@ -3398,18 +3447,24 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
        *migrated = false;
        best_loc = svm_range_best_prefetch_location(prange);
 
-       if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
-           best_loc == prange->actual_loc)
+       /* when best_loc is a gpu node and same as prange->actual_loc
+        * we still need do migration as prange->actual_loc !=0 does
+        * not mean all pages in prange are vram. hmm migrate will pick
+        * up right pages during migration.
+        */
+       if ((best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) ||
+           (best_loc == 0 && prange->actual_loc == 0))
                return 0;
 
        if (!best_loc) {
-               r = svm_migrate_vram_to_ram(prange, mm,
+               r = svm_migrate_vram_to_ram(prange, mm, prange->start, prange->last,
                                        KFD_MIGRATE_TRIGGER_PREFETCH, NULL);
                *migrated = !r;
                return r;
        }
 
-       r = svm_migrate_to_vram(prange, best_loc, mm, KFD_MIGRATE_TRIGGER_PREFETCH);
+       r = svm_migrate_to_vram(prange, best_loc, prange->start, prange->last,
+                               mm, KFD_MIGRATE_TRIGGER_PREFETCH);
        *migrated = !r;
 
        return r;
@@ -3464,7 +3519,11 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work)
 
                mutex_lock(&prange->migrate_mutex);
                do {
+                       /* migrate all vram pages in this prange to sys ram
+                        * after that prange->actual_loc should be zero
+                        */
                        r = svm_migrate_vram_to_ram(prange, mm,
+                                       prange->start, prange->last,
                                        KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL);
                } while (!r && prange->actual_loc && --retries);
 
@@ -3507,7 +3566,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
        struct svm_range *next;
        bool update_mapping = false;
        bool flush_tlb;
-       int r = 0;
+       int r, ret = 0;
 
        pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
                 p->pasid, &p->svms, start, start + size - 1, size);
@@ -3595,7 +3654,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
 out_unlock_range:
                mutex_unlock(&prange->migrate_mutex);
                if (r)
-                       break;
+                       ret = r;
        }
 
        dynamic_svm_range_dump(svms);
@@ -3608,7 +3667,7 @@ out:
        pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
                 &p->svms, start, start + size - 1, r);
 
-       return r;
+       return ret ? ret : r;
 }
 
 static int