drm/xe/uapi: support pat_index selection with vm_bind

author Matthew Auld <matthew.auld@intel.com>

Mon, 25 Sep 2023 11:42:18 +0000 (12:42 +0100)

committer Rodrigo Vivi <rodrigo.vivi@intel.com>

Thu, 21 Dec 2023 16:45:07 +0000 (11:45 -0500)
author Matthew Auld <matthew.auld@intel.com>
Mon, 25 Sep 2023 11:42:18 +0000 (12:42 +0100)
committer Rodrigo Vivi <rodrigo.vivi@intel.com>
Thu, 21 Dec 2023 16:45:07 +0000 (11:45 -0500)
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c

index c6c9b72..3b48531 100644 (file)
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -290,8 +290,6 @@ struct xe_pt_stage_bind_walk {
         struct xe_vm *vm;
         /** @tile: The tile we're building for. */
         struct xe_tile *tile;
-       /** @cache: Desired cache level for the ptes */
-       enum xe_cache_level cache;
         /** @default_pte: PTE flag only template. No address is associated */
         u64 default_pte;
         /** @dma_offset: DMA offset to add to the PTE. */
@@ -511,7 +509,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
  {
         struct xe_pt_stage_bind_walk *xe_walk =
                 container_of(walk, typeof(*xe_walk), base);
-       u16 pat_index = tile_to_xe(xe_walk->tile)->pat.idx[xe_walk->cache];
+       u16 pat_index = xe_walk->vma->pat_index;
         struct xe_pt *xe_parent = container_of(parent, typeof(*xe_parent), base);
         struct xe_vm *vm = xe_walk->vm;
         struct xe_pt *xe_child;
@@ -657,13 +655,8 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
         if (is_devmem) {
                 xe_walk.default_pte |= XE_PPGTT_PTE_DM;
                 xe_walk.dma_offset = vram_region_gpu_offset(bo->ttm.resource);
-               xe_walk.cache = XE_CACHE_WB;
-       } else {
-               if (!xe_vma_has_no_bo(vma) && bo->flags & XE_BO_SCANOUT_BIT)
-                       xe_walk.cache = XE_CACHE_WT;
-               else
-                       xe_walk.cache = XE_CACHE_WB;
         }
+
         if (!xe_vma_has_no_bo(vma) && xe_bo_is_stolen(bo))
                 xe_walk.dma_offset = xe_ttm_stolen_gpu_offset(xe_bo_device(bo));
  
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c

index c33ae4d..a97a310 100644 (file)
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -6,6 +6,7 @@
  #include "xe_vm.h"
  
  #include <linux/dma-fence-array.h>
+#include <linux/nospec.h>
  
  #include <drm/drm_exec.h>
  #include <drm/drm_print.h>
@@ -26,6 +27,7 @@
  #include "xe_gt_pagefault.h"
  #include "xe_gt_tlb_invalidation.h"
  #include "xe_migrate.h"
+#include "xe_pat.h"
  #include "xe_pm.h"
  #include "xe_preempt_fence.h"
  #include "xe_pt.h"
@@ -868,7 +870,8 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm,
                                     u64 start, u64 end,
                                     bool read_only,
                                     bool is_null,
-                                   u8 tile_mask)
+                                   u8 tile_mask,
+                                   u16 pat_index)
  {
         struct xe_vma *vma;
         struct xe_tile *tile;
@@ -910,6 +913,8 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm,
         if (GRAPHICS_VER(vm->xe) >= 20 || vm->xe->info.platform == XE_PVC)
                 vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
  
+       vma->pat_index = pat_index;
+
         if (bo) {
                 struct drm_gpuvm_bo *vm_bo;
  
@@ -2162,7 +2167,7 @@ static struct drm_gpuva_ops *
  vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
                          u64 bo_offset_or_userptr, u64 addr, u64 range,
                          u32 operation, u32 flags, u8 tile_mask,
-                        u32 prefetch_region)
+                        u32 prefetch_region, u16 pat_index)
  {
         struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
         struct drm_gpuva_ops *ops;
@@ -2231,6 +2236,7 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
                         op->map.read_only =
                                 flags & DRM_XE_VM_BIND_FLAG_READONLY;
                         op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
+                       op->map.pat_index = pat_index;
                 } else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
                         op->prefetch.region = prefetch_region;
                 }
@@ -2242,7 +2248,8 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
  }
  
  static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
-                             u8 tile_mask, bool read_only, bool is_null)
+                             u8 tile_mask, bool read_only, bool is_null,
+                             u16 pat_index)
  {
         struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
         struct xe_vma *vma;
@@ -2258,7 +2265,7 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
         vma = xe_vma_create(vm, bo, op->gem.offset,
                             op->va.addr, op->va.addr +
                             op->va.range - 1, read_only, is_null,
-                           tile_mask);
+                           tile_mask, pat_index);
         if (bo)
                 xe_bo_unlock(bo);
  
@@ -2404,7 +2411,7 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
  
                         vma = new_vma(vm, &op->base.map,
                                       op->tile_mask, op->map.read_only,
-                                     op->map.is_null);
+                                     op->map.is_null, op->map.pat_index);
                         if (IS_ERR(vma))
                                 return PTR_ERR(vma);
  
@@ -2430,7 +2437,7 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
  
                                 vma = new_vma(vm, op->base.remap.prev,
                                               op->tile_mask, read_only,
-                                             is_null);
+                                             is_null, old->pat_index);
                                 if (IS_ERR(vma))
                                         return PTR_ERR(vma);
  
@@ -2464,7 +2471,7 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
  
                                 vma = new_vma(vm, op->base.remap.next,
                                               op->tile_mask, read_only,
-                                             is_null);
+                                             is_null, old->pat_index);
                                 if (IS_ERR(vma))
                                         return PTR_ERR(vma);
  
@@ -2862,6 +2869,26 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
                 u64 obj_offset = (*bind_ops)[i].obj_offset;
                 u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
                 bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
+               u16 pat_index = (*bind_ops)[i].pat_index;
+               u16 coh_mode;
+
+               if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
+                       err = -EINVAL;
+                       goto free_bind_ops;
+               }
+
+               pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
+               (*bind_ops)[i].pat_index = pat_index;
+               coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
+               if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
+                       err = -EINVAL;
+                       goto free_bind_ops;
+               }
+
+               if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
+                       err = -EINVAL;
+                       goto free_bind_ops;
+               }
  
                 if (i == 0) {
                         *async = !!(flags & DRM_XE_VM_BIND_FLAG_ASYNC);
@@ -2892,6 +2919,8 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe,
                                  op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
                     XE_IOCTL_DBG(xe, obj &&
                                  op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
+                   XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
+                                op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
                     XE_IOCTL_DBG(xe, obj &&
                                  op == DRM_XE_VM_BIND_OP_PREFETCH) ||
                     XE_IOCTL_DBG(xe, prefetch_region &&
@@ -3025,6 +3054,8 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
                 u64 addr = bind_ops[i].addr;
                 u32 obj = bind_ops[i].obj;
                 u64 obj_offset = bind_ops[i].obj_offset;
+               u16 pat_index = bind_ops[i].pat_index;
+               u16 coh_mode;
  
                 if (!obj)
                         continue;
@@ -3052,6 +3083,24 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
                                 goto put_obj;
                         }
                 }
+
+               coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
+               if (bos[i]->cpu_caching) {
+                       if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
+                                        bos[i]->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
+                               err = -EINVAL;
+                               goto put_obj;
+                       }
+               } else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
+                       /*
+                        * Imported dma-buf from a different device should
+                        * require 1way or 2way coherency since we don't know
+                        * how it was mapped on the CPU. Just assume is it
+                        * potentially cached on CPU side.
+                        */
+                       err = -EINVAL;
+                       goto put_obj;
+               }
         }
  
         if (args->num_syncs) {
@@ -3079,10 +3128,12 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
                 u64 obj_offset = bind_ops[i].obj_offset;
                 u8 tile_mask = bind_ops[i].tile_mask;
                 u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
+               u16 pat_index = bind_ops[i].pat_index;
  
                 ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
                                                   addr, range, op, flags,
-                                                 tile_mask, prefetch_region);
+                                                 tile_mask, prefetch_region,
+                                                 pat_index);
                 if (IS_ERR(ops[i])) {
                         err = PTR_ERR(ops[i]);
                         ops[i] = NULL;
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h

index fc2645e..74cdf16 100644 (file)
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -110,6 +110,11 @@ struct xe_vma {
          */
         u8 tile_present;
  
+       /**
+        * @pat_index: The pat index to use when encoding the PTEs for this vma.
+        */
+       u16 pat_index;
+
         struct {
                 struct list_head rebind_link;
         } notifier;
@@ -333,6 +338,8 @@ struct xe_vma_op_map {
         bool read_only;
         /** @is_null: is NULL binding */
         bool is_null;
+       /** @pat_index: The pat index to use for this operation. */
+       u16 pat_index;
  };
  
  /** struct xe_vma_op_remap - VMA remap operation */
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h

index ab7d1b2..1a844fa 100644 (file)
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -636,8 +636,54 @@ struct drm_xe_vm_bind_op {
          */
         __u32 obj;
  
+       /**
+        * @pat_index: The platform defined @pat_index to use for this mapping.
+        * The index basically maps to some predefined memory attributes,
+        * including things like caching, coherency, compression etc.  The exact
+        * meaning of the pat_index is platform specific and defined in the
+        * Bspec and PRMs.  When the KMD sets up the binding the index here is
+        * encoded into the ppGTT PTE.
+        *
+        * For coherency the @pat_index needs to be at least 1way coherent when
+        * drm_xe_gem_create.cpu_caching is DRM_XE_GEM_CPU_CACHING_WB. The KMD
+        * will extract the coherency mode from the @pat_index and reject if
+        * there is a mismatch (see note below for pre-MTL platforms).
+        *
+        * Note: On pre-MTL platforms there is only a caching mode and no
+        * explicit coherency mode, but on such hardware there is always a
+        * shared-LLC (or is dgpu) so all GT memory accesses are coherent with
+        * CPU caches even with the caching mode set as uncached.  It's only the
+        * display engine that is incoherent (on dgpu it must be in VRAM which
+        * is always mapped as WC on the CPU). However to keep the uapi somewhat
+        * consistent with newer platforms the KMD groups the different cache
+        * levels into the following coherency buckets on all pre-MTL platforms:
+        *
+        *      ppGTT UC -> COH_NONE
+        *      ppGTT WC -> COH_NONE
+        *      ppGTT WT -> COH_NONE
+        *      ppGTT WB -> COH_AT_LEAST_1WAY
+        *
+        * In practice UC/WC/WT should only ever used for scanout surfaces on
+        * such platforms (or perhaps in general for dma-buf if shared with
+        * another device) since it is only the display engine that is actually
+        * incoherent.  Everything else should typically use WB given that we
+        * have a shared-LLC.  On MTL+ this completely changes and the HW
+        * defines the coherency mode as part of the @pat_index, where
+        * incoherent GT access is possible.
+        *
+        * Note: For userptr and externally imported dma-buf the kernel expects
+        * either 1WAY or 2WAY for the @pat_index.
+        *
+        * For DRM_XE_VM_BIND_FLAG_NULL bindings there are no KMD restrictions
+        * on the @pat_index. For such mappings there is no actual memory being
+        * mapped (the address in the PTE is invalid), so the various PAT memory
+        * attributes likely do not apply.  Simply leaving as zero is one
+        * option (still a valid pat_index).
+        */
+       __u16 pat_index;
+
         /** @pad: MBZ */
-       __u32 pad;
+       __u16 pad;
  
         union {
                 /**
author	Matthew Auld <matthew.auld@intel.com>
	Mon, 25 Sep 2023 11:42:18 +0000 (12:42 +0100)
committer	Rodrigo Vivi <rodrigo.vivi@intel.com>
	Thu, 21 Dec 2023 16:45:07 +0000 (11:45 -0500)
drivers/gpu/drm/xe/xe_pt.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_vm.c		patch \| blob \| history
drivers/gpu/drm/xe/xe_vm_types.h		patch \| blob \| history
include/uapi/drm/xe_drm.h		patch \| blob \| history