drm/amdkfd: add xnack enabled flag to kfd_process
authorAlex Sierra <alex.sierra@amd.com>
Thu, 28 May 2020 23:03:15 +0000 (18:03 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 21 Apr 2021 01:47:41 +0000 (21:47 -0400)
XNACK mode controls the SQ RETRY_DISABLE setting that determines,
whether recoverable page faults can be supported on GFXv9 hardware.
Only on Aldebaran we can support different processes running with
different XNACK modes. On older chips all processes must use the same
RETRY_DISABLE setting. However, processes not relying on recoverable
page faults can work with RETRY enabled. This means XNACK off is always
available as a fallback so we can use the same mode on all GPUs in a
process.

Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c

index eca6331..b5c3d13 100644 (file)
@@ -61,10 +61,19 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
                qpd->sh_mem_config =
                                SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
                                        SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
-               if (dqm->dev->noretry &&
-                   !dqm->dev->use_iommu_v2)
+
+               if (dqm->dev->device_info->asic_family == CHIP_ALDEBARAN) {
+                       /* Aldebaran can safely support different XNACK modes
+                        * per process
+                        */
+                       if (!pdd->process->xnack_enabled)
+                               qpd->sh_mem_config |=
+                                       1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+               } else if (dqm->dev->noretry &&
+                          !dqm->dev->use_iommu_v2) {
                        qpd->sh_mem_config |=
                                1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+               }
 
                qpd->sh_mem_ape1_limit = 0;
                qpd->sh_mem_ape1_base = 0;
index 81f71c4..59423c8 100644 (file)
@@ -824,6 +824,8 @@ struct kfd_process {
        /* shared virtual memory registered by this process */
        struct svm_range_list svms;
        bool svm_disabled;
+
+       bool xnack_enabled;
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
@@ -883,6 +885,8 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
                                                        struct kfd_process *p);
 
+bool kfd_process_xnack_mode(struct kfd_process *p, bool supported);
+
 int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
                          struct vm_area_struct *vma);
 
index 3c72e9d..b8db509 100644 (file)
@@ -1193,6 +1193,56 @@ void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
        }
 }
 
+bool kfd_process_xnack_mode(struct kfd_process *p, bool supported)
+{
+       int i;
+
+       /* On most GFXv9 GPUs, the retry mode in the SQ must match the
+        * boot time retry setting. Mixing processes with different
+        * XNACK/retry settings can hang the GPU.
+        *
+        * Different GPUs can have different noretry settings depending
+        * on HW bugs or limitations. We need to find at least one
+        * XNACK mode for this process that's compatible with all GPUs.
+        * Fortunately GPUs with retry enabled (noretry=0) can run code
+        * built for XNACK-off. On GFXv9 it may perform slower.
+        *
+        * Therefore applications built for XNACK-off can always be
+        * supported and will be our fallback if any GPU does not
+        * support retry.
+        */
+       for (i = 0; i < p->n_pdds; i++) {
+               struct kfd_dev *dev = p->pdds[i]->dev;
+
+               /* Only consider GFXv9 and higher GPUs. Older GPUs don't
+                * support the SVM APIs and don't need to be considered
+                * for the XNACK mode selection.
+                */
+               if (dev->device_info->asic_family < CHIP_VEGA10)
+                       continue;
+               /* Aldebaran can always support XNACK because it can support
+                * per-process XNACK mode selection. But let the dev->noretry
+                * setting still influence the default XNACK mode.
+                */
+               if (supported &&
+                   dev->device_info->asic_family == CHIP_ALDEBARAN)
+                       continue;
+
+               /* GFXv10 and later GPUs do not support shader preemption
+                * during page faults. This can lead to poor QoS for queue
+                * management and memory-manager-related preemptions or
+                * even deadlocks.
+                */
+               if (dev->device_info->asic_family >= CHIP_NAVI10)
+                       return false;
+
+               if (dev->noretry)
+                       return false;
+       }
+
+       return true;
+}
+
 /*
  * On return the kfd_process is fully operational and will be freed when the
  * mm is released
@@ -1232,6 +1282,9 @@ static struct kfd_process *create_process(const struct task_struct *thread)
        if (err != 0)
                goto err_init_apertures;
 
+       /* Check XNACK support after PDDs are created in kfd_init_apertures */
+       process->xnack_enabled = kfd_process_xnack_mode(process, false);
+
        err = svm_range_list_init(process);
        if (err)
                goto err_init_svm_range_list;