drm/amdgpu: address remove from fault filter

author Philip Yang <Philip.Yang@amd.com>

Tue, 20 Apr 2021 14:05:44 +0000 (10:05 -0400)

committer Alex Deucher <alexander.deucher@amd.com>

Thu, 29 Apr 2021 03:36:05 +0000 (23:36 -0400)
author Philip Yang <Philip.Yang@amd.com>
Tue, 20 Apr 2021 14:05:44 +0000 (10:05 -0400)
committer Alex Deucher <alexander.deucher@amd.com>
Thu, 29 Apr 2021 03:36:05 +0000 (23:36 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

index c39ed9e..dfa67c2 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
                         mc->agp_size >> 20, mc->agp_start, mc->agp_end);
  }
  
+/**
+ * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
+ *
+ * @addr: 48 bit physical address, page aligned (36 significant bits)
+ * @pasid: 16 bit process address space identifier
+ */
+static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
+{
+       return addr << 4 | pasid;
+}
+
  /**
   * amdgpu_gmc_filter_faults - filter VM faults
   *
@@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
                               uint16_t pasid, uint64_t timestamp)
  {
         struct amdgpu_gmc *gmc = &adev->gmc;
-
-       uint64_t stamp, key = addr << 4 | pasid;
+       uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
         struct amdgpu_gmc_fault *fault;
         uint32_t hash;
  
@@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
         while (fault->timestamp >= stamp) {
                 uint64_t tmp;
  
-               if (fault->key == key)
+               if (atomic64_read(&fault->key) == key)
                         return true;
  
                 tmp = fault->timestamp;
@@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
  
         /* Add the fault to the ring */
         fault = &gmc->fault_ring[gmc->last_fault];
-       fault->key = key;
+       atomic64_set(&fault->key, key);
         fault->timestamp = timestamp;
  
         /* And update the hash */
@@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
         return false;
  }
  
+/**
+ * amdgpu_gmc_filter_faults_remove - remove address from VM faults filter
+ *
+ * @adev: amdgpu device structure
+ * @addr: address of the VM fault
+ * @pasid: PASID of the process causing the fault
+ *
+ * Remove the address from fault filter, then future vm fault on this address
+ * will pass to retry fault handler to recover.
+ */
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
+                                    uint16_t pasid)
+{
+       struct amdgpu_gmc *gmc = &adev->gmc;
+       uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
+       struct amdgpu_gmc_fault *fault;
+       uint32_t hash;
+       uint64_t tmp;
+
+       hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
+       fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
+       do {
+               if (atomic64_cmpxchg(&fault->key, key, 0) == key)
+                       break;
+
+               tmp = fault->timestamp;
+               fault = &gmc->fault_ring[fault->next];
+       } while (fault->timestamp < tmp);
+}
+
  int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
  {
         int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h

index 9d11c02..6aa1d52 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -66,9 +66,9 @@ struct firmware;
   * GMC page fault information
   */
  struct amdgpu_gmc_fault {
-       uint64_t        timestamp;
+       uint64_t        timestamp:48;
         uint64_t        next:AMDGPU_GMC_FAULT_RING_ORDER;
-       uint64_t        key:52;
+       atomic64_t      key;
  };
  
  /*
@@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
                              struct amdgpu_gmc *mc);
  bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
                               uint16_t pasid, uint64_t timestamp);
+void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
+                                    uint16_t pasid);
  int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
  void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
  int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
author	Philip Yang <Philip.Yang@amd.com>
	Tue, 20 Apr 2021 14:05:44 +0000 (10:05 -0400)
committer	Alex Deucher <alexander.deucher@amd.com>
	Thu, 29 Apr 2021 03:36:05 +0000 (23:36 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h		patch \| blob \| history