drm/amdgpu: skip printing vram_lost if needed
authorTrigger Huang <Trigger.Huang@amd.com>
Mon, 19 Aug 2024 07:53:22 +0000 (15:53 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 29 Aug 2024 17:38:53 +0000 (13:38 -0400)
The vm lost status can only be obtained after a GPU reset occurs, but
sometimes a dev core dump can be happened before GPU reset. So a new
argument is added to tell the dev core dump implementation whether to
skip printing the vram_lost status in the dump.
And this patch is also trying to decouple the core dump function from
the GPU reset function, by replacing the argument amdgpu_reset_context
with amdgpu_job to specify the context for core dump.

V2: Inform user if VRAM lost check is skipped so users don't assume
VRAM wasn't lost (Alex)

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index cf2b4dd..5ac59b6 100644 (file)
@@ -28,8 +28,8 @@
 #include "atom.h"
 
 #ifndef CONFIG_DEV_COREDUMP
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-                    struct amdgpu_reset_context *reset_context)
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+                    bool vram_lost, struct amdgpu_job *job)
 {
 }
 #else
@@ -315,7 +315,9 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
                }
        }
 
-       if (coredump->reset_vram_lost)
+       if (coredump->skip_vram_check)
+               drm_printf(&p, "VRAM lost check is skipped!\n");
+       else if (coredump->reset_vram_lost)
                drm_printf(&p, "VRAM is lost due to GPU reset!\n");
 
        return count - iter.remain;
@@ -326,12 +328,11 @@ static void amdgpu_devcoredump_free(void *data)
        kfree(data);
 }
 
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-                    struct amdgpu_reset_context *reset_context)
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+                    bool vram_lost, struct amdgpu_job *job)
 {
-       struct amdgpu_coredump_info *coredump;
        struct drm_device *dev = adev_to_drm(adev);
-       struct amdgpu_job *job = reset_context->job;
+       struct amdgpu_coredump_info *coredump;
        struct drm_sched_job *s_job;
 
        coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
@@ -341,11 +342,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
                return;
        }
 
+       coredump->skip_vram_check = skip_vram_check;
        coredump->reset_vram_lost = vram_lost;
 
-       if (reset_context->job && reset_context->job->vm) {
+       if (job && job->vm) {
+               struct amdgpu_vm *vm = job->vm;
                struct amdgpu_task_info *ti;
-               struct amdgpu_vm *vm = reset_context->job->vm;
 
                ti = amdgpu_vm_get_task_info_vm(vm);
                if (ti) {
index 5245951..ef9772c 100644 (file)
@@ -26,7 +26,6 @@
 #define __AMDGPU_DEV_COREDUMP_H__
 
 #include "amdgpu.h"
-#include "amdgpu_reset.h"
 
 #ifdef CONFIG_DEV_COREDUMP
 
@@ -36,12 +35,12 @@ struct amdgpu_coredump_info {
        struct amdgpu_device            *adev;
        struct amdgpu_task_info         reset_task_info;
        struct timespec64               reset_time;
+       bool                            skip_vram_check;
        bool                            reset_vram_lost;
        struct amdgpu_ring              *ring;
 };
 #endif
 
-void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
-                    struct amdgpu_reset_context *reset_context);
-
+void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
+                    bool vram_lost, struct amdgpu_job *job);
 #endif
index 49ef22d..45edf99 100644 (file)
@@ -5489,7 +5489,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
                                vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
 
                                if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
-                                       amdgpu_coredump(tmp_adev, vram_lost, reset_context);
+                                       amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
 
                                if (vram_lost) {
                                        DRM_INFO("VRAM is lost due to GPU reset!\n");