drm/amdgpu: Fix repeatly flr issue
authorjqdeng <Emily.Deng@amd.com>
Fri, 7 Aug 2020 09:31:19 +0000 (17:31 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 18 Aug 2020 22:22:02 +0000 (18:22 -0400)
Only for no job running test case need to do recover in
flr notification.
For having job in mirror list, then let guest driver to
hit job timeout, and then do recover.

Signed-off-by: jqdeng <Emily.Deng@amd.com>
Acked-by: Nirmoy Das <nirmoy.das@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c

index 08f80ca3b296cb92b0fd3b82ba79bfa50c78ef76..54666eea186398f94552b55cec1fbc526d0c02a7 100644 (file)
@@ -1133,6 +1133,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define amdgpu_inc_vram_lost(adev) atomic_inc(&((adev)->vram_lost_counter));
 
 /* Common functions */
+bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                              struct amdgpu_job* job);
index 415e1a32b98c2e86443447810acb06c6e8f9dc34..6573e111246245a5c179bdd9e27349e1d9020da2 100644 (file)
@@ -3922,6 +3922,34 @@ error:
        return r;
 }
 
+/**
+ * amdgpu_device_has_job_running - check if there is any job in mirror list
+ *
+ * @adev: amdgpu device pointer
+ *
+ * check if there is any job in mirror list
+ */
+bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
+{
+       int i;
+       struct drm_sched_job *job;
+
+       for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+               struct amdgpu_ring *ring = adev->rings[i];
+
+               if (!ring || !ring->sched.thread)
+                       continue;
+
+               spin_lock(&ring->sched.job_list_lock);
+               job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
+                               struct drm_sched_job, node);
+               spin_unlock(&ring->sched.job_list_lock);
+               if (job)
+                       return true;
+       }
+       return false;
+}
+
 /**
  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
  *
index 5fd67e1cc2a0465ef469434d29e2585ffcd042f0..475ff5df8c873454f8a286ded577d220adea5f5e 100644 (file)
@@ -268,7 +268,7 @@ flr_done:
 
        /* Trigger recovery for world switch failure if no TDR */
        if (amdgpu_device_should_recover_gpu(adev)
-               && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
+               && (amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
                amdgpu_device_gpu_recover(adev, NULL);
 }
 
index ce2bf1fb79ed12a2c7a11cb30eb5c8c9f49eebe0..9cf695c05db390dc2ebc0ea0e81558563f6f9e50 100644 (file)
@@ -289,7 +289,8 @@ flr_done:
 
        /* Trigger recovery for world switch failure if no TDR */
        if (amdgpu_device_should_recover_gpu(adev)
-               && (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
+               && (amdgpu_device_has_job_running(adev) ||
+               adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
                adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
                adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
                adev->video_timeout == MAX_SCHEDULE_TIMEOUT))