drm/amdgpu: Fix ecc irq enable/disable unpaired
authorStanley.Yang <Stanley.Yang@amd.com>
Fri, 15 Dec 2023 08:13:23 +0000 (16:13 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 3 Jan 2024 15:30:49 +0000 (10:30 -0500)
The ecc_irq is disabled while GPU mode2 reset suspending process,
but not be enabled during GPU mode2 reset resume process.

Changed from V1:
only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip
delete amdgpu_ras_late_resume function

Changed from V2:
check umc ras supported before put ecc_irq

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/aldebaran.c
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 02f4c6f9d4f6876e3edc0fa7de1f38a8a969986c..576067d66bb9af69fd9c3f3c80994f7aa00a319e 100644 (file)
@@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 {
        struct list_head *reset_device_list = reset_context->reset_device_list;
        struct amdgpu_device *tmp_adev = NULL;
+       struct amdgpu_ras *con;
        int r;
 
        if (reset_device_list == NULL)
@@ -355,7 +356,30 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
                 */
                amdgpu_register_gpu_instance(tmp_adev);
 
-               /* Resume RAS */
+               /* Resume RAS, ecc_irq */
+               con = amdgpu_ras_get_context(tmp_adev);
+               if (!amdgpu_sriov_vf(tmp_adev) && con) {
+                       if (tmp_adev->sdma.ras &&
+                               tmp_adev->sdma.ras->ras_block.ras_late_init) {
+                               r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
+                                               &tmp_adev->sdma.ras->ras_block.ras_comm);
+                               if (r) {
+                                       dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
+                                       goto end;
+                               }
+                       }
+
+                       if (tmp_adev->gfx.ras &&
+                               tmp_adev->gfx.ras->ras_block.ras_late_init) {
+                               r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
+                                               &tmp_adev->gfx.ras->ras_block.ras_comm);
+                               if (r) {
+                                       dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
+                                       goto end;
+                               }
+                       }
+               }
+
                amdgpu_ras_resume(tmp_adev);
 
                /* Update PSP FW topology after reset */
index a5a05c16c10d7be2ea1b86fbdcf76699551a8fd8..6c51856088546faed3c2e3d9376f8c23d54ba554 100644 (file)
@@ -1041,6 +1041,10 @@ static int gmc_v10_0_hw_fini(void *handle)
 
        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
 
+       if (adev->gmc.ecc_irq.funcs &&
+               amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        return 0;
 }
 
index 23d7b548d13f446766c0adc6f051f9b492111efb..c9c653cfc765b8b88e5ab1f77cefcbbce38ff79c 100644 (file)
@@ -941,6 +941,11 @@ static int gmc_v11_0_hw_fini(void *handle)
        }
 
        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+
+       if (adev->gmc.ecc_irq.funcs &&
+               amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        gmc_v11_0_gart_disable(adev);
 
        return 0;
index 473a774294cee76356717f7eb5e3ddaabbf76c11..f9039d64ff2d72804556daa16b8ed9632b08b307 100644 (file)
@@ -2380,6 +2380,10 @@ static int gmc_v9_0_hw_fini(void *handle)
 
        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
 
+       if (adev->gmc.ecc_irq.funcs &&
+               amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        return 0;
 }