drm/amdgpu: Fix crash on device remove/driver unload
authorAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Wed, 15 Sep 2021 19:27:18 +0000 (15:27 -0400)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 18 Nov 2021 18:16:24 +0000 (19:16 +0100)
[ Upstream commit d82e2c249c8ffaec20fa618611ea2ab4dcfd4d01 ]

Crash:
BUG: unable to handle page fault for address: 00000000000010e1
RIP: 0010:vega10_power_gate_vce+0x26/0x50 [amdgpu]
Call Trace:
pp_set_powergating_by_smu+0x16a/0x2b0 [amdgpu]
amdgpu_dpm_set_powergating_by_smu+0x92/0xf0 [amdgpu]
amdgpu_dpm_enable_vce+0x2e/0xc0 [amdgpu]
vce_v4_0_hw_fini+0x95/0xa0 [amdgpu]
amdgpu_device_fini_hw+0x232/0x30d [amdgpu]
amdgpu_driver_unload_kms+0x5c/0x80 [amdgpu]
amdgpu_pci_remove+0x27/0x40 [amdgpu]
pci_device_remove+0x3e/0xb0
device_release_driver_internal+0x103/0x1d0
device_release_driver+0x12/0x20
pci_stop_bus_device+0x79/0xa0
pci_stop_and_remove_bus_device_locked+0x1b/0x30
remove_store+0x7b/0x90
dev_attr_store+0x17/0x30
sysfs_kf_write+0x4b/0x60
kernfs_fop_write_iter+0x151/0x1e0

Why:
VCE/UVD had dependency on SMC block for their suspend but
SMC block is the first to do HW fini due to some constraints

How:
Since the original patch was dealing with suspend issues
move the SMC block dependency back into suspend hooks as
was done in V1 of the original patches.
Keep flushing idle work both in suspend and HW fini seuqnces
since it's essential in both cases.

Fixes: 859e4659273f1d ("drm/amdgpu: add missing cleanups for more ASICs on UVD/VCE suspend")
Fixes: bf756fb833cbe8 ("drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend")
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
drivers/gpu/drm/amd/amdgpu/vce_v2_0.c
drivers/gpu/drm/amd/amdgpu/vce_v3_0.c
drivers/gpu/drm/amd/amdgpu/vce_v4_0.c

index 7232241e3bfb26b137c6111e936794e0dbf84a1f..0fef925b66024a7c705bdf251121a14d86546225 100644 (file)
@@ -698,6 +698,19 @@ static int uvd_v3_1_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+       if (RREG32(mmUVD_STATUS) != 0)
+               uvd_v3_1_stop(adev);
+
+       return 0;
+}
+
+static int uvd_v3_1_suspend(void *handle)
+{
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -722,17 +735,6 @@ static int uvd_v3_1_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }
 
-       if (RREG32(mmUVD_STATUS) != 0)
-               uvd_v3_1_stop(adev);
-
-       return 0;
-}
-
-static int uvd_v3_1_suspend(void *handle)
-{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = uvd_v3_1_hw_fini(adev);
        if (r)
                return r;
index 52d6de969f4626b5169c0159bb413d9ad84a5edd..c108b83817951b7e3b5be55cec9939b72eb3e636 100644 (file)
@@ -212,6 +212,19 @@ static int uvd_v4_2_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+       if (RREG32(mmUVD_STATUS) != 0)
+               uvd_v4_2_stop(adev);
+
+       return 0;
+}
+
+static int uvd_v4_2_suspend(void *handle)
+{
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -236,17 +249,6 @@ static int uvd_v4_2_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }
 
-       if (RREG32(mmUVD_STATUS) != 0)
-               uvd_v4_2_stop(adev);
-
-       return 0;
-}
-
-static int uvd_v4_2_suspend(void *handle)
-{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = uvd_v4_2_hw_fini(adev);
        if (r)
                return r;
index db6d06758e4d473b203f0a7c08b3bba310d47ecd..563493d1f8306b41b16b0b86a20aab88a1723f30 100644 (file)
@@ -210,6 +210,19 @@ static int uvd_v5_0_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+       if (RREG32(mmUVD_STATUS) != 0)
+               uvd_v5_0_stop(adev);
+
+       return 0;
+}
+
+static int uvd_v5_0_suspend(void *handle)
+{
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -234,17 +247,6 @@ static int uvd_v5_0_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }
 
-       if (RREG32(mmUVD_STATUS) != 0)
-               uvd_v5_0_stop(adev);
-
-       return 0;
-}
-
-static int uvd_v5_0_suspend(void *handle)
-{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = uvd_v5_0_hw_fini(adev);
        if (r)
                return r;
index b6e82d75561f697dee697b6d793e9aaffe261cd4..1fd9ca21a091bac07ff851386c83e24795f83757 100644 (file)
@@ -606,6 +606,23 @@ static int uvd_v7_0_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       cancel_delayed_work_sync(&adev->uvd.idle_work);
+
+       if (!amdgpu_sriov_vf(adev))
+               uvd_v7_0_stop(adev);
+       else {
+               /* full access mode, so don't touch any UVD register */
+               DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
+       }
+
+       return 0;
+}
+
+static int uvd_v7_0_suspend(void *handle)
+{
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -630,21 +647,6 @@ static int uvd_v7_0_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }
 
-       if (!amdgpu_sriov_vf(adev))
-               uvd_v7_0_stop(adev);
-       else {
-               /* full access mode, so don't touch any UVD register */
-               DRM_DEBUG("For SRIOV client, shouldn't do anything.\n");
-       }
-
-       return 0;
-}
-
-static int uvd_v7_0_suspend(void *handle)
-{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = uvd_v7_0_hw_fini(adev);
        if (r)
                return r;
index b70c17f0c52e8e0b2def09dc3455a3c9b8c8b789..98952fd387e73dbcda19a632e295368549bcd44b 100644 (file)
@@ -479,6 +479,17 @@ static int vce_v2_0_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       cancel_delayed_work_sync(&adev->vce.idle_work);
+
+       return 0;
+}
+
+static int vce_v2_0_suspend(void *handle)
+{
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -502,14 +513,6 @@ static int vce_v2_0_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }
 
-       return 0;
-}
-
-static int vce_v2_0_suspend(void *handle)
-{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = vce_v2_0_hw_fini(adev);
        if (r)
                return r;
index 9de66893ccd6dca4b629084da6b409fc898792e4..8fb5df7181e091c7385f8637b018f3f461c3c6af 100644 (file)
@@ -490,6 +490,21 @@ static int vce_v3_0_hw_fini(void *handle)
        int r;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       cancel_delayed_work_sync(&adev->vce.idle_work);
+
+       r = vce_v3_0_wait_for_idle(handle);
+       if (r)
+               return r;
+
+       vce_v3_0_stop(adev);
+       return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
+}
+
+static int vce_v3_0_suspend(void *handle)
+{
+       int r;
+       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+
        /*
         * Proper cleanups before halting the HW engine:
         *   - cancel the delayed idle work
@@ -513,19 +528,6 @@ static int vce_v3_0_hw_fini(void *handle)
                                                       AMD_CG_STATE_GATE);
        }
 
-       r = vce_v3_0_wait_for_idle(handle);
-       if (r)
-               return r;
-
-       vce_v3_0_stop(adev);
-       return vce_v3_0_set_clockgating_state(adev, AMD_CG_STATE_GATE);
-}
-
-static int vce_v3_0_suspend(void *handle)
-{
-       int r;
-       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
        r = vce_v3_0_hw_fini(adev);
        if (r)
                return r;
index fec902b800c28b79e7e7057f1ab0187c2738dba1..70b8c88d30513d016806ad73024d702822af026c 100644 (file)
@@ -542,29 +542,8 @@ static int vce_v4_0_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-       /*
-        * Proper cleanups before halting the HW engine:
-        *   - cancel the delayed idle work
-        *   - enable powergating
-        *   - enable clockgating
-        *   - disable dpm
-        *
-        * TODO: to align with the VCN implementation, move the
-        * jobs for clockgating/powergating/dpm setting to
-        * ->set_powergating_state().
-        */
        cancel_delayed_work_sync(&adev->vce.idle_work);
 
-       if (adev->pm.dpm_enabled) {
-               amdgpu_dpm_enable_vce(adev, false);
-       } else {
-               amdgpu_asic_set_vce_clocks(adev, 0, 0);
-               amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-                                                      AMD_PG_STATE_GATE);
-               amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
-                                                      AMD_CG_STATE_GATE);
-       }
-
        if (!amdgpu_sriov_vf(adev)) {
                /* vce_v4_0_wait_for_idle(handle); */
                vce_v4_0_stop(adev);
@@ -594,6 +573,29 @@ static int vce_v4_0_suspend(void *handle)
                drm_dev_exit(idx);
        }
 
+       /*
+        * Proper cleanups before halting the HW engine:
+        *   - cancel the delayed idle work
+        *   - enable powergating
+        *   - enable clockgating
+        *   - disable dpm
+        *
+        * TODO: to align with the VCN implementation, move the
+        * jobs for clockgating/powergating/dpm setting to
+        * ->set_powergating_state().
+        */
+       cancel_delayed_work_sync(&adev->vce.idle_work);
+
+       if (adev->pm.dpm_enabled) {
+               amdgpu_dpm_enable_vce(adev, false);
+       } else {
+               amdgpu_asic_set_vce_clocks(adev, 0, 0);
+               amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+                                                      AMD_PG_STATE_GATE);
+               amdgpu_device_ip_set_clockgating_state(adev, AMD_IP_BLOCK_TYPE_VCE,
+                                                      AMD_CG_STATE_GATE);
+       }
+
        r = vce_v4_0_hw_fini(adev);
        if (r)
                return r;