drm/amdgpu: block ring buffer access during GPU recovery
authorDennis Li <Dennis.Li@amd.com>
Tue, 1 Sep 2020 01:03:53 +0000 (09:03 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 3 Sep 2020 18:46:55 +0000 (14:46 -0400)
When GPU is in reset, its status isn't stable and ring buffer also need
be reset when resuming. Therefore driver should protect GPU recovery
thread from ring buffer accessed by other threads. Otherwise GPU will
randomly hang during recovery.

v2: correct indent

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 6518e444beada2b4df1fa5e9ac13c77b2f8f0bc1..c4900471beb00ce9c927f0d54e6dc52e4eed7cbf 100644 (file)
@@ -319,8 +319,12 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 {
        uint32_t ret;
 
-       if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-               return amdgpu_kiq_rreg(adev, reg);
+       if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
+           down_read_trylock(&adev->reset_sem)) {
+               ret = amdgpu_kiq_rreg(adev, reg);
+               up_read(&adev->reset_sem);
+               return ret;
+       }
 
        if ((reg * 4) < adev->rmmio_size)
                ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
@@ -332,6 +336,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
                ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
                spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
        }
+
        trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
        return ret;
 }
@@ -409,8 +414,12 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
                    uint32_t acc_flags)
 {
-       if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
-               return amdgpu_kiq_wreg(adev, reg, v);
+       if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
+           down_read_trylock(&adev->reset_sem)) {
+               amdgpu_kiq_wreg(adev, reg, v);
+               up_read(&adev->reset_sem);
+               return;
+       }
 
        amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 }
index ad9ad622ccce721be8e47657f3c9c24bd6d2834b..31359e519d69d8889395dda8e8745cc69d6cbf31 100644 (file)
@@ -287,8 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
         */
        if (adev->gfx.kiq.ring.sched.ready &&
            (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-           !amdgpu_in_reset(adev)) {
-
+           down_read_trylock(&adev->reset_sem)) {
                struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
                const unsigned eng = 17;
                u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type);
@@ -297,6 +296,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
 
                amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
                                1 << vmid);
+
+               up_read(&adev->reset_sem);
                return;
        }
 
index 1ca79030e95eb8c701067ebb6ef4d6f58b75e5b8..93ee77b14cc90ed4fdecdad8d2ab676c1e6001e1 100644 (file)
@@ -503,13 +503,14 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
         * as GFXOFF under bare metal
         */
        if (adev->gfx.kiq.ring.sched.ready &&
-                       (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
-                       !amdgpu_in_reset(adev)) {
+           (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
+           down_read_trylock(&adev->reset_sem)) {
                uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
                uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
 
                amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
                                                   1 << vmid);
+               up_read(&adev->reset_sem);
                return;
        }
 
@@ -602,7 +603,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
        if (amdgpu_in_reset(adev))
                return -EIO;
 
-       if (ring->sched.ready) {
+       if (ring->sched.ready && down_read_trylock(&adev->reset_sem)) {
                /* Vega20+XGMI caches PTEs in TC and TLB. Add a
                 * heavy-weight TLB flush (type 2), which flushes
                 * both. Due to a race condition with concurrent
@@ -629,6 +630,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
                if (r) {
                        amdgpu_ring_undo(ring);
                        spin_unlock(&adev->gfx.kiq.ring_lock);
+                       up_read(&adev->reset_sem);
                        return -ETIME;
                }
 
@@ -637,9 +639,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
                r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
                if (r < 1) {
                        dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
+                       up_read(&adev->reset_sem);
                        return -ETIME;
                }
-
+               up_read(&adev->reset_sem);
                return 0;
        }