From 56b53c0b5aa5de49747351b2ad323fd36089eb52 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 10 Mar 2021 17:20:45 +0800 Subject: [PATCH] drm/amdgpu: add codes to capture invalid hardware access when recovery MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When recovery thread has begun GPU reset, there should be not other threads to access hardware, otherwise system randomly hang. v2 (chk): rewritten from scratch, use trylock and lockdep instead of hand wiring the logic. v3: add in_irq check v4: change to check in_task Signed-off-by: Dennis Li Signed-off-by: Christian König Reviewed-by: Christian König Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 +++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 +- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index a037c223c251b..4c2fabe93d59e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1390,6 +1390,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev); bool amdgpu_device_cache_pci_state(struct pci_dev *pdev); bool amdgpu_device_load_pci_state(struct pci_dev *pdev); +bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev); + #include "amdgpu_object.h" static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0f82c5d212372..2d080622eb23e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -326,6 +326,35 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, /* * register access helper functions. */ + +/* Check if hw access should be skipped because of hotplug or device error */ +bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) +{ + if (adev->in_pci_err_recovery) + return true; + +#ifdef CONFIG_LOCKDEP + /* + * This is a bit complicated to understand, so worth a comment. What we assert + * here is that the GPU reset is not running on another thread in parallel. + * + * For this we trylock the read side of the reset semaphore, if that succeeds + * we know that the reset is not running in paralell. + * + * If the trylock fails we assert that we are either already holding the read + * side of the lock or are the reset thread itself and hold the write side of + * the lock. + */ + if (in_task()) { + if (down_read_trylock(&adev->reset_sem)) + up_read(&adev->reset_sem); + else + lockdep_assert_held(&adev->reset_sem); + } +#endif + return false; +} + /** * amdgpu_device_rreg - read a memory mapped IO or indirect register * @@ -340,7 +369,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, { uint32_t ret; - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if ((reg * 4) < adev->rmmio_size) { @@ -377,7 +406,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, */ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (offset < adev->rmmio_size) @@ -402,7 +431,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) */ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (offset < adev->rmmio_size) @@ -425,7 +454,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if ((reg * 4) < adev->rmmio_size) { @@ -452,7 +481,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (amdgpu_sriov_fullaccess(adev) && @@ -476,7 +505,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (index < adev->doorbell.num_doorbells) { @@ -499,7 +528,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (index < adev->doorbell.num_doorbells) { @@ -520,7 +549,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; if (index < adev->doorbell.num_doorbells) { @@ -543,7 +572,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; if (index < adev->doorbell.num_doorbells) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 689addb1520d2..f63f66a0c63f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -705,7 +705,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) struct amdgpu_kiq *kiq = &adev->gfx.kiq; struct amdgpu_ring *ring = &kiq->ring; - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return 0; BUG_ON(!ring->funcs->emit_rreg); @@ -772,7 +772,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) BUG_ON(!ring->funcs->emit_wreg); - if (adev->in_pci_err_recovery) + if (amdgpu_device_skip_hw_access(adev)) return; spin_lock_irqsave(&kiq->ring_lock, flags); -- 2.30.2