drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)
authorTao Zhou <tao.zhou1@amd.com>
Tue, 15 Mar 2022 09:48:18 +0000 (17:48 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 25 Mar 2022 16:40:26 +0000 (12:40 -0400)
Add help functions to query and reset RAS UTCL2 poison status.

v2: implement it on amdgpu side and kfd only calls it.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c

index 6ca1db3c243f9c5291bcbc55fb78106eff0e51d5..c18c4be1e4acd3cab75e1c0cb8e302819ba5000d 100644 (file)
@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
        else if (reset)
                amdgpu_amdkfd_gpu_reset(adev);
 }
+
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
+{
+       if (adev->gfx.ras->query_utcl2_poison_status)
+               return adev->gfx.ras->query_utcl2_poison_status(adev);
+       else
+               return false;
+}
index 4cb14c2fe53fbe3fda50094f1c668279c232ec72..0838926a8ef06cd8fa23ebf02464f26444732ad2 100644 (file)
@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
 bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
 void amdgpu_amdkfd_block_mmu_notifications(void *p);
 int amdgpu_amdkfd_criu_resume(void *p);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
 
 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
index dcb3c7871c73472e77a0711a36612cfb5c6d6f48..5ed9b8a4c571d49f6a98ef61f761fbcc57cf3e5d 100644 (file)
@@ -202,6 +202,7 @@ struct amdgpu_cu_info {
 struct amdgpu_gfx_ras {
        struct amdgpu_ras_block_object  ras_block;
        void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+       bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_gfx_funcs {
index 7653ebd0e67bd8513e167f18e95cfe11992bc428..3a797424579c5955ffe0f701cbec2344d7f2ef23 100644 (file)
@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
        mutex_unlock(&adev->grbm_idx_mutex);
 }
 
+static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
+{
+       u32 status = 0;
+       struct amdgpu_vmhub *hub;
+
+       hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+       status = RREG32(hub->vm_l2_pro_fault_status);
+       /* reset page fault status */
+       WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+       return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+}
+
 struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {
                .ras_error_inject = &gfx_v9_4_2_ras_error_inject,
                .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
                .hw_ops = &gfx_v9_4_2_ras_ops,
        },
        .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+       .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
 };