drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place

author Tao Zhou <tao.zhou1@amd.com>

Fri, 30 Aug 2019 11:50:39 +0000 (19:50 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Fri, 13 Sep 2019 22:50:47 +0000 (17:50 -0500)
author Tao Zhou <tao.zhou1@amd.com>
Fri, 30 Aug 2019 11:50:39 +0000 (19:50 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Fri, 13 Sep 2019 22:50:47 +0000 (17:50 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 3ad034aa0e3cc0a965f28da17ebec08c099a0f3c..3268291babf8be2029c0360503860f3113f3d09e 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                                 break;
                                 }
                         }
-
-                       list_for_each_entry(tmp_adev, device_list_handle,
-                                       gmc.xgmi.head) {
-                               amdgpu_ras_reserve_bad_pages(tmp_adev);
-                       }
                 }
         }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 53540e067d15ee50394a7c05ddfdba22b302a910..e9bd40ea7ce0b17fe4b7681040c654dc64bc776f 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1493,16 +1493,17 @@ out:
         return 0;
  }
  
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_err_handler_data **data = &con->eh_data;
         int ret;
  
-       *data = kmalloc(sizeof(**data),
-                       GFP_KERNEL|__GFP_ZERO);
-       if (!*data)
-               return -ENOMEM;
+       *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
+       if (!*data) {
+               ret = -ENOMEM;
+               goto out;
+       }
  
         mutex_init(&con->recovery_lock);
         INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
@@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
  
         ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
         if (ret)
-               return ret;
+               goto free;
  
         if (adev->psp.ras.ras->eeprom_control.num_recs) {
                 ret = amdgpu_ras_load_bad_pages(adev);
                 if (ret)
-                       return ret;
+                       goto free;
                 ret = amdgpu_ras_reserve_bad_pages(adev);
                 if (ret)
-                       return ret;
+                       goto release;
         }
  
         return 0;
+
+release:
+       amdgpu_ras_release_bad_pages(adev);
+free:
+       con->eh_data = NULL;
+       kfree((*data)->bps);
+       kfree((*data)->bps_bo);
+       kfree(*data);
+out:
+       DRM_WARN("Failed to initialize ras recovery!\n");
+
+       return ret;
  }
  
  static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
@@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_err_handler_data *data = con->eh_data;
  
+       /* recovery_init failed to init it, fini is useless */
+       if (!data)
+               return 0;
+
         cancel_work_sync(&con->recovery_work);
         amdgpu_ras_release_bad_pages(adev);
  
         mutex_lock(&con->recovery_lock);
         con->eh_data = NULL;
         kfree(data->bps);
+       kfree(data->bps_bo);
         kfree(data);
         mutex_unlock(&con->recovery_lock);
  
@@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                         return r;
         }
  
-       if (amdgpu_ras_recovery_init(adev))
-               goto recovery_out;
-
         amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
  
         if (amdgpu_ras_fs_init(adev))
@@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                         con->hw_supported, con->supported);
         return 0;
  fs_out:
-       amdgpu_ras_recovery_fini(adev);
-recovery_out:
         amdgpu_ras_set_context(adev, NULL);
         kfree(con);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index 96210e18191e9e2bfd08b4be070c013b2f02a8ad..012034d2ae0625565e1e9bd71441a168e9a21558 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
         return ras && (ras->supported & (1 << block));
  }
  
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
  int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
                 unsigned int block);
  
@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
  {
         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
  
+       /* save bad page to eeprom before gpu reset,
+        * i2c may be unstable in gpu reset
+        */
+       amdgpu_ras_reserve_bad_pages(adev);
         if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
                 schedule_work(&ras->recovery_work);
         return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

index 13b144c8f67d68c2c261d16efbfea8772ff0a95b..54e6dacc34a478de67f05f196d3c3e85ec25891d 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -54,6 +54,7 @@
  #include "amdgpu_trace.h"
  #include "amdgpu_amdkfd.h"
  #include "amdgpu_sdma.h"
+#include "amdgpu_ras.h"
  #include "bif/bif_4_1_d.h"
  
  static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
@@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
                                                 adev->gmc.visible_vram_size);
  #endif
  
+       /*
+        * retired pages will be loaded from eeprom and reserved here,
+        * it should be called after ttm init since new bo may be created,
+        * recovery_init may fail, but it can free all resources allocated by
+        * itself and its failure should not stop amdgpu init process.
+        *
+        * Note: theoretically, this should be called before all vram allocations
+        * to protect retired page from abusing
+        */
+       amdgpu_ras_recovery_init(adev);
+
         /*
          *The reserved vram for firmware must be pinned to the specified
          *place on the VRAM, so reserve it early.
author	Tao Zhou <tao.zhou1@amd.com>
	Fri, 30 Aug 2019 11:50:39 +0000 (19:50 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Fri, 13 Sep 2019 22:50:47 +0000 (17:50 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c		patch \| blob \| history