drm/amdkfd: Introduce KFD module parameter halt_if_hws_hang
authorYong Zhao <yong.zhao@amd.com>
Thu, 12 Jul 2018 02:33:05 +0000 (22:33 -0400)
committerOded Gabbay <oded.gabbay@gmail.com>
Thu, 12 Jul 2018 02:33:05 +0000 (22:33 -0400)
This avoids triggering a GPU reset or otherwise changing the HW
state. Instead KFD will hang, which allows HW debugging tools to
analyze the problem.

Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_module.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 32e93b53e7e8706fe4994fbed0347bad99992eb3..5d05d125c3c1444ccfd99a07078570330694ede5 100644 (file)
@@ -1217,6 +1217,13 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
        while (*fence_addr != fence_value) {
                if (time_after(jiffies, end_jiffies)) {
                        pr_err("qcm fence wait loop timeout expired\n");
+                       /* In HWS case, this is used to halt the driver thread
+                        * in order not to mess up CP states before doing
+                        * scandumps for FW debugging.
+                        */
+                       while (halt_if_hws_hang)
+                               schedule();
+
                        return -ETIME;
                }
                schedule();
index ee7bf07db4723d4f84f39b085cfaf844c3118325..3a8c15ad0c64d3c1f0ad94033aa9b08cdac0bef0 100644 (file)
@@ -92,6 +92,10 @@ MODULE_PARM_DESC(noretry,
 
 static int amdkfd_init_completed;
 
+int halt_if_hws_hang;
+module_param(halt_if_hws_hang, int, 0644);
+MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)");
+
 int kgd2kfd_init(unsigned int interface_version,
                const struct kgd2kfd_calls **g2f)
 {
index d9bf70b52857008165b1d1731fcb69badfb6cc2a..8473e7b3dcc234f9ea3ddfbb71d59efcd1394297 100644 (file)
@@ -144,6 +144,11 @@ extern int ignore_crat;
  */
 extern int vega10_noretry;
 
+/*
+ * Halt if HWS hang is detected
+ */
+extern int halt_if_hws_hang;
+
 /**
  * enum kfd_sched_policy
  *