This avoids triggering a GPU reset or otherwise changing the HW
state. Instead KFD will hang, which allows HW debugging tools to
analyze the problem.
Signed-off-by: Yong Zhao <yong.zhao@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
while (*fence_addr != fence_value) {
if (time_after(jiffies, end_jiffies)) {
pr_err("qcm fence wait loop timeout expired\n");
+ /* In HWS case, this is used to halt the driver thread
+ * in order not to mess up CP states before doing
+ * scandumps for FW debugging.
+ */
+ while (halt_if_hws_hang)
+ schedule();
+
return -ETIME;
}
schedule();
static int amdkfd_init_completed;
+int halt_if_hws_hang;
+module_param(halt_if_hws_hang, int, 0644);
+MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)");
+
int kgd2kfd_init(unsigned int interface_version,
const struct kgd2kfd_calls **g2f)
{
*/
extern int vega10_noretry;
+/*
+ * Halt if HWS hang is detected
+ */
+extern int halt_if_hws_hang;
+
/**
* enum kfd_sched_policy
*