drm/amdkfd: Add debugfs interface to trigger HWS hang
authorShaoyun Liu <Shaoyun.Liu@amd.com>
Thu, 12 Jul 2018 02:33:04 +0000 (22:33 -0400)
committerOded Gabbay <oded.gabbay@gmail.com>
Thu, 12 Jul 2018 02:33:04 +0000 (22:33 -0400)
Signed-off-by: Shaoyun Liu <Shaoyun.Liu@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 4bd6ebfaf425bcc88947bf41cc6b2ad832bc84b9..ab37d36d9cd69f305b5c81bc5b94fb2b1628d497 100644 (file)
@@ -21,6 +21,8 @@
  */
 
 #include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
 #include "kfd_priv.h"
 
 static struct dentry *debugfs_root;
@@ -32,6 +34,38 @@ static int kfd_debugfs_open(struct inode *inode, struct file *file)
        return single_open(file, show, NULL);
 }
 
+static ssize_t kfd_debugfs_hang_hws_write(struct file *file,
+       const char __user *user_buf, size_t size, loff_t *ppos)
+{
+       struct kfd_dev *dev;
+       char tmp[16];
+       uint32_t gpu_id;
+       int ret = -EINVAL;
+
+       memset(tmp, 0, 16);
+       if (size >= 16) {
+               pr_err("Invalid input for gpu id.\n");
+               goto out;
+       }
+       if (copy_from_user(tmp, user_buf, size)) {
+               ret = -EFAULT;
+               goto out;
+       }
+       if (kstrtoint(tmp, 10, &gpu_id)) {
+               pr_err("Invalid input for gpu id.\n");
+               goto out;
+       }
+       dev = kfd_device_by_id(gpu_id);
+       if (dev) {
+               kfd_debugfs_hang_hws(dev);
+               ret = size;
+       } else
+               pr_err("Cannot find device %d.\n", gpu_id);
+
+out:
+       return ret;
+}
+
 static const struct file_operations kfd_debugfs_fops = {
        .owner = THIS_MODULE,
        .open = kfd_debugfs_open,
@@ -40,6 +74,15 @@ static const struct file_operations kfd_debugfs_fops = {
        .release = single_release,
 };
 
+static const struct file_operations kfd_debugfs_hang_hws_fops = {
+       .owner = THIS_MODULE,
+       .open = kfd_debugfs_open,
+       .read = seq_read,
+       .write = kfd_debugfs_hang_hws_write,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
 void kfd_debugfs_init(void)
 {
        struct dentry *ent;
@@ -65,6 +108,11 @@ void kfd_debugfs_init(void)
        ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
                                  kfd_debugfs_rls_by_device,
                                  &kfd_debugfs_fops);
+
+       ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+                                 NULL,
+                                 &kfd_debugfs_hang_hws_fops);
+
        if (!ent)
                pr_warn("Failed to create rls in kfd debugfs\n");
 }
index 9f63ac366284b30ee022f29761f75869528386a3..8faa8db3eba52febf8785a68d1ecef07336231be 100644 (file)
@@ -914,3 +914,26 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
        kfree(mem_obj);
        return 0;
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+/* This function will send a package to HIQ to hang the HWS
+ * which will trigger a GPU reset and bring the HWS back to normal state
+ */
+int kfd_debugfs_hang_hws(struct kfd_dev *dev)
+{
+       int r = 0;
+
+       if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) {
+               pr_err("HWS is not enabled");
+               return -EINVAL;
+       }
+
+       r = pm_debugfs_hang_hws(&dev->dqm->packets);
+       if (!r)
+               r = dqm_debugfs_execute_queues(dev->dqm);
+
+       return r;
+}
+
+#endif
index 6b59eab39fbe118b9870ed45f77f8b7aaa0a85c6..32e93b53e7e8706fe4994fbed0347bad99992eb3 100644 (file)
@@ -1801,4 +1801,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
        return r;
 }
 
+int dqm_debugfs_execute_queues(struct device_queue_manager *dqm)
+{
+       int r = 0;
+
+       dqm_lock(dqm);
+       dqm->active_runlist = true;
+       r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+       dqm_unlock(dqm);
+
+       return r;
+}
+
 #endif
index c317feb43f69d8ce76cd1ccae55e79f361db87f7..1092631765cb5b09ef198be0caa2007407db0c13 100644 (file)
@@ -418,4 +418,30 @@ out:
        return 0;
 }
 
+int pm_debugfs_hang_hws(struct packet_manager *pm)
+{
+       uint32_t *buffer, size;
+       int r = 0;
+
+       size = pm->pmf->query_status_size;
+       mutex_lock(&pm->lock);
+       pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+                       size / sizeof(uint32_t), (unsigned int **)&buffer);
+       if (!buffer) {
+               pr_err("Failed to allocate buffer on kernel queue\n");
+               r = -ENOMEM;
+               goto out;
+       }
+       memset(buffer, 0x55, size);
+       pm->priv_queue->ops.submit_packet(pm->priv_queue);
+
+       pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.",
+               buffer[0], buffer[1], buffer[2], buffer[3],
+               buffer[4], buffer[5], buffer[6]);
+out:
+       mutex_unlock(&pm->lock);
+       return r;
+}
+
+
 #endif
index 2e03d6c80aa0d45dae7e2d698721a5c36c19cd8b..d9bf70b52857008165b1d1731fcb69badfb6cc2a 100644 (file)
@@ -995,6 +995,10 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data);
 int kfd_debugfs_rls_by_device(struct seq_file *m, void *data);
 int pm_debugfs_runlist(struct seq_file *m, void *data);
 
+int kfd_debugfs_hang_hws(struct kfd_dev *dev);
+int pm_debugfs_hang_hws(struct packet_manager *pm);
+int dqm_debugfs_execute_queues(struct device_queue_manager *dqm);
+
 #else
 
 static inline void kfd_debugfs_init(void) {}