drm/xe: Introduce the dev_coredump infrastructure.

author Rodrigo Vivi <rodrigo.vivi@intel.com>

Thu, 18 May 2023 21:12:39 +0000 (17:12 -0400)

committer Rodrigo Vivi <rodrigo.vivi@intel.com>

Tue, 19 Dec 2023 23:33:51 +0000 (18:33 -0500)
author Rodrigo Vivi <rodrigo.vivi@intel.com>
Thu, 18 May 2023 21:12:39 +0000 (17:12 -0400)
committer Rodrigo Vivi <rodrigo.vivi@intel.com>
Tue, 19 Dec 2023 23:33:51 +0000 (18:33 -0500)
diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig

index 62f54e6d62d93a8621b4989ea24064e3576936c5..0a4854a59c90fcf55acc87132655039ef03810a5 100644 (file)
--- a/drivers/gpu/drm/xe/Kconfig
+++ b/drivers/gpu/drm/xe/Kconfig
@@ -23,6 +23,7 @@ config DRM_XE
         select DRM_TTM_HELPER
         select DRM_SCHED
         select MMU_NOTIFIER
+       select WANT_DEV_COREDUMP
         help
           Experimental driver for Intel Xe series GPUs
  
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile

index 71c604ecff531f4d4649230ea0d56ca00dbdf8b9..5d277d060eba91c7946ff939d4d0f61b1cb6a33b 100644 (file)
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -37,6 +37,7 @@ xe-y += xe_bb.o \
         xe_bo.o \
         xe_bo_evict.o \
         xe_debugfs.o \
+       xe_devcoredump.o \
         xe_device.o \
         xe_dma_buf.o \
         xe_engine.o \
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c

new file mode 100644 (file)

index 0000000..561db73
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+
+#include "xe_devcoredump.h"
+#include "xe_devcoredump_types.h"
+
+#include <linux/devcoredump.h>
+#include <generated/utsrelease.h>
+
+#include "xe_engine.h"
+#include "xe_gt.h"
+
+/**
+ * DOC: Xe device coredump
+ *
+ * Devices overview:
+ * Xe uses dev_coredump infrastructure for exposing the crash errors in a
+ * standardized way.
+ * devcoredump exposes a temporary device under /sys/class/devcoredump/
+ * which is linked with our card device directly.
+ * The core dump can be accessed either from
+ * /sys/class/drm/card<n>/device/devcoredump/ or from
+ * /sys/class/devcoredump/devcd<m> where
+ * /sys/class/devcoredump/devcd<m>/failing_device is a link to
+ * /sys/class/drm/card<n>/device/.
+ *
+ * Snapshot at hang:
+ * The 'data' file is printed with a drm_printer pointer at devcoredump read
+ * time. For this reason, we need to take snapshots from when the hang has
+ * happened, and not only when the user is reading the file. Otherwise the
+ * information is outdated since the resets might have happened in between.
+ *
+ * 'First' failure snapshot:
+ * In general, the first hang is the most critical one since the following hangs
+ * can be a consequence of the initial hang. For this reason we only take the
+ * snapshot of the 'first' failure and ignore subsequent calls of this function,
+ * at least while the coredump device is alive. Dev_coredump has a delayed work
+ * queue that will eventually delete the device and free all the dump
+ * information.
+ */
+
+#ifdef CONFIG_DEV_COREDUMP
+
+static struct xe_device *coredump_to_xe(const struct xe_devcoredump *coredump)
+{
+       return container_of(coredump, struct xe_device, devcoredump);
+}
+
+static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
+                                  size_t count, void *data, size_t datalen)
+{
+       struct xe_devcoredump *coredump = data;
+       struct xe_devcoredump_snapshot *ss;
+       struct drm_printer p;
+       struct drm_print_iterator iter;
+       struct timespec64 ts;
+
+       iter.data = buffer;
+       iter.offset = 0;
+       iter.start = offset;
+       iter.remain = count;
+
+       ss = &coredump->snapshot;
+       p = drm_coredump_printer(&iter);
+
+       drm_printf(&p, "**** Xe Device Coredump ****\n");
+       drm_printf(&p, "kernel: " UTS_RELEASE "\n");
+       drm_printf(&p, "module: " KBUILD_MODNAME "\n");
+
+       ts = ktime_to_timespec64(ss->snapshot_time);
+       drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
+       ts = ktime_to_timespec64(ss->boot_time);
+       drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec);
+
+       return count - iter.remain;
+}
+
+static void xe_devcoredump_free(void *data)
+{
+       struct xe_devcoredump *coredump = data;
+
+       coredump->captured = false;
+       drm_info(&coredump_to_xe(coredump)->drm,
+                "Xe device coredump has been deleted.\n");
+}
+
+static void devcoredump_snapshot(struct xe_devcoredump *coredump,
+                                struct xe_engine *e)
+{
+       struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
+
+       ss->snapshot_time = ktime_get_real();
+       ss->boot_time = ktime_get_boottime();
+}
+
+/**
+ * xe_devcoredump - Take the required snapshots and initialize coredump device.
+ * @e: The faulty xe_engine, where the issue was detected.
+ *
+ * This function should be called at the crash time within the serialized
+ * gt_reset. It is skipped if we still have the core dump device available
+ * with the information of the 'first' snapshot.
+ */
+void xe_devcoredump(struct xe_engine *e)
+{
+       struct xe_device *xe = gt_to_xe(e->gt);
+       struct xe_devcoredump *coredump = &xe->devcoredump;
+
+       if (coredump->captured) {
+               drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
+               return;
+       }
+
+       coredump->captured = true;
+       devcoredump_snapshot(coredump, e);
+
+       drm_info(&xe->drm, "Xe device coredump has been created\n");
+       drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
+                xe->drm.primary->index);
+
+       dev_coredumpm(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
+                     xe_devcoredump_read, xe_devcoredump_free);
+}
+#endif
diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h

new file mode 100644 (file)

index 0000000..8548821
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_devcoredump.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+
+#ifndef _XE_DEVCOREDUMP_H_
+#define _XE_DEVCOREDUMP_H_
+
+struct xe_device;
+struct xe_engine;
+
+#ifdef CONFIG_DEV_COREDUMP
+void xe_devcoredump(struct xe_engine *e);
+#else
+static inline void xe_devcoredump(struct xe_engine *e)
+{
+}
+#endif
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h

new file mode 100644 (file)

index 0000000..52bd27c
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+
+#ifndef _XE_DEVCOREDUMP_TYPES_H_
+#define _XE_DEVCOREDUMP_TYPES_H_
+
+#include <linux/ktime.h>
+#include <linux/mutex.h>
+
+struct xe_device;
+
+/**
+ * struct xe_devcoredump_snapshot - Crash snapshot
+ *
+ * This struct contains all the useful information quickly captured at the time
+ * of the crash. So, any subsequent reads of the coredump points to a data that
+ * shows the state of the GPU of when the issue has happened.
+ */
+struct xe_devcoredump_snapshot {
+       /** @snapshot_time:  Time of this capture. */
+       ktime_t snapshot_time;
+       /** @boot_time:  Relative boot time so the uptime can be calculated. */
+       ktime_t boot_time;
+};
+
+/**
+ * struct xe_devcoredump - Xe devcoredump main structure
+ *
+ * This struct represents the live and active dev_coredump node.
+ * It is created/populated at the time of a crash/error. Then it
+ * is read later when user access the device coredump data file
+ * for reading the information.
+ */
+struct xe_devcoredump {
+       /** @xe: Xe device. */
+       struct xe_device *xe;
+       /** @captured: The snapshot of the first hang has already been taken. */
+       bool captured;
+       /** @snapshot: Snapshot is captured at time of the first crash */
+       struct xe_devcoredump_snapshot snapshot;
+};
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h

index f3cf5a4e5ab23a27e54fb6b3f3e549611d2bb998..91edbe4a3730e58dd8559d1998cb441837badd7c 100644 (file)
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -12,6 +12,7 @@
  #include <drm/drm_file.h>
  #include <drm/ttm/ttm_device.h>
  
+#include "xe_devcoredump_types.h"
  #include "xe_gt_types.h"
  #include "xe_platform_types.h"
  #include "xe_step_types.h"
@@ -49,6 +50,9 @@ struct xe_device {
         /** @drm: drm device */
         struct drm_device drm;
  
+       /** @devcoredump: device coredump */
+       struct xe_devcoredump devcoredump;
+
         /** @info: device info */
         struct intel_device_info {
                 /** @graphics_name: graphics IP name */
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c

index d0b48c885fda13634528f2abb912bb94ba924822..55b51ff791b8562e676d7eb99bbec198323cef92 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -14,6 +14,7 @@
  #include <drm/drm_managed.h>
  
  #include "regs/xe_lrc_layout.h"
+#include "xe_devcoredump.h"
  #include "xe_device.h"
  #include "xe_engine.h"
  #include "xe_force_wake.h"
@@ -800,6 +801,7 @@ guc_engine_timedout_job(struct drm_sched_job *drm_job)
                 drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx",
                            xe_sched_job_seqno(job), e->guc->id, e->flags);
                 simple_error_capture(e);
+               xe_devcoredump(e);
         } else {
                 drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u, guc_id=%d, flags=0x%lx",
                          xe_sched_job_seqno(job), e->guc->id, e->flags);
author	Rodrigo Vivi <rodrigo.vivi@intel.com>
	Thu, 18 May 2023 21:12:39 +0000 (17:12 -0400)
committer	Rodrigo Vivi <rodrigo.vivi@intel.com>
	Tue, 19 Dec 2023 23:33:51 +0000 (18:33 -0500)
drivers/gpu/drm/xe/Kconfig		patch \| blob \| history
drivers/gpu/drm/xe/Makefile		patch \| blob \| history
drivers/gpu/drm/xe/xe_devcoredump.c	[new file with mode: 0644]	patch \| blob
drivers/gpu/drm/xe/xe_devcoredump.h	[new file with mode: 0644]	patch \| blob
drivers/gpu/drm/xe/xe_devcoredump_types.h	[new file with mode: 0644]	patch \| blob
drivers/gpu/drm/xe/xe_device_types.h		patch \| blob \| history
drivers/gpu/drm/xe/xe_guc_submit.c		patch \| blob \| history