drm/xe: Change devcoredump functions parameters to xe_sched_job
authorJosé Roberto de Souza <jose.souza@intel.com>
Tue, 23 Jan 2024 20:44:47 +0000 (12:44 -0800)
committerJosé Roberto de Souza <jose.souza@intel.com>
Wed, 24 Jan 2024 18:53:38 +0000 (10:53 -0800)
When devcoredump start to dump the VMs contents it will be necessary
to know the starting addresses of batch buffers of the job that hang.

This information it set in xe_sched_job and xe_sched_job is not easily
acessible from xe_exec_queue, so here changing the parameter, next
patch will append the batch buffer addresses to devcoredump snapshot
capture.

v3:
- update functions documentation to xe_sched_job

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Maarten Lankhorst <dev@lankhorst.se>
Reviewed-by: Stuart Summers <stuart.summers@intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240123204454.246788-2-jose.souza@intel.com
drivers/gpu/drm/xe/xe_devcoredump.c
drivers/gpu/drm/xe/xe_devcoredump.h
drivers/gpu/drm/xe/xe_guc_submit.c
drivers/gpu/drm/xe/xe_guc_submit.h

index 68abc0b195beb897af366cc171a01acc47ec836e..fd74dae292433971091cc79be77730993e0d689c 100644 (file)
@@ -16,6 +16,7 @@
 #include "xe_guc_ct.h"
 #include "xe_guc_submit.h"
 #include "xe_hw_engine.h"
+#include "xe_sched_job.h"
 
 /**
  * DOC: Xe device coredump
@@ -123,9 +124,10 @@ static void xe_devcoredump_free(void *data)
 }
 
 static void devcoredump_snapshot(struct xe_devcoredump *coredump,
-                                struct xe_exec_queue *q)
+                                struct xe_sched_job *job)
 {
        struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
+       struct xe_exec_queue *q = job->q;
        struct xe_guc *guc = exec_queue_to_guc(q);
        struct xe_hw_engine *hwe;
        enum xe_hw_engine_id id;
@@ -150,7 +152,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
        xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL);
 
        coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true);
-       coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(q);
+       coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(job);
 
        for_each_hw_engine(hwe, q->gt, id) {
                if (hwe->class != q->hwe->class ||
@@ -167,15 +169,15 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
 
 /**
  * xe_devcoredump - Take the required snapshots and initialize coredump device.
- * @q: The faulty xe_exec_queue, where the issue was detected.
+ * @job: The faulty xe_sched_job, where the issue was detected.
  *
  * This function should be called at the crash time within the serialized
  * gt_reset. It is skipped if we still have the core dump device available
  * with the information of the 'first' snapshot.
  */
-void xe_devcoredump(struct xe_exec_queue *q)
+void xe_devcoredump(struct xe_sched_job *job)
 {
-       struct xe_device *xe = gt_to_xe(q->gt);
+       struct xe_device *xe = gt_to_xe(job->q->gt);
        struct xe_devcoredump *coredump = &xe->devcoredump;
 
        if (coredump->captured) {
@@ -184,7 +186,7 @@ void xe_devcoredump(struct xe_exec_queue *q)
        }
 
        coredump->captured = true;
-       devcoredump_snapshot(coredump, q);
+       devcoredump_snapshot(coredump, job);
 
        drm_info(&xe->drm, "Xe device coredump has been created\n");
        drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
index 6ac218a5c1945886d1d5d8335c412160ad9f6f6e..df8671f0b5eb2fcffe7145cc28d89056c6980817 100644 (file)
@@ -7,12 +7,12 @@
 #define _XE_DEVCOREDUMP_H_
 
 struct xe_device;
-struct xe_exec_queue;
+struct xe_sched_job;
 
 #ifdef CONFIG_DEV_COREDUMP
-void xe_devcoredump(struct xe_exec_queue *q);
+void xe_devcoredump(struct xe_sched_job *job);
 #else
-static inline void xe_devcoredump(struct xe_exec_queue *q)
+static inline void xe_devcoredump(struct xe_sched_job *job)
 {
 }
 #endif
index 7c29b8333c71925406a0edf50e049572c6b7f46c..2b008ec1b6de59c805551973b482fbd88cd66b76 100644 (file)
@@ -934,7 +934,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
                drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx",
                           xe_sched_job_seqno(job), q->guc->id, q->flags);
                simple_error_capture(q);
-               xe_devcoredump(q);
+               xe_devcoredump(job);
        } else {
                drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u, guc_id=%d, flags=0x%lx",
                         xe_sched_job_seqno(job), q->guc->id, q->flags);
@@ -1780,7 +1780,7 @@ guc_exec_queue_wq_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snaps
 
 /**
  * xe_guc_exec_queue_snapshot_capture - Take a quick snapshot of the GuC Engine.
- * @q: Xe exec queue.
+ * @job: faulty Xe scheduled job.
  *
  * This can be printed out in a later stage like during dev_coredump
  * analysis.
@@ -1789,12 +1789,12 @@ guc_exec_queue_wq_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snaps
  * caller, using `xe_guc_exec_queue_snapshot_free`.
  */
 struct xe_guc_submit_exec_queue_snapshot *
-xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q)
+xe_guc_exec_queue_snapshot_capture(struct xe_sched_job *job)
 {
+       struct xe_exec_queue *q = job->q;
        struct xe_guc *guc = exec_queue_to_guc(q);
        struct xe_device *xe = guc_to_xe(guc);
        struct xe_gpu_scheduler *sched = &q->guc->sched;
-       struct xe_sched_job *job;
        struct xe_guc_submit_exec_queue_snapshot *snapshot;
        int i;
 
@@ -1852,14 +1852,16 @@ xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q)
        if (!snapshot->pending_list) {
                drm_err(&xe->drm, "Skipping GuC Engine pending_list snapshot.\n");
        } else {
+               struct xe_sched_job *job_iter;
+
                i = 0;
-               list_for_each_entry(job, &sched->base.pending_list, drm.list) {
+               list_for_each_entry(job_iter, &sched->base.pending_list, drm.list) {
                        snapshot->pending_list[i].seqno =
-                               xe_sched_job_seqno(job);
+                               xe_sched_job_seqno(job_iter);
                        snapshot->pending_list[i].fence =
-                               dma_fence_is_signaled(job->fence) ? 1 : 0;
+                               dma_fence_is_signaled(job_iter->fence) ? 1 : 0;
                        snapshot->pending_list[i].finished =
-                               dma_fence_is_signaled(&job->drm.s_fence->finished)
+                               dma_fence_is_signaled(&job_iter->drm.s_fence->finished)
                                ? 1 : 0;
                        i++;
                }
@@ -1945,10 +1947,28 @@ void xe_guc_exec_queue_snapshot_free(struct xe_guc_submit_exec_queue_snapshot *s
 static void guc_exec_queue_print(struct xe_exec_queue *q, struct drm_printer *p)
 {
        struct xe_guc_submit_exec_queue_snapshot *snapshot;
+       struct xe_gpu_scheduler *sched = &q->guc->sched;
+       struct xe_sched_job *job;
+       bool found = false;
+
+       spin_lock(&sched->base.job_list_lock);
+       list_for_each_entry(job, &sched->base.pending_list, drm.list) {
+               if (job->q == q) {
+                       xe_sched_job_get(job);
+                       found = true;
+                       break;
+               }
+       }
+       spin_unlock(&sched->base.job_list_lock);
 
-       snapshot = xe_guc_exec_queue_snapshot_capture(q);
+       if (!found)
+               return;
+
+       snapshot = xe_guc_exec_queue_snapshot_capture(job);
        xe_guc_exec_queue_snapshot_print(snapshot, p);
        xe_guc_exec_queue_snapshot_free(snapshot);
+
+       xe_sched_job_put(job);
 }
 
 /**
index fc97869c5b865d59d998f0adc55d004655fa269f..723dc2bd8df91260b0cdbc35201e6737d993483c 100644 (file)
@@ -9,8 +9,8 @@
 #include <linux/types.h>
 
 struct drm_printer;
-struct xe_exec_queue;
 struct xe_guc;
+struct xe_sched_job;
 
 int xe_guc_submit_init(struct xe_guc *guc);
 
@@ -27,7 +27,7 @@ int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
 int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len);
 
 struct xe_guc_submit_exec_queue_snapshot *
-xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q);
+xe_guc_exec_queue_snapshot_capture(struct xe_sched_job *job);
 void
 xe_guc_exec_queue_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snapshot,
                                 struct drm_printer *p);