From e275d61c5f3ffc250b2a9601d36fbd11b4db774b Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 23 Feb 2024 12:46:59 -0800 Subject: [PATCH] drm/xe/guc: Handle timing out of signaled jobs gracefully MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Timing out of signaled jobs can happen during regular operations (e.g. an exec queue closed immediately after last fence signaled). The TDR can pass the worker which free jobs. Rather than running through the TDR if signaled job is found, simply free it without any debug messages. Cc: Thomas Hellström Reported-by: José Roberto de Souza Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1271 Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Tested-by: José Roberto de Souza Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20240223204659.40750-1-matthew.brost@intel.com --- drivers/gpu/drm/xe/xe_guc_submit.c | 32 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index ff77bc8da1b27..29748e40555fc 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -929,20 +929,26 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) int err = -ETIME; int i = 0; - if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) { - drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx", - xe_sched_job_seqno(job), q->guc->id, q->flags); - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL, - "Kernel-submitted job timed out\n"); - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q), - "VM job timed out on non-killed execqueue\n"); - - simple_error_capture(q); - xe_devcoredump(job); - } else { - drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u, guc_id=%d, flags=0x%lx", - xe_sched_job_seqno(job), q->guc->id, q->flags); + /* + * TDR has fired before free job worker. Common if exec queue + * immediately closed after last fence signaled. + */ + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) { + guc_exec_queue_free_job(drm_job); + + return DRM_GPU_SCHED_STAT_NOMINAL; } + + drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx", + xe_sched_job_seqno(job), q->guc->id, q->flags); + xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL, + "Kernel-submitted job timed out\n"); + xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q), + "VM job timed out on non-killed execqueue\n"); + + simple_error_capture(q); + xe_devcoredump(job); + trace_xe_sched_job_timedout(job); /* Kill the run_job entry point */ -- 2.30.2