drm/xe/multi_queue: Teardown group upon job timeout

Upon a job timeout, teardown the multi-queue group by
triggering TDR on all queues of the multi-queue group
and by skipping timeout checks in them.

v5: Ban the group while triggering TDR for the guc
    reported errors
    Add FIXME in TDR to take multi-queue group off HW
    (Matt Brost)
v6: Trigger cleanup of group only for multi-queue case

Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patch.msgid.link/20251211010249.1647839-32-niranjana.vishwanathapura@intel.com
This commit is contained in:
Niranjana Vishwanathapura 2025-12-10 17:03:01 -08:00
parent bb9343f122
commit 8b81c76885
2 changed files with 24 additions and 1 deletions

View File

@ -64,6 +64,8 @@ struct xe_exec_queue_group {
struct mutex list_lock;
/** @sync_pending: CGP_SYNC_DONE g2h response pending */
bool sync_pending;
/** @banned: Group banned */
bool banned;
};
/**

View File

@ -602,6 +602,8 @@ static void xe_guc_exec_queue_group_trigger_cleanup(struct xe_exec_queue *q)
xe_gt_assert(guc_to_gt(exec_queue_to_guc(q)),
xe_exec_queue_is_multi_queue(q));
/* Group banned, skip timeout check in TDR */
WRITE_ONCE(group->banned, true);
xe_guc_exec_queue_trigger_cleanup(primary);
mutex_lock(&group->list_lock);
@ -617,6 +619,9 @@ static void xe_guc_exec_queue_reset_trigger_cleanup(struct xe_exec_queue *q)
struct xe_exec_queue_group *group = q->multi_queue.group;
struct xe_exec_queue *eq;
/* Group banned, skip timeout check in TDR */
WRITE_ONCE(group->banned, true);
set_exec_queue_reset(primary);
if (!exec_queue_banned(primary) && !exec_queue_check_timeout(primary))
xe_guc_exec_queue_trigger_cleanup(primary);
@ -1487,6 +1492,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
exec_queue_killed_or_banned_or_wedged(q) ||
exec_queue_destroyed(q);
/* Skip timeout check if multi-queue group is banned */
if (xe_exec_queue_is_multi_queue(q) &&
READ_ONCE(q->multi_queue.group->banned))
skip_timeout_check = true;
/*
* FIXME: In multi-queue scenario, the TDR must ensure that the whole
* multi-queue group is off the HW before signaling the fences to avoid
* possible memory corruptions. This means disabling scheduling on the
* primary queue before or during the secondary queue's TDR. Need to
* implement this in least obtrusive way.
*/
/*
* If devcoredump not captured and GuC capture for the job is not ready
* do manual capture first and decide later if we need to use it
@ -1639,7 +1657,10 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);
xe_guc_exec_queue_trigger_cleanup(q);
if (xe_exec_queue_is_multi_queue(q))
xe_guc_exec_queue_group_trigger_cleanup(q);
else
xe_guc_exec_queue_trigger_cleanup(q);
/* Mark all outstanding jobs as bad, thus completing them */
spin_lock(&sched->base.job_list_lock);