drm/xe/guc: Track pending-enable source in submission state

Add explicit tracking in the GuC submission state to record the source
of a pending enable (TDR vs. queue resume path vs. submission).
Disambiguating the origin lets the GuC submission state machine apply
the correct recovery/replay behavior.

This helps VF restore: when the device comes back, the state machine knows
whether the pending enable stems from timeout recovery, from a queue resume
sequence, or submission and can gate sequencing and fixups accordingly.

v4:
 - Clarify commit message (Tomasz)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Tomasz Lis <tomasz.lis@intel.com>
Link: https://lore.kernel.org/r/20251008214532.3442967-4-matthew.brost@intel.com
This commit is contained in:
Matthew Brost 2025-10-08 14:45:01 -07:00
parent 26cd498e00
commit 7e1fe102c8

View File

@ -69,6 +69,8 @@ exec_queue_to_guc(struct xe_exec_queue *q)
#define EXEC_QUEUE_STATE_BANNED (1 << 9)
#define EXEC_QUEUE_STATE_CHECK_TIMEOUT (1 << 10)
#define EXEC_QUEUE_STATE_EXTRA_REF (1 << 11)
#define EXEC_QUEUE_STATE_PENDING_RESUME (1 << 12)
#define EXEC_QUEUE_STATE_PENDING_TDR_EXIT (1 << 13)
static bool exec_queue_registered(struct xe_exec_queue *q)
{
@ -220,6 +222,36 @@ static void set_exec_queue_extra_ref(struct xe_exec_queue *q)
atomic_or(EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state);
}
static bool __maybe_unused exec_queue_pending_resume(struct xe_exec_queue *q)
{
return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_PENDING_RESUME;
}
static void set_exec_queue_pending_resume(struct xe_exec_queue *q)
{
atomic_or(EXEC_QUEUE_STATE_PENDING_RESUME, &q->guc->state);
}
static void clear_exec_queue_pending_resume(struct xe_exec_queue *q)
{
atomic_and(~EXEC_QUEUE_STATE_PENDING_RESUME, &q->guc->state);
}
static bool __maybe_unused exec_queue_pending_tdr_exit(struct xe_exec_queue *q)
{
return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_PENDING_TDR_EXIT;
}
static void set_exec_queue_pending_tdr_exit(struct xe_exec_queue *q)
{
atomic_or(EXEC_QUEUE_STATE_PENDING_TDR_EXIT, &q->guc->state);
}
static void clear_exec_queue_pending_tdr_exit(struct xe_exec_queue *q)
{
atomic_and(~EXEC_QUEUE_STATE_PENDING_TDR_EXIT, &q->guc->state);
}
static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
{
return (atomic_read(&q->guc->state) &
@ -1334,6 +1366,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
return DRM_GPU_SCHED_STAT_RESET;
sched_enable:
set_exec_queue_pending_tdr_exit(q);
enable_scheduling(q);
rearm:
/*
@ -1493,6 +1526,7 @@ static void __guc_exec_queue_process_msg_resume(struct xe_sched_msg *msg)
clear_exec_queue_suspended(q);
if (!exec_queue_enabled(q)) {
q->guc->resume_time = RESUME_PENDING;
set_exec_queue_pending_resume(q);
enable_scheduling(q);
}
} else {
@ -2065,6 +2099,8 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q,
xe_gt_assert(guc_to_gt(guc), exec_queue_pending_enable(q));
q->guc->resume_time = ktime_get();
clear_exec_queue_pending_resume(q);
clear_exec_queue_pending_tdr_exit(q);
clear_exec_queue_pending_enable(q);
smp_wmb();
wake_up_all(&guc->ct.wq);