mirror of
https://github.com/torvalds/linux.git
synced 2026-06-04 20:46:48 +02:00
drm/xe/guc_submit: fix race around pending_disable
Currently in some testcases we can trigger: [drm] *ERROR* GT0: SCHED_DONE: Unexpected engine state 0x02b1, guc_id=8, runnable_state=0 [drm] *ERROR* GT0: G2H action 0x1002 failed (-EPROTO) len 3 msg 02 10 00 90 08 00 00 00 00 00 00 00 Looking at a snippet of corresponding ftrace for this GuC id we can see: 498.852891: xe_sched_msg_add: dev=0000:03:00.0, gt=0 guc_id=8, opcode=3 498.854083: xe_sched_msg_recv: dev=0000:03:00.0, gt=0 guc_id=8, opcode=3 498.855389: xe_exec_queue_kill: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x3, flags=0x0 498.855436: xe_exec_queue_lr_cleanup: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x83, flags=0x0 498.856767: xe_exec_queue_close: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x83, flags=0x0 498.862889: xe_exec_queue_scheduling_disable: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0xa9, flags=0x0 498.863032: xe_exec_queue_scheduling_disable: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x2b9, flags=0x0 498.875596: xe_exec_queue_scheduling_done: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x2b9, flags=0x0 498.875604: xe_exec_queue_deregister: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x2b1, flags=0x0 499.074483: xe_exec_queue_deregister_done: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x2b1, flags=0x0 This looks to be the two scheduling_disable racing with each other, one from the suspend (opcode=3) and then again during lr cleanup. While those two operations are serialized, the G2H portion is not, therefore when marking the queue as pending_disabled and then firing off the first request, we proceed do the same again, however the first disable response only fires after this which then clears the pending_disabled. At this point the second comes back and is processed, however the pending_disabled is no longer set, hence triggering the warning. To fix this wait for pending_disabled when doing the lr cleanup and calling disable_scheduling_deregister. Also do the same for all other disable_scheduling callers. Fixes:dd08ebf6c3("drm/xe: Introduce a new DRM driver for Intel GPUs") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3515 Signed-off-by: Matthew Auld <matthew.auld@intel.com> Cc: Matthew Brost <matthew.brost@intel.com> Cc: <stable@vger.kernel.org> # v6.8+ Reviewed-by: Matthew Brost <mattheq.brost@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20241122161914.321263-5-matthew.auld@intel.com (cherry picked from commitddb106d212) Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
This commit is contained in:
parent
ece45026b0
commit
6965f91a00
|
|
@ -769,17 +769,19 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
|
|||
struct xe_exec_queue *q)
|
||||
{
|
||||
MAKE_SCHED_CONTEXT_ACTION(q, DISABLE);
|
||||
struct xe_device *xe = guc_to_xe(guc);
|
||||
int ret;
|
||||
|
||||
set_min_preemption_timeout(guc, q);
|
||||
smp_rmb();
|
||||
ret = wait_event_timeout(guc->ct.wq, !exec_queue_pending_enable(q) ||
|
||||
xe_guc_read_stopped(guc), HZ * 5);
|
||||
ret = wait_event_timeout(guc->ct.wq,
|
||||
(!exec_queue_pending_enable(q) &&
|
||||
!exec_queue_pending_disable(q)) ||
|
||||
xe_guc_read_stopped(guc),
|
||||
HZ * 5);
|
||||
if (!ret) {
|
||||
struct xe_gpu_scheduler *sched = &q->guc->sched;
|
||||
|
||||
drm_warn(&xe->drm, "Pending enable failed to respond");
|
||||
xe_gt_warn(q->gt, "Pending enable/disable failed to respond\n");
|
||||
xe_sched_submission_start(sched);
|
||||
xe_gt_reset_async(q->gt);
|
||||
xe_sched_tdr_queue_imm(sched);
|
||||
|
|
@ -1101,7 +1103,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
|
|||
* modifying state
|
||||
*/
|
||||
ret = wait_event_timeout(guc->ct.wq,
|
||||
!exec_queue_pending_enable(q) ||
|
||||
(!exec_queue_pending_enable(q) &&
|
||||
!exec_queue_pending_disable(q)) ||
|
||||
xe_guc_read_stopped(guc), HZ * 5);
|
||||
if (!ret || xe_guc_read_stopped(guc))
|
||||
goto trigger_reset;
|
||||
|
|
@ -1330,8 +1333,8 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
|
|||
|
||||
if (guc_exec_queue_allowed_to_change_state(q) && !exec_queue_suspended(q) &&
|
||||
exec_queue_enabled(q)) {
|
||||
wait_event(guc->ct.wq, q->guc->resume_time != RESUME_PENDING ||
|
||||
xe_guc_read_stopped(guc));
|
||||
wait_event(guc->ct.wq, (q->guc->resume_time != RESUME_PENDING ||
|
||||
xe_guc_read_stopped(guc)) && !exec_queue_pending_disable(q));
|
||||
|
||||
if (!xe_guc_read_stopped(guc)) {
|
||||
s64 since_resume_ms =
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user