From ece45026b057edb91bc2a38f0be05309b2b13ba6 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 25 Nov 2024 11:48:39 -0800 Subject: [PATCH 1/6] drm/xe: Update xe2_graphics name string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since both Xe2 and Xe3 platforms currently use the same set of graphics IP feature flags, we associate the "graphics_xe2" structure with both IPs. Update the name string on that IP structure to clarify this and avoid confusion as Xe3 platforms start going into public CI. Fixes: 800d75bf20ae ("drm/xe/xe3: Define Xe3 feature flags") Signed-off-by: Matt Roper Reviewed-by: Vinay Belgaumkar Link: https://patchwork.freedesktop.org/patch/msgid/20241125194838.1190599-2-matthew.d.roper@intel.com (cherry picked from commit 4fe70f664a105391321c85b2af241001e8118d24) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index e6640283893f..6b7f77425c7f 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -174,7 +174,7 @@ static const struct xe_graphics_desc graphics_xelpg = { GENMASK(XE_HW_ENGINE_CCS3, XE_HW_ENGINE_CCS0) static const struct xe_graphics_desc graphics_xe2 = { - .name = "Xe2_LPG / Xe2_HPG", + .name = "Xe2_LPG / Xe2_HPG / Xe3_LPG", XE2_GFX_FEATURES, }; From 6965f91a000a24b2c25480a92696a007545d97ec Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 22 Nov 2024 16:19:16 +0000 Subject: [PATCH 2/6] drm/xe/guc_submit: fix race around pending_disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently in some testcases we can trigger: [drm] *ERROR* GT0: SCHED_DONE: Unexpected engine state 0x02b1, guc_id=8, runnable_state=0 [drm] *ERROR* GT0: G2H action 0x1002 failed (-EPROTO) len 3 msg 02 10 00 90 08 00 00 00 00 00 00 00 Looking at a snippet of corresponding ftrace for this GuC id we can see: 498.852891: xe_sched_msg_add: dev=0000:03:00.0, gt=0 guc_id=8, opcode=3 498.854083: xe_sched_msg_recv: dev=0000:03:00.0, gt=0 guc_id=8, opcode=3 498.855389: xe_exec_queue_kill: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x3, flags=0x0 498.855436: xe_exec_queue_lr_cleanup: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x83, flags=0x0 498.856767: xe_exec_queue_close: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x83, flags=0x0 498.862889: xe_exec_queue_scheduling_disable: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0xa9, flags=0x0 498.863032: xe_exec_queue_scheduling_disable: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x2b9, flags=0x0 498.875596: xe_exec_queue_scheduling_done: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x2b9, flags=0x0 498.875604: xe_exec_queue_deregister: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x2b1, flags=0x0 499.074483: xe_exec_queue_deregister_done: dev=0000:03:00.0, 5:0x1, gt=0, width=1, guc_id=8, guc_state=0x2b1, flags=0x0 This looks to be the two scheduling_disable racing with each other, one from the suspend (opcode=3) and then again during lr cleanup. While those two operations are serialized, the G2H portion is not, therefore when marking the queue as pending_disabled and then firing off the first request, we proceed do the same again, however the first disable response only fires after this which then clears the pending_disabled. At this point the second comes back and is processed, however the pending_disabled is no longer set, hence triggering the warning. To fix this wait for pending_disabled when doing the lr cleanup and calling disable_scheduling_deregister. Also do the same for all other disable_scheduling callers. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3515 Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: # v6.8+ Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20241122161914.321263-5-matthew.auld@intel.com (cherry picked from commit ddb106d2120a0bf1c5ff87c71d059d193814da41) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_submit.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 7afcc243037c..ebc85d98b025 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -769,17 +769,19 @@ static void disable_scheduling_deregister(struct xe_guc *guc, struct xe_exec_queue *q) { MAKE_SCHED_CONTEXT_ACTION(q, DISABLE); - struct xe_device *xe = guc_to_xe(guc); int ret; set_min_preemption_timeout(guc, q); smp_rmb(); - ret = wait_event_timeout(guc->ct.wq, !exec_queue_pending_enable(q) || - xe_guc_read_stopped(guc), HZ * 5); + ret = wait_event_timeout(guc->ct.wq, + (!exec_queue_pending_enable(q) && + !exec_queue_pending_disable(q)) || + xe_guc_read_stopped(guc), + HZ * 5); if (!ret) { struct xe_gpu_scheduler *sched = &q->guc->sched; - drm_warn(&xe->drm, "Pending enable failed to respond"); + xe_gt_warn(q->gt, "Pending enable/disable failed to respond\n"); xe_sched_submission_start(sched); xe_gt_reset_async(q->gt); xe_sched_tdr_queue_imm(sched); @@ -1101,7 +1103,8 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) * modifying state */ ret = wait_event_timeout(guc->ct.wq, - !exec_queue_pending_enable(q) || + (!exec_queue_pending_enable(q) && + !exec_queue_pending_disable(q)) || xe_guc_read_stopped(guc), HZ * 5); if (!ret || xe_guc_read_stopped(guc)) goto trigger_reset; @@ -1330,8 +1333,8 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg) if (guc_exec_queue_allowed_to_change_state(q) && !exec_queue_suspended(q) && exec_queue_enabled(q)) { - wait_event(guc->ct.wq, q->guc->resume_time != RESUME_PENDING || - xe_guc_read_stopped(guc)); + wait_event(guc->ct.wq, (q->guc->resume_time != RESUME_PENDING || + xe_guc_read_stopped(guc)) && !exec_queue_pending_disable(q)); if (!xe_guc_read_stopped(guc)) { s64 since_resume_ms = From 87651f31ae4e6e6e7e6c7270b9b469405e747407 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 22 Nov 2024 16:19:17 +0000 Subject: [PATCH 3/6] drm/xe/guc_submit: fix race around suspend_pending MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently in some testcases we can trigger: xe 0000:03:00.0: [drm] Assertion `exec_queue_destroyed(q)` failed! .... WARNING: CPU: 18 PID: 2640 at drivers/gpu/drm/xe/xe_guc_submit.c:1826 xe_guc_sched_done_handler+0xa54/0xef0 [xe] xe 0000:03:00.0: [drm] *ERROR* GT1: DEREGISTER_DONE: Unexpected engine state 0x00a1, guc_id=57 Looking at a snippet of corresponding ftrace for this GuC id we can see: 162.673311: xe_sched_msg_add: dev=0000:03:00.0, gt=1 guc_id=57, opcode=3 162.673317: xe_sched_msg_recv: dev=0000:03:00.0, gt=1 guc_id=57, opcode=3 162.673319: xe_exec_queue_scheduling_disable: dev=0000:03:00.0, 1:0x2, gt=1, width=1, guc_id=57, guc_state=0x29, flags=0x0 162.674089: xe_exec_queue_kill: dev=0000:03:00.0, 1:0x2, gt=1, width=1, guc_id=57, guc_state=0x29, flags=0x0 162.674108: xe_exec_queue_close: dev=0000:03:00.0, 1:0x2, gt=1, width=1, guc_id=57, guc_state=0xa9, flags=0x0 162.674488: xe_exec_queue_scheduling_done: dev=0000:03:00.0, 1:0x2, gt=1, width=1, guc_id=57, guc_state=0xa9, flags=0x0 162.678452: xe_exec_queue_deregister: dev=0000:03:00.0, 1:0x2, gt=1, width=1, guc_id=57, guc_state=0xa1, flags=0x0 It looks like we try to suspend the queue (opcode=3), setting suspend_pending and triggering a disable_scheduling. The user then closes the queue. However the close will also forcefully signal the suspend fence after killing the queue, later when the G2H response for disable_scheduling comes back we have now cleared suspend_pending when signalling the suspend fence, so the disable_scheduling now incorrectly tries to also deregister the queue. This leads to warnings since the queue has yet to even be marked for destruction. We also seem to trigger errors later with trying to double unregister the same queue. To fix this tweak the ordering when handling the response to ensure we don't race with a disable_scheduling that didn't actually intend to perform an unregister. The destruction path should now also correctly wait for any pending_disable before marking as destroyed. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/3371 Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: # v6.8+ Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20241122161914.321263-6-matthew.auld@intel.com (cherry picked from commit f161809b362f027b6d72bd998e47f8f0bad60a2e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_submit.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index ebc85d98b025..5f3ef8b1f194 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1871,16 +1871,29 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q, xe_gt_assert(guc_to_gt(guc), runnable_state == 0); xe_gt_assert(guc_to_gt(guc), exec_queue_pending_disable(q)); - clear_exec_queue_pending_disable(q); if (q->guc->suspend_pending) { suspend_fence_signal(q); + clear_exec_queue_pending_disable(q); } else { if (exec_queue_banned(q) || check_timeout) { smp_wmb(); wake_up_all(&guc->ct.wq); } - if (!check_timeout) + if (!check_timeout && exec_queue_destroyed(q)) { + /* + * Make sure to clear the pending_disable only + * after sampling the destroyed state. We want + * to ensure we don't trigger the unregister too + * early with something intending to only + * disable scheduling. The caller doing the + * destroy must wait for an ongoing + * pending_disable before marking as destroyed. + */ + clear_exec_queue_pending_disable(q); deregister_exec_queue(guc, q); + } else { + clear_exec_queue_pending_disable(q); + } } } } From 23346f85163de83aca6dc30dde3944131cf54706 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Tue, 26 Nov 2024 18:13:00 +0000 Subject: [PATCH 4/6] drm/xe/migrate: fix pat index usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XE_CACHE_WB must be converted into the per-platform pat index for that particular caching mode, otherwise we are just encoding whatever happens to be the value of that enum. Fixes: e8babb280b5e ("drm/xe: Convert multiple bind ops into single job") Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: Nirmoy Das Cc: # v6.12+ Reviewed-by: Nirmoy Das Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20241126181259.159713-3-matthew.auld@intel.com (cherry picked from commit f3dc9246f9c3cd5a7d8fd70cfd805bfc52214e2e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index cfd31ae49cc1..48e205a40fd2 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1350,6 +1350,7 @@ __xe_migrate_update_pgtables(struct xe_migrate *m, /* For sysmem PTE's, need to map them in our hole.. */ if (!IS_DGFX(xe)) { + u16 pat_index = xe->pat.idx[XE_CACHE_WB]; u32 ptes, ofs; ppgtt_ofs = NUM_KERNEL_PDE - 1; @@ -1409,7 +1410,7 @@ __xe_migrate_update_pgtables(struct xe_migrate *m, pt_bo->update_index = current_update; addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, - XE_CACHE_WB, 0); + pat_index, 0); bb->cs[bb->len++] = lower_32_bits(addr); bb->cs[bb->len++] = upper_32_bits(addr); } From c78f4399188369a55eed69cbf19a8aad2a65ac75 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Tue, 26 Nov 2024 18:13:01 +0000 Subject: [PATCH 5/6] drm/xe/migrate: use XE_BO_FLAG_PAGETABLE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On some HW we want to avoid the host caching PTEs, since access from GPU side can be incoherent. However here the special migrate object is mapping PTEs which are written from the host and potentially cached. Use XE_BO_FLAG_PAGETABLE to ensure that non-cached mapping is used, on platforms where this matters. Fixes: 7a060d786cc1 ("drm/xe/mtl: Map PPGTT as CPU:WC") Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: Nirmoy Das Cc: # v6.8+ Reviewed-by: Nirmoy Das Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20241126181259.159713-4-matthew.auld@intel.com (cherry picked from commit febc689b27d28973cd02f667548a5dca383d859a) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 48e205a40fd2..1b97d90aadda 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -209,7 +209,8 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, num_entries * XE_PAGE_SIZE, ttm_bo_type_kernel, XE_BO_FLAG_VRAM_IF_DGFX(tile) | - XE_BO_FLAG_PINNED); + XE_BO_FLAG_PINNED | + XE_BO_FLAG_PAGETABLE); if (IS_ERR(bo)) return PTR_ERR(bo); From aef0b4a07277f715bfc2a0d76a16da2bc4e89205 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 26 Nov 2024 09:46:11 -0800 Subject: [PATCH 6/6] drm/xe: Take PM ref in delayed snapshot capture worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The delayed snapshot capture worker can access the GPU or VRAM both of which require a PM reference. Take a reference in this worker. Cc: Rodrigo Vivi Cc: Maarten Lankhorst Fixes: 4f04d07c0a94 ("drm/xe: Faster devcoredump") Signed-off-by: Matthew Brost Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20241126174615.2665852-5-matthew.brost@intel.com (cherry picked from commit 1c6878af115a4586a40d6c14d530fa9f93e0bd83) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_devcoredump.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index d2679c5d976b..0b0cd6aa1d9f 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -23,6 +23,7 @@ #include "xe_guc_submit.h" #include "xe_hw_engine.h" #include "xe_module.h" +#include "xe_pm.h" #include "xe_sched_job.h" #include "xe_vm.h" @@ -158,8 +159,11 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work) { struct xe_devcoredump_snapshot *ss = container_of(work, typeof(*ss), work); struct xe_devcoredump *coredump = container_of(ss, typeof(*coredump), snapshot); + struct xe_device *xe = coredump_to_xe(coredump); unsigned int fw_ref; + xe_pm_runtime_get(xe); + /* keep going if fw fails as we still want to save the memory and SW data */ fw_ref = xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL); if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL)) @@ -168,6 +172,8 @@ static void xe_devcoredump_deferred_snap_work(struct work_struct *work) xe_guc_exec_queue_snapshot_capture_delayed(ss->ge); xe_force_wake_put(gt_to_fw(ss->gt), fw_ref); + xe_pm_runtime_put(xe); + /* Calculate devcoredump size */ ss->read.size = __xe_devcoredump_read(NULL, INT_MAX, coredump);