From fd25fa90edcfd4db5bf69c11621021a7cfd11d53 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Thu, 10 Jul 2025 10:29:45 +0530 Subject: [PATCH 1/7] drm/xe: Dont skip TLB invalidations on VF Skipping TLB invalidations on VF causing unrecoverable faults. Probable reason for skipping TLB invalidations on SRIOV could be lack of support for instruction MI_FLUSH_DW_STORE_INDEX. Add back TLB flush with some additional handling. Helps in resolving, [ 704.913454] xe 0000:00:02.1: [drm:pf_queue_work_func [xe]] ASID: 0 VFID: 0 PDATA: 0x0d92 Faulted Address: 0x0000000002fa0000 FaultType: 0 AccessType: 1 FaultLevel: 0 EngineClass: 3 bcs EngineInstance: 8 [ 704.913551] xe 0000:00:02.1: [drm:pf_queue_work_func [xe]] Fault response: Unsuccessful -22 V2: - Use Xmas tree (MichalW) Suggested-by: Matthew Brost Fixes: 97515d0b3ed92 ("drm/xe/vf: Don't emit access to Global HWSP if VF") Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20250710045945.1023840-1-tejas.upadhyay@intel.com Signed-off-by: Tejas Upadhyay (cherry picked from commit b528e896fa570844d654b5a4617a97fa770a1030) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_ring_ops.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index bc1689db4cd7..7b50c7c1ee21 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -110,13 +110,14 @@ static int emit_bb_start(u64 batch_addr, u32 ppgtt_flag, u32 *dw, int i) return i; } -static int emit_flush_invalidate(u32 *dw, int i) +static int emit_flush_invalidate(u32 addr, u32 val, u32 *dw, int i) { dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | - MI_FLUSH_IMM_DW | MI_FLUSH_DW_STORE_INDEX; - dw[i++] = LRC_PPHWSP_FLUSH_INVAL_SCRATCH_ADDR; - dw[i++] = 0; + MI_FLUSH_IMM_DW; + + dw[i++] = addr | MI_FLUSH_DW_USE_GTT; dw[i++] = 0; + dw[i++] = val; return i; } @@ -397,23 +398,20 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, static void emit_migration_job_gen12(struct xe_sched_job *job, struct xe_lrc *lrc, u32 seqno) { + u32 saddr = xe_lrc_start_seqno_ggtt_addr(lrc); u32 dw[MAX_JOB_SIZE_DW], i = 0; i = emit_copy_timestamp(lrc, dw, i); - i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), - seqno, dw, i); + i = emit_store_imm_ggtt(saddr, seqno, dw, i); dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again below */ i = emit_bb_start(job->ptrs[0].batch_addr, BIT(8), dw, i); - if (!IS_SRIOV_VF(gt_to_xe(job->q->gt))) { - /* XXX: Do we need this? Leaving for now. */ - dw[i++] = preparser_disable(true); - i = emit_flush_invalidate(dw, i); - dw[i++] = preparser_disable(false); - } + dw[i++] = preparser_disable(true); + i = emit_flush_invalidate(saddr, seqno, dw, i); + dw[i++] = preparser_disable(false); i = emit_bb_start(job->ptrs[1].batch_addr, BIT(8), dw, i); From 1381c231c1267480f721b06528c3a230f43f2ac5 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 10 Jul 2025 14:41:29 +0100 Subject: [PATCH 2/7] drm/xe/migrate: fix copy direction in access_memory After we do the modification on the host side, ensure we write the result back to VRAM and not the other way around, otherwise the modification will be lost if treated like a read. Fixes: 270172f64b11 ("drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access") Signed-off-by: Matthew Auld Cc: Matthew Brost Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/20250710134128.800756-2-matthew.auld@intel.com (cherry picked from commit c12fe703cab93f9d8bfe0ff32b58e7b1fd52be1f) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 66bc02302c55..73286b815f39 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1848,7 +1848,7 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, err = xe_migrate_access_memory(m, bo, offset & ~XE_CACHELINE_MASK, (void *)ptr, - sizeof(bounce), 0); + sizeof(bounce), write); if (err) return err; } else { From 2a58b21adee3df10ca6f4491af965c4890d2d8e3 Mon Sep 17 00:00:00 2001 From: Balasubramani Vivekanandan Date: Tue, 20 May 2025 19:54:45 +0530 Subject: [PATCH 3/7] drm/xe/mocs: Initialize MOCS index early MOCS uc_index is used even before it is initialized in the following callstack guc_prepare_xfer() __xe_guc_upload() xe_guc_min_load_for_hwconfig() xe_uc_init_hwconfig() xe_gt_init_hwconfig() Do MOCS index initialization earlier in the device probe. Signed-off-by: Balasubramani Vivekanandan Reviewed-by: Ravi Kumar Vodapalli Link: https://lore.kernel.org/r/20250520142445.2792824-1-balasubramani.vivekanandan@intel.com Signed-off-by: Matt Roper (cherry picked from commit 241cc827c0987d7173714fc5a95a7c8fc9bf15c0) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 6c4cb9576fb6..9752a38c0162 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -417,6 +417,8 @@ int xe_gt_init_early(struct xe_gt *gt) if (err) return err; + xe_mocs_init_early(gt); + return 0; } @@ -634,8 +636,6 @@ int xe_gt_init(struct xe_gt *gt) if (err) return err; - xe_mocs_init_early(gt); - err = xe_gt_sysfs_init(gt); if (err) return err; From 3155ac89251dcb5e35a3ec2f60a74a6ed22c56fd Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 10 Jul 2025 12:12:08 -0700 Subject: [PATCH 4/7] drm/xe: Move page fault init after topology init We need the topology to determine GT page fault queue size, move page fault init after topology init. Cc: stable@vger.kernel.org Fixes: 3338e4f90c14 ("drm/xe: Use topology to determine page fault queue size") Signed-off-by: Matthew Brost Reviewed-by: Jonathan Cavitt Reviewed-by: Stuart Summers Link: https://lore.kernel.org/r/20250710191208.1040215-1-matthew.brost@intel.com (cherry picked from commit beb72acb5b38dbe670d8eb752d1ad7a32f9c4119) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 9752a38c0162..d554a8cc565c 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -632,10 +632,6 @@ int xe_gt_init(struct xe_gt *gt) if (err) return err; - err = xe_gt_pagefault_init(gt); - if (err) - return err; - err = xe_gt_sysfs_init(gt); if (err) return err; @@ -644,6 +640,10 @@ int xe_gt_init(struct xe_gt *gt) if (err) return err; + err = xe_gt_pagefault_init(gt); + if (err) + return err; + err = xe_gt_idle_init(>->gtidle); if (err) return err; From 057a7d66f98eda9257e46fe44ee5750e0a6eec66 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 10 Jul 2025 14:34:41 -0700 Subject: [PATCH 5/7] drm/xe/migrate: Fix alignment check The check would fail if the address is unaligned, but not when accounting the offset. Instead of `buf | offset` it should have been `buf + offset`. To make it more readable and also drop the uintptr_t, just use the IS_ALIGNED() macro. Fixes: 270172f64b11 ("drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access") Reviewed-by: Matthew Brost Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250710-migrate-aligned-v1-1-44003ef3c078@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 81e139db6900503a2e68009764054fad128fbf95) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 73286b815f39..07a5161c7d5b 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1817,8 +1817,8 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, xe_bo_assert_held(bo); /* Use bounce buffer for small access and unaligned access */ - if (len & XE_CACHELINE_MASK || - ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) { + if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) || + !IS_ALIGNED((unsigned long)buf + offset, XE_CACHELINE_BYTES)) { int buf_offset = 0; /* From 81dccec448d204e448ae83e1fe60e8aaeaadadb8 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 11 Jul 2025 21:33:11 +0200 Subject: [PATCH 6/7] drm/xe/pf: Prepare to stop SR-IOV support prior GT reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of the resume or GT reset, the PF driver schedules work which is then used to complete restarting of the SR-IOV support, including resending to the GuC configurations of provisioned VFs. However, in case of short delay between those two actions, which could be seen by triggering a GT reset on the suspened device: $ echo 1 > /sys/kernel/debug/dri/0000:00:02.0/gt0/force_reset this PF worker might be still busy, which lead to errors due to just stopped or disabled GuC CTB communication: [ ] xe 0000:00:02.0: [drm:xe_gt_resume [xe]] GT0: resumed [ ] xe 0000:00:02.0: [drm] GT0: trying reset from force_reset_show [xe] [ ] xe 0000:00:02.0: [drm] GT0: reset queued [ ] xe 0000:00:02.0: [drm] GT0: reset started [ ] xe 0000:00:02.0: [drm:guc_ct_change_state [xe]] GT0: GuC CT communication channel stopped [ ] xe 0000:00:02.0: [drm:guc_ct_send_recv [xe]] GT0: H2G request 0x5503 canceled! [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push VF1 12 config KLVs (-ECANCELED) [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push VF1 configuration (-ECANCELED) [ ] xe 0000:00:02.0: [drm:guc_ct_change_state [xe]] GT0: GuC CT communication channel disabled [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push VF2 12 config KLVs (-ENODEV) [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push VF2 configuration (-ENODEV) [ ] xe 0000:00:02.0: [drm] GT0: PF: Failed to push 2 of 2 VFs configurations [ ] xe 0000:00:02.0: [drm:pf_worker_restart_func [xe]] GT0: PF: restart completed While this VFs reprovisioning will be successful during next spin of the worker, to avoid those errors, make sure to cancel restart worker if we are about to trigger next reset. Fixes: 411220808cee ("drm/xe/pf: Restart VFs provisioning after GT reset") Signed-off-by: Michal Wajdeczko Reviewed-by: Piotr Piórkowski Link: https://lore.kernel.org/r/20250711193316.1920-2-michal.wajdeczko@intel.com (cherry picked from commit 9f50b729dd61dfb9f4d7c66900d22a7c7353a8c0) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt.c | 3 +++ drivers/gpu/drm/xe/xe_gt_sriov_pf.c | 19 +++++++++++++++++++ drivers/gpu/drm/xe/xe_gt_sriov_pf.h | 5 +++++ 3 files changed, 27 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index d554a8cc565c..e3517ce2e18c 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -839,6 +839,9 @@ static int gt_reset(struct xe_gt *gt) goto err_out; } + if (IS_SRIOV_PF(gt_to_xe(gt))) + xe_gt_sriov_pf_stop_prepare(gt); + xe_uc_gucrc_disable(>->uc); xe_uc_stop_prepare(>->uc); xe_gt_pagefault_reset(gt); diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c index c08efca6420e..35489fa81825 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c @@ -172,6 +172,25 @@ void xe_gt_sriov_pf_sanitize_hw(struct xe_gt *gt, unsigned int vfid) pf_clear_vf_scratch_regs(gt, vfid); } +static void pf_cancel_restart(struct xe_gt *gt) +{ + xe_gt_assert(gt, IS_SRIOV_PF(gt_to_xe(gt))); + + if (cancel_work_sync(>->sriov.pf.workers.restart)) + xe_gt_sriov_dbg_verbose(gt, "pending restart canceled!\n"); +} + +/** + * xe_gt_sriov_pf_stop_prepare() - Prepare to stop SR-IOV support. + * @gt: the &xe_gt + * + * This function can only be called on the PF. + */ +void xe_gt_sriov_pf_stop_prepare(struct xe_gt *gt) +{ + pf_cancel_restart(gt); +} + static void pf_restart(struct xe_gt *gt) { struct xe_device *xe = gt_to_xe(gt); diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h index f474509411c0..e2b2ff8132dc 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h @@ -13,6 +13,7 @@ int xe_gt_sriov_pf_init_early(struct xe_gt *gt); int xe_gt_sriov_pf_init(struct xe_gt *gt); void xe_gt_sriov_pf_init_hw(struct xe_gt *gt); void xe_gt_sriov_pf_sanitize_hw(struct xe_gt *gt, unsigned int vfid); +void xe_gt_sriov_pf_stop_prepare(struct xe_gt *gt); void xe_gt_sriov_pf_restart(struct xe_gt *gt); #else static inline int xe_gt_sriov_pf_init_early(struct xe_gt *gt) @@ -29,6 +30,10 @@ static inline void xe_gt_sriov_pf_init_hw(struct xe_gt *gt) { } +static inline void xe_gt_sriov_pf_stop_prepare(struct xe_gt *gt) +{ +} + static inline void xe_gt_sriov_pf_restart(struct xe_gt *gt) { } From 5c244eeca57ff4e47e1f60310d059346d1b86b9b Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Fri, 11 Jul 2025 21:33:12 +0200 Subject: [PATCH 7/7] drm/xe/pf: Resend PF provisioning after GT reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we reload the GuC due to suspend/resume or GT reset then we have to resend not only any VFs provisioning data, but also PF configuration, like scheduling parameters (EQ, PT), as otherwise GuC will continue to use default values. Fixes: 411220808cee ("drm/xe/pf: Restart VFs provisioning after GT reset") Signed-off-by: Michal Wajdeczko Reviewed-by: Piotr Piórkowski Link: https://lore.kernel.org/r/20250711193316.1920-3-michal.wajdeczko@intel.com (cherry picked from commit 1c38dd6afa4a8ecce28e94da794fd1d205c30f51) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 2420a548cacc..53a44702c04a 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -2364,6 +2364,21 @@ int xe_gt_sriov_pf_config_restore(struct xe_gt *gt, unsigned int vfid, return err; } +static int pf_push_self_config(struct xe_gt *gt) +{ + int err; + + err = pf_push_full_vf_config(gt, PFID); + if (err) { + xe_gt_sriov_err(gt, "Failed to push self configuration (%pe)\n", + ERR_PTR(err)); + return err; + } + + xe_gt_sriov_dbg_verbose(gt, "self configuration completed\n"); + return 0; +} + static void fini_config(void *arg) { struct xe_gt *gt = arg; @@ -2387,9 +2402,17 @@ static void fini_config(void *arg) int xe_gt_sriov_pf_config_init(struct xe_gt *gt) { struct xe_device *xe = gt_to_xe(gt); + int err; xe_gt_assert(gt, IS_SRIOV_PF(xe)); + mutex_lock(xe_gt_sriov_pf_master_mutex(gt)); + err = pf_push_self_config(gt); + mutex_unlock(xe_gt_sriov_pf_master_mutex(gt)); + + if (err) + return err; + return devm_add_action_or_reset(xe->drm.dev, fini_config, gt); } @@ -2407,6 +2430,10 @@ void xe_gt_sriov_pf_config_restart(struct xe_gt *gt) unsigned int n, total_vfs = xe_sriov_pf_get_totalvfs(gt_to_xe(gt)); unsigned int fail = 0, skip = 0; + mutex_lock(xe_gt_sriov_pf_master_mutex(gt)); + pf_push_self_config(gt); + mutex_unlock(xe_gt_sriov_pf_master_mutex(gt)); + for (n = 1; n <= total_vfs; n++) { if (xe_gt_sriov_pf_config_is_empty(gt, n)) skip++;