From 10c7988418d8f759ba70c4a558961e0bfa74647f Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Thu, 6 Mar 2025 18:42:11 +0530 Subject: [PATCH 1/5] drm/xe: Release guc ids before cancelling work A GT resets can be occurring in parallel while cancelling work in async call which can requeue these workers. to avoid that, lets first release guc ids and then cancel work so they don't requeued. Fixes: 8ae8a2e8dd21 ("drm/xe: Long running job update") Fixes: 12c2f962fe71 ("drm/xe: cancel pending job timer before freeing scheduler") Signed-off-by: Tejas Upadhyay Suggested-by: Matthew Brost Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20250306131211.975503-1-tejas.upadhyay@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 8e8d76f62329127b31c64a034b052fb9e30e92af) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_guc_submit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index b6a2dd742ebd..1a5fe4822a62 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1246,11 +1246,11 @@ static void __guc_exec_queue_fini_async(struct work_struct *w) xe_pm_runtime_get(guc_to_xe(guc)); trace_xe_exec_queue_destroy(q); + release_guc_id(guc, q); if (xe_exec_queue_is_lr(q)) cancel_work_sync(&ge->lr_tdr); /* Confirm no work left behind accessing device structures */ cancel_delayed_work_sync(&ge->sched.base.work_tdr); - release_guc_id(guc, q); xe_sched_entity_fini(&ge->entity); xe_sched_fini(&ge->sched); From 9106713bd2ab0cacd380cda0d3f0219f2e488086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Fri, 7 Mar 2025 11:01:09 +0100 Subject: [PATCH 2/5] drm/xe/userptr: Fix an incorrect assert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The assert incorrectly checks the total length processed which can in fact be greater than the number of pages. Fix. Fixes: 0a98219bcc96 ("drm/xe/hmm: Don't dereference struct page pointers without notifier lock") Cc: Matthew Auld Cc: Matthew Brost Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20250307100109.21397-1-thomas.hellstrom@linux.intel.com (cherry picked from commit 70e5043ba85eae199b232e39921abd706b5c1fa4) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_hmm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_hmm.c b/drivers/gpu/drm/xe/xe_hmm.c index 392102515f3d..c3cc0fa105e8 100644 --- a/drivers/gpu/drm/xe/xe_hmm.c +++ b/drivers/gpu/drm/xe/xe_hmm.c @@ -138,13 +138,17 @@ static int xe_build_sg(struct xe_device *xe, struct hmm_range *range, i += size; if (unlikely(j == st->nents - 1)) { + xe_assert(xe, i >= npages); if (i > npages) size -= (i - npages); + sg_mark_end(sgl); + } else { + xe_assert(xe, i < npages); } + sg_set_page(sgl, page, size << PAGE_SHIFT, 0); } - xe_assert(xe, i == npages); return dma_map_sgtable(dev, st, write ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING); From 3e331a6715ee26f2fabc59dad6bb36d810707028 Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Fri, 7 Mar 2025 19:56:35 -0500 Subject: [PATCH 3/5] drm/xe/pm: Temporarily disable D3Cold on BMG Currently, many instability cases related to D3Cold -> D0 transition on BMG are under investigation. Among them some bad cases where the device is lost after 1 to 3 transitions from D3Cold to D0 on the runtime pm, with pcieport upstream bridge port link retrain failure. In other cases, it works fine, but with some sudden random memory corruptions after D3cold, that could be 0xffff missed ack on GT forcewake or GuC reload related failures. In some other cases though, D3Cold -> D0 works pretty reliably. It looks like it is a combination of GPU cards and Host boards at this point. So, there is no possible/available quirk at this time. This patch disables the D3Cold by default on BMG by reducing the vram_d3cold_threshold to 0. Users and developers who wants to enable it are still able to via $ echo 300 > /sys/bus/pci/devices//vram_d3cold_threshold Fixes: 3adcf970dc7e ("drm/xe/bmg: Drop force_probe requirement") Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4037 Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4395 Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4396 Cc: Karthik Poosa Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20250308005636.1475420-1-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit d945cc876277851053c0cf37927c8d7bd9d0e880) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_pm.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index c9cc0c091dfd..89fd2c043136 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -267,6 +267,15 @@ int xe_pm_init_early(struct xe_device *xe) } ALLOW_ERROR_INJECTION(xe_pm_init_early, ERRNO); /* See xe_pci_probe() */ +static u32 vram_threshold_value(struct xe_device *xe) +{ + /* FIXME: D3Cold temporarily disabled by default on BMG */ + if (xe->info.platform == XE_BATTLEMAGE) + return 0; + + return DEFAULT_VRAM_THRESHOLD; +} + /** * xe_pm_init - Initialize Xe Power Management * @xe: xe device instance @@ -277,6 +286,7 @@ ALLOW_ERROR_INJECTION(xe_pm_init_early, ERRNO); /* See xe_pci_probe() */ */ int xe_pm_init(struct xe_device *xe) { + u32 vram_threshold; int err; /* For now suspend/resume is only allowed with GuC */ @@ -290,7 +300,8 @@ int xe_pm_init(struct xe_device *xe) if (err) return err; - err = xe_pm_set_vram_threshold(xe, DEFAULT_VRAM_THRESHOLD); + vram_threshold = vram_threshold_value(xe); + err = xe_pm_set_vram_threshold(xe, vram_threshold); if (err) return err; } From c605acb53f449f6289f042790307d7dc9e62d03d Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Fri, 7 Mar 2025 11:03:07 -0500 Subject: [PATCH 4/5] drm/xe/guc_pc: Retry and wait longer for GuC PC start In a rare situation of thermal limit during resume, GuC can be slow and run into delays like this: xe 0000:00:02.0: [drm] GT1: excessive init time: 667ms! \ [status = 0x8002F034, timeouts = 0] xe 0000:00:02.0: [drm] GT1: excessive init time: \ [freq = 100MHz (req = 800MHz), before = 100MHz, \ perf_limit_reasons = 0x1C001000] xe 0000:00:02.0: [drm] *ERROR* GT1: GuC PC Start failed ------------[ cut here ]------------ xe 0000:00:02.0: [drm] GT1: Failed to start GuC PC: -EIO When this happens, it will block entirely the GPU to be used. So, let's try and with a huge timeout in the hope it comes back. Also, let's collect some information on how long it is usually taking on situations like this, so perhaps the time can be tuned later. Cc: Vinay Belgaumkar Cc: Jonathan Cavitt Cc: John Harrison Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20250307160307.1093391-1-rodrigo.vivi@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit b4b05e53b550a886b4754b87fd0dd2b304579e85) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_guc_pc.c | 53 +++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c index df7f130fb663..b995d1d51aed 100644 --- a/drivers/gpu/drm/xe/xe_guc_pc.c +++ b/drivers/gpu/drm/xe/xe_guc_pc.c @@ -6,6 +6,7 @@ #include "xe_guc_pc.h" #include +#include #include #include @@ -19,6 +20,7 @@ #include "xe_gt.h" #include "xe_gt_idle.h" #include "xe_gt_printk.h" +#include "xe_gt_throttle.h" #include "xe_gt_types.h" #include "xe_guc.h" #include "xe_guc_ct.h" @@ -49,6 +51,9 @@ #define LNL_MERT_FREQ_CAP 800 #define BMG_MERT_FREQ_CAP 2133 +#define SLPC_RESET_TIMEOUT_MS 5 /* roughly 5ms, but no need for precision */ +#define SLPC_RESET_EXTENDED_TIMEOUT_MS 1000 /* To be used only at pc_start */ + /** * DOC: GuC Power Conservation (PC) * @@ -113,9 +118,10 @@ static struct iosys_map *pc_to_maps(struct xe_guc_pc *pc) FIELD_PREP(HOST2GUC_PC_SLPC_REQUEST_MSG_1_EVENT_ARGC, count)) static int wait_for_pc_state(struct xe_guc_pc *pc, - enum slpc_global_state state) + enum slpc_global_state state, + int timeout_ms) { - int timeout_us = 5000; /* rought 5ms, but no need for precision */ + int timeout_us = 1000 * timeout_ms; int slept, wait = 10; xe_device_assert_mem_access(pc_to_xe(pc)); @@ -164,7 +170,8 @@ static int pc_action_query_task_state(struct xe_guc_pc *pc) }; int ret; - if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING, + SLPC_RESET_TIMEOUT_MS)) return -EAGAIN; /* Blocking here to ensure the results are ready before reading them */ @@ -187,7 +194,8 @@ static int pc_action_set_param(struct xe_guc_pc *pc, u8 id, u32 value) }; int ret; - if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING, + SLPC_RESET_TIMEOUT_MS)) return -EAGAIN; ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0); @@ -208,7 +216,8 @@ static int pc_action_unset_param(struct xe_guc_pc *pc, u8 id) struct xe_guc_ct *ct = &pc_to_guc(pc)->ct; int ret; - if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING, + SLPC_RESET_TIMEOUT_MS)) return -EAGAIN; ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0); @@ -440,6 +449,15 @@ u32 xe_guc_pc_get_act_freq(struct xe_guc_pc *pc) return freq; } +static u32 get_cur_freq(struct xe_gt *gt) +{ + u32 freq; + + freq = xe_mmio_read32(>->mmio, RPNSWREQ); + freq = REG_FIELD_GET(REQ_RATIO_MASK, freq); + return decode_freq(freq); +} + /** * xe_guc_pc_get_cur_freq - Get Current requested frequency * @pc: The GuC PC @@ -463,10 +481,7 @@ int xe_guc_pc_get_cur_freq(struct xe_guc_pc *pc, u32 *freq) return -ETIMEDOUT; } - *freq = xe_mmio_read32(>->mmio, RPNSWREQ); - - *freq = REG_FIELD_GET(REQ_RATIO_MASK, *freq); - *freq = decode_freq(*freq); + *freq = get_cur_freq(gt); xe_force_wake_put(gt_to_fw(gt), fw_ref); return 0; @@ -1002,6 +1017,7 @@ int xe_guc_pc_start(struct xe_guc_pc *pc) struct xe_gt *gt = pc_to_gt(pc); u32 size = PAGE_ALIGN(sizeof(struct slpc_shared_data)); unsigned int fw_ref; + ktime_t earlier; int ret; xe_gt_assert(gt, xe_device_uc_enabled(xe)); @@ -1026,14 +1042,25 @@ int xe_guc_pc_start(struct xe_guc_pc *pc) memset(pc->bo->vmap.vaddr, 0, size); slpc_shared_data_write(pc, header.size, size); + earlier = ktime_get(); ret = pc_action_reset(pc); if (ret) goto out; - if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) { - xe_gt_err(gt, "GuC PC Start failed\n"); - ret = -EIO; - goto out; + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING, + SLPC_RESET_TIMEOUT_MS)) { + xe_gt_warn(gt, "GuC PC start taking longer than normal [freq = %dMHz (req = %dMHz), perf_limit_reasons = 0x%08X]\n", + xe_guc_pc_get_act_freq(pc), get_cur_freq(gt), + xe_gt_throttle_get_limit_reasons(gt)); + + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING, + SLPC_RESET_EXTENDED_TIMEOUT_MS)) { + xe_gt_err(gt, "GuC PC Start failed: Dynamic GT frequency control and GT sleep states are now disabled.\n"); + goto out; + } + + xe_gt_warn(gt, "GuC PC excessive start time: %lldms", + ktime_ms_delta(ktime_get(), earlier)); } ret = pc_init_freqs(pc); From f5d4e81774c42d9c2ea3980e570f3330ff2ed5d2 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Mon, 3 Mar 2025 08:49:41 +0800 Subject: [PATCH 5/5] drm/xe: remove redundant check in xe_vm_create_ioctl() The check for args->extensions is repeated twice in xe_vm_create_ioctl(). This commit removes the redundant check to streamline the code. Fixes: 7224788f6756 ("drm/xe: Kill XE_VM_PROPERTY_BIND_OP_ERROR_CAPTURE_ADDRESS extension") Cc: Rodrigo Vivi Signed-off-by: Xin Wang Reviewed-by: Tejas Upadhyay Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20250303004942.951699-1-x.wang@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 8da8aecf1f2d89c2b8188bcf7aa252ec146ddd12) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index ec6ec18ab3fa..5956631c0d40 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1809,9 +1809,6 @@ int xe_vm_create_ioctl(struct drm_device *dev, void *data, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)) return -EINVAL; - if (XE_IOCTL_DBG(xe, args->extensions)) - return -EINVAL; - if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE) flags |= XE_VM_FLAG_SCRATCH_PAGE; if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)