From c677f31c857632ca678a4a145b74855bacb72d17 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Fri, 7 Jun 2024 16:51:31 +0200 Subject: [PATCH 1/8] drm/i915/gt: debugfs: Evaluate forcewake usage within locks The forcewake count and domains listing is multi process critical and the uncore provides a spinlock for such cases. Lock the forcewake evaluation section in the fw_domains_show() debugfs interface. Signed-off-by: Andi Shyti Reviewed-by: Rodrigo Vivi Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20240607145131.217251-1-andi.shyti@linux.intel.com --- drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c index 4fcba42cfe34..0437fd8217e0 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm_debugfs.c @@ -71,6 +71,8 @@ static int fw_domains_show(struct seq_file *m, void *data) struct intel_uncore_forcewake_domain *fw_domain; unsigned int tmp; + spin_lock_irq(&uncore->lock); + seq_printf(m, "user.bypass_count = %u\n", uncore->user_forcewake_count); @@ -79,6 +81,8 @@ static int fw_domains_show(struct seq_file *m, void *data) intel_uncore_forcewake_domain_to_str(fw_domain->id), READ_ONCE(fw_domain->wake_count)); + spin_unlock_irq(&uncore->lock); + return 0; } DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(fw_domains); From 05da7d9f717bcb03c457379fa8a61c1689dab86c Mon Sep 17 00:00:00 2001 From: Jonathan Cavitt Date: Mon, 22 Apr 2024 06:59:59 -0700 Subject: [PATCH 2/8] drm/i915/gem: Downgrade stolen lmem setup warning In the case where lmem_size < dsm_base, hardware is reporting that stolen lmem is unusable. In this case, instead of throwing a warning, we can continue execution as normal by disabling stolen LMEM support. For example, this change will allow the following error report from ATS-M to no longer apply: <6> [144.859887] pcieport 0000:4b:00.0: bridge window [mem 0xb1000000-0xb11fffff] <6> [144.859900] pcieport 0000:4b:00.0: bridge window [mem 0x3bbc00000000-0x3bbc17ffffff 64bit pref] <6> [144.859917] pcieport 0000:4c:01.0: PCI bridge to [bus 4d-4e] <6> [144.859932] pcieport 0000:4c:01.0: bridge window [mem 0xb1000000-0xb11fffff] <6> [144.859945] pcieport 0000:4c:01.0: bridge window [mem 0x3bbc00000000-0x3bbc17ffffff 64bit pref] <6> [144.859984] i915 0000:4d:00.0: [drm] BAR2 resized to 256M <6> [144.860640] i915 0000:4d:00.0: [drm] Using a reduced BAR size of 256MiB. Consider enabling 'Resizable BAR' or similar, if available in the BIOS. <4> [144.860719] -----------[ cut here ]----------- <4> [144.860727] WARNING: CPU: 17 PID: 1815 at drivers/gpu/drm/i915/gem/i915_gem_stolen.c:939 i915_gem_stolen_lmem_setup+0x38c/0x430 [i915] <4> [144.861430] Modules linked in: i915 snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core snd_pcm vgem drm_shmem_helper prime_numbers i2c_algo_bit ttm video drm_display_helper drm_buddy fuse x86_pkg_temp_thermal coretemp kvm_intel kvm ixgbe mdio irqbypass ptp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pps_core i2c_i801 mei_me i2c_smbus mei wmi acpi_power_meter [last unloaded: i915] <4> [144.861611] CPU: 17 PID: 1815 Comm: i915_module_loa Tainted: G U 6.8.0-rc5-drmtip_1515-g78f49af27723+ #1 <4> [144.861624] Hardware name: Intel Corporation WHITLEY/WHITLEY, BIOS SE5C6200.86B.0020.P41.2109300305 09/30/2021 <4> [144.861632] RIP: 0010:i915_gem_stolen_lmem_setup+0x38c/0x430 [i915] <4> [144.862287] Code: ff 41 c1 e4 05 e9 ac fe ff ff 4d 63 e4 48 89 ef 48 85 ed 74 04 48 8b 7d 08 48 c7 c6 10 a3 7b a0 e8 e9 90 43 e1 e9 ee fd ff ff <0f> 0b 49 c7 c4 ed ff ff ff e9 e0 fd ff ff 0f b7 d2 48 c7 c6 00 d9 <4> [144.862299] RSP: 0018:ffffc90005607980 EFLAGS: 00010207 <4> [144.862315] RAX: fffffffffff00000 RBX: 0000000000000003 RCX: 0000000000000000 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10833 Suggested-by: Chris Wilson Signed-off-by: Jonathan Cavitt Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240422135959.4127003-1-jonathan.cavitt@intel.com --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index ad6dd7f3259b..4d60a5b37505 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -936,8 +936,12 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, } else { /* Use DSM base address instead for stolen memory */ dsm_base = intel_uncore_read64(uncore, GEN6_DSMBASE) & GEN11_BDSM_MASK; - if (WARN_ON(lmem_size < dsm_base)) - return ERR_PTR(-ENODEV); + if (lmem_size < dsm_base) { + drm_dbg(&i915->drm, + "Disabling stolen memory support due to OOB placement: lmem_size = %lli vs dsm_base = %lli\n", + lmem_size, dsm_base); + return 0; + } dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M); } From 45ebbbbeaa33da4a6dbc532ebc57d20de4b60a82 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Fri, 14 Jun 2024 00:28:37 +0200 Subject: [PATCH 3/8] drm/i915/gt/uc: Fix typo in comment Replace "dynmically" with "dynamically". Signed-off-by: Andi Shyti Cc: John Harrison Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240613222837.552277-1-andi.shyti@linux.intel.com --- drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h index 14797e80bc92..263c9c3f6a03 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -295,7 +295,7 @@ struct guc_update_scheduling_policy_header { } __packed; /* - * Can't dynmically allocate memory for the scheduling policy KLV because + * Can't dynamically allocate memory for the scheduling policy KLV because * it will be sent from within the reset path. Need a fixed size lump on * the stack instead :(. * From ae45f07cade1a5853ff6fd745bbd86a64cc82643 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Fri, 14 Jun 2024 00:24:02 +0200 Subject: [PATCH 4/8] drm/i915/gt/uc: Evaluate GuC priority within locks The ce->guc_state.lock was made to protect guc_prio, which indicates the GuC priority level. But at the begnning of the function we perform some sanity check of guc_prio outside its protected section. Move them within the locked region. Use this occasion to expand the if statement to make it clearer. Signed-off-by: Andi Shyti Cc: Matthew Brost Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240613222402.551625-2-andi.shyti@linux.intel.com --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 0eaa1064242c..9400d0eb682b 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -4267,20 +4267,25 @@ static void guc_bump_inflight_request_prio(struct i915_request *rq, u8 new_guc_prio = map_i915_prio_to_guc_prio(prio); /* Short circuit function */ - if (prio < I915_PRIORITY_NORMAL || - rq->guc_prio == GUC_PRIO_FINI || - (rq->guc_prio != GUC_PRIO_INIT && - !new_guc_prio_higher(rq->guc_prio, new_guc_prio))) + if (prio < I915_PRIORITY_NORMAL) return; spin_lock(&ce->guc_state.lock); - if (rq->guc_prio != GUC_PRIO_FINI) { - if (rq->guc_prio != GUC_PRIO_INIT) - sub_context_inflight_prio(ce, rq->guc_prio); - rq->guc_prio = new_guc_prio; - add_context_inflight_prio(ce, rq->guc_prio); - update_context_prio(ce); - } + + if (rq->guc_prio == GUC_PRIO_FINI) + goto exit; + + if (!new_guc_prio_higher(rq->guc_prio, new_guc_prio)) + goto exit; + + if (rq->guc_prio != GUC_PRIO_INIT) + sub_context_inflight_prio(ce, rq->guc_prio); + + rq->guc_prio = new_guc_prio; + add_context_inflight_prio(ce, rq->guc_prio); + update_context_prio(ce); + +exit: spin_unlock(&ce->guc_state.lock); } From 24bb052d3dd499c5956abad5f7d8e4fd07da7fb1 Mon Sep 17 00:00:00 2001 From: Janusz Krzysztofik Date: Mon, 3 Jun 2024 21:54:45 +0200 Subject: [PATCH 5/8] drm/i915/gt: Fix potential UAF by revoke of fence registers CI has been sporadically reporting the following issue triggered by igt@i915_selftest@live@hangcheck on ADL-P and similar machines: <6> [414.049203] i915: Running intel_hangcheck_live_selftests/igt_reset_evict_fence ... <6> [414.068804] i915 0000:00:02.0: [drm] GT0: GUC: submission enabled <6> [414.068812] i915 0000:00:02.0: [drm] GT0: GUC: SLPC enabled <3> [414.070354] Unable to pin Y-tiled fence; err:-4 <3> [414.071282] i915_vma_revoke_fence:301 GEM_BUG_ON(!i915_active_is_idle(&fence->active)) ... <4>[ 609.603992] ------------[ cut here ]------------ <2>[ 609.603995] kernel BUG at drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c:301! <4>[ 609.604003] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI <4>[ 609.604006] CPU: 0 PID: 268 Comm: kworker/u64:3 Tainted: G U W 6.9.0-CI_DRM_14785-g1ba62f8cea9c+ #1 <4>[ 609.604008] Hardware name: Intel Corporation Alder Lake Client Platform/AlderLake-P DDR4 RVP, BIOS RPLPFWI1.R00.4035.A00.2301200723 01/20/2023 <4>[ 609.604010] Workqueue: i915 __i915_gem_free_work [i915] <4>[ 609.604149] RIP: 0010:i915_vma_revoke_fence+0x187/0x1f0 [i915] ... <4>[ 609.604271] Call Trace: <4>[ 609.604273] ... <4>[ 609.604716] __i915_vma_evict+0x2e9/0x550 [i915] <4>[ 609.604852] __i915_vma_unbind+0x7c/0x160 [i915] <4>[ 609.604977] force_unbind+0x24/0xa0 [i915] <4>[ 609.605098] i915_vma_destroy+0x2f/0xa0 [i915] <4>[ 609.605210] __i915_gem_object_pages_fini+0x51/0x2f0 [i915] <4>[ 609.605330] __i915_gem_free_objects.isra.0+0x6a/0xc0 [i915] <4>[ 609.605440] process_scheduled_works+0x351/0x690 ... In the past, there were similar failures reported by CI from other IGT tests, observed on other platforms. Before commit 63baf4f3d587 ("drm/i915/gt: Only wait for GPU activity before unbinding a GGTT fence"), i915_vma_revoke_fence() was waiting for idleness of vma->active via fence_update(). That commit introduced vma->fence->active in order for the fence_update() to be able to wait selectively on that one instead of vma->active since only idleness of fence registers was needed. But then, another commit 0d86ee35097a ("drm/i915/gt: Make fence revocation unequivocal") replaced the call to fence_update() in i915_vma_revoke_fence() with only fence_write(), and also added that GEM_BUG_ON(!i915_active_is_idle(&fence->active)) in front. No justification was provided on why we might then expect idleness of vma->fence->active without first waiting on it. The issue can be potentially caused by a race among revocation of fence registers on one side and sequential execution of signal callbacks invoked on completion of a request that was using them on the other, still processed in parallel to revocation of those fence registers. Fix it by waiting for idleness of vma->fence->active in i915_vma_revoke_fence(). Fixes: 0d86ee35097a ("drm/i915/gt: Make fence revocation unequivocal") Closes: https://gitlab.freedesktop.org/drm/intel/issues/10021 Signed-off-by: Janusz Krzysztofik Cc: stable@vger.kernel.org # v5.8+ Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240603195446.297690-2-janusz.krzysztofik@linux.intel.com --- drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c index 40371b8a9bbb..93bc1cc1ee7e 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c @@ -298,6 +298,7 @@ void i915_vma_revoke_fence(struct i915_vma *vma) return; GEM_BUG_ON(fence->vma != vma); + i915_active_wait(&fence->active); GEM_BUG_ON(!i915_active_is_idle(&fence->active)); GEM_BUG_ON(atomic_read(&fence->pin_count)); From 3bece98b9eb6941b4708237a4557a5082df25589 Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Mon, 17 Jun 2024 20:42:42 +0200 Subject: [PATCH 6/8] drm/i915/gem: Return NULL instead of '0' Commit 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning") returns '0' from i915_gem_stolen_lmem_setup(), but it's supposed to return a pointer to the intel_memory_region structure. Sparse complains with the following message: >> drivers/gpu/drm/i915/gem/i915_gem_stolen.c:943:32: sparse: sparse: Using plain integer as NULL pointer Return NULL. Signed-off-by: Andi Shyti Cc: Jonathan Cavitt Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240617184243.330231-2-andi.shyti@linux.intel.com --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 4d60a5b37505..2f22306e9c0a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -940,7 +940,7 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, drm_dbg(&i915->drm, "Disabling stolen memory support due to OOB placement: lmem_size = %lli vs dsm_base = %lli\n", lmem_size, dsm_base); - return 0; + return NULL; } dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M); } From e22103b9b6026cc0a7846dc6369f0399b863039f Mon Sep 17 00:00:00 2001 From: Andi Shyti Date: Mon, 17 Jun 2024 20:42:43 +0200 Subject: [PATCH 7/8] drm/i915/gem: Use the correct format specifier for resource_size_t Commit 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning") adds a debug message where the "lmem_size" and "dsm_base" variables are printed using the %lli identifier. However, these variables are defined as resource_size_t, which are unsigned long for 32-bit machines and unsigned long long for 64-bit machines. The documentation (core-api/printk-formats.rst) recommends using the %pa specifier for printing addresses and sizes of resources. Replace %lli with %pa. This patch also mutes the following sparse warning when compiling with: make W=1 ARCH=i386 drivers/gpu/drm/i915 >> drivers/gpu/drm/i915/gem/i915_gem_stolen.c:941:5: error: format '%lli' expects argument of type 'long long int', but argument 5 has type 'resource_size_t' {aka 'unsigned int'} [-Werror=format=] Signed-off-by: Andi Shyti Cc: Jonathan Cavitt Reviewed-by: Jonathan Cavitt Link: https://patchwork.freedesktop.org/patch/msgid/20240617184243.330231-3-andi.shyti@linux.intel.com --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 2f22306e9c0a..903f1e466cf8 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -938,8 +938,8 @@ i915_gem_stolen_lmem_setup(struct drm_i915_private *i915, u16 type, dsm_base = intel_uncore_read64(uncore, GEN6_DSMBASE) & GEN11_BDSM_MASK; if (lmem_size < dsm_base) { drm_dbg(&i915->drm, - "Disabling stolen memory support due to OOB placement: lmem_size = %lli vs dsm_base = %lli\n", - lmem_size, dsm_base); + "Disabling stolen memory support due to OOB placement: lmem_size = %pa vs dsm_base = %pa\n", + &lmem_size, &dsm_base); return NULL; } dsm_size = ALIGN_DOWN(lmem_size - dsm_base, SZ_1M); From 3b85152cb167bd24fe84ceb91b719b5904ca354f Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Wed, 26 Jun 2024 16:33:18 +0200 Subject: [PATCH 8/8] drm/i915/gem: Suppress oom warning in favour of ENOMEM to userspace We report object allocation failures to userspace with ENOMEM so add __GFP_NOWARN to remove superfluous oom warnings. Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/4936 Cc: Andi Shyti Signed-off-by: Nirmoy Das Reviewed-by: Rodrigo Vivi Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240626143318.11600-1-nirmoy.das@intel.com --- drivers/gpu/drm/i915/i915_scatterlist.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index e93d2538f298..4d830740946d 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -90,7 +90,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, GEM_BUG_ON(!max_segment); - rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); + rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN); if (!rsgt) return ERR_PTR(-ENOMEM); @@ -104,7 +104,7 @@ struct i915_refct_sgt *i915_rsgt_from_mm_node(const struct drm_mm_node *node, } if (sg_alloc_table(st, DIV_ROUND_UP_ULL(node->size, segment_pages), - GFP_KERNEL)) { + GFP_KERNEL | __GFP_NOWARN)) { i915_refct_sgt_put(rsgt); return ERR_PTR(-ENOMEM); } @@ -178,7 +178,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, GEM_BUG_ON(list_empty(blocks)); GEM_BUG_ON(!max_segment); - rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL); + rsgt = kmalloc(sizeof(*rsgt), GFP_KERNEL | __GFP_NOWARN); if (!rsgt) return ERR_PTR(-ENOMEM); @@ -190,7 +190,7 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, return ERR_PTR(-E2BIG); } - if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL)) { + if (sg_alloc_table(st, PFN_UP(res->size), GFP_KERNEL | __GFP_NOWARN)) { i915_refct_sgt_put(rsgt); return ERR_PTR(-ENOMEM); }