From c0de552910bbd9e49568c1052e00ed118712152a Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 28 Oct 2025 13:49:24 +0530 Subject: [PATCH 01/83] drm/amdkfd: clean up the code to free hmm_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit a. hmm_range is either NULL or a valid pointer so we do not need to set range to NULL ever. b. keep the hmm_range_free in the end irrespective of the other conditions to avoid some additional checks and also avoid double free issue. Signed-off-by: Sunil Khatri Reviewed-by: Felix Kuehling Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index ffb7b36e577c..c30dfb8ec236 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1744,11 +1744,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm, else r = -ENOMEM; WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { - amdgpu_hmm_range_free(range); - range = NULL; + if (r) pr_debug("failed %d to get svm range pages\n", r); - } } else { r = -EFAULT; } @@ -1771,10 +1768,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm, pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; } - /* Free the hmm range */ - if (range) - amdgpu_hmm_range_free(range); + /* Free the hmm range */ + amdgpu_hmm_range_free(range); if (!r && !list_empty(&prange->child_list)) { pr_debug("range split by unmap in parallel, validate again\n"); From 1972763505d728c604b537180727ec8132e619df Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 14 Oct 2025 16:45:17 -0400 Subject: [PATCH 02/83] drm/amdgpu: set default gfx reset masks for gfx6-8 These were not set so soft recovery was inadvertantly disabled. Fixes: 6ac55eab4fc4 ("drm/amdgpu: move reset support type checks into the caller") Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index 7693b7953426..80565392313f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -3102,6 +3102,11 @@ static int gfx_v6_0_sw_init(struct amdgpu_ip_block *ip_block) return r; } + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index 5976ed55d9db..2b7aba22ecc1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -4399,6 +4399,11 @@ static int gfx_v7_0_sw_init(struct amdgpu_ip_block *ip_block) gfx_v7_0_gpu_early_init(adev); + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index d3d0a4b0380c..1c87375e1dd5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -2023,6 +2023,11 @@ static int gfx_v8_0_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + return 0; } From c6526cc6f887c710bab21ee934035a2c9ffb8c4b Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Tue, 28 Oct 2025 17:39:27 +0530 Subject: [PATCH 03/83] drm/amdgpu: caller should make sure not to double free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the NULL check from amdgpu_hmm_range_free for hmm_pfns as caller is responsible not to call amdgpu_hmm_range_free more than once. Signed-off-by: Sunil Khatri Reviewed-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 518ca3f4db2b..90d26d820bac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -287,9 +287,7 @@ void amdgpu_hmm_range_free(struct amdgpu_hmm_range *range) if (!range) return; - if (range->hmm_range.hmm_pfns) - kvfree(range->hmm_range.hmm_pfns); - + kvfree(range->hmm_range.hmm_pfns); amdgpu_bo_unref(&range->bo); kfree(range); } From b5e333e634bfcb740ae157653a9788f6c75226c2 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Mon, 27 Oct 2025 18:11:52 +0800 Subject: [PATCH 04/83] drm/amdgpu: Update invalidate and flush hdp function Update asic_invalidate_hdp and asic_flush_hdp function to check if ip function exist, if not return void v2: Use else/if (Kevin) Update function name (Lijo) Signed-off-by: Asad Kamal Suggested-by: Lijo Lazar Reviewed-by: Yang Wang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ++----- drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 16 ++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h | 4 ++++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 50079209c472..d1137d8beca7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1539,11 +1539,8 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define amdgpu_asic_read_bios_from_rom(adev, b, l) (adev)->asic_funcs->read_bios_from_rom((adev), (b), (l)) #define amdgpu_asic_read_register(adev, se, sh, offset, v)((adev)->asic_funcs->read_register((adev), (se), (sh), (offset), (v))) #define amdgpu_asic_get_config_memsize(adev) (adev)->asic_funcs->get_config_memsize((adev)) -#define amdgpu_asic_flush_hdp(adev, r) \ - ((adev)->asic_funcs->flush_hdp ? (adev)->asic_funcs->flush_hdp((adev), (r)) : (adev)->hdp.funcs->flush_hdp((adev), (r))) -#define amdgpu_asic_invalidate_hdp(adev, r) \ - ((adev)->asic_funcs->invalidate_hdp ? (adev)->asic_funcs->invalidate_hdp((adev), (r)) : \ - ((adev)->hdp.funcs->invalidate_hdp ? (adev)->hdp.funcs->invalidate_hdp((adev), (r)) : (void)0)) +#define amdgpu_asic_flush_hdp(adev, r) amdgpu_hdp_flush(adev, r) +#define amdgpu_asic_invalidate_hdp(adev, r) amdgpu_hdp_invalidate(adev, r) #define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev)) #define amdgpu_asic_init_doorbell_index(adev) (adev)->asic_funcs->init_doorbell_index((adev)) #define amdgpu_asic_get_pcie_usage(adev, cnt0, cnt1) ((adev)->asic_funcs->get_pcie_usage((adev), (cnt0), (cnt1))) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c index 6e02fb9ac2f6..5a60d69a3e1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c @@ -66,3 +66,19 @@ void amdgpu_hdp_generic_flush(struct amdgpu_device *adev, 0); } } + +void amdgpu_hdp_invalidate(struct amdgpu_device *adev, struct amdgpu_ring *ring) +{ + if (adev->asic_funcs && adev->asic_funcs->invalidate_hdp) + adev->asic_funcs->invalidate_hdp(adev, ring); + else if (adev->hdp.funcs && adev->hdp.funcs->invalidate_hdp) + adev->hdp.funcs->invalidate_hdp(adev, ring); +} + +void amdgpu_hdp_flush(struct amdgpu_device *adev, struct amdgpu_ring *ring) +{ + if (adev->asic_funcs && adev->asic_funcs->flush_hdp) + adev->asic_funcs->flush_hdp(adev, ring); + else if (adev->hdp.funcs && adev->hdp.funcs->flush_hdp) + adev->hdp.funcs->flush_hdp(adev, ring); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h index 4cfd932b7e91..d9f488fa76b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h @@ -46,4 +46,8 @@ struct amdgpu_hdp { int amdgpu_hdp_ras_sw_init(struct amdgpu_device *adev); void amdgpu_hdp_generic_flush(struct amdgpu_device *adev, struct amdgpu_ring *ring); +void amdgpu_hdp_invalidate(struct amdgpu_device *adev, + struct amdgpu_ring *ring); +void amdgpu_hdp_flush(struct amdgpu_device *adev, + struct amdgpu_ring *ring); #endif /* __AMDGPU_HDP_H__ */ From ad0a48e531a3137cec16bb5f8f60c8cc8de06b01 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 14 Oct 2025 17:01:05 -0400 Subject: [PATCH 05/83] drm/amdgpu: move reset debug disable handling Move everything to the supported resets masks rather than having an explicit misc checks for this. Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 8 +++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 3 --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 8 ++++++-- drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 3 ++- 12 files changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index e08d837668f1..9e0cd1e0afc3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -130,11 +130,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } /* attempt a per ring reset */ - if (unlikely(adev->debug_disable_gpu_ring_reset)) { - dev_err(adev->dev, "Ring reset disabled by debug mask\n"); - } else if (amdgpu_gpu_recovery && - amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_PER_QUEUE) && - ring->funcs->reset) { + if (amdgpu_gpu_recovery && + amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_PER_QUEUE) && + ring->funcs->reset) { dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); r = amdgpu_ring_reset(ring, job->vmid, job->hw_fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 43f769fed810..bf1b90a341d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -468,9 +468,6 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, ktime_t deadline; bool ret; - if (unlikely(ring->adev->debug_disable_soft_recovery)) - return false; - deadline = ktime_add_us(ktime_get(), 10000); if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 751732f3e883..d75b9940f248 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4956,7 +4956,8 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block) amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); adev->gfx.compute_supported_reset = amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); - if (!amdgpu_sriov_vf(adev)) { + if (!amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 252517ce5d5a..02d7cfae22bd 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1821,13 +1821,15 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(11, 0, 3): if ((adev->gfx.me_fw_version >= 2280) && (adev->gfx.mec_fw_version >= 2410) && - !amdgpu_sriov_vf(adev)) { + !amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; } break; default: - if (!amdgpu_sriov_vf(adev)) { + if (!amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 35d5a7e99a7c..d01d2712cf57 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1548,7 +1548,8 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(12, 0, 1): if ((adev->gfx.me_fw_version >= 2660) && (adev->gfx.mec_fw_version >= 2920) && - !amdgpu_sriov_vf(adev)) { + !amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index f1a2efc2a8d0..0148d7ff34d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2409,7 +2409,7 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block) amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); adev->gfx.compute_supported_reset = amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev) && !adev->debug_disable_gpu_ring_reset) adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index e0b50c690f8c..c4c551ef6b87 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -1149,14 +1149,16 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(9, 4, 3): case IP_VERSION(9, 4, 4): if ((adev->gfx.mec_fw_version >= 155) && - !amdgpu_sriov_vf(adev)) { + !amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE; } break; case IP_VERSION(9, 5, 0): if ((adev->gfx.mec_fw_version >= 21) && - !amdgpu_sriov_vf(adev)) { + !amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE; } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 36b1ca73c2ed..a1443990d5c6 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -2361,11 +2361,15 @@ static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev) switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(9, 4, 3): case IP_VERSION(9, 4, 4): - if ((adev->gfx.mec_fw_version >= 0xb0) && amdgpu_dpm_reset_sdma_is_supported(adev)) + if ((adev->gfx.mec_fw_version >= 0xb0) && + amdgpu_dpm_reset_sdma_is_supported(adev) && + !adev->debug_disable_gpu_ring_reset) adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; case IP_VERSION(9, 5, 0): - if ((adev->gfx.mec_fw_version >= 0xf) && amdgpu_dpm_reset_sdma_is_supported(adev)) + if ((adev->gfx.mec_fw_version >= 0xf) && + amdgpu_dpm_reset_sdma_is_supported(adev) && + !adev->debug_disable_gpu_ring_reset) adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; default: diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index 7dc67a22a7a0..8ddc4df06a1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -1429,7 +1429,8 @@ static int sdma_v5_0_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(5, 0, 2): case IP_VERSION(5, 0, 5): if ((adev->sdma.instance[0].fw_version >= 35) && - !amdgpu_sriov_vf(adev)) + !amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; default: diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index 3bd44c24f692..c6a619514a8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -1348,12 +1348,14 @@ static int sdma_v5_2_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(5, 2, 3): case IP_VERSION(5, 2, 4): if ((adev->sdma.instance[0].fw_version >= 76) && - !amdgpu_sriov_vf(adev)) + !amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; case IP_VERSION(5, 2, 5): if ((adev->sdma.instance[0].fw_version >= 34) && - !amdgpu_sriov_vf(adev)) + !amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; default: diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index db6e41967f12..0ceeb19df2e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -1356,7 +1356,8 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(6, 0, 2): case IP_VERSION(6, 0, 3): if ((adev->sdma.instance[0].fw_version >= 21) && - !amdgpu_sriov_vf(adev)) + !amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; default: diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 326ecc8d37d2..2b81344dcd66 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1337,7 +1337,8 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block) adev->sdma.supported_reset = amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev) && + !adev->debug_disable_gpu_ring_reset) adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; r = amdgpu_sdma_sysfs_reset_mask_init(adev); From 527e3d40339b228f1c5a81ef17b4d883ead18530 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Wed, 22 Oct 2025 15:11:42 +0800 Subject: [PATCH 06/83] drm/amd/ras: Add CPER ring read for uniras Read CPER raw data from debugfs node "/sys/kernel/debug/dri/*/ amdgpu_ring_cper". Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 59 +++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 5a7bf0661dbf..011fa4748084 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -770,7 +770,8 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) "Saved bad pages %d reaches threshold value %d\n", control->ras_num_bad_pages, ras->bad_page_cnt_threshold); - if (adev->cper.enabled && amdgpu_cper_generate_bp_threshold_record(adev)) + if (adev->cper.enabled && !amdgpu_uniras_enabled(adev) && + amdgpu_cper_generate_bp_threshold_record(adev)) dev_warn(adev->dev, "fail to generate bad page threshold cper records\n"); if ((amdgpu_bad_page_threshold != -1) && diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index bf1b90a341d8..cd8873c6931a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -33,6 +33,7 @@ #include #include "amdgpu.h" +#include "amdgpu_ras_mgr.h" #include "atom.h" /* @@ -495,6 +496,61 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, */ #if defined(CONFIG_DEBUG_FS) +static ssize_t amdgpu_ras_cper_debugfs_read(struct file *f, char __user *buf, + size_t size, loff_t *offset) +{ + const uint8_t ring_header_size = 12; + struct amdgpu_ring *ring = file_inode(f)->i_private; + struct ras_cmd_cper_snapshot_req *snapshot_req __free(kfree) = + kzalloc(sizeof(struct ras_cmd_cper_snapshot_req), GFP_KERNEL); + struct ras_cmd_cper_snapshot_rsp *snapshot_rsp __free(kfree) = + kzalloc(sizeof(struct ras_cmd_cper_snapshot_rsp), GFP_KERNEL); + struct ras_cmd_cper_record_req *record_req __free(kfree) = + kzalloc(sizeof(struct ras_cmd_cper_record_req), GFP_KERNEL); + struct ras_cmd_cper_record_rsp *record_rsp __free(kfree) = + kzalloc(sizeof(struct ras_cmd_cper_record_rsp), GFP_KERNEL); + uint8_t *ring_header __free(kfree) = + kzalloc(ring_header_size, GFP_KERNEL); + uint32_t total_cper_num; + uint64_t start_cper_id; + int r; + + if (!snapshot_req || !snapshot_rsp || !record_req || !record_rsp || + !ring_header) + return -ENOMEM; + + if (!(*offset)) { + if (copy_to_user(buf, ring_header, ring_header_size)) + return -EFAULT; + buf += ring_header_size; + } + + r = amdgpu_ras_mgr_handle_ras_cmd(ring->adev, + RAS_CMD__GET_CPER_SNAPSHOT, + snapshot_req, sizeof(struct ras_cmd_cper_snapshot_req), + snapshot_rsp, sizeof(struct ras_cmd_cper_snapshot_rsp)); + if (r || !snapshot_rsp->total_cper_num) + return r; + + start_cper_id = snapshot_rsp->start_cper_id; + total_cper_num = snapshot_rsp->total_cper_num; + + record_req->buf_ptr = (uint64_t)(uintptr_t)buf; + record_req->buf_size = size; + record_req->cper_start_id = start_cper_id + *offset; + record_req->cper_num = total_cper_num; + r = amdgpu_ras_mgr_handle_ras_cmd(ring->adev, RAS_CMD__GET_CPER_RECORD, + record_req, sizeof(struct ras_cmd_cper_record_req), + record_rsp, sizeof(struct ras_cmd_cper_record_rsp)); + if (r) + return r; + + r = *offset ? record_rsp->real_data_size : record_rsp->real_data_size + ring_header_size; + (*offset) += record_rsp->real_cper_num; + + return r; +} + /* Layout of file is 12 bytes consisting of * - rptr * - wptr @@ -511,6 +567,9 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, loff_t i; int r; + if (ring->funcs->type == AMDGPU_RING_TYPE_CPER && amdgpu_uniras_enabled(ring->adev)) + return amdgpu_ras_cper_debugfs_read(f, buf, size, pos); + if (*pos & 3 || size & 3) return -EINVAL; From d80391dd037af032a38e36a9f9d366b7afd195b4 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Tue, 28 Oct 2025 22:08:28 +0800 Subject: [PATCH 07/83] drm/amdgpu: Remove invalidate and flush hdp macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove amdgpu_asic_flush_hdp & amdgpu_asic_invalidate_hdp functions and directly use the mapped ones Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 2 +- drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c | 4 ++-- drivers/gpu/drm/amd/pm/powerplay/smumgr/vega10_smumgr.c | 4 ++-- drivers/gpu/drm/amd/pm/powerplay/smumgr/vega12_smumgr.c | 4 ++-- drivers/gpu/drm/amd/pm/powerplay/smumgr/vega20_smumgr.c | 8 ++++---- drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 6 +++--- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 4 ++-- 11 files changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d1137d8beca7..bcfed46eedaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1539,8 +1539,6 @@ int emu_soc_asic_init(struct amdgpu_device *adev); #define amdgpu_asic_read_bios_from_rom(adev, b, l) (adev)->asic_funcs->read_bios_from_rom((adev), (b), (l)) #define amdgpu_asic_read_register(adev, se, sh, offset, v)((adev)->asic_funcs->read_register((adev), (se), (sh), (offset), (v))) #define amdgpu_asic_get_config_memsize(adev) (adev)->asic_funcs->get_config_memsize((adev)) -#define amdgpu_asic_flush_hdp(adev, r) amdgpu_hdp_flush(adev, r) -#define amdgpu_asic_invalidate_hdp(adev, r) amdgpu_hdp_invalidate(adev, r) #define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev)) #define amdgpu_asic_init_doorbell_index(adev) (adev)->asic_funcs->init_doorbell_index((adev)) #define amdgpu_asic_get_pcie_usage(adev, cnt0, cnt1) ((adev)->asic_funcs->get_pcie_usage((adev), (cnt0), (cnt1))) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 654f4844b7ad..b385e086e6c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -7327,7 +7327,7 @@ void amdgpu_device_flush_hdp(struct amdgpu_device *adev, return; } - amdgpu_asic_flush_hdp(adev, ring); + amdgpu_hdp_flush(adev, ring); } void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, @@ -7340,7 +7340,7 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, if (adev->gmc.xgmi.connected_to_cpu) return; - amdgpu_asic_invalidate_hdp(adev, ring); + amdgpu_hdp_invalidate(adev, ring); } int amdgpu_in_reset(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index c6a619514a8a..51101b0aa2fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -342,7 +342,7 @@ static void sdma_v5_2_ring_emit_hdp_flush(struct amdgpu_ring *ring) const struct nbio_hdp_flush_reg *nbio_hf_reg = adev->nbio.hdp_flush_reg; if (ring->me > 1) { - amdgpu_asic_flush_hdp(adev, ring); + amdgpu_hdp_flush(adev, ring); } else { ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 << ring->me; diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c index ac9ec8257f82..38e19e5cad4d 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c @@ -139,7 +139,7 @@ static int smu10_copy_table_from_smc(struct pp_hwmgr *hwmgr, priv->smu_tables.entry[table_id].table_id, NULL); - amdgpu_asic_invalidate_hdp(adev, NULL); + amdgpu_hdp_invalidate(adev, NULL); memcpy(table, (uint8_t *)priv->smu_tables.entry[table_id].table, priv->smu_tables.entry[table_id].size); @@ -164,7 +164,7 @@ static int smu10_copy_table_to_smc(struct pp_hwmgr *hwmgr, memcpy(priv->smu_tables.entry[table_id].table, table, priv->smu_tables.entry[table_id].size); - amdgpu_asic_flush_hdp(adev, NULL); + amdgpu_hdp_flush(adev, NULL); smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetDriverDramAddrHigh, diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega10_smumgr.c b/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega10_smumgr.c index f9c0f117725d..0bf1bf5528c2 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega10_smumgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega10_smumgr.c @@ -60,7 +60,7 @@ static int vega10_copy_table_from_smc(struct pp_hwmgr *hwmgr, priv->smu_tables.entry[table_id].table_id, NULL); - amdgpu_asic_invalidate_hdp(adev, NULL); + amdgpu_hdp_invalidate(adev, NULL); memcpy(table, priv->smu_tables.entry[table_id].table, priv->smu_tables.entry[table_id].size); @@ -90,7 +90,7 @@ static int vega10_copy_table_to_smc(struct pp_hwmgr *hwmgr, memcpy(priv->smu_tables.entry[table_id].table, table, priv->smu_tables.entry[table_id].size); - amdgpu_asic_flush_hdp(adev, NULL); + amdgpu_hdp_flush(adev, NULL); smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetDriverDramAddrHigh, diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega12_smumgr.c b/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega12_smumgr.c index d3ff6a831ed5..e2ba593faa5d 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega12_smumgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega12_smumgr.c @@ -68,7 +68,7 @@ static int vega12_copy_table_from_smc(struct pp_hwmgr *hwmgr, "[CopyTableFromSMC] Attempt to Transfer Table From SMU Failed!", return -EINVAL); - amdgpu_asic_invalidate_hdp(adev, NULL); + amdgpu_hdp_invalidate(adev, NULL); memcpy(table, priv->smu_tables.entry[table_id].table, priv->smu_tables.entry[table_id].size); @@ -98,7 +98,7 @@ static int vega12_copy_table_to_smc(struct pp_hwmgr *hwmgr, memcpy(priv->smu_tables.entry[table_id].table, table, priv->smu_tables.entry[table_id].size); - amdgpu_asic_flush_hdp(adev, NULL); + amdgpu_hdp_flush(adev, NULL); PP_ASSERT_WITH_CODE(smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetDriverDramAddrHigh, diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega20_smumgr.c b/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega20_smumgr.c index a5c95b180672..e3515156d26f 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega20_smumgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/vega20_smumgr.c @@ -192,7 +192,7 @@ static int vega20_copy_table_from_smc(struct pp_hwmgr *hwmgr, "[CopyTableFromSMC] Attempt to Transfer Table From SMU Failed!", return ret); - amdgpu_asic_invalidate_hdp(adev, NULL); + amdgpu_hdp_invalidate(adev, NULL); memcpy(table, priv->smu_tables.entry[table_id].table, priv->smu_tables.entry[table_id].size); @@ -223,7 +223,7 @@ static int vega20_copy_table_to_smc(struct pp_hwmgr *hwmgr, memcpy(priv->smu_tables.entry[table_id].table, table, priv->smu_tables.entry[table_id].size); - amdgpu_asic_flush_hdp(adev, NULL); + amdgpu_hdp_flush(adev, NULL); PP_ASSERT_WITH_CODE((ret = smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetDriverDramAddrHigh, @@ -256,7 +256,7 @@ int vega20_set_activity_monitor_coeff(struct pp_hwmgr *hwmgr, memcpy(priv->smu_tables.entry[TABLE_ACTIVITY_MONITOR_COEFF].table, table, priv->smu_tables.entry[TABLE_ACTIVITY_MONITOR_COEFF].size); - amdgpu_asic_flush_hdp(adev, NULL); + amdgpu_hdp_flush(adev, NULL); PP_ASSERT_WITH_CODE((ret = smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetDriverDramAddrHigh, @@ -306,7 +306,7 @@ int vega20_get_activity_monitor_coeff(struct pp_hwmgr *hwmgr, "[GetActivityMonitor] Attempt to Transfer Table From SMU Failed!", return ret); - amdgpu_asic_invalidate_hdp(adev, NULL); + amdgpu_hdp_invalidate(adev, NULL); memcpy(table, priv->smu_tables.entry[TABLE_ACTIVITY_MONITOR_COEFF].table, priv->smu_tables.entry[TABLE_ACTIVITY_MONITOR_COEFF].size); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c index bbf09aec9152..7c9f77124ab2 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c @@ -2889,7 +2889,7 @@ static int navi10_set_dummy_pstates_table_location(struct smu_context *smu) dummy_table += 0x1000; } - amdgpu_asic_flush_hdp(smu->adev, NULL); + amdgpu_hdp_flush(smu->adev, NULL); ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SET_DRIVER_DUMMY_TABLE_DRAM_ADDR_HIGH, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c index cb3fea9e8cf3..a0c844bf852c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c @@ -449,7 +449,7 @@ static int smu_v13_0_12_get_system_metrics_table(struct smu_context *smu) return ret; } - amdgpu_asic_invalidate_hdp(smu->adev, NULL); + amdgpu_hdp_invalidate(smu->adev, NULL); smu_table_cache_update_time(sys_table, jiffies); memcpy(sys_table->cache.buffer, table->cpu_addr, smu_v13_0_12_get_system_metrics_size()); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 0a7d2cea7dc6..e1f9aa5d6c20 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -766,7 +766,7 @@ int smu_v13_0_6_get_metrics_table(struct smu_context *smu, void *metrics_table, return ret; } - amdgpu_asic_invalidate_hdp(smu->adev, NULL); + amdgpu_hdp_invalidate(smu->adev, NULL); memcpy(smu_table->metrics_table, table->cpu_addr, table_size); smu_table->metrics_time = jiffies; @@ -845,7 +845,7 @@ int smu_v13_0_6_get_static_metrics_table(struct smu_context *smu) return ret; } - amdgpu_asic_invalidate_hdp(smu->adev, NULL); + amdgpu_hdp_invalidate(smu->adev, NULL); memcpy(smu_table->metrics_table, table->cpu_addr, table_size); return 0; @@ -2385,7 +2385,7 @@ static int smu_v13_0_6_request_i2c_xfer(struct smu_context *smu, memcpy(table->cpu_addr, table_data, table_size); /* Flush hdp cache */ - amdgpu_asic_flush_hdp(adev, NULL); + amdgpu_hdp_flush(adev, NULL); ret = smu_cmn_send_smc_msg(smu, SMU_MSG_RequestI2cTransaction, NULL); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index a8961a8f5c42..3b98065dac1d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -980,7 +980,7 @@ int smu_cmn_update_table(struct smu_context *smu, * Flush hdp cache: to guard the content seen by * GPU is consitent with CPU. */ - amdgpu_asic_flush_hdp(adev, NULL); + amdgpu_hdp_flush(adev, NULL); } ret = smu_cmn_send_smc_msg_with_param(smu, drv2smu ? @@ -992,7 +992,7 @@ int smu_cmn_update_table(struct smu_context *smu, return ret; if (!drv2smu) { - amdgpu_asic_invalidate_hdp(adev, NULL); + amdgpu_hdp_invalidate(adev, NULL); memcpy(table_data, table->cpu_addr, table_size); } From fd0e35bdd3a5ca43cfe4dad48ad7df510ee18407 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 28 Oct 2025 16:34:34 +0800 Subject: [PATCH 08/83] drm/amd/ras: Fix the error of undefined reference to `__udivdi3' Fix the error: drivers/gpu/drm/amd/amdgpu/../ras/ras_mgr/amdgpu_ras_mgr.c:132:undefined reference to `__udivdi3' Fixes: fa0b203cd902 ("drm/amd/ras: Add amdgpu ras management function.") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202510272144.6SUHUoWx-lkp@intel.com/ Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c index 8007e49951d8..dc2a4c6c1907 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c @@ -37,7 +37,7 @@ #define MAX_XCD_NUM_PER_AID 2 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ -#define ESTIMATE_BAD_PAGE_THRESHOLD(size) ((size)/(100 * 1024 * 1024ULL)) +#define TYPICAL_ECC_BAD_PAGE_RATE (100ULL * SZ_1M) #define COUNT_BAD_PAGE_THRESHOLD(size) (((size) >> 21) << 4) @@ -129,7 +129,7 @@ static int amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device *adev, */ if (amdgpu_bad_page_threshold == NONSTOP_OVER_THRESHOLD) eeprom_cfg->eeprom_record_threshold_count = - ESTIMATE_BAD_PAGE_THRESHOLD(adev->gmc.mc_vram_size); + div64_u64(adev->gmc.mc_vram_size, TYPICAL_ECC_BAD_PAGE_RATE); else if (amdgpu_bad_page_threshold == WARN_NONSTOP_OVER_THRESHOLD) eeprom_cfg->eeprom_record_threshold_count = COUNT_BAD_PAGE_THRESHOLD(RAS_RESERVED_VRAM_SIZE_DEFAULT); From 812b727364f166e9e4529a56c33707a9cf15e651 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 28 Oct 2025 16:30:12 +0800 Subject: [PATCH 09/83] drm/amdgpu: Fix error injection parameter error Fix error injection parameter error. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c8b4dd3ea5c3..0984928db042 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1661,7 +1661,7 @@ static int amdgpu_uniras_error_inject(struct amdgpu_device *adev, inject_req.address = info->address; inject_req.error_type = info->head.type; inject_req.instance_mask = info->instance_mask; - inject_req.value = info->value; + inject_req.method = info->value; return amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__INJECT_ERROR, &inject_req, sizeof(inject_req), &rsp, sizeof(rsp)); From 5e55fd9bc6bd5098ba963f6b3118549f2621bca5 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Wed, 29 Oct 2025 10:28:14 -0400 Subject: [PATCH 10/83] drm/amd/display: Fix null pointer on analog detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check if we have an amdgpu_dm_connector->dc_sink first before adding common modes for analog outputs. If we don't have a sink yet we can safely skip this. Fixes: 70181ad96ec2 ("drm/amd/display: Add common modes to analog displays without EDID") Signed-off-by: Harry Wentland Reviewed-by: Timur Kristóf Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index bb0fe91a1601..b798154768c8 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8690,7 +8690,7 @@ static int amdgpu_dm_connector_get_modes(struct drm_connector *connector) amdgpu_dm_connector->num_modes += drm_add_modes_noedid(connector, 1920, 1080); - if (amdgpu_dm_connector->dc_sink->edid_caps.analog) { + if (amdgpu_dm_connector->dc_sink && amdgpu_dm_connector->dc_sink->edid_caps.analog) { /* Analog monitor connected by DAC load detection. * Add common modes. It will be up to the user to select one that works. */ From 87208c10683930fbc066e6fe3b6111a0ab67bd04 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Mon, 20 Oct 2025 15:45:23 +0800 Subject: [PATCH 11/83] drm/amd/ras: Update IPID value for bad page threshold CPER The IPID register value for bad page threshold CPER holds socket_id info now according to the latest definition. Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c index d0621464f1a7..0a838fdcb2f6 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c @@ -226,7 +226,9 @@ void ras_log_ring_add_log_event(struct ras_core_context *ras_core, enum ras_log_event event, void *data, struct ras_log_batch_tag *batch_tag) { struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + struct device_system_info dev_info = {0}; struct ras_log_info *log; + uint64_t socket_id; void *obj; obj = mempool_alloc_preallocated(log_ring->ras_log_mempool); @@ -252,8 +254,13 @@ void ras_log_ring_add_log_event(struct ras_core_context *ras_core, if (data) memcpy(&log->aca_reg, data, sizeof(log->aca_reg)); - if (event == RAS_LOG_EVENT_RMA) + if (event == RAS_LOG_EVENT_RMA) { memcpy(&log->aca_reg, ras_rma_aca_reg, sizeof(log->aca_reg)); + ras_core_get_device_system_info(ras_core, &dev_info); + socket_id = dev_info.socket_id; + log->aca_reg.regs[ACA_REG_IDX__IPID] |= ((socket_id / 4) & 0x01); + log->aca_reg.regs[ACA_REG_IDX__IPID] |= (((socket_id % 4) & 0x3) << 44); + } ras_log_ring_add_data(ras_core, log, batch_tag); } From 8f94d5d0d7ec2af5c583ca6666acfc8f4413a850 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 27 Oct 2025 15:22:54 +0800 Subject: [PATCH 12/83] drm/amd/pm: fix the issue of size calculation error for smu 13.0.6 v1: the driver should handle return value of smu_v13_0_6_printk_clk_levels() to return the correct size for sysfs reads. v2: fix the issue of size calculation error in smu_v13_0_6_print_clks() Fixes: cdfdec6f1608 ("drm/amd/pm: Avoid writing nulls into `pp_od_clk_voltage`") Signed-off-by: Yang Wang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 42 ++++++++++++++----- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index e1f9aa5d6c20..197fd91e1fb4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1394,7 +1394,7 @@ static int smu_v13_0_6_print_clks(struct smu_context *smu, char *buf, int size, return -EINVAL; if (curr_clk < SMU_13_0_6_DSCLK_THRESHOLD) { - size = sysfs_emit_at(buf, size, "S: %uMhz *\n", curr_clk); + size += sysfs_emit_at(buf, size, "S: %uMhz *\n", curr_clk); for (i = 0; i < clocks.num_levels; i++) size += sysfs_emit_at(buf, size, "%d: %uMhz\n", i, clocks.data[i].clocks_in_khz / @@ -1514,9 +1514,13 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu, single_dpm_table = &(dpm_context->dpm_tables.uclk_table); - return smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, - now, "mclk"); + ret = smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, + now, "mclk"); + if (ret < 0) + return ret; + size += ret; + break; case SMU_SOCCLK: ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_SOCCLK, &now); @@ -1528,9 +1532,13 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu, single_dpm_table = &(dpm_context->dpm_tables.soc_table); - return smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, - now, "socclk"); + ret = smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, + now, "socclk"); + if (ret < 0) + return ret; + size += ret; + break; case SMU_FCLK: ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_FCLK, &now); @@ -1542,9 +1550,13 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu, single_dpm_table = &(dpm_context->dpm_tables.fclk_table); - return smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, - now, "fclk"); + ret = smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, + now, "fclk"); + if (ret < 0) + return ret; + size += ret; + break; case SMU_VCLK: ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_VCLK, &now); @@ -1556,9 +1568,13 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu, single_dpm_table = &(dpm_context->dpm_tables.vclk_table); - return smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, - now, "vclk"); + ret = smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, + now, "vclk"); + if (ret < 0) + return ret; + size += ret; + break; case SMU_DCLK: ret = smu_v13_0_6_get_current_clk_freq_by_table(smu, SMU_DCLK, &now); @@ -1570,9 +1586,13 @@ static int smu_v13_0_6_print_clk_levels(struct smu_context *smu, single_dpm_table = &(dpm_context->dpm_tables.dclk_table); - return smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, - now, "dclk"); + ret = smu_v13_0_6_print_clks(smu, buf, size, single_dpm_table, + now, "dclk"); + if (ret < 0) + return ret; + size += ret; + break; default: break; } From 4c4c138a1c86775c4d47e24f26357a1f8b64d0a3 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 30 Oct 2025 13:06:24 +0800 Subject: [PATCH 13/83] drm/amd/pm: fix missing device_attr cleanup in amdgpu_pm_sysfs_init() Use the correct label to complete all cleanup work. Fixes: 4d154b1ca580 ("drm/amd/pm: Add support for DPM policies") Fixes: 25e82f2e2c59 ("drm/amd/pm: Add temperature metrics sysfs entry") Signed-off-by: Yang Wang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index c88a76cce401..40ffaced74fd 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -4723,14 +4723,14 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev) ret = devm_device_add_group(adev->dev, &amdgpu_pm_policy_attr_group); if (ret) - goto err_out0; + goto err_out1; } if (amdgpu_dpm_is_temp_metrics_supported(adev, SMU_TEMP_METRIC_GPUBOARD)) { ret = devm_device_add_group(adev->dev, &amdgpu_board_attr_group); if (ret) - goto err_out0; + goto err_out1; if (amdgpu_pm_get_sensor_generic(adev, AMDGPU_PP_SENSOR_MAXNODEPOWERLIMIT, (void *)&tmp) != -EOPNOTSUPP) { sysfs_add_file_to_group(&adev->dev->kobj, From c3cd00fea6d6980faf32421495d0f48727ace89a Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 11 Aug 2025 19:07:05 +0530 Subject: [PATCH 14/83] drm/amd/pm: Add helper functions for gpu metrics Add helper macros to define metrics struct definitions. It will define structs with field type followed by actual field. A helper macro is also added to initialize the field encoding for all fields and to initialize the field members to 0xFFs. Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 67 ++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h index 0ae91c8b6d72..8d7c4814c68f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h @@ -202,5 +202,72 @@ void smu_cmn_get_backend_workload_mask(struct smu_context *smu, u32 workload_mask, u32 *backend_workload_mask); +/*SMU gpu metrics */ + +/* Attribute ID mapping */ +#define SMU_MATTR(X) AMDGPU_METRICS_ATTR_ID_##X +/* Type ID mapping */ +#define SMU_MTYPE(X) AMDGPU_METRICS_TYPE_##X +/* Unit ID mapping */ +#define SMU_MUNIT(X) AMDGPU_METRICS_UNIT_##X + +/* Map TYPEID to C type */ +#define SMU_CTYPE(TYPEID) SMU_CTYPE_##TYPEID + +#define SMU_CTYPE_AMDGPU_METRICS_TYPE_U8 u8 +#define SMU_CTYPE_AMDGPU_METRICS_TYPE_S8 s8 +#define SMU_CTYPE_AMDGPU_METRICS_TYPE_U16 u16 +#define SMU_CTYPE_AMDGPU_METRICS_TYPE_S16 s16 +#define SMU_CTYPE_AMDGPU_METRICS_TYPE_U32 u32 +#define SMU_CTYPE_AMDGPU_METRICS_TYPE_S32 s32 +#define SMU_CTYPE_AMDGPU_METRICS_TYPE_U64 u64 +#define SMU_CTYPE_AMDGPU_METRICS_TYPE_S64 s64 + +/* struct members */ +#define SMU_METRICS_SCALAR(ID, UNIT, TYPEID, NAME) \ + u64 NAME##_ftype; \ + SMU_CTYPE(TYPEID) NAME + +#define SMU_METRICS_ARRAY(ID, UNIT, TYPEID, NAME, SIZE) \ + u64 NAME##_ftype; \ + SMU_CTYPE(TYPEID) NAME[SIZE] + +/* Init functions for scalar/array fields - init to 0xFFs */ +#define SMU_METRICS_INIT_SCALAR(ID, UNIT, TYPEID, NAME) \ + do { \ + obj->NAME##_ftype = \ + AMDGPU_METRICS_ENC_ATTR(UNIT, TYPEID, ID, 1); \ + obj->NAME = (SMU_CTYPE(TYPEID)) ~0; \ + count++; \ + } while (0) + +#define SMU_METRICS_INIT_ARRAY(ID, UNIT, TYPEID, NAME, SIZE) \ + do { \ + obj->NAME##_ftype = \ + AMDGPU_METRICS_ENC_ATTR(UNIT, TYPEID, ID, SIZE); \ + memset(obj->NAME, 0xFF, sizeof(obj->NAME)); \ + count++; \ + } while (0) + +/* Declare Metrics Class and Template object */ +#define DECLARE_SMU_METRICS_CLASS(CLASSNAME, SMU_METRICS_FIELD_LIST) \ + struct __packed CLASSNAME { \ + struct metrics_table_header header; \ + int attr_count; \ + SMU_METRICS_FIELD_LIST(SMU_METRICS_SCALAR, SMU_METRICS_ARRAY); \ + }; \ + static inline void CLASSNAME##_init(struct CLASSNAME *obj, \ + uint8_t frev, uint8_t crev) \ + { \ + int count = 0; \ + memset(obj, 0xFF, sizeof(*obj)); \ + obj->header.format_revision = frev; \ + obj->header.content_revision = crev; \ + obj->header.structure_size = sizeof(*obj); \ + SMU_METRICS_FIELD_LIST(SMU_METRICS_INIT_SCALAR, \ + SMU_METRICS_INIT_ARRAY) \ + obj->attr_count = count; \ + } + #endif #endif From b4f748f22d315ae622ebfe068b9559b988702df8 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 12 Aug 2025 18:26:59 +0530 Subject: [PATCH 15/83] drm/amd/pm: Use gpu metrics 1.9 for SMUv13.0.6 Fill and publish GPU metrics in v1.9 format for SMUv13.0.6 SOCs Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 113 +++++++++-------- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h | 120 ++++++++++++++++++ 2 files changed, 179 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 197fd91e1fb4..6d39b02a3257 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -550,6 +550,7 @@ static int smu_v13_0_6_tables_init(struct smu_context *smu) struct smu_table_context *smu_table = &smu->smu_table; struct smu_table *tables = smu_table->tables; void *gpu_metrics_table __free(kfree) = NULL; + struct smu_v13_0_6_gpu_metrics *gpu_metrics; void *driver_pptable __free(kfree) = NULL; void *metrics_table __free(kfree) = NULL; struct amdgpu_device *adev = smu->adev; @@ -589,11 +590,22 @@ static int smu_v13_0_6_tables_init(struct smu_context *smu) if (!driver_pptable) return -ENOMEM; + ret = smu_table_cache_init(smu, SMU_TABLE_SMU_METRICS, + sizeof(struct smu_v13_0_6_gpu_metrics), 1); + if (ret) + return ret; + + gpu_metrics = (struct smu_v13_0_6_gpu_metrics + *)(tables[SMU_TABLE_SMU_METRICS].cache.buffer); + + smu_v13_0_6_gpu_metrics_init(gpu_metrics, 1, 9); if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 12)) { ret = smu_v13_0_12_tables_init(smu); - if (ret) + if (ret) { + smu_table_cache_fini(smu, SMU_TABLE_SMU_METRICS); return ret; + } } smu_table->gpu_metrics_table = no_free_ptr(gpu_metrics_table); @@ -732,6 +744,7 @@ static int smu_v13_0_6_fini_smc_tables(struct smu_context *smu) { if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 12)) smu_v13_0_12_tables_fini(smu); + smu_table_cache_fini(smu, SMU_TABLE_SMU_METRICS); return smu_v13_0_fini_smc_tables(smu); } @@ -2762,18 +2775,16 @@ static ssize_t smu_v13_0_6_get_xcp_metrics(struct smu_context *smu, int xcp_id, static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table) { struct smu_table_context *smu_table = &smu->smu_table; - struct gpu_metrics_v1_8 *gpu_metrics = - (struct gpu_metrics_v1_8 *)smu_table->gpu_metrics_table; + struct smu_table *tables = smu_table->tables; + struct smu_v13_0_6_gpu_metrics *gpu_metrics; int version = smu_v13_0_6_get_metrics_version(smu); MetricsTableV0_t *metrics_v0 __free(kfree) = NULL; - int ret = 0, xcc_id, inst, i, j, k, idx; struct amdgpu_device *adev = smu->adev; + int ret = 0, xcc_id, inst, i, j; MetricsTableV1_t *metrics_v1; MetricsTableV2_t *metrics_v2; - struct amdgpu_xcp *xcp; u16 link_width_level; u8 num_jpeg_rings; - u32 inst_mask; bool per_inst; metrics_v0 = kzalloc(METRICS_TABLE_SIZE, GFP_KERNEL); @@ -2788,8 +2799,8 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table metrics_v1 = (MetricsTableV1_t *)metrics_v0; metrics_v2 = (MetricsTableV2_t *)metrics_v0; - - smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 8); + gpu_metrics = (struct smu_v13_0_6_gpu_metrics + *)(tables[SMU_TABLE_SMU_METRICS].cache.buffer); gpu_metrics->temperature_hotspot = SMUQ10_ROUND(GET_METRIC_FIELD(MaxSocketTemperature, version)); @@ -2911,55 +2922,49 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table gpu_metrics->xgmi_link_status[j] = ret; } - gpu_metrics->num_partition = adev->xcp_mgr->num_xcps; - per_inst = smu_v13_0_6_cap_supported(smu, SMU_CAP(PER_INST_METRICS)); num_jpeg_rings = AMDGPU_MAX_JPEG_RINGS_4_0_3; - for_each_xcp(adev->xcp_mgr, xcp, i) { - amdgpu_xcp_get_inst_details(xcp, AMDGPU_XCP_VCN, &inst_mask); - idx = 0; - for_each_inst(k, inst_mask) { - /* Both JPEG and VCN has same instances */ - inst = GET_INST(VCN, k); + for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) { + inst = GET_INST(JPEG, i); + for (j = 0; j < num_jpeg_rings; ++j) + gpu_metrics->jpeg_busy[(i * num_jpeg_rings) + j] = + SMUQ10_ROUND(GET_METRIC_FIELD( + JpegBusy, + version)[(inst * num_jpeg_rings) + j]); + } + for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + inst = GET_INST(VCN, i); + gpu_metrics->vcn_busy[i] = + SMUQ10_ROUND(GET_METRIC_FIELD(VcnBusy, version)[inst]); + } - for (j = 0; j < num_jpeg_rings; ++j) { - gpu_metrics->xcp_stats[i].jpeg_busy - [(idx * num_jpeg_rings) + j] = - SMUQ10_ROUND(GET_METRIC_FIELD(JpegBusy, version) - [(inst * num_jpeg_rings) + j]); - } - gpu_metrics->xcp_stats[i].vcn_busy[idx] = - SMUQ10_ROUND(GET_METRIC_FIELD(VcnBusy, version)[inst]); - idx++; - - } - - if (per_inst) { - amdgpu_xcp_get_inst_details(xcp, AMDGPU_XCP_GFX, &inst_mask); - idx = 0; - for_each_inst(k, inst_mask) { - inst = GET_INST(GC, k); - gpu_metrics->xcp_stats[i].gfx_busy_inst[idx] = - SMUQ10_ROUND(GET_GPU_METRIC_FIELD(GfxBusy, version)[inst]); - gpu_metrics->xcp_stats[i].gfx_busy_acc[idx] = - SMUQ10_ROUND(GET_GPU_METRIC_FIELD(GfxBusyAcc, - version)[inst]); - if (smu_v13_0_6_cap_supported(smu, SMU_CAP(HST_LIMIT_METRICS))) { - gpu_metrics->xcp_stats[i].gfx_below_host_limit_ppt_acc[idx] = - SMUQ10_ROUND - (metrics_v0->GfxclkBelowHostLimitPptAcc[inst]); - gpu_metrics->xcp_stats[i].gfx_below_host_limit_thm_acc[idx] = - SMUQ10_ROUND - (metrics_v0->GfxclkBelowHostLimitThmAcc[inst]); - gpu_metrics->xcp_stats[i].gfx_low_utilization_acc[idx] = - SMUQ10_ROUND - (metrics_v0->GfxclkLowUtilizationAcc[inst]); - gpu_metrics->xcp_stats[i].gfx_below_host_limit_total_acc[idx] = - SMUQ10_ROUND - (metrics_v0->GfxclkBelowHostLimitTotalAcc[inst]); - } - idx++; + if (per_inst) { + for (i = 0; i < NUM_XCC(adev->gfx.xcc_mask); ++i) { + inst = GET_INST(GC, i); + gpu_metrics->gfx_busy_inst[i] = SMUQ10_ROUND( + GET_GPU_METRIC_FIELD(GfxBusy, version)[inst]); + gpu_metrics->gfx_busy_acc[i] = SMUQ10_ROUND( + GET_GPU_METRIC_FIELD(GfxBusyAcc, + version)[inst]); + if (smu_v13_0_6_cap_supported( + smu, SMU_CAP(HST_LIMIT_METRICS))) { + gpu_metrics->gfx_below_host_limit_ppt_acc + [i] = SMUQ10_ROUND( + metrics_v0->GfxclkBelowHostLimitPptAcc + [inst]); + gpu_metrics->gfx_below_host_limit_thm_acc + [i] = SMUQ10_ROUND( + metrics_v0->GfxclkBelowHostLimitThmAcc + [inst]); + gpu_metrics->gfx_low_utilization_acc + [i] = SMUQ10_ROUND( + metrics_v0 + ->GfxclkLowUtilizationAcc[inst]); + gpu_metrics->gfx_below_host_limit_total_acc + [i] = SMUQ10_ROUND( + metrics_v0->GfxclkBelowHostLimitTotalAcc + [inst]); } } } @@ -2969,7 +2974,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table gpu_metrics->firmware_timestamp = GET_METRIC_FIELD(Timestamp, version); - *table = (void *)gpu_metrics; + *table = tables[SMU_TABLE_SMU_METRICS].cache.buffer; return sizeof(*gpu_metrics); } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h index 7ef5f3e66c27..3f57e2a33fb4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h @@ -75,6 +75,13 @@ enum smu_v13_0_6_caps { SMU_CAP(ALL), }; +#define SMU_13_0_6_NUM_XGMI_LINKS 8 +#define SMU_13_0_6_MAX_GFX_CLKS 8 +#define SMU_13_0_6_MAX_CLKS 4 +#define SMU_13_0_6_MAX_XCC 8 +#define SMU_13_0_6_MAX_VCN 4 +#define SMU_13_0_6_MAX_JPEG 40 + extern void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu); bool smu_v13_0_6_cap_supported(struct smu_context *smu, enum smu_v13_0_6_caps cap); int smu_v13_0_6_get_static_metrics_table(struct smu_context *smu); @@ -99,4 +106,117 @@ int smu_v13_0_12_get_npm_data(struct smu_context *smu, extern const struct cmn2asic_mapping smu_v13_0_12_feature_mask_map[]; extern const struct cmn2asic_msg_mapping smu_v13_0_12_message_map[]; extern const struct smu_temp_funcs smu_v13_0_12_temp_funcs; + +#if defined(SWSMU_CODE_LAYER_L2) +#include "smu_cmn.h" + +/* SMUv 13.0.6 GPU metrics*/ +#define SMU_13_0_6_METRICS_FIELDS(SMU_SCALAR, SMU_ARRAY) \ + SMU_SCALAR(SMU_MATTR(TEMPERATURE_HOTSPOT), SMU_MUNIT(TEMP_1), \ + SMU_MTYPE(U16), temperature_hotspot); \ + SMU_SCALAR(SMU_MATTR(TEMPERATURE_MEM), SMU_MUNIT(TEMP_1), \ + SMU_MTYPE(U16), temperature_mem); \ + SMU_SCALAR(SMU_MATTR(TEMPERATURE_VRSOC), SMU_MUNIT(TEMP_1), \ + SMU_MTYPE(U16), temperature_vrsoc); \ + SMU_SCALAR(SMU_MATTR(CURR_SOCKET_POWER), SMU_MUNIT(POWER_1), \ + SMU_MTYPE(U16), curr_socket_power); \ + SMU_SCALAR(SMU_MATTR(AVERAGE_GFX_ACTIVITY), SMU_MUNIT(PERCENT), \ + SMU_MTYPE(U16), average_gfx_activity); \ + SMU_SCALAR(SMU_MATTR(AVERAGE_UMC_ACTIVITY), SMU_MUNIT(PERCENT), \ + SMU_MTYPE(U16), average_umc_activity); \ + SMU_SCALAR(SMU_MATTR(MEM_MAX_BANDWIDTH), SMU_MUNIT(BW_1), \ + SMU_MTYPE(U64), mem_max_bandwidth); \ + SMU_SCALAR(SMU_MATTR(ENERGY_ACCUMULATOR), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), energy_accumulator); \ + SMU_SCALAR(SMU_MATTR(SYSTEM_CLOCK_COUNTER), SMU_MUNIT(TIME_1), \ + SMU_MTYPE(U64), system_clock_counter); \ + SMU_SCALAR(SMU_MATTR(ACCUMULATION_COUNTER), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), accumulation_counter); \ + SMU_SCALAR(SMU_MATTR(PROCHOT_RESIDENCY_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), prochot_residency_acc); \ + SMU_SCALAR(SMU_MATTR(PPT_RESIDENCY_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), ppt_residency_acc); \ + SMU_SCALAR(SMU_MATTR(SOCKET_THM_RESIDENCY_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), socket_thm_residency_acc); \ + SMU_SCALAR(SMU_MATTR(VR_THM_RESIDENCY_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), vr_thm_residency_acc); \ + SMU_SCALAR(SMU_MATTR(HBM_THM_RESIDENCY_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), hbm_thm_residency_acc); \ + SMU_SCALAR(SMU_MATTR(GFXCLK_LOCK_STATUS), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), gfxclk_lock_status); \ + SMU_SCALAR(SMU_MATTR(PCIE_LINK_WIDTH), SMU_MUNIT(NONE), \ + SMU_MTYPE(U16), pcie_link_width); \ + SMU_SCALAR(SMU_MATTR(PCIE_LINK_SPEED), SMU_MUNIT(SPEED_2), \ + SMU_MTYPE(U16), pcie_link_speed); \ + SMU_SCALAR(SMU_MATTR(XGMI_LINK_WIDTH), SMU_MUNIT(NONE), \ + SMU_MTYPE(U16), xgmi_link_width); \ + SMU_SCALAR(SMU_MATTR(XGMI_LINK_SPEED), SMU_MUNIT(SPEED_1), \ + SMU_MTYPE(U16), xgmi_link_speed); \ + SMU_SCALAR(SMU_MATTR(GFX_ACTIVITY_ACC), SMU_MUNIT(PERCENT), \ + SMU_MTYPE(U32), gfx_activity_acc); \ + SMU_SCALAR(SMU_MATTR(MEM_ACTIVITY_ACC), SMU_MUNIT(PERCENT), \ + SMU_MTYPE(U32), mem_activity_acc); \ + SMU_SCALAR(SMU_MATTR(PCIE_BANDWIDTH_ACC), SMU_MUNIT(PERCENT), \ + SMU_MTYPE(U64), pcie_bandwidth_acc); \ + SMU_SCALAR(SMU_MATTR(PCIE_BANDWIDTH_INST), SMU_MUNIT(BW_1), \ + SMU_MTYPE(U64), pcie_bandwidth_inst); \ + SMU_SCALAR(SMU_MATTR(PCIE_L0_TO_RECOV_COUNT_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), pcie_l0_to_recov_count_acc); \ + SMU_SCALAR(SMU_MATTR(PCIE_REPLAY_COUNT_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), pcie_replay_count_acc); \ + SMU_SCALAR(SMU_MATTR(PCIE_REPLAY_ROVER_COUNT_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), pcie_replay_rover_count_acc); \ + SMU_SCALAR(SMU_MATTR(PCIE_NAK_SENT_COUNT_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), pcie_nak_sent_count_acc); \ + SMU_SCALAR(SMU_MATTR(PCIE_NAK_RCVD_COUNT_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U32), pcie_nak_rcvd_count_acc); \ + SMU_ARRAY(SMU_MATTR(XGMI_READ_DATA_ACC), SMU_MUNIT(DATA_1), \ + SMU_MTYPE(U64), xgmi_read_data_acc, \ + SMU_13_0_6_NUM_XGMI_LINKS); \ + SMU_ARRAY(SMU_MATTR(XGMI_WRITE_DATA_ACC), SMU_MUNIT(DATA_1), \ + SMU_MTYPE(U64), xgmi_write_data_acc, \ + SMU_13_0_6_NUM_XGMI_LINKS); \ + SMU_ARRAY(SMU_MATTR(XGMI_LINK_STATUS), SMU_MUNIT(NONE), \ + SMU_MTYPE(U16), xgmi_link_status, \ + SMU_13_0_6_NUM_XGMI_LINKS); \ + SMU_SCALAR(SMU_MATTR(FIRMWARE_TIMESTAMP), SMU_MUNIT(TIME_2), \ + SMU_MTYPE(U64), firmware_timestamp); \ + SMU_ARRAY(SMU_MATTR(CURRENT_GFXCLK), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_gfxclk, SMU_13_0_6_MAX_GFX_CLKS); \ + SMU_ARRAY(SMU_MATTR(CURRENT_SOCCLK), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_socclk, SMU_13_0_6_MAX_CLKS); \ + SMU_ARRAY(SMU_MATTR(CURRENT_VCLK0), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_vclk0, SMU_13_0_6_MAX_CLKS); \ + SMU_ARRAY(SMU_MATTR(CURRENT_DCLK0), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_dclk0, SMU_13_0_6_MAX_CLKS); \ + SMU_SCALAR(SMU_MATTR(CURRENT_UCLK), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_uclk); \ + SMU_SCALAR(SMU_MATTR(PCIE_LC_PERF_OTHER_END_RECOVERY), \ + SMU_MUNIT(NONE), SMU_MTYPE(U32), \ + pcie_lc_perf_other_end_recovery); \ + SMU_ARRAY(SMU_MATTR(GFX_BUSY_INST), SMU_MUNIT(PERCENT), \ + SMU_MTYPE(U32), gfx_busy_inst, SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(JPEG_BUSY), SMU_MUNIT(PERCENT), SMU_MTYPE(U16), \ + jpeg_busy, SMU_13_0_6_MAX_JPEG); \ + SMU_ARRAY(SMU_MATTR(VCN_BUSY), SMU_MUNIT(PERCENT), SMU_MTYPE(U16), \ + vcn_busy, SMU_13_0_6_MAX_VCN); \ + SMU_ARRAY(SMU_MATTR(GFX_BUSY_ACC), SMU_MUNIT(PERCENT), SMU_MTYPE(U64), \ + gfx_busy_acc, SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(GFX_BELOW_HOST_LIMIT_PPT_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), gfx_below_host_limit_ppt_acc, \ + SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(GFX_BELOW_HOST_LIMIT_THM_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), gfx_below_host_limit_thm_acc, \ + SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(GFX_LOW_UTILIZATION_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), gfx_low_utilization_acc, \ + SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(GFX_BELOW_HOST_LIMIT_TOTAL_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), gfx_below_host_limit_total_acc, \ + SMU_13_0_6_MAX_XCC); + +DECLARE_SMU_METRICS_CLASS(smu_v13_0_6_gpu_metrics, SMU_13_0_6_METRICS_FIELDS); + +#endif /* SWSMU_CODE_LAYER_L2 */ + #endif From 9f1cb2c3fa26d5529976c8bf795c14b7b677ce94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Thu, 30 Oct 2025 10:15:56 +0100 Subject: [PATCH 16/83] drm/amd/pm/si: Delete unused structs and fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The contents of si_dpm.h seem to have been copied from the old radeon driver, including a lot of structs and fields which were only relevant to GPU generations even older than SI. A lot of these can be deleted without causing much churn to the actual SI DPM code. Let's delete them to make the code easier to understand. Signed-off-by: Timur Kristóf Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 10 +- drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.h | 557 --------------------- 2 files changed, 1 insertion(+), 566 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c index 3a9522c17fee..020e05c137e4 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c @@ -2558,18 +2558,13 @@ static int si_enable_power_containment(struct amdgpu_device *adev, if (enable) { if (!si_should_disable_uvd_powertune(adev, amdgpu_new_state)) { smc_result = amdgpu_si_send_msg_to_smc(adev, PPSMC_TDPClampingActive); - if (smc_result != PPSMC_Result_OK) { + if (smc_result != PPSMC_Result_OK) ret = -EINVAL; - ni_pi->pc_enabled = false; - } else { - ni_pi->pc_enabled = true; - } } } else { smc_result = amdgpu_si_send_msg_to_smc(adev, PPSMC_TDPClampingInactive); if (smc_result != PPSMC_Result_OK) ret = -EINVAL; - ni_pi->pc_enabled = false; } } @@ -7509,8 +7504,6 @@ static int si_dpm_init(struct amdgpu_device *adev) pi->pasi = CYPRESS_HASI_DFLT; pi->vrc = SISLANDS_VRC_DFLT; - pi->gfx_clock_gating = true; - eg_pi->sclk_deep_sleep = true; si_pi->sclk_deep_sleep_above_low = false; @@ -7521,7 +7514,6 @@ static int si_dpm_init(struct amdgpu_device *adev) eg_pi->dynamic_ac_timing = true; - eg_pi->light_sleep = true; #if defined(CONFIG_ACPI) eg_pi->pcie_performance_request = amdgpu_acpi_is_pcie_performance_request_supported(adev); diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.h b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.h index 11cb7874a6bb..3aed75fbf913 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.h +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.h @@ -38,11 +38,7 @@ #define MC_ARB_DRAM_TIMING2_2 0xa00 #define MC_ARB_DRAM_TIMING2_3 0xa01 -#define MAX_NO_OF_MVDD_VALUES 2 -#define MAX_NO_VREG_STEPS 32 #define NISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE 16 -#define SMC_NISLANDS_MC_REGISTER_ARRAY_SIZE 32 -#define SMC_NISLANDS_MC_REGISTER_ARRAY_SET_COUNT 20 #define RV770_ASI_DFLT 1000 #define CYPRESS_HASI_DFLT 400000 #define PCIE_PERF_REQ_PECI_GEN1 2 @@ -51,11 +47,6 @@ #define RV770_DEFAULT_VCLK_FREQ 53300 /* 10 khz */ #define RV770_DEFAULT_DCLK_FREQ 40000 /* 10 khz */ -#define SMC_EVERGREEN_MC_REGISTER_ARRAY_SIZE 16 - -#define RV770_SMC_TABLE_ADDRESS 0xB000 -#define RV770_SMC_PERFORMANCE_LEVELS_PER_SWSTATE 3 - #define SMC_STROBE_RATIO 0x0F #define SMC_STROBE_ENABLE 0x10 @@ -64,27 +55,6 @@ #define SMC_MC_RTT_ENABLE 0x04 #define SMC_MC_STUTTER_EN 0x08 -#define RV770_SMC_VOLTAGEMASK_VDDC 0 -#define RV770_SMC_VOLTAGEMASK_MVDD 1 -#define RV770_SMC_VOLTAGEMASK_VDDCI 2 -#define RV770_SMC_VOLTAGEMASK_MAX 4 - -#define NISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE 16 -#define NISLANDS_SMC_STROBE_RATIO 0x0F -#define NISLANDS_SMC_STROBE_ENABLE 0x10 - -#define NISLANDS_SMC_MC_EDC_RD_FLAG 0x01 -#define NISLANDS_SMC_MC_EDC_WR_FLAG 0x02 -#define NISLANDS_SMC_MC_RTT_ENABLE 0x04 -#define NISLANDS_SMC_MC_STUTTER_EN 0x08 - -#define MAX_NO_VREG_STEPS 32 - -#define NISLANDS_SMC_VOLTAGEMASK_VDDC 0 -#define NISLANDS_SMC_VOLTAGEMASK_MVDD 1 -#define NISLANDS_SMC_VOLTAGEMASK_VDDCI 2 -#define NISLANDS_SMC_VOLTAGEMASK_MAX 4 - #define SISLANDS_MCREGISTERTABLE_INITIAL_SLOT 0 #define SISLANDS_MCREGISTERTABLE_ACPI_SLOT 1 #define SISLANDS_MCREGISTERTABLE_ULV_SLOT 2 @@ -219,32 +189,6 @@ enum si_cac_config_reg_type SISLANDS_CACCONFIG_MAX }; -enum si_power_level { - SI_POWER_LEVEL_LOW = 0, - SI_POWER_LEVEL_MEDIUM = 1, - SI_POWER_LEVEL_HIGH = 2, - SI_POWER_LEVEL_CTXSW = 3, -}; - -enum si_td { - SI_TD_AUTO, - SI_TD_UP, - SI_TD_DOWN, -}; - -enum si_display_watermark { - SI_DISPLAY_WATERMARK_LOW = 0, - SI_DISPLAY_WATERMARK_HIGH = 1, -}; - -enum si_display_gap -{ - SI_PM_DISPLAY_GAP_VBLANK_OR_WM = 0, - SI_PM_DISPLAY_GAP_VBLANK = 1, - SI_PM_DISPLAY_GAP_WATERMARK = 2, - SI_PM_DISPLAY_GAP_IGNORE = 3, -}; - extern const struct amdgpu_ip_block_version si_smu_ip_block; struct ni_leakage_coeffients @@ -258,56 +202,6 @@ struct ni_leakage_coeffients u32 t_ref; }; -struct SMC_Evergreen_MCRegisterAddress -{ - uint16_t s0; - uint16_t s1; -}; - -typedef struct SMC_Evergreen_MCRegisterAddress SMC_Evergreen_MCRegisterAddress; - -struct evergreen_mc_reg_entry { - u32 mclk_max; - u32 mc_data[SMC_EVERGREEN_MC_REGISTER_ARRAY_SIZE]; -}; - -struct evergreen_mc_reg_table { - u8 last; - u8 num_entries; - u16 valid_flag; - struct evergreen_mc_reg_entry mc_reg_table_entry[MAX_AC_TIMING_ENTRIES]; - SMC_Evergreen_MCRegisterAddress mc_reg_address[SMC_EVERGREEN_MC_REGISTER_ARRAY_SIZE]; -}; - -struct SMC_Evergreen_MCRegisterSet -{ - uint32_t value[SMC_EVERGREEN_MC_REGISTER_ARRAY_SIZE]; -}; - -typedef struct SMC_Evergreen_MCRegisterSet SMC_Evergreen_MCRegisterSet; - -struct SMC_Evergreen_MCRegisters -{ - uint8_t last; - uint8_t reserved[3]; - SMC_Evergreen_MCRegisterAddress address[SMC_EVERGREEN_MC_REGISTER_ARRAY_SIZE]; - SMC_Evergreen_MCRegisterSet data[5]; -}; - -typedef struct SMC_Evergreen_MCRegisters SMC_Evergreen_MCRegisters; - -struct SMC_NIslands_MCRegisterSet -{ - uint32_t value[SMC_NISLANDS_MC_REGISTER_ARRAY_SIZE]; -}; - -typedef struct SMC_NIslands_MCRegisterSet SMC_NIslands_MCRegisterSet; - -struct ni_mc_reg_entry { - u32 mclk_max; - u32 mc_data[SMC_NISLANDS_MC_REGISTER_ARRAY_SIZE]; -}; - struct SMC_NIslands_MCRegisterAddress { uint16_t s0; @@ -316,257 +210,20 @@ struct SMC_NIslands_MCRegisterAddress typedef struct SMC_NIslands_MCRegisterAddress SMC_NIslands_MCRegisterAddress; -struct SMC_NIslands_MCRegisters -{ - uint8_t last; - uint8_t reserved[3]; - SMC_NIslands_MCRegisterAddress address[SMC_NISLANDS_MC_REGISTER_ARRAY_SIZE]; - SMC_NIslands_MCRegisterSet data[SMC_NISLANDS_MC_REGISTER_ARRAY_SET_COUNT]; -}; - -typedef struct SMC_NIslands_MCRegisters SMC_NIslands_MCRegisters; - -struct evergreen_ulv_param { - bool supported; - struct rv7xx_pl *pl; -}; - -struct evergreen_arb_registers { - u32 mc_arb_dram_timing; - u32 mc_arb_dram_timing2; - u32 mc_arb_rfsh_rate; - u32 mc_arb_burst_time; -}; - -struct at { - u32 rlp; - u32 rmp; - u32 lhp; - u32 lmp; -}; - -struct ni_clock_registers { - u32 cg_spll_func_cntl; - u32 cg_spll_func_cntl_2; - u32 cg_spll_func_cntl_3; - u32 cg_spll_func_cntl_4; - u32 cg_spll_spread_spectrum; - u32 cg_spll_spread_spectrum_2; - u32 mclk_pwrmgt_cntl; - u32 dll_cntl; - u32 mpll_ad_func_cntl; - u32 mpll_ad_func_cntl_2; - u32 mpll_dq_func_cntl; - u32 mpll_dq_func_cntl_2; - u32 mpll_ss1; - u32 mpll_ss2; -}; - -struct RV770_SMC_SCLK_VALUE -{ - uint32_t vCG_SPLL_FUNC_CNTL; - uint32_t vCG_SPLL_FUNC_CNTL_2; - uint32_t vCG_SPLL_FUNC_CNTL_3; - uint32_t vCG_SPLL_SPREAD_SPECTRUM; - uint32_t vCG_SPLL_SPREAD_SPECTRUM_2; - uint32_t sclk_value; -}; - -typedef struct RV770_SMC_SCLK_VALUE RV770_SMC_SCLK_VALUE; - -struct RV770_SMC_MCLK_VALUE -{ - uint32_t vMPLL_AD_FUNC_CNTL; - uint32_t vMPLL_AD_FUNC_CNTL_2; - uint32_t vMPLL_DQ_FUNC_CNTL; - uint32_t vMPLL_DQ_FUNC_CNTL_2; - uint32_t vMCLK_PWRMGT_CNTL; - uint32_t vDLL_CNTL; - uint32_t vMPLL_SS; - uint32_t vMPLL_SS2; - uint32_t mclk_value; -}; - -typedef struct RV770_SMC_MCLK_VALUE RV770_SMC_MCLK_VALUE; - - -struct RV730_SMC_MCLK_VALUE -{ - uint32_t vMCLK_PWRMGT_CNTL; - uint32_t vDLL_CNTL; - uint32_t vMPLL_FUNC_CNTL; - uint32_t vMPLL_FUNC_CNTL2; - uint32_t vMPLL_FUNC_CNTL3; - uint32_t vMPLL_SS; - uint32_t vMPLL_SS2; - uint32_t mclk_value; -}; - -typedef struct RV730_SMC_MCLK_VALUE RV730_SMC_MCLK_VALUE; - -struct RV770_SMC_VOLTAGE_VALUE -{ - uint16_t value; - uint8_t index; - uint8_t padding; -}; - -typedef struct RV770_SMC_VOLTAGE_VALUE RV770_SMC_VOLTAGE_VALUE; - -union RV7XX_SMC_MCLK_VALUE -{ - RV770_SMC_MCLK_VALUE mclk770; - RV730_SMC_MCLK_VALUE mclk730; -}; - -typedef union RV7XX_SMC_MCLK_VALUE RV7XX_SMC_MCLK_VALUE, *LPRV7XX_SMC_MCLK_VALUE; - -struct RV770_SMC_HW_PERFORMANCE_LEVEL -{ - uint8_t arbValue; - union{ - uint8_t seqValue; - uint8_t ACIndex; - }; - uint8_t displayWatermark; - uint8_t gen2PCIE; - uint8_t gen2XSP; - uint8_t backbias; - uint8_t strobeMode; - uint8_t mcFlags; - uint32_t aT; - uint32_t bSP; - RV770_SMC_SCLK_VALUE sclk; - RV7XX_SMC_MCLK_VALUE mclk; - RV770_SMC_VOLTAGE_VALUE vddc; - RV770_SMC_VOLTAGE_VALUE mvdd; - RV770_SMC_VOLTAGE_VALUE vddci; - uint8_t reserved1; - uint8_t reserved2; - uint8_t stateFlags; - uint8_t padding; -}; - -typedef struct RV770_SMC_HW_PERFORMANCE_LEVEL RV770_SMC_HW_PERFORMANCE_LEVEL; - -struct RV770_SMC_SWSTATE -{ - uint8_t flags; - uint8_t padding1; - uint8_t padding2; - uint8_t padding3; - RV770_SMC_HW_PERFORMANCE_LEVEL levels[RV770_SMC_PERFORMANCE_LEVELS_PER_SWSTATE]; -}; - -typedef struct RV770_SMC_SWSTATE RV770_SMC_SWSTATE; - -struct RV770_SMC_VOLTAGEMASKTABLE -{ - uint8_t highMask[RV770_SMC_VOLTAGEMASK_MAX]; - uint32_t lowMask[RV770_SMC_VOLTAGEMASK_MAX]; -}; - -typedef struct RV770_SMC_VOLTAGEMASKTABLE RV770_SMC_VOLTAGEMASKTABLE; - -struct RV770_SMC_STATETABLE -{ - uint8_t thermalProtectType; - uint8_t systemFlags; - uint8_t maxVDDCIndexInPPTable; - uint8_t extraFlags; - uint8_t highSMIO[MAX_NO_VREG_STEPS]; - uint32_t lowSMIO[MAX_NO_VREG_STEPS]; - RV770_SMC_VOLTAGEMASKTABLE voltageMaskTable; - RV770_SMC_SWSTATE initialState; - RV770_SMC_SWSTATE ACPIState; - RV770_SMC_SWSTATE driverState; - RV770_SMC_SWSTATE ULVState; -}; - -typedef struct RV770_SMC_STATETABLE RV770_SMC_STATETABLE; - -struct vddc_table_entry { - u16 vddc; - u8 vddc_index; - u8 high_smio; - u32 low_smio; -}; - -struct rv770_clock_registers { - u32 cg_spll_func_cntl; - u32 cg_spll_func_cntl_2; - u32 cg_spll_func_cntl_3; - u32 cg_spll_spread_spectrum; - u32 cg_spll_spread_spectrum_2; - u32 mpll_ad_func_cntl; - u32 mpll_ad_func_cntl_2; - u32 mpll_dq_func_cntl; - u32 mpll_dq_func_cntl_2; - u32 mclk_pwrmgt_cntl; - u32 dll_cntl; - u32 mpll_ss1; - u32 mpll_ss2; -}; - -struct rv730_clock_registers { - u32 cg_spll_func_cntl; - u32 cg_spll_func_cntl_2; - u32 cg_spll_func_cntl_3; - u32 cg_spll_spread_spectrum; - u32 cg_spll_spread_spectrum_2; - u32 mclk_pwrmgt_cntl; - u32 dll_cntl; - u32 mpll_func_cntl; - u32 mpll_func_cntl2; - u32 mpll_func_cntl3; - u32 mpll_ss; - u32 mpll_ss2; -}; - -union r7xx_clock_registers { - struct rv770_clock_registers rv770; - struct rv730_clock_registers rv730; -}; - struct rv7xx_power_info { /* flags */ - bool mem_gddr5; - bool pcie_gen2; - bool dynamic_pcie_gen2; - bool acpi_pcie_gen2; - bool boot_in_gen2; bool voltage_control; /* vddc */ bool mvdd_control; bool sclk_ss; bool mclk_ss; bool dynamic_ss; - bool gfx_clock_gating; - bool mg_clock_gating; - bool mgcgtssm; - bool power_gating; bool thermal_protection; - bool display_gap; - bool dcodt; - bool ulps; - /* registers */ - union r7xx_clock_registers clk_regs; - u32 s0_vid_lower_smio_cntl; /* voltage */ - u32 vddc_mask_low; - u32 mvdd_mask_low; u32 mvdd_split_frequency; - u32 mvdd_low_smio[MAX_NO_OF_MVDD_VALUES]; u16 max_vddc; u16 max_vddc_in_table; u16 min_vddc_in_table; - struct vddc_table_entry vddc_table[MAX_NO_VREG_STEPS]; - u8 valid_vddc_entries; - /* dc odt */ - u32 mclk_odt_threshold; - u8 odt_value_0[2]; - u8 odt_value_1[2]; /* stored values */ - u32 boot_sclk; u16 acpi_vddc; u32 ref_div; u32 active_auto_throttle_sources; @@ -582,17 +239,6 @@ struct rv7xx_power_info { u32 asi; u32 pasi; u32 vrc; - u32 restricted_levels; - u32 rlp; - u32 rmp; - u32 lhp; - u32 lmp; - /* smc offsets */ - u16 state_table_start; - u16 soft_regs_start; - u16 sram_end; - /* scratch structs */ - RV770_SMC_STATETABLE smc_statetable; }; enum si_pcie_gen { @@ -611,44 +257,12 @@ struct rv7xx_pl { enum si_pcie_gen pcie_gen; /* si+ only */ }; -struct rv7xx_ps { - struct rv7xx_pl high; - struct rv7xx_pl medium; - struct rv7xx_pl low; - bool dc_compatible; -}; - struct si_ps { u16 performance_level_count; bool dc_compatible; struct rv7xx_pl performance_levels[NISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE]; }; -struct ni_mc_reg_table { - u8 last; - u8 num_entries; - u16 valid_flag; - struct ni_mc_reg_entry mc_reg_table_entry[MAX_AC_TIMING_ENTRIES]; - SMC_NIslands_MCRegisterAddress mc_reg_address[SMC_NISLANDS_MC_REGISTER_ARRAY_SIZE]; -}; - -struct ni_cac_data -{ - struct ni_leakage_coeffients leakage_coefficients; - u32 i_leakage; - s32 leakage_minimum_temperature; - u32 pwr_const; - u32 dc_cac_value; - u32 bif_cac_value; - u32 lkge_pwr; - u8 mc_wr_weight; - u8 mc_rd_weight; - u8 allow_ovrflw; - u8 num_win_tdp; - u8 l2num_win_tdp; - u8 lts_truncate_n; -}; - struct evergreen_power_info { /* must be first! */ struct rv7xx_power_info rv7xx; @@ -657,203 +271,33 @@ struct evergreen_power_info { bool dynamic_ac_timing; bool abm; bool mcls; - bool light_sleep; - bool memory_transition; bool pcie_performance_request; - bool pcie_performance_request_registered; bool sclk_deep_sleep; - bool dll_default_on; - bool ls_clock_gating; bool smu_uvd_hs; bool uvd_enabled; /* stored values */ u16 acpi_vddci; - u8 mvdd_high_index; - u8 mvdd_low_index; u32 mclk_edc_wr_enable_threshold; - struct evergreen_mc_reg_table mc_reg_table; struct atom_voltage_table vddc_voltage_table; struct atom_voltage_table vddci_voltage_table; - struct evergreen_arb_registers bootup_arb_registers; - struct evergreen_ulv_param ulv; - struct at ats[2]; - /* smc offsets */ - u16 mc_reg_table_start; struct amdgpu_ps current_rps; - struct rv7xx_ps current_ps; struct amdgpu_ps requested_rps; - struct rv7xx_ps requested_ps; }; -struct PP_NIslands_Dpm2PerfLevel -{ - uint8_t MaxPS; - uint8_t TgtAct; - uint8_t MaxPS_StepInc; - uint8_t MaxPS_StepDec; - uint8_t PSST; - uint8_t NearTDPDec; - uint8_t AboveSafeInc; - uint8_t BelowSafeInc; - uint8_t PSDeltaLimit; - uint8_t PSDeltaWin; - uint8_t Reserved[6]; -}; - -typedef struct PP_NIslands_Dpm2PerfLevel PP_NIslands_Dpm2PerfLevel; - -struct PP_NIslands_DPM2Parameters -{ - uint32_t TDPLimit; - uint32_t NearTDPLimit; - uint32_t SafePowerLimit; - uint32_t PowerBoostLimit; -}; -typedef struct PP_NIslands_DPM2Parameters PP_NIslands_DPM2Parameters; - -struct NISLANDS_SMC_SCLK_VALUE -{ - uint32_t vCG_SPLL_FUNC_CNTL; - uint32_t vCG_SPLL_FUNC_CNTL_2; - uint32_t vCG_SPLL_FUNC_CNTL_3; - uint32_t vCG_SPLL_FUNC_CNTL_4; - uint32_t vCG_SPLL_SPREAD_SPECTRUM; - uint32_t vCG_SPLL_SPREAD_SPECTRUM_2; - uint32_t sclk_value; -}; - -typedef struct NISLANDS_SMC_SCLK_VALUE NISLANDS_SMC_SCLK_VALUE; - -struct NISLANDS_SMC_MCLK_VALUE -{ - uint32_t vMPLL_FUNC_CNTL; - uint32_t vMPLL_FUNC_CNTL_1; - uint32_t vMPLL_FUNC_CNTL_2; - uint32_t vMPLL_AD_FUNC_CNTL; - uint32_t vMPLL_AD_FUNC_CNTL_2; - uint32_t vMPLL_DQ_FUNC_CNTL; - uint32_t vMPLL_DQ_FUNC_CNTL_2; - uint32_t vMCLK_PWRMGT_CNTL; - uint32_t vDLL_CNTL; - uint32_t vMPLL_SS; - uint32_t vMPLL_SS2; - uint32_t mclk_value; -}; - -typedef struct NISLANDS_SMC_MCLK_VALUE NISLANDS_SMC_MCLK_VALUE; - -struct NISLANDS_SMC_VOLTAGE_VALUE -{ - uint16_t value; - uint8_t index; - uint8_t padding; -}; - -typedef struct NISLANDS_SMC_VOLTAGE_VALUE NISLANDS_SMC_VOLTAGE_VALUE; - -struct NISLANDS_SMC_HW_PERFORMANCE_LEVEL -{ - uint8_t arbValue; - uint8_t ACIndex; - uint8_t displayWatermark; - uint8_t gen2PCIE; - uint8_t reserved1; - uint8_t reserved2; - uint8_t strobeMode; - uint8_t mcFlags; - uint32_t aT; - uint32_t bSP; - NISLANDS_SMC_SCLK_VALUE sclk; - NISLANDS_SMC_MCLK_VALUE mclk; - NISLANDS_SMC_VOLTAGE_VALUE vddc; - NISLANDS_SMC_VOLTAGE_VALUE mvdd; - NISLANDS_SMC_VOLTAGE_VALUE vddci; - NISLANDS_SMC_VOLTAGE_VALUE std_vddc; - uint32_t powergate_en; - uint8_t hUp; - uint8_t hDown; - uint8_t stateFlags; - uint8_t arbRefreshState; - uint32_t SQPowerThrottle; - uint32_t SQPowerThrottle_2; - uint32_t reserved[2]; - PP_NIslands_Dpm2PerfLevel dpm2; -}; - -typedef struct NISLANDS_SMC_HW_PERFORMANCE_LEVEL NISLANDS_SMC_HW_PERFORMANCE_LEVEL; - -struct NISLANDS_SMC_SWSTATE -{ - uint8_t flags; - uint8_t levelCount; - uint8_t padding2; - uint8_t padding3; - NISLANDS_SMC_HW_PERFORMANCE_LEVEL levels[]; -}; - -typedef struct NISLANDS_SMC_SWSTATE NISLANDS_SMC_SWSTATE; - -struct NISLANDS_SMC_VOLTAGEMASKTABLE -{ - uint8_t highMask[NISLANDS_SMC_VOLTAGEMASK_MAX]; - uint32_t lowMask[NISLANDS_SMC_VOLTAGEMASK_MAX]; -}; - -typedef struct NISLANDS_SMC_VOLTAGEMASKTABLE NISLANDS_SMC_VOLTAGEMASKTABLE; - -#define NISLANDS_MAX_NO_VREG_STEPS 32 - -struct NISLANDS_SMC_STATETABLE -{ - uint8_t thermalProtectType; - uint8_t systemFlags; - uint8_t maxVDDCIndexInPPTable; - uint8_t extraFlags; - uint8_t highSMIO[NISLANDS_MAX_NO_VREG_STEPS]; - uint32_t lowSMIO[NISLANDS_MAX_NO_VREG_STEPS]; - NISLANDS_SMC_VOLTAGEMASKTABLE voltageMaskTable; - PP_NIslands_DPM2Parameters dpm2Params; - NISLANDS_SMC_SWSTATE initialState; - NISLANDS_SMC_SWSTATE ACPIState; - NISLANDS_SMC_SWSTATE ULVState; - NISLANDS_SMC_SWSTATE driverState; - NISLANDS_SMC_HW_PERFORMANCE_LEVEL dpmLevels[NISLANDS_MAX_SMC_PERFORMANCE_LEVELS_PER_SWSTATE - 1]; -}; - -typedef struct NISLANDS_SMC_STATETABLE NISLANDS_SMC_STATETABLE; - struct ni_power_info { /* must be first! */ struct evergreen_power_info eg; - struct ni_clock_registers clock_registers; - struct ni_mc_reg_table mc_reg_table; u32 mclk_rtt_mode_threshold; /* flags */ - bool use_power_boost_limit; bool support_cac_long_term_average; bool cac_enabled; bool cac_configuration_required; bool driver_calculate_cac_leakage; - bool pc_enabled; bool enable_power_containment; bool enable_cac; bool enable_sq_ramping; - /* smc offsets */ - u16 arb_table_start; - u16 fan_table_start; - u16 cac_table_start; - u16 spll_table_start; - /* CAC stuff */ - struct ni_cac_data cac_data; - u32 dc_cac_table[NISLANDS_DCCAC_MAX_LEVELS]; - const struct ni_cac_weights *cac_weights; - u8 lta_window_size; - u8 lts_truncate; struct si_ps current_ps; struct si_ps requested_ps; - /* scratch structs */ - SMC_NIslands_MCRegisters smc_mc_reg_table; - NISLANDS_SMC_STATETABLE smc_statetable; }; struct si_cac_config_reg @@ -952,7 +396,6 @@ struct si_leakage_voltage struct si_leakage_voltage_entry entries[SISLANDS_MAX_LEAKAGE_COUNT]; }; - struct si_ulv_param { bool supported; u32 cg_ulv_control; From 5d7ccf080cc43b1fe96c2f38bcb61dafce398e5b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 30 Oct 2025 09:56:31 +0800 Subject: [PATCH 17/83] drm/amd/display: remove unneeded semicolon No functional modification involved. ./drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c:1674:3-4: Unneeded semicolon. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=26821 Signed-off-by: Jiapeng Chong Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c index 130058d7a70c..875ae97489d3 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn401/dcn401_resource.c @@ -1671,7 +1671,7 @@ enum dc_status dcn401_validate_bandwidth(struct dc *dc, dc_state_set_stream_cursor_subvp_limit(stream, context, true); status = DC_FAIL_HW_CURSOR_SUPPORT; } - }; + } } if (validate_mode == DC_VALIDATE_MODE_AND_PROGRAMMING && status == DC_FAIL_HW_CURSOR_SUPPORT) { From 26c288c1709aa638fc8c8d57f03f1739a0c3317f Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 30 Oct 2025 09:56:30 +0800 Subject: [PATCH 18/83] drm/amd/display: remove unneeded semicolon No functional modification involved. ./drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c:1850:3-4: Unneeded semicolon. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=26821 Signed-off-by: Jiapeng Chong Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c index 81e64e17d0cb..df0b664c0cd2 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c @@ -1847,7 +1847,7 @@ enum dc_status dcn32_validate_bandwidth(struct dc *dc, dc_state_set_stream_cursor_subvp_limit(stream, context, true); status = DC_FAIL_HW_CURSOR_SUPPORT; } - }; + } } if (validate_mode == DC_VALIDATE_MODE_AND_PROGRAMMING && status == DC_FAIL_HW_CURSOR_SUPPORT) { From 2a30ff5bc4954fb1bc882d73a48b9b9cfabf55ce Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 30 Oct 2025 09:56:29 +0800 Subject: [PATCH 19/83] drm/amd/display: remove unneeded semicolon No functional modification involved. ./drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c:7392:3-4: Unneeded semicolon. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=26821 Signed-off-by: Jiapeng Chong Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index b798154768c8..8be4c5a36fd3 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -7389,7 +7389,7 @@ int amdgpu_dm_connector_atomic_set_property(struct drm_connector *connector, default: dm_new_state->abm_sysfs_forbidden = true; dm_new_state->abm_level = val; - }; + } ret = 0; } From 399299d81f3d16e50d641a0ac454e7d4737cf08c Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 30 Oct 2025 15:37:37 +0100 Subject: [PATCH 20/83] MAINTAINERS: adjust file entry in AMD DISPLAY CORE - DML Commit e6a8a000cfe6 ("drm/amd/display: Rename dml2 to dml2_0 folder") renames the directory dml2 to dml2_0 in ./drivers/gpu/drm/amd/display/dc, but misses to adjust the file entry in AMD DISPLAY CORE - DML. Adjust the file entry after this directory renaming. Signed-off-by: Lukas Bulwahn Signed-off-by: Alex Deucher --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1083598bb2b6..9bccfb4d3bca 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1080,7 +1080,7 @@ M: Austin Zheng M: Jun Lei S: Supported F: drivers/gpu/drm/amd/display/dc/dml/ -F: drivers/gpu/drm/amd/display/dc/dml2/ +F: drivers/gpu/drm/amd/display/dc/dml2_0/ AMD FAM15H PROCESSOR POWER MONITORING DRIVER M: Huang Rui From bfdffc29956eea59520b32308d8db0dbc7f6deb9 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Mon, 20 Oct 2025 16:44:29 +0800 Subject: [PATCH 21/83] drm/amd/ras: Correct info field of bad page threshold exceed CPER Correct valid_bits and ms_chk_bits of section info field for bad page threshold exceed CPER to match OOB's behavior. Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/ras/rascore/ras_cper.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c index 2343991adccf..c2e9fa7c3e6c 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c @@ -65,7 +65,6 @@ static void fill_section_hdr(struct ras_core_context *ras_core, hdr->error_severity = sev; hdr->valid_bits.platform_id = 1; - hdr->valid_bits.partition_id = 1; hdr->valid_bits.timestamp = 1; ras_core_get_device_system_info(ras_core, &dev_info); @@ -147,13 +146,19 @@ static int fill_section_fatal(struct ras_core_context *ras_core, } static int fill_section_runtime(struct ras_core_context *ras_core, - struct cper_section_runtime *runtime, struct ras_log_info *trace) + struct cper_section_runtime *runtime, struct ras_log_info *trace, + enum ras_cper_severity sev) { runtime->hdr.valid_bits.err_info_cnt = 1; runtime->hdr.valid_bits.err_context_cnt = 1; runtime->descriptor.error_type = RUNTIME; runtime->descriptor.ms_chk_bits.err_type_valid = 1; + if (sev == RAS_CPER_SEV_RMA) { + runtime->descriptor.valid_bits.ms_chk = 1; + runtime->descriptor.ms_chk_bits.err_type = 1; + runtime->descriptor.ms_chk_bits.pcc = 1; + } runtime->reg.reg_ctx_type = CPER_CTX_TYPE__CRASH; runtime->reg.reg_arr_size = sizeof(runtime->reg.reg_dump); @@ -189,7 +194,7 @@ static int cper_generate_runtime_record(struct ras_core_context *ras_core, fill_section_descriptor(ras_core, descriptor, sev, RUNTIME, RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i), sizeof(struct cper_section_runtime)); - fill_section_runtime(ras_core, runtime, trace_arr[i]); + fill_section_runtime(ras_core, runtime, trace_arr[i], sev); } return 0; From 988fd51e45fce61ae8283ee47488fc9df31b57cf Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Thu, 23 Oct 2025 10:10:02 +0800 Subject: [PATCH 22/83] drm/amd/ras: Use correct severity for BP threshold exceed event The severity of CPER for BP threshold exceed event should be set as FATAL to match the OOB implementation. Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/ras/rascore/ras_cper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c index c2e9fa7c3e6c..3c5bfa1c93f6 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c @@ -62,7 +62,7 @@ static void fill_section_hdr(struct ras_core_context *ras_core, hdr->signature[3] = 'R'; hdr->revision = CPER_HDR__REV_1; hdr->signature_end = 0xFFFFFFFF; - hdr->error_severity = sev; + hdr->error_severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev); hdr->valid_bits.platform_id = 1; hdr->valid_bits.timestamp = 1; @@ -115,7 +115,7 @@ static int fill_section_descriptor(struct ras_core_context *ras_core, descriptor->sec_length = section_length; descriptor->valid_bits.fru_text = 1; descriptor->flag_bits.primary = 1; - descriptor->severity = sev; + descriptor->severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev); descriptor->sec_type = sec_type; ras_core_get_device_system_info(ras_core, &dev_info); From 960e30a61e1a7ca5341a6cf9481e770e1cda24aa Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sat, 25 Oct 2025 23:29:36 -0500 Subject: [PATCH 23/83] drm/amdgpu: Drop PMFW RLC notifier from amdgpu_device_suspend() For S3 on vangogh, PMFW needs to be notified before the driver powers down RLC. This already happens in smu_disable_dpms() so drop the superfluous call in amdgpu_device_suspend(). Co-developed-by: Mario Limonciello (AMD) Reviewed-by: Alex Deucher Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ---- drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 18 ------------------ drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 2 -- 3 files changed, 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b385e086e6c2..9f67c53633aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5279,10 +5279,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, false); - r = amdgpu_dpm_notify_rlc_state(adev, false); - if (r) - return r; - return 0; } diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 5d08dc3b7110..5c4d0eb198c4 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -195,24 +195,6 @@ int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev, return ret; } -int amdgpu_dpm_notify_rlc_state(struct amdgpu_device *adev, bool en) -{ - int ret = 0; - const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; - - if (pp_funcs && pp_funcs->notify_rlc_state) { - mutex_lock(&adev->pm.mutex); - - ret = pp_funcs->notify_rlc_state( - adev->powerplay.pp_handle, - en); - - mutex_unlock(&adev->pm.mutex); - } - - return ret; -} - int amdgpu_dpm_is_baco_supported(struct amdgpu_device *adev) { const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h index 3bce74f8bb0a..c7ea29385682 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h @@ -424,8 +424,6 @@ int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev); int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev, enum pp_mp1_state mp1_state); -int amdgpu_dpm_notify_rlc_state(struct amdgpu_device *adev, bool en); - int amdgpu_dpm_set_gfx_power_up_by_imu(struct amdgpu_device *adev); int amdgpu_dpm_baco_exit(struct amdgpu_device *adev); From 6f4208f9d992104fed0854f0b975a31ffb9c0d46 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sat, 25 Oct 2025 23:29:37 -0500 Subject: [PATCH 24/83] drm/amd: Add an unwind for failures in amdgpu_device_ip_suspend_phase1() If any hardware IPs involved with the first phase of suspend fail, unwind all steps to restore back to original state. Signed-off-by: Mario Limonciello (AMD) Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9f67c53633aa..227ac9370175 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -178,6 +178,7 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = { BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | BIT(AMD_IP_BLOCK_TYPE_PSP) }; +static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); @@ -3784,7 +3785,7 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) */ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) { - int i, r; + int i, r, rec; amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); @@ -3807,10 +3808,23 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); if (r) - return r; + goto unwind; } return 0; +unwind: + rec = amdgpu_device_ip_resume_phase3(adev); + if (rec) + dev_err(adev->dev, + "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", + rec); + + amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); + + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); + + return r; } /** From 1d611218729dd7ab7bbc05e57ace5d4fa224b9bb Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sat, 25 Oct 2025 23:29:38 -0500 Subject: [PATCH 25/83] drm/amd: Add an unwind for failures in amdgpu_device_ip_suspend_phase2() If any hardware IPs involved with the second phase of suspend fail, unwind all steps to restore back to original state. Signed-off-by: Mario Limonciello (AMD) Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 36 ++++++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 227ac9370175..6a8dfe15617a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -178,6 +178,9 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = { BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | BIT(AMD_IP_BLOCK_TYPE_PSP) }; + +static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); +static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); @@ -3840,7 +3843,7 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) */ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) { - int i, r; + int i, r, rec; if (adev->in_s0ix) amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); @@ -3903,7 +3906,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); if (r) - return r; + goto unwind; /* handle putting the SMC in the appropriate state */ if (!amdgpu_sriov_vf(adev)) { @@ -3913,13 +3916,40 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) dev_err(adev->dev, "SMC failed to set mp1 state %d, %d\n", adev->mp1_state, r); - return r; + goto unwind; } } } } return 0; +unwind: + /* suspend phase 2 = resume phase 1 + resume phase 2 */ + rec = amdgpu_device_ip_resume_phase1(adev); + if (rec) { + dev_err(adev->dev, + "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", + rec); + return r; + } + + rec = amdgpu_device_fw_loading(adev); + if (rec) { + dev_err(adev->dev, + "amdgpu_device_fw_loading failed during unwind: %d\n", + rec); + return r; + } + + rec = amdgpu_device_ip_resume_phase2(adev); + if (rec) { + dev_err(adev->dev, + "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", + rec); + return r; + } + + return r; } /** From 72b0b75d608b54f7bb94ee12737c4a62d95bce4a Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Sat, 25 Oct 2025 23:29:39 -0500 Subject: [PATCH 26/83] drm/amd: Unwind for failed device suspend If device suspend has failed, add a recovery flow that will attempt to unwind the suspend and get things back up and running. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627 Signed-off-by: Mario Limonciello (AMD) Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 65 +++++++++++++++++++--- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6a8dfe15617a..6f574f674f49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5273,7 +5273,7 @@ void amdgpu_device_complete(struct drm_device *dev) int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) { struct amdgpu_device *adev = drm_to_adev(dev); - int r = 0; + int r, rec; if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; @@ -5289,8 +5289,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) return r; } - if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3)) - dev_warn(adev->dev, "smart shift update failed\n"); + r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); + if (r) + goto unwind_sriov; if (notify_clients) drm_client_dev_suspend(adev_to_drm(adev)); @@ -5301,16 +5302,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) r = amdgpu_device_ip_suspend_phase1(adev); if (r) - return r; + goto unwind_smartshift; amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); r = amdgpu_userq_suspend(adev); if (r) - return r; + goto unwind_ip_phase1; r = amdgpu_device_evict_resources(adev); if (r) - return r; + goto unwind_userq; amdgpu_ttm_set_buffer_funcs_status(adev, false); @@ -5318,12 +5319,62 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) r = amdgpu_device_ip_suspend_phase2(adev); if (r) - return r; + goto unwind_evict; if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, false); return 0; + +unwind_evict: + if (adev->mman.buffer_funcs_ring->sched.ready) + amdgpu_ttm_set_buffer_funcs_status(adev, true); + amdgpu_fence_driver_hw_init(adev); + +unwind_userq: + rec = amdgpu_userq_resume(adev); + if (rec) { + dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); + return r; + } + rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + if (rec) { + dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); + return r; + } + +unwind_ip_phase1: + /* suspend phase 1 = resume phase 3 */ + rec = amdgpu_device_ip_resume_phase3(adev); + if (rec) { + dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); + return r; + } + +unwind_smartshift: + rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); + if (rec) { + dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); + return r; + } + + if (notify_clients) + drm_client_dev_resume(adev_to_drm(adev)); + + amdgpu_ras_resume(adev); + +unwind_sriov: + if (amdgpu_sriov_vf(adev)) { + rec = amdgpu_virt_request_full_gpu(adev, true); + if (rec) { + dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); + return r; + } + } + + adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; + + return r; } static inline int amdgpu_virt_resume(struct amdgpu_device *adev) From ff7644faf3529d186861b02957bf9c6b2fceb4c0 Mon Sep 17 00:00:00 2001 From: Sunday Clement Date: Mon, 27 Oct 2025 14:00:59 -0400 Subject: [PATCH 27/83] drm/amdkfd: Fix Unchecked Return Values Properly Check for return values from calls to debug functions in runtime_disable(). v2: storing the last non zero returned value from the loop. Signed-off-by: Sunday Clement Reviewed-by: Jonathan Kim Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 0f0719528bcc..22925df6a791 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -2826,7 +2826,7 @@ static int runtime_enable(struct kfd_process *p, uint64_t r_debug, static int runtime_disable(struct kfd_process *p) { - int i = 0, ret; + int i = 0, ret = 0; bool was_enabled = p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED; p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_DISABLED; @@ -2863,6 +2863,7 @@ static int runtime_disable(struct kfd_process *p) /* disable ttmp setup */ for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + int last_err = 0; if (kfd_dbg_is_per_vmid_supported(pdd->dev)) { pdd->spi_dbg_override = @@ -2872,14 +2873,17 @@ static int runtime_disable(struct kfd_process *p) pdd->dev->vm_info.last_vmid_kfd); if (!pdd->dev->kfd->shared_resources.enable_mes) - debug_refresh_runlist(pdd->dev->dqm); + last_err = debug_refresh_runlist(pdd->dev->dqm); else - kfd_dbg_set_mes_debug_mode(pdd, + last_err = kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev)); + + if (last_err) + ret = last_err; } } - return 0; + return ret; } static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data) From 825df7ff4bb1a383ad4827545e09aec60d230770 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 30 Oct 2025 14:39:43 -0500 Subject: [PATCH 28/83] drm/amd/display: Don't stretch non-native images by default in eDP commit 978fa2f6d0b12 ("drm/amd/display: Use scaling for non-native resolutions on eDP") started using the GPU scaler hardware to scale when a non-native resolution was picked on eDP. This scaling was done to fill the screen instead of maintain aspect ratio. The idea was supposed to be that if a different scaling behavior is preferred then the compositor would request it. The not following aspect ratio behavior however isn't desirable, so adjust it to follow aspect ratio and still try to fill screen. Note: This will lead to black bars in some cases for non-native resolutions. Compositors can request the previous behavior if desired. Fixes: 978fa2f6d0b1 ("drm/amd/display: Use scaling for non-native resolutions on eDP") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4538 Signed-off-by: Mario Limonciello (AMD) Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 8be4c5a36fd3..5f090c13f224 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8216,7 +8216,7 @@ static int dm_encoder_helper_atomic_check(struct drm_encoder *encoder, "mode %dx%d@%dHz is not native, enabling scaling\n", adjusted_mode->hdisplay, adjusted_mode->vdisplay, drm_mode_vrefresh(adjusted_mode)); - dm_new_connector_state->scaling = RMX_FULL; + dm_new_connector_state->scaling = RMX_ASPECT; } return 0; } From 290f46cf5726509f55fac4c7abe9b9d311aa5a3a Mon Sep 17 00:00:00 2001 From: "Jesse.Zhang" Date: Fri, 24 Oct 2025 10:51:52 +0800 Subject: [PATCH 29/83] drm/amdgpu: Implement user queue reset functionality This patch adds robust reset handling for user queues (userq) to improve recovery from queue failures. The key components include: 1. Queue detection and reset logic: - amdgpu_userq_detect_and_reset_queues() identifies failed queues - Per-IP detect_and_reset callbacks for targeted recovery - Falls back to full GPU reset when needed 2. Reset infrastructure: - Adds userq_reset_work workqueue for async reset handling - Implements pre/post reset handlers for queue state management - Integrates with existing GPU reset framework 3. Error handling improvements: - Enhanced state tracking with HUNG state - Automatic reset triggering on critical failures - VRAM loss handling during recovery 4. Integration points: - Added to device init/reset paths - Called during queue destroy, suspend, and isolation events - Handles both individual queue and full GPU resets The reset functionality works with both gfx/compute and sdma queues, providing better resilience against queue failures while minimizing disruption to unaffected queues. v2: add detection and reset calls when preemption/unmaped fails. add a per device userq counter for each user queue type.(Alex) v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex) warn if the adev->userq_mutex is not held. v4: make sure we have all of the uqm->userq_mutex held. warn if the uqm->userq_mutex is not held. v5: Use array for user queue type counters.(Alex) all of the uqm->userq_mutex need to be held when calling detect and reset. (Alex) v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo) v8: remove atomic_set(&userq_mgr->userq_count[i], 0). it should already be 0 since we kzalloc the structure (Alex) v9: For consistency with kernel queues, We may want something like: amdgpu_userq_is_reset_type_supported (Alex) Signed-off-by: Jesse Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 214 +++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 + 5 files changed, 217 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bcfed46eedaf..9f9774f58ce1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1316,6 +1316,7 @@ struct amdgpu_device { bool apu_prefer_gtt; bool userq_halt_for_enforce_isolation; + struct work_struct userq_reset_work; struct amdgpu_uid *uid_info; /* KFD diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6f574f674f49..c65bb0bcd32b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4651,6 +4651,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, } INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); adev->gfx.gfx_off_req_count = 1; adev->gfx.gfx_off_residency = 0; @@ -6080,6 +6081,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) if (r) goto out; + r = amdgpu_userq_post_reset(tmp_adev, vram_lost); + if (r) + goto out; + drm_client_dev_resume(adev_to_drm(tmp_adev)); /* @@ -6302,6 +6307,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) if (!amdgpu_sriov_vf(adev)) cancel_work(&adev->reset_work); #endif + cancel_work(&adev->userq_reset_work); if (adev->kfd.dev) cancel_work(&adev->kfd.reset_work); @@ -6422,6 +6428,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev, amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); + amdgpu_userq_pre_reset(tmp_adev); + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 87b962df5460..7a27c6c4bb44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -83,6 +83,7 @@ enum amdgpu_ring_type { AMDGPU_RING_TYPE_MES, AMDGPU_RING_TYPE_UMSCH_MM, AMDGPU_RING_TYPE_CPER, + AMDGPU_RING_TYPE_MAX, }; enum amdgpu_ib_pool_type { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 13cc5a686dfd..fd54423c3587 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -25,8 +25,10 @@ #include #include #include +#include #include "amdgpu.h" +#include "amdgpu_reset.h" #include "amdgpu_vm.h" #include "amdgpu_userq.h" #include "amdgpu_hmm.h" @@ -46,6 +48,107 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) return userq_ip_mask; } +static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev, + enum amdgpu_ring_type ring_type, int reset_type) +{ + + if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX) + return false; + + switch (ring_type) { + case AMDGPU_RING_TYPE_GFX: + if (adev->gfx.gfx_supported_reset & reset_type) + return true; + break; + case AMDGPU_RING_TYPE_COMPUTE: + if (adev->gfx.compute_supported_reset & reset_type) + return true; + break; + case AMDGPU_RING_TYPE_SDMA: + if (adev->sdma.supported_reset & reset_type) + return true; + break; + case AMDGPU_RING_TYPE_VCN_DEC: + case AMDGPU_RING_TYPE_VCN_ENC: + if (adev->vcn.supported_reset & reset_type) + return true; + break; + case AMDGPU_RING_TYPE_VCN_JPEG: + if (adev->jpeg.supported_reset & reset_type) + return true; + break; + default: + break; + } + return false; +} + +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) +{ + if (amdgpu_device_should_recover_gpu(adev)) { + amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->userq_reset_work); + /* Wait for the reset job to complete */ + flush_work(&adev->userq_reset_work); + } +} + +static int +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const int queue_types[] = { + AMDGPU_RING_TYPE_COMPUTE, + AMDGPU_RING_TYPE_GFX, + AMDGPU_RING_TYPE_SDMA + }; + const int num_queue_types = ARRAY_SIZE(queue_types); + bool gpu_reset = false; + int r = 0; + int i; + + /* Warning if current process mutex is not held */ + WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex)); + + if (unlikely(adev->debug_disable_gpu_ring_reset)) { + dev_err(adev->dev, "userq reset disabled by debug mask\n"); + return 0; + } + + /* + * If GPU recovery feature is disabled system-wide, + * skip all reset detection logic + */ + if (!amdgpu_gpu_recovery) + return 0; + + /* + * Iterate through all queue types to detect and reset problematic queues + * Process each queue type in the defined order + */ + for (i = 0; i < num_queue_types; i++) { + int ring_type = queue_types[i]; + const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type]; + + if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE)) + continue; + + if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 && + funcs && funcs->detect_and_reset) { + r = funcs->detect_and_reset(adev, ring_type); + if (r) { + gpu_reset = true; + break; + } + } + } + + if (gpu_reset) + amdgpu_userq_gpu_reset(adev); + + return r; +} + static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue, struct amdgpu_bo_va_mapping *va_map, u64 addr) { @@ -176,17 +279,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; + bool found_hung_queue = false; int r = 0; if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { r = userq_funcs->preempt(uq_mgr, queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; + found_hung_queue = true; } else { queue->state = AMDGPU_USERQ_STATE_PREEMPTED; } } + if (found_hung_queue) + amdgpu_userq_detect_and_reset_queues(uq_mgr); + return r; } @@ -218,16 +326,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; + bool found_hung_queue = false; int r = 0; if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { r = userq_funcs->unmap(uq_mgr, queue); - if (r) + if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - else + found_hung_queue = true; + } else { queue->state = AMDGPU_USERQ_STATE_UNMAPPED; + } } + + if (found_hung_queue) + amdgpu_userq_detect_and_reset_queues(uq_mgr); + return r; } @@ -244,10 +359,12 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, r = userq_funcs->map(uq_mgr, queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; + amdgpu_userq_detect_and_reset_queues(uq_mgr); } else { queue->state = AMDGPU_USERQ_STATE_MAPPED; } } + return r; } @@ -475,10 +592,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) amdgpu_bo_unreserve(queue->db_obj.obj); } amdgpu_bo_unref(&queue->db_obj.obj); - + atomic_dec(&uq_mgr->userq_count[queue->queue_type]); #if defined(CONFIG_DEBUG_FS) debugfs_remove_recursive(queue->debugfs_queue); #endif + amdgpu_userq_detect_and_reset_queues(uq_mgr); r = amdgpu_userq_unmap_helper(uq_mgr, queue); /*TODO: It requires a reset for userq hw unmap error*/ if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) { @@ -701,6 +819,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) kfree(queue_name); args->out.queue_id = qid; + atomic_inc(&uq_mgr->userq_count[queue->queue_type]); unlock: mutex_unlock(&uq_mgr->userq_mutex); @@ -1045,6 +1164,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) unsigned long queue_id; int ret = 0, r; + amdgpu_userq_detect_and_reset_queues(uq_mgr); /* Try to unmap all the queues in this process ctx */ xa_for_each(&uq_mgr->userq_mgr_xa, queue_id, queue) { r = amdgpu_userq_preempt_helper(uq_mgr, queue); @@ -1057,6 +1177,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) return ret; } +void amdgpu_userq_reset_work(struct work_struct *work) +{ + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, + userq_reset_work); + struct amdgpu_reset_context reset_context; + + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + reset_context.src = AMDGPU_RESET_SRC_USERQ; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); +} + static int amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) { @@ -1084,22 +1221,19 @@ void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_eviction_fence *ev_fence) { - int ret; struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + int ret; /* Wait for any pending userqueue fence work to finish */ ret = amdgpu_userq_wait_for_signal(uq_mgr); - if (ret) { - drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n"); - return; - } + if (ret) + dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n"); ret = amdgpu_userq_evict_all(uq_mgr); - if (ret) { - drm_file_err(uq_mgr->file, "Failed to evict userqueue\n"); - return; - } + if (ret) + dev_err(adev->dev, "Failed to evict userqueue\n"); /* Signal current eviction fence */ amdgpu_eviction_fence_signal(evf_mgr, ev_fence); @@ -1133,6 +1267,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) cancel_delayed_work_sync(&userq_mgr->resume_work); mutex_lock(&userq_mgr->userq_mutex); + amdgpu_userq_detect_and_reset_queues(userq_mgr); xa_for_each(&userq_mgr->userq_mgr_xa, queue_id, queue) { amdgpu_userq_wait_for_last_fence(userq_mgr, queue); amdgpu_userq_unmap_helper(userq_mgr, queue); @@ -1159,6 +1294,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) uqm = queue->userq_mgr; cancel_delayed_work_sync(&uqm->resume_work); guard(mutex)(&uqm->userq_mutex); + amdgpu_userq_detect_and_reset_queues(uqm); if (adev->in_s0ix) r = amdgpu_userq_preempt_helper(uqm, queue); else @@ -1217,6 +1353,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, if (((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && (queue->xcp_id == idx)) { + amdgpu_userq_detect_and_reset_queues(uqm); r = amdgpu_userq_preempt_helper(uqm, queue); if (r) ret = r; @@ -1289,3 +1426,56 @@ int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev, return 0; } + +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) +{ + const struct amdgpu_userq_funcs *userq_funcs; + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm; + unsigned long queue_id; + + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { + uqm = queue->userq_mgr; + cancel_delayed_work_sync(&uqm->resume_work); + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { + amdgpu_userq_wait_for_last_fence(uqm, queue); + userq_funcs = adev->userq_funcs[queue->queue_type]; + userq_funcs->unmap(uqm, queue); + /* just mark all queues as hung at this point. + * if unmap succeeds, we could map again + * in amdgpu_userq_post_reset() if vram is not lost + */ + queue->state = AMDGPU_USERQ_STATE_HUNG; + amdgpu_userq_fence_driver_force_completion(queue); + } + } +} + +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) +{ + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED + * at this point, we should be able to map it again + * and continue if vram is not lost. + */ + struct amdgpu_userq_mgr *uqm; + struct amdgpu_usermode_queue *queue; + const struct amdgpu_userq_funcs *userq_funcs; + unsigned long queue_id; + int r = 0; + + xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { + uqm = queue->userq_mgr; + if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) { + userq_funcs = adev->userq_funcs[queue->queue_type]; + /* Re-map queue */ + r = userq_funcs->map(uqm, queue); + if (r) { + dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id); + continue; + } + queue->state = AMDGPU_USERQ_STATE_MAPPED; + } + } + + return r; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 09da0617bfa2..c37444427a14 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -106,6 +106,7 @@ struct amdgpu_userq_mgr { struct amdgpu_device *adev; struct delayed_work resume_work; struct drm_file *file; + atomic_t userq_count[AMDGPU_RING_TYPE_MAX]; }; struct amdgpu_db_info { @@ -148,6 +149,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, u32 idx); int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, u32 idx); +void amdgpu_userq_reset_work(struct work_struct *work); +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); + int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue, u64 addr, u64 expected_size); int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev, From face6a3615a649456eb4549f6d474221d877d604 Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Fri, 3 Oct 2025 11:59:39 -0400 Subject: [PATCH 30/83] drm/amd/display: fw locality check refactors [why] There are some new changes for HDCP2 firmware locality check. The implementation doesn't perfectly fit the intended design and clarity. 1. Clarify and consolidate variable responsibilities. The previous implementation introduced the following variables: - config.ddc.funcs.atomic_write_poll_read_i2c (optional pointer) - hdcp->config.ddc.funcs.atomic_write_poll_read_aux (optional pointer) - hdcp->connection.link.adjust.hdcp2.force_sw_locality_check (bool) - hdcp->config.debug.lc_enable_sw_fallback (bool) - use_fw (bool) They will be used together to determine two operations: - Whether to use FW locality check - Whether to use SW fallback on FW locality check failure The refactor streamlines this by introducing two variables in the hdcp2 link adjustment, while ensuring function pointers are always assigned and remain independent from policy decisions: - use_fw_locality_check (bool) -> true if fw locality should be used. - use_sw_locality_fallback (bool) -> true to reset use_fw_locality_check back to false and retry on fw locality check failure. 2. Mixed meanings of l_prime_read transition input l_prime_read originally means if l_prime is read when sw locality check is used. When FW locality check is used, l_prime_read means if lc init write, l prime poll and l_prime read combo operation is successful. The mix of meanings is confusing. The refactor introduces a new variable l_prime_combo_read to isolate the second meaning into its own variable. 3. Missing specific error code on firmware locality error. The original change reuses the generic DDC failure error code when firmware fails to return locality check result. This is not ideal as DDC failure indicates an error occurred during an I2C/AUX transaction. FW locality failure could be caused by polling timeout in firmware or failure to acquire firmware access. Which sits at a higher level of abstraction above DDC hardware. An incorrect error code could mislead the debug into a wrong direction. 4. Correcting misplaced comments. The previous implementation of the firmware locality check resulted in some comments in hdcp2_transition being incorrectly positioned. This refactor relocates those comments to their appropriate locations for better clarity. Reviewed-by: Aric Cyr Signed-off-by: Wenjing Liu Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.c | 18 ++--- .../gpu/drm/amd/display/modules/hdcp/hdcp.h | 1 + .../display/modules/hdcp/hdcp2_execution.c | 68 ++++++------------- .../display/modules/hdcp/hdcp2_transition.c | 61 +++++++++-------- .../drm/amd/display/modules/hdcp/hdcp_ddc.c | 2 +- .../drm/amd/display/modules/hdcp/hdcp_log.c | 2 + .../drm/amd/display/modules/inc/mod_hdcp.h | 10 ++- 7 files changed, 71 insertions(+), 91 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c index 19038f336155..85ce558cefc5 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c @@ -201,6 +201,7 @@ void hdcp_update_display(struct hdcp_workqueue *hdcp_work, struct mod_hdcp_link_adjustment link_adjust; struct mod_hdcp_display_adjustment display_adjust; unsigned int conn_index = aconnector->base.index; + const struct dc *dc = aconnector->dc_link->dc; guard(mutex)(&hdcp_w->mutex); drm_connector_get(&aconnector->base); @@ -231,6 +232,9 @@ void hdcp_update_display(struct hdcp_workqueue *hdcp_work, link_adjust.hdcp1.disable = 1; link_adjust.hdcp2.force_type = MOD_HDCP_FORCE_TYPE_1; } + link_adjust.hdcp2.use_fw_locality_check = + (dc->caps.fused_io_supported || dc->debug.hdcp_lc_force_fw_enable); + link_adjust.hdcp2.use_sw_locality_fallback = dc->debug.hdcp_lc_enable_sw_fallback; schedule_delayed_work(&hdcp_w->property_validate_dwork, msecs_to_jiffies(DRM_HDCP_CHECK_PERIOD_MS)); @@ -534,6 +538,7 @@ static void update_config(void *handle, struct cp_psp_stream_config *config) struct hdcp_workqueue *hdcp_w = &hdcp_work[link_index]; struct dc_sink *sink = NULL; bool link_is_hdcp14 = false; + const struct dc *dc = aconnector->dc_link->dc; if (config->dpms_off) { hdcp_remove_display(hdcp_work, link_index, aconnector); @@ -575,6 +580,8 @@ static void update_config(void *handle, struct cp_psp_stream_config *config) link->adjust.auth_delay = 2; link->adjust.retry_limit = MAX_NUM_OF_ATTEMPTS; link->adjust.hdcp1.disable = 0; + link->adjust.hdcp2.use_fw_locality_check = (dc->caps.fused_io_supported || dc->debug.hdcp_lc_force_fw_enable); + link->adjust.hdcp2.use_sw_locality_fallback = dc->debug.hdcp_lc_enable_sw_fallback; hdcp_w->encryption_status[display->index] = MOD_HDCP_ENCRYPTION_STATUS_HDCP_OFF; DRM_DEBUG_DRIVER("[HDCP_DM] display %d, CP %d, type %d\n", aconnector->base.index, @@ -786,15 +793,8 @@ struct hdcp_workqueue *hdcp_create_workqueue(struct amdgpu_device *adev, ddc_funcs->read_i2c = lp_read_i2c; ddc_funcs->write_dpcd = lp_write_dpcd; ddc_funcs->read_dpcd = lp_read_dpcd; - - config->debug.lc_enable_sw_fallback = dc->debug.hdcp_lc_enable_sw_fallback; - if (dc->caps.fused_io_supported || dc->debug.hdcp_lc_force_fw_enable) { - ddc_funcs->atomic_write_poll_read_i2c = lp_atomic_write_poll_read_i2c; - ddc_funcs->atomic_write_poll_read_aux = lp_atomic_write_poll_read_aux; - } else { - ddc_funcs->atomic_write_poll_read_i2c = NULL; - ddc_funcs->atomic_write_poll_read_aux = NULL; - } + ddc_funcs->atomic_write_poll_read_i2c = lp_atomic_write_poll_read_i2c; + ddc_funcs->atomic_write_poll_read_aux = lp_atomic_write_poll_read_aux; memset(hdcp_work[i].aconnector, 0, sizeof(struct amdgpu_dm_connector *) * diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h index b883d626f1c3..26a351a184f3 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h @@ -88,6 +88,7 @@ struct mod_hdcp_transition_input_hdcp2 { uint8_t lc_init_write; uint8_t l_prime_available_poll; uint8_t l_prime_read; + uint8_t l_prime_combo_read; uint8_t l_prime_validation; uint8_t eks_prepare; uint8_t eks_write; diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c index 5628f0ef73fd..27500abf9fee 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c @@ -465,54 +465,11 @@ static enum mod_hdcp_status validate_h_prime(struct mod_hdcp *hdcp, return status; } -static enum mod_hdcp_status locality_check_sw(struct mod_hdcp *hdcp, - struct mod_hdcp_event_context *event_ctx, - struct mod_hdcp_transition_input_hdcp2 *input) -{ - enum mod_hdcp_status status = MOD_HDCP_STATUS_SUCCESS; - - if (!mod_hdcp_execute_and_set(mod_hdcp_write_lc_init, - &input->lc_init_write, &status, - hdcp, "lc_init_write")) - goto out; - if (is_dp_hdcp(hdcp)) - msleep(16); - else - if (!mod_hdcp_execute_and_set(poll_l_prime_available, - &input->l_prime_available_poll, &status, - hdcp, "l_prime_available_poll")) - goto out; - if (!mod_hdcp_execute_and_set(mod_hdcp_read_l_prime, - &input->l_prime_read, &status, - hdcp, "l_prime_read")) - goto out; -out: - return status; -} - -static enum mod_hdcp_status locality_check_fw(struct mod_hdcp *hdcp, - struct mod_hdcp_event_context *event_ctx, - struct mod_hdcp_transition_input_hdcp2 *input) -{ - enum mod_hdcp_status status = MOD_HDCP_STATUS_SUCCESS; - - if (!mod_hdcp_execute_and_set(mod_hdcp_write_poll_read_lc_fw, - &input->l_prime_read, &status, - hdcp, "l_prime_read")) - goto out; - -out: - return status; -} - static enum mod_hdcp_status locality_check(struct mod_hdcp *hdcp, struct mod_hdcp_event_context *event_ctx, struct mod_hdcp_transition_input_hdcp2 *input) { enum mod_hdcp_status status = MOD_HDCP_STATUS_SUCCESS; - const bool use_fw = hdcp->config.ddc.funcs.atomic_write_poll_read_i2c - && hdcp->config.ddc.funcs.atomic_write_poll_read_aux - && !hdcp->connection.link.adjust.hdcp2.force_sw_locality_check; if (event_ctx->event != MOD_HDCP_EVENT_CALLBACK) { event_ctx->unexpected_event = 1; @@ -524,9 +481,28 @@ static enum mod_hdcp_status locality_check(struct mod_hdcp *hdcp, hdcp, "lc_init_prepare")) goto out; - status = (use_fw ? locality_check_fw : locality_check_sw)(hdcp, event_ctx, input); - if (status != MOD_HDCP_STATUS_SUCCESS) - goto out; + if (hdcp->connection.link.adjust.hdcp2.use_fw_locality_check) { + if (!mod_hdcp_execute_and_set(mod_hdcp_write_poll_read_lc_fw, + &input->l_prime_combo_read, &status, + hdcp, "l_prime_combo_read")) + goto out; + } else { + if (!mod_hdcp_execute_and_set(mod_hdcp_write_lc_init, + &input->lc_init_write, &status, + hdcp, "lc_init_write")) + goto out; + if (is_dp_hdcp(hdcp)) + msleep(16); + else + if (!mod_hdcp_execute_and_set(poll_l_prime_available, + &input->l_prime_available_poll, &status, + hdcp, "l_prime_available_poll")) + goto out; + if (!mod_hdcp_execute_and_set(mod_hdcp_read_l_prime, + &input->l_prime_read, &status, + hdcp, "l_prime_read")) + goto out; + } if (!mod_hdcp_execute_and_set(mod_hdcp_hdcp2_validate_l_prime, &input->l_prime_validation, &status, diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_transition.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_transition.c index 89ffb89e1932..9316312a4df5 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_transition.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_transition.c @@ -184,31 +184,33 @@ enum mod_hdcp_status mod_hdcp_hdcp2_transition(struct mod_hdcp *hdcp, callback_in_ms(0, output); set_state_id(hdcp, output, H2_A2_LOCALITY_CHECK); break; - case H2_A2_LOCALITY_CHECK: { - const bool use_fw = hdcp->config.ddc.funcs.atomic_write_poll_read_i2c - && !adjust->hdcp2.force_sw_locality_check; - - /* - * 1A-05: consider disconnection after LC init a failure - * 1A-13-1: consider invalid l' a failure - * 1A-13-2: consider l' timeout a failure - */ + case H2_A2_LOCALITY_CHECK: + /* 1A-05: consider disconnection after LC init a failure */ if (hdcp->state.stay_count > 10 || - input->lc_init_prepare != PASS || - (!use_fw && input->lc_init_write != PASS) || - (!use_fw && input->l_prime_available_poll != PASS)) { + input->lc_init_prepare != PASS) { fail_and_restart_in_ms(0, &status, output); break; - } else if (input->l_prime_read != PASS) { - if (use_fw && hdcp->config.debug.lc_enable_sw_fallback) { - adjust->hdcp2.force_sw_locality_check = true; + } else if (adjust->hdcp2.use_fw_locality_check && + input->l_prime_combo_read != PASS) { + /* 1A-13-2: consider l' timeout a failure */ + if (adjust->hdcp2.use_sw_locality_fallback) { + /* switch to software locality check */ + adjust->hdcp2.use_fw_locality_check = 0; callback_in_ms(0, output); + increment_stay_counter(hdcp); break; } - + fail_and_restart_in_ms(0, &status, output); + break; + } else if (!adjust->hdcp2.use_fw_locality_check && + (input->lc_init_write != PASS || + input->l_prime_available_poll != PASS || + input->l_prime_read != PASS)) { + /* 1A-13-2: consider l' timeout a failure */ fail_and_restart_in_ms(0, &status, output); break; } else if (input->l_prime_validation != PASS) { + /* 1A-13-1: consider invalid l' a failure */ callback_in_ms(0, output); increment_stay_counter(hdcp); break; @@ -216,7 +218,6 @@ enum mod_hdcp_status mod_hdcp_hdcp2_transition(struct mod_hdcp *hdcp, callback_in_ms(0, output); set_state_id(hdcp, output, H2_A3_EXCHANGE_KS_AND_TEST_FOR_REPEATER); break; - } case H2_A3_EXCHANGE_KS_AND_TEST_FOR_REPEATER: if (input->eks_prepare != PASS || input->eks_write != PASS) { @@ -510,26 +511,29 @@ enum mod_hdcp_status mod_hdcp_hdcp2_dp_transition(struct mod_hdcp *hdcp, callback_in_ms(0, output); set_state_id(hdcp, output, D2_A2_LOCALITY_CHECK); break; - case D2_A2_LOCALITY_CHECK: { - const bool use_fw = hdcp->config.ddc.funcs.atomic_write_poll_read_aux - && !adjust->hdcp2.force_sw_locality_check; - + case D2_A2_LOCALITY_CHECK: if (hdcp->state.stay_count > 10 || - input->lc_init_prepare != PASS || - (!use_fw && input->lc_init_write != PASS)) { - /* 1A-12: consider invalid l' a failure */ + input->lc_init_prepare != PASS) { fail_and_restart_in_ms(0, &status, output); break; - } else if (input->l_prime_read != PASS) { - if (use_fw && hdcp->config.debug.lc_enable_sw_fallback) { - adjust->hdcp2.force_sw_locality_check = true; + } else if (adjust->hdcp2.use_fw_locality_check && + input->l_prime_combo_read != PASS) { + if (adjust->hdcp2.use_sw_locality_fallback) { + /* switch to software locality check */ + adjust->hdcp2.use_fw_locality_check = 0; callback_in_ms(0, output); + increment_stay_counter(hdcp); break; } - + fail_and_restart_in_ms(0, &status, output); + break; + } else if (!adjust->hdcp2.use_fw_locality_check && + (input->lc_init_write != PASS || + input->l_prime_read != PASS)) { fail_and_restart_in_ms(0, &status, output); break; } else if (input->l_prime_validation != PASS) { + /* 1A-12: consider invalid l' a failure */ callback_in_ms(0, output); increment_stay_counter(hdcp); break; @@ -537,7 +541,6 @@ enum mod_hdcp_status mod_hdcp_hdcp2_dp_transition(struct mod_hdcp *hdcp, callback_in_ms(0, output); set_state_id(hdcp, output, D2_A34_EXCHANGE_KS_AND_TEST_FOR_REPEATER); break; - } case D2_A34_EXCHANGE_KS_AND_TEST_FOR_REPEATER: if (input->eks_prepare != PASS || input->eks_write != PASS) { diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c index 2e6408579194..0ca39873f807 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c @@ -758,6 +758,6 @@ enum mod_hdcp_status mod_hdcp_write_poll_read_lc_fw(struct mod_hdcp *hdcp) { const bool success = (is_dp_hdcp(hdcp) ? write_stall_read_lc_fw_aux : write_poll_read_lc_fw_i2c)(hdcp); - return success ? MOD_HDCP_STATUS_SUCCESS : MOD_HDCP_STATUS_DDC_FAILURE; + return success ? MOD_HDCP_STATUS_SUCCESS : MOD_HDCP_STATUS_HDCP2_LOCALITY_COMBO_READ_FAILURE; } diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_log.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_log.c index 6b3b5f610907..ac44ee1532fd 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_log.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_log.c @@ -248,6 +248,8 @@ char *mod_hdcp_status_to_str(int32_t status) return "MOD_HDCP_STATUS_HDCP2_DEVICE_COUNT_MISMATCH_FAILURE"; case MOD_HDCP_STATUS_UNSUPPORTED_PSP_VER_FAILURE: return "MOD_HDCP_STATUS_UNSUPPORTED_PSP_VER_FAILURE"; + case MOD_HDCP_STATUS_HDCP2_LOCALITY_COMBO_READ_FAILURE: + return "MOD_HDCP_STATUS_HDCP2_LOCALITY_COMBO_READ_FAILURE"; default: return "MOD_HDCP_STATUS_UNKNOWN"; } diff --git a/drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h b/drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h index 46e52fb3a118..264348989e9b 100644 --- a/drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h +++ b/drivers/gpu/drm/amd/display/modules/inc/mod_hdcp.h @@ -98,6 +98,7 @@ enum mod_hdcp_status { MOD_HDCP_STATUS_HDCP2_REAUTH_LINK_INTEGRITY_FAILURE, MOD_HDCP_STATUS_HDCP2_DEVICE_COUNT_MISMATCH_FAILURE, MOD_HDCP_STATUS_UNSUPPORTED_PSP_VER_FAILURE, + MOD_HDCP_STATUS_HDCP2_LOCALITY_COMBO_READ_FAILURE, }; struct mod_hdcp_displayport { @@ -214,8 +215,9 @@ struct mod_hdcp_link_adjustment_hdcp2 { uint8_t force_type : 2; uint8_t force_no_stored_km : 1; uint8_t increase_h_prime_timeout: 1; - uint8_t force_sw_locality_check : 1; - uint8_t reserved : 2; + uint8_t use_fw_locality_check : 1; + uint8_t use_sw_locality_fallback: 1; + uint8_t reserved : 1; }; struct mod_hdcp_link_adjustment { @@ -317,10 +319,6 @@ struct mod_hdcp_display_query { struct mod_hdcp_config { struct mod_hdcp_psp psp; struct mod_hdcp_ddc ddc; - struct { - uint8_t lc_enable_sw_fallback : 1; - uint8_t reserved : 7; - } debug; uint8_t index; }; From 7d041982fe11fff29b32a09228c4d52f159b56ad Mon Sep 17 00:00:00 2001 From: Andrew Mazour Date: Wed, 15 Oct 2025 12:19:49 -0400 Subject: [PATCH 31/83] drm/amd/display: Extend inbox0 lock to run Replay/PSR [Why] The inbox1 infrastructure is deprecated, so to support display power features requiring a DMUB interlock moving forward extend the inbox0 locking conditions to also include Replay or PSR. [How] Implemented a series of changes to improve HW lock handling: - Deprecated should_use_dmub_inbox1_lock() and guarded it with DCN401 flag. - Migrated lock checks into inbox0 helpers and added PSR/Replay enablement checks to ensure correct behavior. - Updated HWSS fast update path to acquire HW lock as needed using the new helpers. Reviewed-by: Nicholas Kazlauskas Signed-off-by: Andrew Mazour Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/core/dc_hw_sequencer.c | 5 +- .../drm/amd/display/dc/dce/dmub_hw_lock_mgr.c | 58 ++++++++++++------- .../drm/amd/display/dc/dce/dmub_hw_lock_mgr.h | 2 + 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c index f95cb0cf4b8a..a7ec633b26c0 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_hw_sequencer.c @@ -38,6 +38,7 @@ #include "dccg.h" #include "abm.h" #include "dcn10/dcn10_hubbub.h" +#include "dce/dmub_hw_lock_mgr.h" #define NUM_ELEMENTS(a) (sizeof(a) / sizeof((a)[0])) #define MAX_NUM_MCACHE 8 @@ -764,7 +765,9 @@ void hwss_build_fast_sequence(struct dc *dc, if (dc->hwss.dmub_hw_control_lock_fast) { block_sequence[*num_steps].params.dmub_hw_control_lock_fast_params.dc = dc; block_sequence[*num_steps].params.dmub_hw_control_lock_fast_params.lock = true; - block_sequence[*num_steps].params.dmub_hw_control_lock_fast_params.is_required = dc_state_is_fams2_in_use(dc, context); + block_sequence[*num_steps].params.dmub_hw_control_lock_fast_params.is_required = + dc_state_is_fams2_in_use(dc, context) || + dmub_hw_lock_mgr_does_link_require_lock(dc, stream->link); block_sequence[*num_steps].func = DMUB_HW_CONTROL_LOCK_FAST; (*num_steps)++; } diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c index 39f5fa73c43e..5bfa2b0d2afd 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c @@ -61,31 +61,49 @@ void dmub_hw_lock_mgr_inbox0_cmd(struct dc_dmub_srv *dmub_srv, dc_dmub_srv_wait_for_inbox0_ack(dmub_srv); } +bool dmub_hw_lock_mgr_does_link_require_lock(const struct dc *dc, const struct dc_link *link) +{ + if (!link) + return false; + + if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) + return true; + + if (link->replay_settings.replay_feature_enabled) + return true; + + if (link->psr_settings.psr_version == DC_PSR_VERSION_1) { + struct dc_link *edp_links[MAX_NUM_EDP]; + int edp_num; + + dc_get_edp_links(dc, edp_links, &edp_num); + if (edp_num == 1) + return true; + } + return false; +} + +bool dmub_hw_lock_mgr_does_context_require_lock(const struct dc *dc, const struct dc_state *context) +{ + if (!context) + return false; + for (int i = 0; i < context->stream_count; i++) { + const struct dc_link *link = context->streams[i]->link; + + if (dmub_hw_lock_mgr_does_link_require_lock(dc, link)) + return true; + } + return false; +} + bool should_use_dmub_inbox1_lock(const struct dc *dc, const struct dc_link *link) { /* ASIC doesn't support DMUB */ if (!dc->ctx->dmub_srv) return false; - if (link) { + if (dc->ctx->dce_version >= DCN_VERSION_4_01) + return false; - if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) - return true; - - if (link->replay_settings.replay_feature_enabled) - return true; - - /* only use HW lock for PSR1 on single eDP */ - if (link->psr_settings.psr_version == DC_PSR_VERSION_1) { - struct dc_link *edp_links[MAX_NUM_EDP]; - int edp_num; - - dc_get_edp_links(dc, edp_links, &edp_num); - - if (edp_num == 1) - return true; - } - } - - return false; + return dmub_hw_lock_mgr_does_link_require_lock(dc, link); } diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.h b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.h index 9f53d2ea5fa5..4c80ca8484ad 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.h +++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.h @@ -46,5 +46,7 @@ void dmub_hw_lock_mgr_inbox0_cmd(struct dc_dmub_srv *dmub_srv, * Return: true if the inbox1 lock should be used, false otherwise */ bool should_use_dmub_inbox1_lock(const struct dc *dc, const struct dc_link *link); +bool dmub_hw_lock_mgr_does_link_require_lock(const struct dc *dc, const struct dc_link *link); +bool dmub_hw_lock_mgr_does_context_require_lock(const struct dc *dc, const struct dc_state *context); #endif /*_DMUB_HW_LOCK_MGR_H_ */ From e19cb97ea80865d4a5b33e94450213f25d470541 Mon Sep 17 00:00:00 2001 From: Austin Zheng Date: Wed, 17 Sep 2025 10:38:37 -0500 Subject: [PATCH 32/83] drm/amd/display: Add pte_buffer_mode and force_one_row_for_frame in dchub reg [Why & How] Update structs for rq regs Reviewed-by: Dillon Varone Signed-off-by: Austin Zheng Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/dml2_0/dml21/inc/dml_top_dchub_registers.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_dchub_registers.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_dchub_registers.h index 8e5a30287220..bf57df42d1d9 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_dchub_registers.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_dchub_registers.h @@ -121,6 +121,8 @@ struct dml2_display_rq_regs { uint32_t crq_expansion_mode; uint32_t plane1_base_address; uint32_t unbounded_request_enabled; + bool pte_buffer_mode; + bool force_one_row_for_frame; // MRQ uint32_t mrq_expansion_mode; From 9c83768e578f991031d6943815842ede2ba94fcf Mon Sep 17 00:00:00 2001 From: Austin Zheng Date: Wed, 17 Sep 2025 12:56:00 -0400 Subject: [PATCH 33/83] drm/amd/display: Remove old PMO options [Why & How] Removes deprecated or unused PMO options. Reviewed-by: Dillon Varone Signed-off-by: Austin Zheng Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h | 2 -- .../dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h | 4 ++-- .../dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c | 6 +----- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h index 13749c9fcf18..da8e5c8b2244 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h @@ -423,7 +423,6 @@ struct dml2_stream_parameters { bool disable_dynamic_odm; bool disable_subvp; int minimum_vblank_idle_requirement_us; - bool minimize_active_latency_hiding; struct { struct { @@ -489,7 +488,6 @@ struct dml2_display_cfg { bool synchronize_ddr_displays_for_uclk_pstate_change; bool max_outstanding_when_urgent_expected_disable; bool enable_subvp_implicit_pmo; //enables PMO to switch pipe uclk strategy to subvp, and generate phantom programming - unsigned int best_effort_min_active_latency_hiding_us; bool all_streams_blanked; } overrides; }; diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h index 4a9a0d5a09b7..e87d04a734b5 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h @@ -89,8 +89,8 @@ struct dml2_soc_qos_parameters { struct dml2_soc_power_management_parameters { double dram_clk_change_blackout_us; - double dram_clk_change_read_only_us; - double dram_clk_change_write_only_us; + double dram_clk_change_read_only_us; // deprecated + double dram_clk_change_write_only_us; // deprecated double fclk_change_blackout_us; double g7_ppt_blackout_us; double g7_temperature_read_blackout_us; diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c index 5769c2638f9a..abd210401fe2 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c @@ -1962,9 +1962,6 @@ static void reset_display_configuration(struct display_configuation_with_meta *d for (stream_index = 0; stream_index < display_config->display_config.num_streams; stream_index++) { display_config->stage3.stream_svp_meta[stream_index].valid = false; - - display_config->display_config.stream_descriptors[stream_index].overrides.minimize_active_latency_hiding = false; - display_config->display_config.overrides.best_effort_min_active_latency_hiding_us = 0; } for (plane_index = 0; plane_index < display_config->display_config.num_planes; plane_index++) { @@ -1997,7 +1994,6 @@ static void setup_planes_for_drr_by_mask(struct display_configuation_with_meta * plane->overrides.uclk_pstate_change_strategy = dml2_uclk_pstate_change_strategy_force_drr; display_config->stage3.pstate_switch_modes[plane_index] = dml2_pstate_method_fw_drr; - } } } @@ -2063,7 +2059,6 @@ static void setup_planes_for_vblank_by_mask(struct display_configuation_with_met plane->overrides.reserved_vblank_time_ns); display_config->stage3.pstate_switch_modes[plane_index] = dml2_pstate_method_vblank; - } } } @@ -2078,6 +2073,7 @@ static void setup_planes_for_vblank_drr_by_mask(struct display_configuation_with for (plane_index = 0; plane_index < display_config->display_config.num_planes; plane_index++) { if (is_bit_set_in_bitfield(plane_mask, plane_index)) { plane = &display_config->display_config.plane_descriptors[plane_index]; + plane->overrides.reserved_vblank_time_ns = (long)(pmo->soc_bb->power_management_parameters.dram_clk_change_blackout_us * 1000); display_config->stage3.pstate_switch_modes[plane_index] = dml2_pstate_method_fw_vblank_drr; From e44ee152a5ea8ee1b5f5440a754a9fd17639f230 Mon Sep 17 00:00:00 2001 From: Austin Zheng Date: Wed, 24 Sep 2025 10:23:24 -0400 Subject: [PATCH 34/83] drm/amd/display: Update P-state naming for clarity. [Why & How] P-state can refer to different things like UCLK P-state, PPT, or temp read Update naming for clarity Reviewed-by: Dillon Varone Signed-off-by: Austin Zheng Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../dml21/src/dml2_core/dml2_core_dcn4_calcs.c | 2 +- .../src/dml2_core/dml2_core_shared_types.h | 18 ++++++++++++------ .../dml21/src/dml2_core/dml2_core_utils.c | 2 ++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_dcn4_calcs.c b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_dcn4_calcs.c index f809c4073b43..e7a0f46e1289 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_dcn4_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_dcn4_calcs.c @@ -13024,7 +13024,7 @@ void dml2_core_calcs_get_informative(const struct dml2_core_internal_display_mod out->informative.mode_support_info.InvalidCombinationOfMALLUseForPState = mode_lib->ms.support.InvalidCombinationOfMALLUseForPState; out->informative.mode_support_info.ExceededMALLSize = mode_lib->ms.support.ExceededMALLSize; out->informative.mode_support_info.EnoughWritebackUnits = mode_lib->ms.support.EnoughWritebackUnits; - out->informative.mode_support_info.temp_read_or_ppt_support = mode_lib->ms.support.temp_read_or_ppt_support; + out->informative.mode_support_info.temp_read_or_ppt_support = mode_lib->ms.support.global_temp_read_or_ppt_supported; out->informative.mode_support_info.g6_temp_read_support = mode_lib->ms.support.g6_temp_read_support; out->informative.mode_support_info.ExceededMultistreamSlots = mode_lib->ms.support.ExceededMultistreamSlots; diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_shared_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_shared_types.h index 051c31ec2f0e..6d13d4c9b69a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_shared_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_shared_types.h @@ -264,8 +264,11 @@ struct dml2_core_internal_mode_support_info { bool DCCMetaBufferSizeNotExceeded; enum dml2_pstate_change_support DRAMClockChangeSupport[DML2_MAX_PLANES]; enum dml2_pstate_change_support FCLKChangeSupport[DML2_MAX_PLANES]; + enum dml2_pstate_change_support temp_read_or_ppt_support[DML2_MAX_PLANES]; + bool global_dram_clock_change_support_required; bool global_dram_clock_change_supported; bool global_fclk_change_supported; + bool global_temp_read_or_ppt_supported; bool USRRetrainingSupport; bool AvgBandwidthSupport; bool UrgVactiveBandwidthSupport; @@ -336,7 +339,6 @@ struct dml2_core_internal_mode_support_info { bool incorrect_imall_usage; bool g6_temp_read_support; - bool temp_read_or_ppt_support; struct dml2_core_internal_watermarks watermarks; bool dcfclk_support; @@ -646,7 +648,7 @@ struct dml2_core_internal_mode_support { unsigned int DSTYAfterScaler[DML2_MAX_PLANES]; unsigned int DSTXAfterScaler[DML2_MAX_PLANES]; - enum dml2_pstate_method pstate_switch_modes[DML2_MAX_PLANES]; + enum dml2_pstate_method uclk_pstate_switch_modes[DML2_MAX_PLANES]; }; /// @brief A mega structure that houses various info for model programming step. @@ -837,6 +839,7 @@ struct dml2_core_internal_mode_program { double max_urgent_latency_us; double df_response_time_us; + enum dml2_pstate_method uclk_pstate_switch_modes[DML2_MAX_PLANES]; // ------------------- // Output // ------------------- @@ -963,11 +966,12 @@ struct dml2_core_internal_mode_program { double MaxActiveFCLKChangeLatencySupported; bool USRRetrainingSupport; bool g6_temp_read_support; - bool temp_read_or_ppt_support; enum dml2_pstate_change_support FCLKChangeSupport[DML2_MAX_PLANES]; enum dml2_pstate_change_support DRAMClockChangeSupport[DML2_MAX_PLANES]; + enum dml2_pstate_change_support temp_read_or_ppt_support[DML2_MAX_PLANES]; bool global_dram_clock_change_supported; bool global_fclk_change_supported; + bool global_temp_read_or_ppt_supported; double MaxActiveDRAMClockChangeLatencySupported[DML2_MAX_PLANES]; double WritebackAllowFCLKChangeEndPosition[DML2_MAX_PLANES]; double WritebackAllowDRAMClockChangeEndPosition[DML2_MAX_PLANES]; @@ -1313,7 +1317,7 @@ struct dml2_core_calcs_CalculateVMRowAndSwath_params { unsigned int HostVMMinPageSize; unsigned int DCCMetaBufferSizeBytes; bool mrq_present; - enum dml2_pstate_method *pstate_switch_modes; + enum dml2_pstate_method *uclk_pstate_switch_modes; // Output bool *PTEBufferSizeNotExceeded; @@ -1740,10 +1744,12 @@ struct dml2_core_calcs_CalculateWatermarksMALLUseAndDRAMSpeedChangeSupport_param unsigned int max_request_size_bytes; unsigned int *meta_row_height_l; unsigned int *meta_row_height_c; + enum dml2_pstate_method *uclk_pstate_switch_modes; // Output struct dml2_core_internal_watermarks *Watermark; enum dml2_pstate_change_support *DRAMClockChangeSupport; + bool *global_dram_clock_change_support_required; bool *global_dram_clock_change_supported; double *MaxActiveDRAMClockChangeLatencySupported; unsigned int *SubViewportLinesNeededInMALL; @@ -1754,10 +1760,10 @@ struct dml2_core_calcs_CalculateWatermarksMALLUseAndDRAMSpeedChangeSupport_param double *VActiveLatencyHidingMargin; double *VActiveLatencyHidingUs; bool *g6_temp_read_support; - bool *temp_read_or_ppt_support; + enum dml2_pstate_change_support *temp_read_or_ppt_support; + bool *global_temp_read_or_ppt_supported; }; - struct dml2_core_calcs_CalculateSwathAndDETConfiguration_params { const struct dml2_display_cfg *display_cfg; unsigned int ConfigReturnBufferSizeInKByte; diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_utils.c b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_utils.c index 5f301befed16..b57d0f6ea6a1 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_utils.c +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_utils.c @@ -306,6 +306,8 @@ void dml2_core_utils_print_mode_support_info(const struct dml2_core_internal_mod DML_LOG_VERBOSE("DML: support: ExceededMALLSize = %d\n", support->ExceededMALLSize); if (!fail_only || support->g6_temp_read_support == 0) DML_LOG_VERBOSE("DML: support: g6_temp_read_support = %d\n", support->g6_temp_read_support); + if (!fail_only || (support->global_dram_clock_change_supported == 0 && support->global_dram_clock_change_support_required)) + DML_LOG_VERBOSE("DML: support: dram_clock_change_support = %d\n", support->global_dram_clock_change_supported); if (!fail_only || support->ImmediateFlipSupport == 0) DML_LOG_VERBOSE("DML: support: ImmediateFlipSupport = %d\n", support->ImmediateFlipSupport); if (!fail_only || support->LinkCapacitySupport == 0) From 1b3246352af8761f00c98f4ee9502e91634c33ed Mon Sep 17 00:00:00 2001 From: Austin Zheng Date: Fri, 3 Oct 2025 10:39:49 -0400 Subject: [PATCH 35/83] drm/amd/display: Refactor VActive implementation [Why & How] Refactors VActive accounting in PMO, and breaks down fill time requirement by P-State type as it can result in drasitcally different bandwidth requirements depending on the blackout length. Reviewed-by: Dillon Varone Signed-off-by: Austin Zheng Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../dml21/inc/dml_top_display_cfg_types.h | 12 +++++- .../dml21/inc/dml_top_soc_parameter_types.h | 2 +- .../dc/dml2_0/dml21/inc/dml_top_types.h | 8 ---- .../src/dml2_core/dml2_core_dcn4_calcs.c | 37 ++++++++-------- .../src/dml2_core/dml2_core_shared_types.h | 16 +++---- .../dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c | 14 +++---- .../src/inc/dml2_internal_shared_types.h | 42 +++++++++++-------- 7 files changed, 69 insertions(+), 62 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h index da8e5c8b2244..35aa954248cd 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_display_cfg_types.h @@ -87,6 +87,15 @@ enum dml2_output_link_dp_rate { dml2_dp_rate_uhbr20 = 6 }; +enum dml2_pstate_type { + dml2_pstate_type_uclk = 0, + dml2_pstate_type_fclk = 1, + dml2_pstate_type_ppt = 2, + dml2_pstate_type_temp_read = 3, + dml2_pstate_type_dummy_pstate = 4, + dml2_pstate_type_count = 5 +}; + enum dml2_uclk_pstate_change_strategy { dml2_uclk_pstate_change_strategy_auto = 0, dml2_uclk_pstate_change_strategy_force_vactive = 1, @@ -393,8 +402,7 @@ struct dml2_plane_parameters { // reserved_vblank_time_ns is the minimum time to reserve in vblank for Twait // The actual reserved vblank time used for the corresponding stream in mode_programming would be at least as much as this per-plane override. long reserved_vblank_time_ns; - unsigned int max_vactive_det_fill_delay_us; // 0 = no reserved time, +ve = explicit max delay - unsigned int vactive_latency_to_hide_for_pstate_admissibility_us; + unsigned int max_vactive_det_fill_delay_us[dml2_pstate_type_count]; // 0 = no reserved time, +ve = explicit max delay unsigned int gpuvm_min_page_size_kbytes; unsigned int hostvm_min_page_size_kbytes; diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h index e87d04a734b5..1fbc520c2540 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_soc_parameter_types.h @@ -191,7 +191,7 @@ struct dml2_ip_capabilities { unsigned int subvp_prefetch_end_to_mall_start_us; unsigned int subvp_fw_processing_delay; unsigned int max_vactive_det_fill_delay_us; - unsigned int ppt_max_allow_delay_ns; + unsigned int ppt_max_allow_delay_us; unsigned int temp_read_max_allow_delay_us; unsigned int dummy_pstate_max_allow_delay_us; /* FAMS2 delays */ diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h index 8646ce5f1c01..d2584b00a19c 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h @@ -195,14 +195,6 @@ struct dml2_mcache_surface_allocation { } informative; }; -enum dml2_pstate_type { - dml2_pstate_type_uclk, - dml2_pstate_type_ppt, - dml2_pstate_type_temp_read, - dml2_pstate_type_dummy_pstate, - dml2_pstate_type_count -}; - enum dml2_pstate_method { dml2_pstate_method_na = 0, /* hw exclusive modes */ diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_dcn4_calcs.c b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_dcn4_calcs.c index e7a0f46e1289..df81bd963bb8 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_dcn4_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_dcn4_calcs.c @@ -6972,7 +6972,7 @@ static void calculate_bytes_to_fetch_required_to_hide_latency( stream_index = p->display_cfg->plane_descriptors[plane_index].stream_index; - dst_lines_to_hide = (unsigned int)math_ceil(p->latency_to_hide_us / + dst_lines_to_hide = (unsigned int)math_ceil(p->latency_to_hide_us[0] / ((double)p->display_cfg->stream_descriptors[stream_index].timing.h_total / (double)p->display_cfg->stream_descriptors[stream_index].timing.pixel_clock_khz * 1000.0)); @@ -7069,9 +7069,9 @@ static void calculate_excess_vactive_bandwidth_required( excess_vactive_fill_bw_l[plane_index] = 0.0; excess_vactive_fill_bw_c[plane_index] = 0.0; - if (display_cfg->plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us > 0) { - excess_vactive_fill_bw_l[plane_index] = (double)bytes_required_l[plane_index] / (double)display_cfg->plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us; - excess_vactive_fill_bw_c[plane_index] = (double)bytes_required_c[plane_index] / (double)display_cfg->plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us; + if (display_cfg->plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us[dml2_pstate_type_uclk] > 0) { + excess_vactive_fill_bw_l[plane_index] = (double)bytes_required_l[plane_index] / (double)display_cfg->plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us[dml2_pstate_type_uclk]; + excess_vactive_fill_bw_c[plane_index] = (double)bytes_required_c[plane_index] / (double)display_cfg->plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us[dml2_pstate_type_uclk]; } } } @@ -9051,11 +9051,11 @@ static bool dml_core_mode_support(struct dml2_core_calcs_mode_support_ex *in_out calculate_bytes_to_fetch_required_to_hide_latency_params->swath_width_c = mode_lib->ms.SwathWidthC; calculate_bytes_to_fetch_required_to_hide_latency_params->swath_height_l = mode_lib->ms.SwathHeightY; calculate_bytes_to_fetch_required_to_hide_latency_params->swath_height_c = mode_lib->ms.SwathHeightC; - calculate_bytes_to_fetch_required_to_hide_latency_params->latency_to_hide_us = mode_lib->soc.power_management_parameters.dram_clk_change_blackout_us; + calculate_bytes_to_fetch_required_to_hide_latency_params->latency_to_hide_us[0] = mode_lib->soc.power_management_parameters.dram_clk_change_blackout_us; /* outputs */ - calculate_bytes_to_fetch_required_to_hide_latency_params->bytes_required_l = s->pstate_bytes_required_l; - calculate_bytes_to_fetch_required_to_hide_latency_params->bytes_required_c = s->pstate_bytes_required_c; + calculate_bytes_to_fetch_required_to_hide_latency_params->bytes_required_l = s->pstate_bytes_required_l[dml2_pstate_type_uclk]; + calculate_bytes_to_fetch_required_to_hide_latency_params->bytes_required_c = s->pstate_bytes_required_c[dml2_pstate_type_uclk]; calculate_bytes_to_fetch_required_to_hide_latency(calculate_bytes_to_fetch_required_to_hide_latency_params); @@ -9063,8 +9063,8 @@ static bool dml_core_mode_support(struct dml2_core_calcs_mode_support_ex *in_out calculate_excess_vactive_bandwidth_required( display_cfg, mode_lib->ms.num_active_planes, - s->pstate_bytes_required_l, - s->pstate_bytes_required_c, + s->pstate_bytes_required_l[dml2_pstate_type_uclk], + s->pstate_bytes_required_c[dml2_pstate_type_uclk], /* outputs */ mode_lib->ms.excess_vactive_fill_bw_l, mode_lib->ms.excess_vactive_fill_bw_c); @@ -9506,8 +9506,8 @@ static bool dml_core_mode_support(struct dml2_core_calcs_mode_support_ex *in_out calculate_vactive_det_fill_latency( display_cfg, mode_lib->ms.num_active_planes, - s->pstate_bytes_required_l, - s->pstate_bytes_required_c, + s->pstate_bytes_required_l[dml2_pstate_type_uclk], + s->pstate_bytes_required_c[dml2_pstate_type_uclk], mode_lib->ms.dcc_dram_bw_nom_overhead_factor_p0, mode_lib->ms.dcc_dram_bw_nom_overhead_factor_p1, mode_lib->ms.vactive_sw_bw_l, @@ -9515,7 +9515,7 @@ static bool dml_core_mode_support(struct dml2_core_calcs_mode_support_ex *in_out mode_lib->ms.surface_avg_vactive_required_bw, mode_lib->ms.surface_peak_required_bw, /* outputs */ - mode_lib->ms.dram_change_vactive_det_fill_delay_us); + mode_lib->ms.pstate_vactive_det_fill_delay_us[dml2_pstate_type_uclk]); #ifdef __DML_VBA_DEBUG__ DML_LOG_VERBOSE("DML::%s: max_urgent_latency_us = %f\n", __func__, s->mSOCParameters.max_urgent_latency_us); @@ -11009,11 +11009,11 @@ static bool dml_core_mode_programming(struct dml2_core_calcs_mode_programming_ex calculate_bytes_to_fetch_required_to_hide_latency_params->swath_width_c = mode_lib->mp.SwathWidthC; calculate_bytes_to_fetch_required_to_hide_latency_params->swath_height_l = mode_lib->mp.SwathHeightY; calculate_bytes_to_fetch_required_to_hide_latency_params->swath_height_c = mode_lib->mp.SwathHeightC; - calculate_bytes_to_fetch_required_to_hide_latency_params->latency_to_hide_us = mode_lib->soc.power_management_parameters.dram_clk_change_blackout_us; + calculate_bytes_to_fetch_required_to_hide_latency_params->latency_to_hide_us[0] = mode_lib->soc.power_management_parameters.dram_clk_change_blackout_us; /* outputs */ - calculate_bytes_to_fetch_required_to_hide_latency_params->bytes_required_l = s->pstate_bytes_required_l; - calculate_bytes_to_fetch_required_to_hide_latency_params->bytes_required_c = s->pstate_bytes_required_c; + calculate_bytes_to_fetch_required_to_hide_latency_params->bytes_required_l = s->pstate_bytes_required_l[dml2_pstate_type_uclk]; + calculate_bytes_to_fetch_required_to_hide_latency_params->bytes_required_c = s->pstate_bytes_required_c[dml2_pstate_type_uclk]; calculate_bytes_to_fetch_required_to_hide_latency(calculate_bytes_to_fetch_required_to_hide_latency_params); @@ -11021,8 +11021,8 @@ static bool dml_core_mode_programming(struct dml2_core_calcs_mode_programming_ex calculate_excess_vactive_bandwidth_required( display_cfg, s->num_active_planes, - s->pstate_bytes_required_l, - s->pstate_bytes_required_c, + s->pstate_bytes_required_l[dml2_pstate_type_uclk], + s->pstate_bytes_required_c[dml2_pstate_type_uclk], /* outputs */ mode_lib->mp.excess_vactive_fill_bw_l, mode_lib->mp.excess_vactive_fill_bw_c); @@ -12943,7 +12943,8 @@ void dml2_core_calcs_get_plane_support_info(const struct dml2_display_cfg *displ out->active_latency_hiding_us = (int)mode_lib->ms.VActiveLatencyHidingUs[plane_idx]; - out->dram_change_vactive_det_fill_delay_us = (unsigned int)math_ceil(mode_lib->ms.dram_change_vactive_det_fill_delay_us[plane_idx]); + out->vactive_det_fill_delay_us[dml2_pstate_type_uclk] = + (unsigned int)math_ceil(mode_lib->ms.pstate_vactive_det_fill_delay_us[plane_idx][dml2_pstate_type_uclk]); } void dml2_core_calcs_get_stream_support_info(const struct dml2_display_cfg *display_cfg, const struct dml2_core_internal_display_mode_lib *mode_lib, struct core_stream_support_info *out, int plane_index) diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_shared_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_shared_types.h index 6d13d4c9b69a..1087a8c926ff 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_shared_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_core/dml2_core_shared_types.h @@ -593,7 +593,7 @@ struct dml2_core_internal_mode_support { double VActiveLatencyHidingMargin[DML2_MAX_PLANES]; double VActiveLatencyHidingUs[DML2_MAX_PLANES]; unsigned int MaxVStartupLines[DML2_MAX_PLANES]; - double dram_change_vactive_det_fill_delay_us[DML2_MAX_PLANES]; + double pstate_vactive_det_fill_delay_us[dml2_pstate_type_count][DML2_MAX_PLANES]; unsigned int num_mcaches_l[DML2_MAX_PLANES]; unsigned int mcache_row_bytes_l[DML2_MAX_PLANES]; @@ -623,8 +623,8 @@ struct dml2_core_internal_mode_support { unsigned int dpte_row_bytes_per_row_l[DML2_MAX_PLANES]; unsigned int dpte_row_bytes_per_row_c[DML2_MAX_PLANES]; - unsigned int pstate_bytes_required_l[DML2_MAX_PLANES]; - unsigned int pstate_bytes_required_c[DML2_MAX_PLANES]; + unsigned int pstate_bytes_required_l[dml2_pstate_type_count][DML2_MAX_PLANES]; + unsigned int pstate_bytes_required_c[dml2_pstate_type_count][DML2_MAX_PLANES]; unsigned int cursor_bytes_per_chunk[DML2_MAX_PLANES]; unsigned int cursor_bytes_per_line[DML2_MAX_PLANES]; @@ -1138,8 +1138,8 @@ struct dml2_core_calcs_mode_support_locals { unsigned int cursor_bytes[DML2_MAX_PLANES]; bool stream_visited[DML2_MAX_PLANES]; - unsigned int pstate_bytes_required_l[DML2_MAX_PLANES]; - unsigned int pstate_bytes_required_c[DML2_MAX_PLANES]; + unsigned int pstate_bytes_required_l[dml2_pstate_type_count][DML2_MAX_PLANES]; + unsigned int pstate_bytes_required_c[dml2_pstate_type_count][DML2_MAX_PLANES]; double prefetch_sw_bytes[DML2_MAX_PLANES]; double Tpre_rounded[DML2_MAX_PLANES]; @@ -1230,8 +1230,8 @@ struct dml2_core_calcs_mode_programming_locals { double Tr0_trips_flip_rounded[DML2_MAX_PLANES]; unsigned int per_pipe_flip_bytes[DML2_MAX_PLANES]; - unsigned int pstate_bytes_required_l[DML2_MAX_PLANES]; - unsigned int pstate_bytes_required_c[DML2_MAX_PLANES]; + unsigned int pstate_bytes_required_l[dml2_pstate_type_count][DML2_MAX_PLANES]; + unsigned int pstate_bytes_required_c[dml2_pstate_type_count][DML2_MAX_PLANES]; double prefetch_sw_bytes[DML2_MAX_PLANES]; double Tpre_rounded[DML2_MAX_PLANES]; @@ -2253,7 +2253,7 @@ struct dml2_core_calcs_calculate_bytes_to_fetch_required_to_hide_latency_params unsigned int *swath_width_c; unsigned int *swath_height_l; unsigned int *swath_height_c; - double latency_to_hide_us; + double latency_to_hide_us[DML2_MAX_PLANES]; /* outputs */ unsigned int *bytes_required_l; diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c index abd210401fe2..c26e100fcaf2 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c @@ -1087,7 +1087,7 @@ static bool all_timings_support_drr(const struct dml2_pmo_instance *pmo, /* check required stretch is allowed */ if (stream_descriptor->timing.drr_config.max_instant_vtotal_delta > 0 && - stream_pstate_meta->method_drr.stretched_vtotal - stream_pstate_meta->nom_vtotal > stream_descriptor->timing.drr_config.max_instant_vtotal_delta) { + stream_pstate_meta->method_drr.stretched_vtotal - stream_pstate_meta->nom_vtotal > (int)stream_descriptor->timing.drr_config.max_instant_vtotal_delta) { return false; } } @@ -1669,15 +1669,15 @@ static int get_vactive_pstate_margin(const struct display_configuation_with_meta return min_vactive_margin_us; } -static unsigned int get_vactive_det_fill_latency_delay_us(const struct display_configuation_with_meta *display_cfg, int plane_mask) +static int get_vactive_det_fill_latency_delay_us(const struct display_configuation_with_meta *display_cfg, int plane_mask) { unsigned char i; - unsigned int max_vactive_fill_us = 0; + int max_vactive_fill_us = 0; for (i = 0; i < DML2_MAX_PLANES; i++) { if (is_bit_set_in_bitfield(plane_mask, i)) { - if (display_cfg->mode_support_result.cfg_support_info.plane_support_info[i].dram_change_vactive_det_fill_delay_us > max_vactive_fill_us) - max_vactive_fill_us = display_cfg->mode_support_result.cfg_support_info.plane_support_info[i].dram_change_vactive_det_fill_delay_us; + if (display_cfg->mode_support_result.cfg_support_info.plane_support_info[i].vactive_det_fill_delay_us[dml2_pstate_type_uclk] > max_vactive_fill_us) + max_vactive_fill_us = display_cfg->mode_support_result.cfg_support_info.plane_support_info[i].vactive_det_fill_delay_us[dml2_pstate_type_uclk]; } } @@ -2095,7 +2095,7 @@ static void setup_planes_for_vactive_by_mask(struct display_configuation_with_me display_config->stage3.pstate_switch_modes[plane_index] = dml2_pstate_method_vactive; if (!pmo->options->disable_vactive_det_fill_bw_pad) { - display_config->display_config.plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us = + display_config->display_config.plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us[dml2_pstate_type_uclk] = (unsigned int)math_floor(pmo->scratch.pmo_dcn4.stream_pstate_meta[stream_index].method_vactive.max_vactive_det_fill_delay_us); } } @@ -2116,7 +2116,7 @@ static void setup_planes_for_vactive_drr_by_mask(struct display_configuation_wit display_config->stage3.pstate_switch_modes[plane_index] = dml2_pstate_method_fw_vactive_drr; if (!pmo->options->disable_vactive_det_fill_bw_pad) { - display_config->display_config.plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us = + display_config->display_config.plane_descriptors[plane_index].overrides.max_vactive_det_fill_delay_us[dml2_pstate_type_uclk] = (unsigned int)math_floor(pmo->scratch.pmo_dcn4.stream_pstate_meta[stream_index].method_vactive.max_vactive_det_fill_delay_us); } } diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/inc/dml2_internal_shared_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/inc/dml2_internal_shared_types.h index 9f562f0c4797..1a6c0727cd2a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/inc/dml2_internal_shared_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/src/inc/dml2_internal_shared_types.h @@ -152,7 +152,7 @@ struct core_plane_support_info { int active_latency_hiding_us; int mall_svp_size_requirement_ways; int nominal_vblank_pstate_latency_hiding_us; - unsigned int dram_change_vactive_det_fill_delay_us; + int vactive_det_fill_delay_us[dml2_pstate_type_count]; }; struct core_stream_support_info { @@ -209,6 +209,7 @@ struct dml2_core_mode_support_result { unsigned int uclk_pstate_supported; unsigned int fclk_pstate_supported; + struct dml2_core_internal_watermarks watermarks; } global; struct { @@ -257,8 +258,8 @@ struct dml2_implicit_svp_meta { struct dml2_pstate_per_method_common_meta { /* generic params */ - unsigned int allow_start_otg_vline; - unsigned int allow_end_otg_vline; + int allow_start_otg_vline; + int allow_end_otg_vline; /* scheduling params */ double allow_time_us; double disallow_time_us; @@ -268,39 +269,44 @@ struct dml2_pstate_per_method_common_meta { struct dml2_pstate_meta { bool valid; double otg_vline_time_us; - unsigned int scheduling_delay_otg_vlines; - unsigned int vertical_interrupt_ack_delay_otg_vlines; - unsigned int allow_to_target_delay_otg_vlines; - unsigned int contention_delay_otg_vlines; - unsigned int min_allow_width_otg_vlines; - unsigned int nom_vtotal; - unsigned int vblank_start; + int scheduling_delay_otg_vlines; + int vertical_interrupt_ack_delay_otg_vlines; + int allow_to_target_delay_otg_vlines; + int contention_delay_otg_vlines; + int min_allow_width_otg_vlines; + int nom_vtotal; + int vblank_start; double nom_refresh_rate_hz; double nom_frame_time_us; - unsigned int max_vtotal; + int max_vtotal; double min_refresh_rate_hz; double max_frame_time_us; - unsigned int blackout_otg_vlines; + int blackout_otg_vlines; + int max_allow_delay_otg_vlines; + double nom_vblank_time_us; struct { double max_vactive_det_fill_delay_us; - unsigned int max_vactive_det_fill_delay_otg_vlines; + double vactive_latency_hiding_us; + double reserved_vblank_required_us; + int max_vactive_det_fill_delay_otg_vlines; + int reserved_blank_required_vlines; struct dml2_pstate_per_method_common_meta common; } method_vactive; struct { struct dml2_pstate_per_method_common_meta common; } method_vblank; struct { - unsigned int programming_delay_otg_vlines; - unsigned int df_throttle_delay_otg_vlines; - unsigned int prefetch_to_mall_delay_otg_vlines; + int programming_delay_otg_vlines; + int df_throttle_delay_otg_vlines; + int prefetch_to_mall_delay_otg_vlines; unsigned long phantom_vactive; unsigned long phantom_vfp; unsigned long phantom_vtotal; struct dml2_pstate_per_method_common_meta common; } method_subvp; struct { - unsigned int programming_delay_otg_vlines; - unsigned int stretched_vtotal; + int programming_delay_otg_vlines; + int stretched_vtotal; struct dml2_pstate_per_method_common_meta common; } method_drr; }; From 717b836c27d9a4d5bcaae74fa83f7b3312d45620 Mon Sep 17 00:00:00 2001 From: Austin Zheng Date: Tue, 30 Sep 2025 14:32:03 -0500 Subject: [PATCH 36/83] drm/amd/display: Add Pstate viewport reduction [Why/How] Add struct to hold calculated reduced viewport pstate recout reduction lines per plane Reviewed-by: Dillon Varone Signed-off-by: Austin Zheng Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h index d2584b00a19c..452e4a2e72c0 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/inc/dml_top_types.h @@ -676,6 +676,8 @@ struct dml2_display_cfg_programming { unsigned int PrefetchMode[DML2_MAX_PLANES]; // LEGACY_ONLY bool ROBUrgencyAvoidance; double LowestPrefetchMargin; + + unsigned int pstate_recout_reduction_lines[DML2_MAX_PLANES]; } misc; struct dml2_mode_support_info mode_support_info; From b0ba3108e3f8b20d8631ca4475a1a6d171973651 Mon Sep 17 00:00:00 2001 From: Joshua Aberback Date: Thu, 23 Oct 2025 16:43:56 -0400 Subject: [PATCH 37/83] drm/amd/display: Persist stream refcount through restore [Why & How] Overwriting the refcount on stream restore can lead to double-free errors or memory leaks if an unbalanced number of retains and releases occurs between a backup and restore. Reviewed-by: Dillon Varone Signed-off-by: Joshua Aberback Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index b720e007c654..f519e5893a68 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3389,7 +3389,11 @@ static void restore_planes_and_stream_state( for (i = 0; i < status->plane_count; i++) { dc_plane_copy_config(status->plane_states[i], &scratch->plane_states[i]); } + + // refcount is persistent + struct kref temp_refcount = stream->refcount; *stream = scratch->stream_state; + stream->refcount = temp_refcount; } /** From ac7f4fcc7b7d3b922a0001a101f4ce1bd8824a5e Mon Sep 17 00:00:00 2001 From: Dillon Varone Date: Thu, 23 Oct 2025 17:07:04 -0400 Subject: [PATCH 38/83] drm/amd/display: Revert DCN4 max buffered cursor size to 64 [Why & How] The buffered cursor cap is expressed assuming a square cursor, and usage of the cursor buffer is limited by the request size. For greater than 32 pixels, the request size is fixed at 256 bytes, so the maximum width must be floored to the nearest 256th byte. At 4bpp this means even with 24kB DCN4 can only hold a 64x64 cursor in the buffer as even 65 pixels would require 512 bytes per line instead of 256. Reviewed-by: Alvin Lee Signed-off-by: Dillon Varone Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c index df0b664c0cd2..b276fec3e479 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c @@ -2200,7 +2200,8 @@ static bool dcn32_resource_construct( dc->caps.i2c_speed_in_khz_hdcp = 100; /*1.4 w/a applied by default*/ /* TODO: Bring max_cursor_size back to 256 after subvp cursor corruption is fixed*/ dc->caps.max_cursor_size = 64; - dc->caps.max_buffered_cursor_size = 64; // sqrt(16 * 1024 / 4) + /* floor(sqrt(buf_size_bytes / bpp ) * bpp, fixed_req_size) / bpp = max_width */ + dc->caps.max_buffered_cursor_size = 64; // floor(sqrt(16 * 1024 / 4) * 4, 256) / 4 = 64 dc->caps.min_horizontal_blanking_period = 80; dc->caps.dmdata_alloc_size = 2048; dc->caps.mall_size_per_mem_channel = 4; From 54963d18a8e99e3eb5fe39b73c83b66fe8caf43f Mon Sep 17 00:00:00 2001 From: Alvin Lee Date: Thu, 23 Oct 2025 13:56:32 -0400 Subject: [PATCH 39/83] drm/amd/display: Increase IB mem size [Why & How] Increase IB mem size to match size of largest structure that will use IB transfer between driver and DMU. Reviewed-by: Oleh Kuzhylnyi Signed-off-by: Alvin Lee Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c index a657efda89ce..a6ae1d2e9685 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c @@ -66,7 +66,7 @@ #define DMUB_SCRATCH_MEM_SIZE (1024) /* Default indirect buffer size. */ -#define DMUB_IB_MEM_SIZE (1280) +#define DMUB_IB_MEM_SIZE (2560) /* Default LSDMA ring buffer size. */ #define DMUB_LSDMA_RB_SIZE (64 * 1024) From 678c901443a6d2e909e3b51331a20f9d8f84ce82 Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Wed, 22 Oct 2025 16:19:34 -0600 Subject: [PATCH 40/83] drm/amd/display: Fix black screen with HDMI outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why & How] This fixes the black screen issue on certain APUs with HDMI, accompanied by the following messages: amdgpu 0000:c4:00.0: amdgpu: [drm] Failed to setup vendor info frame on connector DP-1: -22 amdgpu 0000:c4:00.0: [drm] Cannot find any crtc or sizes [drm] Cannot find any crtc or sizes Fixes: 489f0f600ce2 ("drm/amd/display: Fix DVI-D/HDMI adapters") Suggested-by: Timur Kristóf Reviewed-by: Harry Wentland Signed-off-by: Alex Hung Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/link/link_detection.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/link/link_detection.c b/drivers/gpu/drm/amd/display/dc/link/link_detection.c index c417780f37bc..5d287874c125 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_detection.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_detection.c @@ -1257,6 +1257,7 @@ static bool detect_link_and_local_sink(struct dc_link *link, !sink->edid_caps.edid_hdmi) sink->sink_signal = SIGNAL_TYPE_DVI_SINGLE_LINK; else if (dc_is_dvi_signal(sink->sink_signal) && + dc_is_dvi_signal(link->connector_signal) && aud_support->hdmi_audio_native && sink->edid_caps.edid_hdmi) sink->sink_signal = SIGNAL_TYPE_HDMI_TYPE_A; From 3f0c27edd8fa0dd1e64d232687057a0be4ea836c Mon Sep 17 00:00:00 2001 From: Taimur Hassan Date: Fri, 24 Oct 2025 18:42:39 -0400 Subject: [PATCH 41/83] drm/amd/display: [FW Promotion] Release 0.1.34.0 Release hightlights DCN35/36 * Dynamically clock gate before and after prefetch Acked-by: Wayne Lin Signed-off-by: Taimur Hassan Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 439 ++++++++++++++++-- 1 file changed, 407 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h index 772e07a1a959..9bc512a522e0 100644 --- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h @@ -599,6 +599,104 @@ union replay_hw_flags { uint32_t u32All; }; +/** + * Flags that can be set by driver to change some Panel Replay behaviour. + */ +union pr_debug_flags { + struct { + /** + * 0x1 (bit 0) + * Enable visual confirm in FW. + */ + uint32_t visual_confirm : 1; + + /** + * 0x2 (bit 1) + * @skip_crc: Set if need to skip CRC. + */ + uint32_t skip_crc : 1; + + /** + * 0x4 (bit 2) + * @force_link_power_on: Force disable ALPM control + */ + uint32_t force_link_power_on : 1; + + /** + * 0x8 (bit 3) + * @force_phy_power_on: Force phy power on + */ + uint32_t force_phy_power_on : 1; + + /** + * 0x10 (bit 4) + * @skip_crtc_disabled: CRTC disable skipped + */ + uint32_t skip_crtc_disabled : 1; + + /* + * 0x20 (bit 5) + * @visual_confirm_rate_control: Enable Visual Confirm rate control detection + */ + uint32_t visual_confirm_rate_control : 1; + + uint32_t reserved : 26; + } bitfields; + + uint32_t u32All; +}; + +union pr_hw_flags { + struct { + /** + * @allow_alpm_fw_standby_mode: To indicate whether the + * ALPM FW standby mode is allowed + */ + uint32_t allow_alpm_fw_standby_mode : 1; + + /* + * @dsc_enable_status: DSC enable status in driver + */ + uint32_t dsc_enable_status : 1; + + /** + * @fec_enable_status: receive fec enable/disable status from driver + */ + uint32_t fec_enable_status : 1; + + /* + * @smu_optimizations_en: SMU power optimization. + * Only when active display is Replay capable and display enters Replay. + * Trigger interrupt to SMU to powerup/down. + */ + uint32_t smu_optimizations_en : 1; + + /** + * @phy_power_state: Indicates current phy power state + */ + uint32_t phy_power_state : 1; + + /** + * @link_power_state: Indicates current link power state + */ + uint32_t link_power_state : 1; + /** + * Use TPS3 signal when restore main link. + */ + uint32_t force_wakeup_by_tps3 : 1; + /** + * @is_alpm_initialized: Indicates whether ALPM is initialized + */ + uint32_t is_alpm_initialized : 1; + /** + * @alpm_mode: Indicates ALPM mode selected + */ + uint32_t alpm_mode : 2; + } bitfields; + + uint32_t u32All; +}; + union fw_assisted_mclk_switch_version { struct { uint8_t minor : 5; @@ -1732,9 +1830,15 @@ enum dmub_cmd_type { DMUB_CMD__CURSOR_OFFLOAD = 92, /** - * Command type used for all SMART_POWER_HDR commands. + * Command type used for all SMART_POWER_OLED commands. */ - DMUB_CMD__SMART_POWER_HDR = 93, + DMUB_CMD__SMART_POWER_OLED = 93, + + /** + * Command type use for all Panel Replay commands. + */ + DMUB_CMD__PR = 94, + /** * Command type use for VBIOS shared commands. @@ -4147,6 +4251,33 @@ enum replay_state { REPLAY_STATE_INVALID = 0xFF, }; +/** + * Definition of a panel replay state + */ +enum pr_state { + PR_STATE_0 = 0x00, // State 0 steady state + // Pending SDP and Unlock before back to State 0 + PR_STATE_0_PENDING_SDP_AND_UNLOCK = 0x01, + PR_STATE_1 = 0x10, // State 1 + PR_STATE_2 = 0x20, // State 2 steady state + // Pending frame transmission before transition to State 2 + PR_STATE_2_PENDING_FRAME_TRANSMISSION = 0x30, + // Active and Powered Up + PR_STATE_2_POWERED = 0x31, + // Active and Powered Down, but need to blank HUBP after DPG_EN latch + PR_STATE_2_PENDING_HUBP_BLANK = 0x32, + // Active and Pending Power Up + PR_STATE_2_PENDING_POWER_UP = 0x33, + // Active and Powered Up, Pending DPG latch + PR_STATE_2_PENDING_LOCK_FOR_DPG_POWER_ON = 0x34, + // Active and Powered Up, Pending SDP and Unlock + PR_STATE_2_PENDING_SDP_AND_UNLOCK = 0x35, + // Pending transmission of AS SDP for timing sync, but no rfb update + PR_STATE_2_PENDING_AS_SDP = 0x36, + // Invalid + PR_STATE_INVALID = 0xFF, +}; + /** * Replay command sub-types. */ @@ -4197,6 +4328,25 @@ enum dmub_cmd_replay_type { DMUB_CMD__REPLAY_SET_GENERAL_CMD = 16, }; +/* + * Panel Replay sub-types + */ +enum dmub_cmd_panel_replay_type { + DMUB_CMD__PR_ENABLE = 0, + DMUB_CMD__PR_COPY_SETTINGS = 1, + DMUB_CMD__PR_UPDATE_STATE = 2, + DMUB_CMD__PR_GENERAL_CMD = 3, +}; + +enum dmub_cmd_panel_replay_state_update_subtype { + PR_STATE_UPDATE_COASTING_VTOTAL = 0x1, + PR_STATE_UPDATE_SYNC_MODE = 0x2, +}; + +enum dmub_cmd_panel_replay_general_subtype { + PR_GENERAL_CMD_DEBUG_OPTION = 0x1, +}; + /** * Replay general command sub-types. */ @@ -4349,17 +4499,13 @@ struct dmub_cmd_replay_set_version_data { */ uint8_t panel_inst; /** - * PSR version that FW should implement. + * Replay version that FW should implement. */ enum replay_version version; - /** - * PSR control version. - */ - uint8_t cmd_version; /** * Explicit padding to 4 byte boundary. */ - uint8_t pad[2]; + uint8_t pad[3]; }; /** @@ -4405,11 +4551,11 @@ enum replay_enable { }; /** - * Data passed from driver to FW in a DMUB_CMD__SMART_POWER_HDR_ENABLE command. + * Data passed from driver to FW in a DMUB_CMD__SMART_POWER_OLED_ENABLE command. */ -struct dmub_rb_cmd_smart_power_hdr_enable_data { +struct dmub_rb_cmd_smart_power_oled_enable_data { /** - * SMART_POWER_HDR enable or disable. + * SMART_POWER_OLED enable or disable. */ uint8_t enable; /** @@ -4777,53 +4923,53 @@ union dmub_replay_cmd_set { }; /** - * SMART POWER HDR command sub-types. + * SMART POWER OLED command sub-types. */ -enum dmub_cmd_smart_power_hdr_type { +enum dmub_cmd_smart_power_oled_type { /** - * Enable/Disable SMART_POWER_HDR. + * Enable/Disable SMART_POWER_OLED. */ - DMUB_CMD__SMART_POWER_HDR_ENABLE = 1, + DMUB_CMD__SMART_POWER_OLED_ENABLE = 1, /** - * Get current MaxCLL value if SMART POWER HDR is enabled. + * Get current MaxCLL value if SMART POWER OLED is enabled. */ - DMUB_CMD__SMART_POWER_HDR_GETMAXCLL = 2, + DMUB_CMD__SMART_POWER_OLED_GETMAXCLL = 2, }; /** - * Definition of a DMUB_CMD__SMART_POWER_HDR command. + * Definition of a DMUB_CMD__SMART_POWER_OLED command. */ -struct dmub_rb_cmd_smart_power_hdr_enable { +struct dmub_rb_cmd_smart_power_oled_enable { /** * Command header. */ struct dmub_cmd_header header; - struct dmub_rb_cmd_smart_power_hdr_enable_data data; + struct dmub_rb_cmd_smart_power_oled_enable_data data; }; -struct dmub_cmd_smart_power_hdr_getmaxcll_input { +struct dmub_cmd_smart_power_oled_getmaxcll_input { uint8_t panel_inst; uint8_t pad[3]; }; -struct dmub_cmd_smart_power_hdr_getmaxcll_output { +struct dmub_cmd_smart_power_oled_getmaxcll_output { uint16_t current_max_cll; uint8_t pad[2]; }; /** - * Definition of a DMUB_CMD__SMART_POWER_HDR command. + * Definition of a DMUB_CMD__SMART_POWER_OLED command. */ -struct dmub_rb_cmd_smart_power_hdr_getmaxcll { +struct dmub_rb_cmd_smart_power_oled_getmaxcll { struct dmub_cmd_header header; /**< Command header */ /** - * Data passed from driver to FW in a DMUB_CMD__SMART_POWER_HDR_GETMAXCLL command. + * Data passed from driver to FW in a DMUB_CMD__SMART_POWER_OLED_GETMAXCLL command. */ - union dmub_cmd_smart_power_hdr_getmaxcll_data { - struct dmub_cmd_smart_power_hdr_getmaxcll_input input; /**< Input */ - struct dmub_cmd_smart_power_hdr_getmaxcll_output output; /**< Output */ + union dmub_cmd_smart_power_oled_getmaxcll_data { + struct dmub_cmd_smart_power_oled_getmaxcll_input input; /**< Input */ + struct dmub_cmd_smart_power_oled_getmaxcll_output output; /**< Output */ uint32_t output_raw; /**< Raw data output */ } data; }; @@ -6356,6 +6502,223 @@ struct dmub_rb_cmd_cursor_offload_stream_cntl { struct dmub_cmd_cursor_offload_stream_data data; }; +/** + * Data passed from driver to FW in a DMUB_CMD__PR_ENABLE command. + */ +struct dmub_cmd_pr_enable_data { + /** + * Panel Replay enable or disable. + */ + uint8_t enable; + /** + * Panel Instance. + * Panel isntance to identify which replay_state to use + * Currently the support is only for 0 or 1 + */ + uint8_t panel_inst; + /** + * Phy state to enter. + * Values to use are defined in dmub_phy_fsm_state + */ + uint8_t phy_fsm_state; + /** + * Phy rate for DP - RBR/HBR/HBR2/HBR3. + * Set this using enum phy_link_rate. + * This does not support HDMI/DP2 for now. + */ + uint8_t phy_rate; + /** + * @hpo_stream_enc_inst: HPO stream encoder instance + */ + uint8_t hpo_stream_enc_inst; + /** + * @hpo_link_enc_inst: HPO link encoder instance + */ + uint8_t hpo_link_enc_inst; + /** + * @pad: Align structure to 4 byte boundary. + */ + uint8_t pad[2]; +}; + +/** + * Definition of a DMUB_CMD__PR_ENABLE command. + * Panel Replay enable/disable is controlled using action in data. + */ +struct dmub_rb_cmd_pr_enable { + /** + * Command header. + */ + struct dmub_cmd_header header; + + struct dmub_cmd_pr_enable_data data; +}; + +/** + * Data passed from driver to FW in a DMUB_CMD__PR_COPY_SETTINGS command. + */ +struct dmub_cmd_pr_copy_settings_data { + /** + * Flags that can be set by driver to change some replay behaviour. + */ + union pr_debug_flags debug; + + /** + * @flags: Flags used to determine feature functionality. + */ + union pr_hw_flags flags; + + /** + * DPP HW instance. + */ + uint8_t dpp_inst; + /** + * OTG HW instance. + */ + uint8_t otg_inst; + /** + * DIG FE HW instance. + */ + uint8_t digfe_inst; + /** + * DIG BE HW instance. + */ + uint8_t digbe_inst; + /** + * AUX HW instance. + */ + uint8_t aux_inst; + /** + * Panel Instance. + * Panel isntance to identify which psr_state to use + * Currently the support is only for 0 or 1 + */ + uint8_t panel_inst; + /** + * Length of each horizontal line in ns. + */ + uint32_t line_time_in_ns; + /** + * PHY instance. + */ + uint8_t dpphy_inst; + /** + * Determines if SMU optimzations are enabled/disabled. + */ + uint8_t smu_optimizations_en; + /* + * Use FSM state for Replay power up/down + */ + uint8_t use_phy_fsm; + /* + * Use FSFT afftet pixel clk + */ + uint32_t pix_clk_100hz; + /* + * Use Original pixel clock + */ + uint32_t sink_pix_clk_100hz; + /** + * Use for AUX-less ALPM LFPS wake operation + */ + struct dmub_alpm_auxless_data auxless_alpm_data; + /** + * @hpo_stream_enc_inst: HPO stream encoder instance + */ + uint8_t hpo_stream_enc_inst; + /** + * @hpo_link_enc_inst: HPO link encoder instance + */ + uint8_t hpo_link_enc_inst; + /** + * @pad: Align structure to 4 byte boundary. + */ + uint8_t pad[2]; +}; + +/** + * Definition of a DMUB_CMD__PR_COPY_SETTINGS command. + */ +struct dmub_rb_cmd_pr_copy_settings { + /** + * Command header. + */ + struct dmub_cmd_header header; + /** + * Data passed from driver to FW in a DMUB_CMD__PR_COPY_SETTINGS command. + */ + struct dmub_cmd_pr_copy_settings_data data; +}; + +struct dmub_cmd_pr_update_state_data { + /** + * Panel Instance. + * Panel isntance to identify which psr_state to use + * Currently the support is only for 0 or 1 + */ + uint8_t panel_inst; + + uint8_t pad[3]; // align to 4-byte boundary + /* + * Update flags to control the update behavior. + */ + uint32_t update_flag; + /** + * state/data to set. + */ + uint32_t coasting_vtotal; + uint32_t sync_mode; +}; + +struct dmub_cmd_pr_general_cmd_data { + /** + * Panel Instance. + * Panel isntance to identify which psr_state to use + * Currently the support is only for 0 or 1 + */ + uint8_t panel_inst; + /** + * subtype: PR general cmd sub type + */ + uint8_t subtype; + + uint8_t pad[2]; + /** + * config data by different subtypes + */ + union { + uint32_t u32All; + } data; +}; + +/** + * Definition of a DMUB_CMD__PR_UPDATE_STATE command. + */ +struct dmub_rb_cmd_pr_update_state { + /** + * Command header. + */ + struct dmub_cmd_header header; + /** + * Data passed from driver to FW in a DMUB_CMD__PR_UPDATE_STATE command. + */ + struct dmub_cmd_pr_update_state_data data; +}; + +/** + * Definition of a DMUB_CMD__PR_GENERAL_CMD command. + */ +struct dmub_rb_cmd_pr_general_cmd { + /** + * Command header. + */ + struct dmub_cmd_header header; + /** + * Data passed from driver to FW in a DMUB_CMD__PR_GENERAL_CMD command. + */ + struct dmub_cmd_pr_general_cmd_data data; +}; + /** * union dmub_rb_cmd - DMUB inbox command. */ @@ -6698,13 +7061,25 @@ union dmub_rb_cmd { */ struct dmub_rb_cmd_cursor_offload_stream_cntl cursor_offload_stream_ctnl; /** - * Definition of a DMUB_CMD__SMART_POWER_HDR_ENABLE command. + * Definition of a DMUB_CMD__SMART_POWER_OLED_ENABLE command. */ - struct dmub_rb_cmd_smart_power_hdr_enable smart_power_hdr_enable; + struct dmub_rb_cmd_smart_power_oled_enable smart_power_oled_enable; /** - * Definition of a DMUB_CMD__DMUB_CMD__SMART_POWER_HDR_GETMAXCLL command. + * Definition of a DMUB_CMD__DMUB_CMD__SMART_POWER_OLED_GETMAXCLL command. */ - struct dmub_rb_cmd_smart_power_hdr_getmaxcll smart_power_hdr_getmaxcll; + struct dmub_rb_cmd_smart_power_oled_getmaxcll smart_power_oled_getmaxcll; + /* + * Definition of a DMUB_CMD__REPLAY_COPY_SETTINGS command. + */ + struct dmub_rb_cmd_pr_copy_settings pr_copy_settings; + /** + * Definition of a DMUB_CMD__REPLAY_ENABLE command. + */ + struct dmub_rb_cmd_pr_enable pr_enable; + + struct dmub_rb_cmd_pr_update_state pr_update_state; + + struct dmub_rb_cmd_pr_general_cmd pr_general_cmd; }; /** From 1da571bdb2b8b5b0a1b759cde1233c4cbf7ac00f Mon Sep 17 00:00:00 2001 From: Taimur Hassan Date: Fri, 24 Oct 2025 19:38:14 -0500 Subject: [PATCH 42/83] drm/amd/display: Promote DC to 3.2.357 This version brings along following update: - HDCP2 FW locality check refactors - Fix black screen issue with HDMI output - Increase IB mem size - Revert max buffered cursor size to 64 - Extend inbox0 lock to run Replay / PSR - Refactor VActive implementation - Add Pstate viewport reduction - Persist stream refcount through restore Acked-by: Wayne Lin Signed-off-by: Taimur Hassan Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 010d9315b96b..75b25b2506a8 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -63,7 +63,7 @@ struct dcn_dsc_reg_state; struct dcn_optc_reg_state; struct dcn_dccg_reg_state; -#define DC_VER "3.2.356" +#define DC_VER "3.2.357" /** * MAX_SURFACES - representative of the upper bound of surfaces that can be piped to a single CRTC From 7cf422ed3386460ee13a3219f8192896f9759d77 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Thu, 30 Oct 2025 22:38:49 +0800 Subject: [PATCH 43/83] drm/amd/ras: Fix format truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ../ras/rascore/ras_cper.c: In function ‘cper_generate_fatal_record.isra’: ../ras/rascore/ras_cper.c:75:36: error: ‘%llX’ directive output may be truncated writing between 1 and 14 bytes into a region of size between 0 and 7 [-Werror=format-truncation=] 75 | snprintf(record_id, 9, "%d:%llX", dev_info.socket_id, | ^~~~ ../ras/rascore/ras_cper.c:75:32: note: directive argument in the range [0, 72057594037927935] 75 | snprintf(record_id, 9, "%d:%llX", dev_info.socket_id, | ^~~~~~~~~ ../ras/rascore/ras_cper.c:75:9: note: ‘snprintf’ output between 4 and 27 bytes into a destination of size 9 75 | snprintf(record_id, 9, "%d:%llX", dev_info.socket_id, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 76 | RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../ras/rascore/ras_cper.c: In function ‘cper_generate_runtime_record.isra’: ../ras/rascore/ras_cper.c:75:36: error: ‘%llX’ directive output may be truncated writing between 1 and 14 bytes into a region of size between 0 and 7 [-Werror=format-truncation=] 75 | snprintf(record_id, 9, "%d:%llX", dev_info.socket_id, | ^~~~ ../ras/rascore/ras_cper.c:75:32: note: directive argument in the range [0, 72057594037927935] 75 | snprintf(record_id, 9, "%d:%llX", dev_info.socket_id, | ^~~~~~~~~ ../ras/rascore/ras_cper.c:75:9: note: ‘snprintf’ output between 4 and 27 bytes into a destination of size 9 75 | snprintf(record_id, 9, "%d:%llX", dev_info.socket_id, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 76 | RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Signed-off-by: Xiang Liu Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/ras/rascore/ras_cper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c index 3c5bfa1c93f6..0fc7522b7ab6 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c @@ -54,7 +54,7 @@ static void fill_section_hdr(struct ras_core_context *ras_core, enum ras_cper_severity sev, struct ras_log_info *trace) { struct device_system_info dev_info = {0}; - char record_id[16]; + char record_id[32]; hdr->signature[0] = 'C'; hdr->signature[1] = 'P'; @@ -71,7 +71,7 @@ static void fill_section_hdr(struct ras_core_context *ras_core, cper_get_timestamp(ras_core, &hdr->timestamp, trace->timestamp); - snprintf(record_id, 9, "%d:%llX", dev_info.socket_id, + snprintf(record_id, sizeof(record_id), "%d:%llX", dev_info.socket_id, RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno)); memcpy(hdr->record_id, record_id, 8); From c72d41a8f3090c2ecc12afdde1bcf9ed726157ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 31 Oct 2025 09:21:36 +0100 Subject: [PATCH 44/83] drm/amdgpu: grab a BO reference in vm_lock_done_list. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise it is possible that between dropping the status lock and locking the BO that the BO is freed up. Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 9309830821b7..453d3b576456 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -484,15 +484,19 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, spin_lock(&vm->status_lock); while (!list_is_head(prev->next, &vm->done)) { bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status); - spin_unlock(&vm->status_lock); bo = bo_va->base.bo; if (bo) { + amdgpu_bo_ref(bo); + spin_unlock(&vm->status_lock); + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1); + amdgpu_bo_unref(&bo); if (unlikely(ret)) return ret; + + spin_lock(&vm->status_lock); } - spin_lock(&vm->status_lock); prev = prev->next; } spin_unlock(&vm->status_lock); From 36ffc58b8a8704e690a0ce679db26baa5759256f Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Tue, 28 Oct 2025 14:09:05 +0100 Subject: [PATCH 45/83] drm/amdgpu: lock bo before calling amdgpu_vm_bo_update_shared MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BO's reservation object must be locked before using amdgpu_vm_bo_update_shared otherwise dma_resv_assert_held will complain in amdgpu_vm_update_shared. Signed-off-by: Pierre-Eric Pelloux-Prayer Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 8561ad7f6180..268d69d862e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -81,13 +81,20 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, struct drm_gem_object *obj = dmabuf->priv; struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + int r; if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) && pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) attach->peer2peer = false; + r = dma_resv_lock(bo->tbo.base.resv, NULL); + if (r) + return r; + amdgpu_vm_bo_update_shared(bo); + dma_resv_unlock(bo->tbo.base.resv); + return 0; } From cd6250f3aeb5ee81bf3724891f274f8a43737198 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Fri, 31 Oct 2025 14:10:13 +0530 Subject: [PATCH 46/83] drm/amdgpu: validate the bo from done list for NULL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure the bo is valid before using it. Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index fd54423c3587..836a14ef0052 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -1062,6 +1062,8 @@ amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr) /* Validate User Ptr BOs */ list_for_each_entry(bo_va, &vm->done, base.vm_status) { bo = bo_va->base.bo; + if (!bo) + continue; if (!amdgpu_ttm_tt_is_userptr(bo->tbo.ttm)) continue; From b480f573a8abd8f98474496fcf1eec86bf954002 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Thu, 14 Aug 2025 16:35:39 +0530 Subject: [PATCH 47/83] drm/amd/pm: Use gpu metrics 1.9 for SMUv13.0.12 Fill and publish GPU metrics in v1.9 format for SMUv13.0.12 SOCs Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- .../drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 87 ++++++++----------- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 25 +++--- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h | 4 +- 3 files changed, 50 insertions(+), 66 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c index a0c844bf852c..f4bf0b558d86 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c @@ -58,7 +58,7 @@ #define NUM_JPEG_RINGS_FW 10 #define NUM_JPEG_RINGS_GPU_METRICS(gpu_metrics) \ - (ARRAY_SIZE(gpu_metrics->xcp_stats[0].jpeg_busy) / 4) + (ARRAY_SIZE(gpu_metrics->jpeg_busy) / 4) const struct cmn2asic_mapping smu_v13_0_12_feature_mask_map[SMU_FEATURE_COUNT] = { SMU_13_0_12_FEA_MAP(SMU_FEATURE_DATA_CALCULATIONS_BIT, FEATURE_DATA_CALCULATION), @@ -772,22 +772,17 @@ ssize_t smu_v13_0_12_get_xcp_metrics(struct smu_context *smu, struct amdgpu_xcp return sizeof(*xcp_metrics); } -ssize_t smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, void *smu_metrics) +void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, + void *smu_metrics, + struct smu_v13_0_6_gpu_metrics *gpu_metrics) { - struct smu_table_context *smu_table = &smu->smu_table; - struct gpu_metrics_v1_8 *gpu_metrics = - (struct gpu_metrics_v1_8 *)smu_table->gpu_metrics_table; - int ret = 0, xcc_id, inst, i, j, k, idx; struct amdgpu_device *adev = smu->adev; + int ret = 0, xcc_id, inst, i, j; u8 num_jpeg_rings_gpu_metrics; MetricsTable_t *metrics; - struct amdgpu_xcp *xcp; - u32 inst_mask; metrics = (MetricsTable_t *)smu_metrics; - smu_cmn_init_soft_gpu_metrics(gpu_metrics, 1, 8); - gpu_metrics->temperature_hotspot = SMUQ10_ROUND(metrics->MaxSocketTemperature); /* Individual HBM stack temperature is not reported */ @@ -877,57 +872,47 @@ ssize_t smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, void gpu_metrics->xgmi_link_status[j] = ret; } - gpu_metrics->num_partition = adev->xcp_mgr->num_xcps; - num_jpeg_rings_gpu_metrics = NUM_JPEG_RINGS_GPU_METRICS(gpu_metrics); - for_each_xcp(adev->xcp_mgr, xcp, i) { - amdgpu_xcp_get_inst_details(xcp, AMDGPU_XCP_VCN, &inst_mask); - idx = 0; - for_each_inst(k, inst_mask) { - /* Both JPEG and VCN has same instances */ - inst = GET_INST(VCN, k); + for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { + inst = GET_INST(VCN, i); - for (j = 0; j < num_jpeg_rings_gpu_metrics; ++j) { - gpu_metrics->xcp_stats[i].jpeg_busy - [(idx * num_jpeg_rings_gpu_metrics) + j] = - SMUQ10_ROUND(metrics->JpegBusy - [(inst * NUM_JPEG_RINGS_FW) + j]); - } - gpu_metrics->xcp_stats[i].vcn_busy[idx] = - SMUQ10_ROUND(metrics->VcnBusy[inst]); - idx++; + for (j = 0; j < num_jpeg_rings_gpu_metrics; ++j) { + gpu_metrics->jpeg_busy[(i * num_jpeg_rings_gpu_metrics) + + j] = + SMUQ10_ROUND( + metrics->JpegBusy[(inst * + NUM_JPEG_RINGS_FW) + + j]); } + gpu_metrics->vcn_busy[i] = SMUQ10_ROUND(metrics->VcnBusy[inst]); + } - amdgpu_xcp_get_inst_details(xcp, AMDGPU_XCP_GFX, &inst_mask); - idx = 0; - for_each_inst(k, inst_mask) { - inst = GET_INST(GC, k); - gpu_metrics->xcp_stats[i].gfx_busy_inst[idx] = - SMUQ10_ROUND(metrics->GfxBusy[inst]); - gpu_metrics->xcp_stats[i].gfx_busy_acc[idx] = - SMUQ10_ROUND(metrics->GfxBusyAcc[inst]); - if (smu_v13_0_6_cap_supported(smu, SMU_CAP(HST_LIMIT_METRICS))) { - gpu_metrics->xcp_stats[i].gfx_below_host_limit_ppt_acc[idx] = - SMUQ10_ROUND(metrics->GfxclkBelowHostLimitPptAcc[inst]); - gpu_metrics->xcp_stats[i].gfx_below_host_limit_thm_acc[idx] = - SMUQ10_ROUND(metrics->GfxclkBelowHostLimitThmAcc[inst]); - gpu_metrics->xcp_stats[i].gfx_low_utilization_acc[idx] = - SMUQ10_ROUND(metrics->GfxclkLowUtilizationAcc[inst]); - gpu_metrics->xcp_stats[i].gfx_below_host_limit_total_acc[idx] = - SMUQ10_ROUND(metrics->GfxclkBelowHostLimitTotalAcc[inst]); - } - idx++; - } + for (i = 0; i < NUM_XCC(adev->gfx.xcc_mask); ++i) { + inst = GET_INST(GC, i); + gpu_metrics->gfx_busy_inst[i] = + SMUQ10_ROUND(metrics->GfxBusy[inst]); + gpu_metrics->gfx_busy_acc[i] = + SMUQ10_ROUND(metrics->GfxBusyAcc[inst]); + if (smu_v13_0_6_cap_supported(smu, + SMU_CAP(HST_LIMIT_METRICS))) { + gpu_metrics + ->gfx_below_host_limit_ppt_acc[i] = SMUQ10_ROUND( + metrics->GfxclkBelowHostLimitPptAcc[inst]); + gpu_metrics + ->gfx_below_host_limit_thm_acc[i] = SMUQ10_ROUND( + metrics->GfxclkBelowHostLimitThmAcc[inst]); + gpu_metrics->gfx_low_utilization_acc[i] = SMUQ10_ROUND( + metrics->GfxclkLowUtilizationAcc[inst]); + gpu_metrics->gfx_below_host_limit_total_acc + [i] = SMUQ10_ROUND( + metrics->GfxclkBelowHostLimitTotalAcc[inst]); + }; } gpu_metrics->xgmi_link_width = metrics->XgmiWidth; gpu_metrics->xgmi_link_speed = metrics->XgmiBitrate; gpu_metrics->firmware_timestamp = metrics->Timestamp; - - *table = (void *)gpu_metrics; - - return sizeof(*gpu_metrics); } const struct smu_temp_funcs smu_v13_0_12_temp_funcs = { diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 6d39b02a3257..556116712171 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -549,7 +549,6 @@ static int smu_v13_0_6_tables_init(struct smu_context *smu) { struct smu_table_context *smu_table = &smu->smu_table; struct smu_table *tables = smu_table->tables; - void *gpu_metrics_table __free(kfree) = NULL; struct smu_v13_0_6_gpu_metrics *gpu_metrics; void *driver_pptable __free(kfree) = NULL; void *metrics_table __free(kfree) = NULL; @@ -580,12 +579,6 @@ static int smu_v13_0_6_tables_init(struct smu_context *smu) return -ENOMEM; smu_table->metrics_time = 0; - smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v1_8); - gpu_metrics_table = - kzalloc(smu_table->gpu_metrics_table_size, GFP_KERNEL); - if (!gpu_metrics_table) - return -ENOMEM; - driver_pptable = kzalloc(sizeof(struct PPTable_t), GFP_KERNEL); if (!driver_pptable) return -ENOMEM; @@ -608,7 +601,6 @@ static int smu_v13_0_6_tables_init(struct smu_context *smu) } } - smu_table->gpu_metrics_table = no_free_ptr(gpu_metrics_table); smu_table->metrics_table = no_free_ptr(metrics_table); smu_table->driver_pptable = no_free_ptr(driver_pptable); @@ -2792,16 +2784,20 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table if (ret) return ret; - if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == - IP_VERSION(13, 0, 12) && - smu_v13_0_6_cap_supported(smu, SMU_CAP(STATIC_METRICS))) - return smu_v13_0_12_get_gpu_metrics(smu, table, metrics_v0); - - metrics_v1 = (MetricsTableV1_t *)metrics_v0; metrics_v2 = (MetricsTableV2_t *)metrics_v0; gpu_metrics = (struct smu_v13_0_6_gpu_metrics *)(tables[SMU_TABLE_SMU_METRICS].cache.buffer); + if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 12) && + smu_v13_0_6_cap_supported(smu, SMU_CAP(STATIC_METRICS))) { + smu_v13_0_12_get_gpu_metrics(smu, table, metrics_v0, + gpu_metrics); + goto fill; + } + + metrics_v1 = (MetricsTableV1_t *)metrics_v0; + metrics_v2 = (MetricsTableV2_t *)metrics_v0; + gpu_metrics->temperature_hotspot = SMUQ10_ROUND(GET_METRIC_FIELD(MaxSocketTemperature, version)); /* Individual HBM stack temperature is not reported */ @@ -2974,6 +2970,7 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table gpu_metrics->firmware_timestamp = GET_METRIC_FIELD(Timestamp, version); +fill: *table = tables[SMU_TABLE_SMU_METRICS].cache.buffer; return sizeof(*gpu_metrics); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h index 3f57e2a33fb4..86d82044a255 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h @@ -94,7 +94,6 @@ size_t smu_v13_0_12_get_system_metrics_size(void); int smu_v13_0_12_setup_driver_pptable(struct smu_context *smu); int smu_v13_0_12_get_smu_metrics_data(struct smu_context *smu, MetricsMember_t member, uint32_t *value); -ssize_t smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, void *smu_metrics); ssize_t smu_v13_0_12_get_xcp_metrics(struct smu_context *smu, struct amdgpu_xcp *xcp, void *table, void *smu_metrics); @@ -216,6 +215,9 @@ extern const struct smu_temp_funcs smu_v13_0_12_temp_funcs; SMU_13_0_6_MAX_XCC); DECLARE_SMU_METRICS_CLASS(smu_v13_0_6_gpu_metrics, SMU_13_0_6_METRICS_FIELDS); +void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, + void *smu_metrics, + struct smu_v13_0_6_gpu_metrics *gpu_metrics); #endif /* SWSMU_CODE_LAYER_L2 */ From 4f993e2309ba7f51b970044c563e5124b1cc03b1 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 18 Aug 2025 12:03:41 +0530 Subject: [PATCH 48/83] drm/amd/pm: Add schema v1.1 for parition metrics Use a schema similar to gpu metrics v1.9 for partition metrics also. It will have field type encoded followed by the field value(s). The attribute ids used will be shared with gpu metrics. The structure definition is only to distinguish between gpu metrics and partition metrics though both gpu metrics v1.9 and partition metrics v1.1 follow the same definition. Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/include/kgd_pp_interface.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h index f92f78d5d330..2366e68262e6 100644 --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h @@ -1820,4 +1820,10 @@ struct amdgpu_partition_metrics_v1_0 { uint64_t gfx_below_host_limit_total_acc[MAX_XCC]; }; +struct amdgpu_partition_metrics_v1_1 { + struct metrics_table_header common_header; + int attr_count; + struct gpu_metrics_attr metrics_attrs[]; +}; + #endif From 56aeca499ae6216cf7717997d744eda67d3d5500 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 18 Aug 2025 12:21:23 +0530 Subject: [PATCH 49/83] drm/amd/pm: Update SMUv13.0.6 partition metrics For SMU v13.0.6 SOCs, move to partition metrics v1.1 schema Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 6 ++-- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h | 35 +++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 556116712171..de4c944885f6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2654,7 +2654,7 @@ static ssize_t smu_v13_0_6_get_xcp_metrics(struct smu_context *smu, int xcp_id, { const u8 num_jpeg_rings = AMDGPU_MAX_JPEG_RINGS_4_0_3; int version = smu_v13_0_6_get_metrics_version(smu); - struct amdgpu_partition_metrics_v1_0 *xcp_metrics; + struct smu_v13_0_6_partition_metrics *xcp_metrics; MetricsTableV0_t *metrics_v0 __free(kfree) = NULL; struct amdgpu_device *adev = smu->adev; int ret, inst, i, j, k, idx; @@ -2674,8 +2674,8 @@ static ssize_t smu_v13_0_6_get_xcp_metrics(struct smu_context *smu, int xcp_id, if (i == adev->xcp_mgr->num_xcps) return -EINVAL; - xcp_metrics = (struct amdgpu_partition_metrics_v1_0 *)table; - smu_cmn_init_partition_metrics(xcp_metrics, 1, 0); + xcp_metrics = (struct smu_v13_0_6_partition_metrics *)table; + smu_v13_0_6_partition_metrics_init(xcp_metrics, 1, 1); metrics_v0 = kzalloc(METRICS_TABLE_SIZE, GFP_KERNEL); if (!metrics_v0) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h index 86d82044a255..ba865ae7eca2 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h @@ -219,6 +219,41 @@ void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, void *smu_metrics, struct smu_v13_0_6_gpu_metrics *gpu_metrics); +#define SMU_13_0_6_PARTITION_METRICS_FIELDS(SMU_SCALAR, SMU_ARRAY) \ + SMU_ARRAY(SMU_MATTR(CURRENT_GFXCLK), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_gfxclk, SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(CURRENT_SOCCLK), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_socclk, SMU_13_0_6_MAX_CLKS); \ + SMU_ARRAY(SMU_MATTR(CURRENT_VCLK0), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_vclk0, SMU_13_0_6_MAX_CLKS); \ + SMU_ARRAY(SMU_MATTR(CURRENT_DCLK0), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_dclk0, SMU_13_0_6_MAX_CLKS); \ + SMU_SCALAR(SMU_MATTR(CURRENT_UCLK), SMU_MUNIT(CLOCK_1), \ + SMU_MTYPE(U16), current_uclk); \ + SMU_ARRAY(SMU_MATTR(GFX_BUSY_INST), SMU_MUNIT(PERCENT), \ + SMU_MTYPE(U32), gfx_busy_inst, SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(JPEG_BUSY), SMU_MUNIT(PERCENT), SMU_MTYPE(U16), \ + jpeg_busy, SMU_13_0_6_MAX_JPEG); \ + SMU_ARRAY(SMU_MATTR(VCN_BUSY), SMU_MUNIT(PERCENT), SMU_MTYPE(U16), \ + vcn_busy, SMU_13_0_6_MAX_VCN); \ + SMU_ARRAY(SMU_MATTR(GFX_BUSY_ACC), SMU_MUNIT(PERCENT), SMU_MTYPE(U64), \ + gfx_busy_acc, SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(GFX_BELOW_HOST_LIMIT_PPT_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), gfx_below_host_limit_ppt_acc, \ + SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(GFX_BELOW_HOST_LIMIT_THM_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), gfx_below_host_limit_thm_acc, \ + SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(GFX_LOW_UTILIZATION_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), gfx_low_utilization_acc, \ + SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(GFX_BELOW_HOST_LIMIT_TOTAL_ACC), SMU_MUNIT(NONE), \ + SMU_MTYPE(U64), gfx_below_host_limit_total_acc, \ + SMU_13_0_6_MAX_XCC); + +DECLARE_SMU_METRICS_CLASS(smu_v13_0_6_partition_metrics, + SMU_13_0_6_PARTITION_METRICS_FIELDS); + #endif /* SWSMU_CODE_LAYER_L2 */ #endif From c83fd2a6656ee47cba989146271c4b1e5f04966b Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 18 Aug 2025 12:25:50 +0530 Subject: [PATCH 50/83] drm/amd/pm: Update SMUv13.0.12 partition metrics Update SMUv13.0.12 partition metrics to partition metrics v1.1 schema. Signed-off-by: Lijo Lazar Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c index f4bf0b558d86..c6cf0d0c4b82 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c @@ -719,15 +719,14 @@ static ssize_t smu_v13_0_12_get_temp_metrics(struct smu_context *smu, ssize_t smu_v13_0_12_get_xcp_metrics(struct smu_context *smu, struct amdgpu_xcp *xcp, void *table, void *smu_metrics) { const u8 num_jpeg_rings = NUM_JPEG_RINGS_FW; - struct amdgpu_partition_metrics_v1_0 *xcp_metrics; + struct smu_v13_0_6_partition_metrics *xcp_metrics; struct amdgpu_device *adev = smu->adev; MetricsTable_t *metrics; int inst, j, k, idx; u32 inst_mask; metrics = (MetricsTable_t *)smu_metrics; - xcp_metrics = (struct amdgpu_partition_metrics_v1_0 *) table; - smu_cmn_init_partition_metrics(xcp_metrics, 1, 0); + xcp_metrics = (struct smu_v13_0_6_partition_metrics *)table; amdgpu_xcp_get_inst_details(xcp, AMDGPU_XCP_VCN, &inst_mask); idx = 0; for_each_inst(k, inst_mask) { From fd39b5a5830d8f2553e0c09d4d50bdff28b10080 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 24 Oct 2025 13:08:11 -0400 Subject: [PATCH 51/83] drm/amdgpu/smu: Handle S0ix for vangogh Fix the flows for S0ix. There is no need to stop rlc or reintialize PMFW in S0ix. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4659 Reviewed-by: Mario Limonciello Reported-by: Antheas Kapenekakis Tested-by: Antheas Kapenekakis Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 6 ++++++ drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c | 3 +++ 2 files changed, 9 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 4317da6f7c38..b3510345a32a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -2054,6 +2054,12 @@ static int smu_disable_dpms(struct smu_context *smu) smu->is_apu && (amdgpu_in_reset(adev) || adev->in_s0ix)) return 0; + /* vangogh s0ix */ + if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(11, 5, 0) || + amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(11, 5, 2)) && + adev->in_s0ix) + return 0; + /* * For gpu reset, runpm and hibernation through BACO, * BACO feature has to be kept enabled. diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c index 53579208cffb..9626da2dba58 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c @@ -2219,6 +2219,9 @@ static int vangogh_post_smu_init(struct smu_context *smu) uint32_t total_cu = adev->gfx.config.max_cu_per_sh * adev->gfx.config.max_sh_per_se * adev->gfx.config.max_shader_engines; + if (adev->in_s0ix) + return 0; + /* allow message will be sent after enable message on Vangogh*/ if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_GFXCLK_BIT) && (adev->pg_flags & AMD_PG_SUPPORT_GFX_PG)) { From 36265d2bcc9eef005e1b175c849f715b4dcd48df Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 31 Oct 2025 13:37:30 +0800 Subject: [PATCH 52/83] drm/amd/ras: Increase ras switch control range Increase ras switch control range. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +++ .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c | 25 ++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 556cf4d7b5ef..40c0bf85f1d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -591,6 +591,9 @@ struct amdgpu_ras { /* Protect poison injection */ struct mutex poison_lock; + + /* Disable/Enable uniras switch */ + bool uniras_enabled; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c index dc2a4c6c1907..f8ec0f26a9e7 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c @@ -283,6 +283,18 @@ static int amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block *ip_block) struct amdgpu_ras_mgr *ras_mgr; int ret = 0; + /* Disabled by default */ + con->uniras_enabled = false; + + /* Enabled only in debug mode */ + if (adev->debug_enable_ras_aca) { + con->uniras_enabled = true; + RAS_DEV_INFO(adev, "Debug amdgpu uniras!"); + } + + if (!con->uniras_enabled) + return 0; + ras_mgr = kzalloc(sizeof(*ras_mgr), GFP_KERNEL); if (!ras_mgr) return -EINVAL; @@ -315,6 +327,9 @@ static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras_mgr *ras_mgr = (struct amdgpu_ras_mgr *)con->ras_mgr; + if (!con->uniras_enabled) + return 0; + if (!ras_mgr) return 0; @@ -332,12 +347,11 @@ static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block) static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); int ret; - /* Currently only debug mode can enable the ras module - */ - if (!adev->debug_enable_ras_aca) + if (!con->uniras_enabled) return 0; if (!ras_mgr || !ras_mgr->ras_core) @@ -360,11 +374,10 @@ static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block) static int amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); - /* Currently only debug mode can enable the ras module - */ - if (!adev->debug_enable_ras_aca) + if (!con->uniras_enabled) return 0; if (!ras_mgr || !ras_mgr->ras_core) From 2f46c547e4af870df9f6140af3a2068f6bc0b84d Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 31 Oct 2025 14:33:18 +0800 Subject: [PATCH 53/83] drm/amdgpu: Add ras ip block name Add ras ip block name. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c65bb0bcd32b..95f7ae36e4f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2498,6 +2498,7 @@ static const char *ip_block_names[] = { [AMD_IP_BLOCK_TYPE_VPE] = "vpe", [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", [AMD_IP_BLOCK_TYPE_ISP] = "isp", + [AMD_IP_BLOCK_TYPE_RAS] = "ras", }; static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) From d7f105a402191d560cbff7ffb930378dec25ecbb Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 31 Oct 2025 15:41:26 +0800 Subject: [PATCH 54/83] drm/amd/ras: Add ras support for nbio v7_9_1 Add ras support for nbio v7_9_1. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c | 3 ++- drivers/gpu/drm/amd/ras/rascore/ras_nbio.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c index f8ec0f26a9e7..e31ffebd32d9 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c @@ -172,12 +172,13 @@ static int amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device *adev, switch (config->nbio_ip_version) { case IP_VERSION(7, 9, 0): + case IP_VERSION(7, 9, 1): nbio_cfg->nbio_sys_fn = &amdgpu_ras_nbio_sys_func_v7_9; break; default: RAS_DEV_ERR(adev, "The nbio(0x%x) ras config is not right!\n", - config->mp1_ip_version); + config->nbio_ip_version); ret = -EINVAL; break; } diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c index 8bf1f35d595e..bfddd104d548 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c @@ -31,6 +31,7 @@ static const struct ras_nbio_ip_func *ras_nbio_get_ip_funcs( { switch (ip_version) { case IP_VERSION(7, 9, 0): + case IP_VERSION(7, 9, 1): return &ras_nbio_v7_9; default: RAS_DEV_ERR(ras_core->dev, From 3f16007d8658357e0434f35f5ddb26cafd771df8 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 31 Oct 2025 15:39:20 +0800 Subject: [PATCH 55/83] drm/amd/ras: Add ras support for umc v12_5_0 Add ras support for umc v12_5_0. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c | 3 ++- drivers/gpu/drm/amd/ras/rascore/ras_umc.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c index e31ffebd32d9..adb01bdee003 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c @@ -259,7 +259,8 @@ static struct ras_core_context *amdgpu_ras_mgr_create_ras_core(struct amdgpu_dev init_config.nbio_ip_version = amdgpu_ip_version(adev, NBIO_HWIP, 0); init_config.psp_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0); - if (init_config.umc_ip_version == IP_VERSION(12, 0, 0)) + if (init_config.umc_ip_version == IP_VERSION(12, 0, 0) || + init_config.umc_ip_version == IP_VERSION(12, 5, 0)) init_config.aca_ip_version = IP_VERSION(1, 0, 0); init_config.sys_fn = &amdgpu_ras_sys_fn; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c index 4067359bb299..4dae64c424a2 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c @@ -71,6 +71,7 @@ static const struct ras_umc_ip_func *ras_umc_get_ip_func( { switch (ip_version) { case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 5, 0): return &ras_umc_func_v12_0; default: RAS_DEV_ERR(ras_core->dev, From 10c382ec6c6d1e11975a11962bec21cba6360391 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 31 Oct 2025 10:50:02 -0400 Subject: [PATCH 56/83] drm/amdkfd: Don't clear PT after process killed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If process is killed. the vm entity is stopped, submit pt update job will trigger the error message "*ERROR* Trying to push to a killed entity", job will not execute. Suggested-by: Christian König Signed-off-by: Philip Yang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 96ccd5ade031..b1c24c8fa686 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1274,6 +1274,10 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem, (void)amdgpu_vm_bo_unmap(adev, bo_va, entry->va); + /* VM entity stopped if process killed, don't clear freed pt bo */ + if (!amdgpu_vm_ready(vm)) + return 0; + (void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); (void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL); From f19bbecd34e3c15eed7e5e593db2ac0fc7a0e6d8 Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Tue, 14 Oct 2025 00:47:35 +0800 Subject: [PATCH 57/83] drm/amd/display: Fix NULL deref in debugfs odm_combine_segments When a connector is connected but inactive (e.g., disabled by desktop environments), pipe_ctx->stream_res.tg will be destroyed. Then, reading odm_combine_segments causes kernel NULL pointer dereference. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 16 UID: 0 PID: 26474 Comm: cat Not tainted 6.17.0+ #2 PREEMPT(lazy) e6a17af9ee6db7c63e9d90dbe5b28ccab67520c6 Hardware name: LENOVO 21Q4/LNVNB161216, BIOS PXCN25WW 03/27/2025 RIP: 0010:odm_combine_segments_show+0x93/0xf0 [amdgpu] Code: 41 83 b8 b0 00 00 00 01 75 6e 48 98 ba a1 ff ff ff 48 c1 e0 0c 48 8d 8c 07 d8 02 00 00 48 85 c9 74 2d 48 8b bc 07 f0 08 00 00 <48> 8b 07 48 8b 80 08 02 00> RSP: 0018:ffffd1bf4b953c58 EFLAGS: 00010286 RAX: 0000000000005000 RBX: ffff8e35976b02d0 RCX: ffff8e3aeed052d8 RDX: 00000000ffffffa1 RSI: ffff8e35a3120800 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff8e3580eb0000 R09: ffff8e35976b02d0 R10: ffffd1bf4b953c78 R11: 0000000000000000 R12: ffffd1bf4b953d08 R13: 0000000000040000 R14: 0000000000000001 R15: 0000000000000001 FS: 00007f44d3f9f740(0000) GS:ffff8e3caa47f000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000006485c2000 CR4: 0000000000f50ef0 PKRU: 55555554 Call Trace: seq_read_iter+0x125/0x490 ? __alloc_frozen_pages_noprof+0x18f/0x350 seq_read+0x12c/0x170 full_proxy_read+0x51/0x80 vfs_read+0xbc/0x390 ? __handle_mm_fault+0xa46/0xef0 ? do_syscall_64+0x71/0x900 ksys_read+0x73/0xf0 do_syscall_64+0x71/0x900 ? count_memcg_events+0xc2/0x190 ? handle_mm_fault+0x1d7/0x2d0 ? do_user_addr_fault+0x21a/0x690 ? exc_page_fault+0x7e/0x1a0 entry_SYSCALL_64_after_hwframe+0x6c/0x74 RIP: 0033:0x7f44d4031687 Code: 48 89 fa 4c 89 df e8 58 b3 00 00 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 1a 5b c3 0f 1f 84 00 00 00 00 00 48 8b 44 24 10 0f 05 <5b> c3 0f 1f 80 00 00 00 00> RSP: 002b:00007ffdb4b5f0b0 EFLAGS: 00000202 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00007f44d3f9f740 RCX: 00007f44d4031687 RDX: 0000000000040000 RSI: 00007f44d3f5e000 RDI: 0000000000000003 RBP: 0000000000040000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000202 R12: 00007f44d3f5e000 R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000040000 Modules linked in: tls tcp_diag inet_diag xt_mark ccm snd_hrtimer snd_seq_dummy snd_seq_midi snd_seq_oss snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device x> snd_hda_codec_atihdmi snd_hda_codec_realtek_lib lenovo_wmi_helpers think_lmi snd_hda_codec_generic snd_hda_codec_hdmi snd_soc_core kvm snd_compress uvcvideo sn> platform_profile joydev amd_pmc mousedev mac_hid sch_fq_codel uinput i2c_dev parport_pc ppdev lp parport nvme_fabrics loop nfnetlink ip_tables x_tables dm_cryp> CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:odm_combine_segments_show+0x93/0xf0 [amdgpu] Code: 41 83 b8 b0 00 00 00 01 75 6e 48 98 ba a1 ff ff ff 48 c1 e0 0c 48 8d 8c 07 d8 02 00 00 48 85 c9 74 2d 48 8b bc 07 f0 08 00 00 <48> 8b 07 48 8b 80 08 02 00> RSP: 0018:ffffd1bf4b953c58 EFLAGS: 00010286 RAX: 0000000000005000 RBX: ffff8e35976b02d0 RCX: ffff8e3aeed052d8 RDX: 00000000ffffffa1 RSI: ffff8e35a3120800 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff8e3580eb0000 R09: ffff8e35976b02d0 R10: ffffd1bf4b953c78 R11: 0000000000000000 R12: ffffd1bf4b953d08 R13: 0000000000040000 R14: 0000000000000001 R15: 0000000000000001 FS: 00007f44d3f9f740(0000) GS:ffff8e3caa47f000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000006485c2000 CR4: 0000000000f50ef0 PKRU: 55555554 Fix this by checking pipe_ctx->stream_res.tg before dereferencing. Fixes: 07926ba8a44f ("drm/amd/display: Add debugfs interface for ODM combine info") Signed-off-by: Rong Zhang Reviewed-by: Mario Limoncello Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c index cb4bb67289a4..a9839485f2a2 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c @@ -1303,7 +1303,8 @@ static int odm_combine_segments_show(struct seq_file *m, void *unused) if (connector->status != connector_status_connected) return -ENODEV; - if (pipe_ctx != NULL && pipe_ctx->stream_res.tg->funcs->get_odm_combine_segments) + if (pipe_ctx && pipe_ctx->stream_res.tg && + pipe_ctx->stream_res.tg->funcs->get_odm_combine_segments) pipe_ctx->stream_res.tg->funcs->get_odm_combine_segments(pipe_ctx->stream_res.tg, &segments); seq_printf(m, "%d\n", segments); From 88ef4de35f46b76a19858f8ca5b93e5e23f244e2 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Wed, 29 Oct 2025 09:41:04 -0400 Subject: [PATCH 58/83] Revert "drm/amdkfd: Improve signal event slow path" To fix regression report on gfx8, which requires the exhaustive search path for signaled event. The high CPU usage of KFD interrupt wq issue is gone after HIP/ROCr add option to reduce HW event interrupts, safe to revert this optimization patch now. This reverts commit de844846f72b152119faaef1b363448dc8ea368f. Signed-off-by: Philip Yang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 82905f3e54dd..5a190dd6be4e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -748,16 +748,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, uint64_t *slots = page_slots(p->signal_page); uint32_t id; - /* - * If id is valid but slot is not signaled, GPU may signal the same event twice - * before driver have chance to process the first interrupt, then signal slot is - * auto-reset after set_event wakeup the user space, just drop the second event as - * the application only need wakeup once. - */ - if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) && - partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT) - goto out_unlock; - if (valid_id_bits) pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", partial_id, valid_id_bits); @@ -786,7 +776,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, } } -out_unlock: rcu_read_unlock(); kfd_unref_process(p); } From 0c6f09e65b86cec39a96ec60912d5da29aee0ff3 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Mon, 8 Sep 2025 17:08:29 +0800 Subject: [PATCH 59/83] drm/amd/pm: add new message definitions for pmfw eeprom interface Add new message definitions for pmfw eeprom interface Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- .../pm/swsmu/inc/pmfw_if/smu_v13_0_12_ppsmc.h | 16 ++++++---------- drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 9 ++++++++- .../drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 7 +++++++ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_ppsmc.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_ppsmc.h index 4b066c42e0ec..fe1b3ac50a75 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_ppsmc.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_ppsmc.h @@ -105,19 +105,15 @@ #define PPSMC_MSG_UpdatePccWaitDecMaxStr 0x4C #define PPSMC_MSG_ResetSDMA 0x4D #define PPSMC_MSG_GetRasTableVersion 0x4E -#define PPSMC_MSG_GetRmaStatus 0x4F -#define PPSMC_MSG_GetErrorCount 0x50 -#define PPSMC_MSG_GetBadPageCount 0x51 -#define PPSMC_MSG_GetBadPageInfo 0x52 -#define PPSMC_MSG_GetBadPagePaAddrLoHi 0x53 -#define PPSMC_MSG_SetTimestampLoHi 0x54 -#define PPSMC_MSG_GetTimestampLoHi 0x55 -#define PPSMC_MSG_GetRasPolicy 0x56 -#define PPSMC_MSG_DumpErrorRecord 0x57 +#define PPSMC_MSG_GetBadPageCount 0x50 +#define PPSMC_MSG_GetBadPageMcaAddress 0x51 +#define PPSMC_MSG_SetTimestamp 0x53 +#define PPSMC_MSG_SetTimestampHi 0x54 +#define PPSMC_MSG_GetTimestamp 0x55 +#define PPSMC_MSG_GetBadPageIpIdLoHi 0x57 #define PPSMC_MSG_EraseRasTable 0x58 #define PPSMC_MSG_GetStaticMetricsTable 0x59 #define PPSMC_MSG_ResetVfArbitersByIndex 0x5A -#define PPSMC_MSG_GetBadPageSeverity 0x5B #define PPSMC_MSG_GetSystemMetricsTable 0x5C #define PPSMC_MSG_GetSystemMetricsVersion 0x5D #define PPSMC_MSG_ResetVCN 0x5E diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h index 2256c77da636..9315ce49b396 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h @@ -279,7 +279,14 @@ __SMU_DUMMY_MAP(ResetSDMA), \ __SMU_DUMMY_MAP(ResetVCN), \ __SMU_DUMMY_MAP(GetStaticMetricsTable), \ - __SMU_DUMMY_MAP(GetSystemMetricsTable), + __SMU_DUMMY_MAP(GetSystemMetricsTable), \ + __SMU_DUMMY_MAP(GetRASTableVersion), \ + __SMU_DUMMY_MAP(GetBadPageCount), \ + __SMU_DUMMY_MAP(GetBadPageMcaAddr), \ + __SMU_DUMMY_MAP(SetTimestamp), \ + __SMU_DUMMY_MAP(GetTimestamp), \ + __SMU_DUMMY_MAP(GetBadPageIpid), \ + __SMU_DUMMY_MAP(EraseRasTable), #undef __SMU_DUMMY_MAP #define __SMU_DUMMY_MAP(type) SMU_MSG_##type diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c index c6cf0d0c4b82..f2e3cae43fda 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c @@ -139,6 +139,13 @@ const struct cmn2asic_msg_mapping smu_v13_0_12_message_map[SMU_MSG_MAX_COUNT] = MSG_MAP(ResetVCN, PPSMC_MSG_ResetVCN, 0), MSG_MAP(GetStaticMetricsTable, PPSMC_MSG_GetStaticMetricsTable, 1), MSG_MAP(GetSystemMetricsTable, PPSMC_MSG_GetSystemMetricsTable, 1), + MSG_MAP(GetRASTableVersion, PPSMC_MSG_GetRasTableVersion, 0), + MSG_MAP(GetBadPageCount, PPSMC_MSG_GetBadPageCount, 0), + MSG_MAP(GetBadPageMcaAddr, PPSMC_MSG_GetBadPageMcaAddress, 0), + MSG_MAP(SetTimestamp, PPSMC_MSG_SetTimestamp, 0), + MSG_MAP(GetTimestamp, PPSMC_MSG_GetTimestamp, 0), + MSG_MAP(GetBadPageIpid, PPSMC_MSG_GetBadPageIpIdLoHi, 0), + MSG_MAP(EraseRasTable, PPSMC_MSG_EraseRasTable, 0), }; int smu_v13_0_12_tables_init(struct smu_context *smu) From 77dbd7c0a2e5ead15564ce2e7ef66a7f38d60794 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Fri, 12 Sep 2025 12:43:35 +0800 Subject: [PATCH 60/83] drm/amd/pm: implement ras_smu_drv interface for smu v13.0.12 implement ras_smu_drv interface for smu v13.0.12 Signed-off-by: Gangliang Xie Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 26 ++++ .../drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 129 ++++++++++++++++++ .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h | 1 + 3 files changed, 156 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 40c0bf85f1d3..3c0b36dd37bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -503,6 +503,32 @@ struct ras_critical_region { uint64_t size; }; +struct ras_eeprom_table_version { + uint32_t minor : 16; + uint32_t major : 16; +}; + +struct ras_eeprom_smu_funcs { + int (*get_ras_table_version)(struct amdgpu_device *adev, + uint32_t *table_version); + int (*get_badpage_count)(struct amdgpu_device *adev, uint32_t *count, uint32_t timeout); + int (*get_badpage_mca_addr)(struct amdgpu_device *adev, uint16_t index, uint64_t *mca_addr); + int (*set_timestamp)(struct amdgpu_device *adev, uint64_t timestamp); + int (*get_timestamp)(struct amdgpu_device *adev, + uint16_t index, uint64_t *timestamp); + int (*get_badpage_ipid)(struct amdgpu_device *adev, uint16_t index, uint64_t *ipid); + int (*erase_ras_table)(struct amdgpu_device *adev, uint32_t *result); +}; + +enum ras_smu_feature_flags { + RAS_SMU_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0), +}; + +struct ras_smu_drv { + const struct ras_eeprom_smu_funcs *smu_eeprom_funcs; + void (*ras_smu_feature_flags)(struct amdgpu_device *adev, uint64_t *flags); +}; + struct amdgpu_ras { void *ras_mgr; /* ras infrastructure */ diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c index f2e3cae43fda..24aaef1494a4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c @@ -34,6 +34,7 @@ #include "amdgpu_fru_eeprom.h" #include #include "smu_cmn.h" +#include "amdgpu_ras.h" #undef MP1_Public #undef smnMP1_FIRMWARE_FLAGS @@ -925,3 +926,131 @@ const struct smu_temp_funcs smu_v13_0_12_temp_funcs = { .temp_metrics_is_supported = smu_v13_0_12_is_temp_metrics_supported, .get_temp_metrics = smu_v13_0_12_get_temp_metrics, }; + +static int smu_v13_0_12_get_ras_table_version(struct amdgpu_device *adev, + uint32_t *table_version) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + + return smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GetRASTableVersion, 0, table_version); +} + +static int smu_v13_0_12_get_badpage_count(struct amdgpu_device *adev, uint32_t *count, + uint32_t timeout) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + uint64_t end, now; + int ret = 0; + + now = (uint64_t)ktime_to_ms(ktime_get()); + end = now + timeout; + do { + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GetBadPageCount, 0, count); + /* eeprom is not ready */ + if (ret != -EBUSY) + return ret; + mdelay(10); + now = (uint64_t)ktime_to_ms(ktime_get()); + } while (now < end); + + return ret; +} + +static int smu_v13_0_12_set_timestamp(struct amdgpu_device *adev, uint64_t timestamp) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + + return smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_SetTimestamp, (uint32_t)timestamp, 0); +} + +static int smu_v13_0_12_get_timestamp(struct amdgpu_device *adev, + uint16_t index, uint64_t *timestamp) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + uint32_t temp; + int ret; + + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GetTimestamp, index, &temp); + if (!ret) + *timestamp = temp; + + return ret; +} + +static int smu_v13_0_12_get_badpage_ipid(struct amdgpu_device *adev, + uint16_t index, uint64_t *ipid) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + uint32_t temp_arg, temp_ipid_lo, temp_ipid_high; + int ret; + + temp_arg = index | (1 << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GetBadPageIpid, temp_arg, &temp_ipid_lo); + if (ret) + return ret; + + temp_arg = index | (2 << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GetBadPageIpid, temp_arg, &temp_ipid_high); + if (!ret) + *ipid = (uint64_t)temp_ipid_high << 32 | temp_ipid_lo; + return ret; +} + +static int smu_v13_0_12_erase_ras_table(struct amdgpu_device *adev, + uint32_t *result) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + + return smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_EraseRasTable, 0, result); +} + +static int smu_v13_0_12_get_badpage_mca_addr(struct amdgpu_device *adev, + uint16_t index, uint64_t *mca_addr) +{ + struct smu_context *smu = adev->powerplay.pp_handle; + uint32_t temp_arg, temp_addr_lo, temp_addr_high; + int ret; + + temp_arg = index | (1 << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GetBadPageMcaAddr, temp_arg, &temp_addr_lo); + if (ret) + return ret; + + temp_arg = index | (2 << 16); + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GetBadPageMcaAddr, temp_arg, &temp_addr_high); + if (!ret) + *mca_addr = (uint64_t)temp_addr_high << 32 | temp_addr_lo; + return ret; +} + +static const struct ras_eeprom_smu_funcs smu_v13_0_12_eeprom_smu_funcs = { + .get_ras_table_version = smu_v13_0_12_get_ras_table_version, + .get_badpage_count = smu_v13_0_12_get_badpage_count, + .get_badpage_mca_addr = smu_v13_0_12_get_badpage_mca_addr, + .set_timestamp = smu_v13_0_12_set_timestamp, + .get_timestamp = smu_v13_0_12_get_timestamp, + .get_badpage_ipid = smu_v13_0_12_get_badpage_ipid, + .erase_ras_table = smu_v13_0_12_erase_ras_table, +}; + +static void smu_v13_0_12_ras_smu_feature_flags(struct amdgpu_device *adev, uint64_t *flags) +{ + if (!flags) + return; + + *flags = 0ULL; +} + +const struct ras_smu_drv smu_v13_0_12_ras_smu_drv = { + .smu_eeprom_funcs = &smu_v13_0_12_eeprom_smu_funcs, + .ras_smu_feature_flags = smu_v13_0_12_ras_smu_feature_flags, +}; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h index ba865ae7eca2..ecec7af8a64f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h @@ -105,6 +105,7 @@ int smu_v13_0_12_get_npm_data(struct smu_context *smu, extern const struct cmn2asic_mapping smu_v13_0_12_feature_mask_map[]; extern const struct cmn2asic_msg_mapping smu_v13_0_12_message_map[]; extern const struct smu_temp_funcs smu_v13_0_12_temp_funcs; +extern const struct ras_smu_drv smu_v13_0_12_ras_smu_drv; #if defined(SWSMU_CODE_LAYER_L2) #include "smu_cmn.h" From f5346a176cb5e2860581ee91110d6f037bb87d85 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Mon, 15 Sep 2025 12:52:35 +0800 Subject: [PATCH 61/83] drm/amd/pm: add smu ras driver framework add functions to get smu ras driver Signed-off-by: Gangliang Xie Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 7 +++++++ drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 1 + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 11 ++++++++++ drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 7 +++++++ .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 21 +++++++++++++++++++ drivers/gpu/drm/amd/pm/swsmu/smu_internal.h | 1 + 7 files changed, 49 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 3c0b36dd37bf..674bcd3c814c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -620,6 +620,7 @@ struct amdgpu_ras { /* Disable/Enable uniras switch */ bool uniras_enabled; + const struct ras_smu_drv *ras_smu_drv; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 5c4d0eb198c4..c6f55d3522cd 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -2122,3 +2122,10 @@ ssize_t amdgpu_dpm_get_xcp_metrics(struct amdgpu_device *adev, int xcp_id, return ret; } + +const struct ras_smu_drv *amdgpu_dpm_get_ras_smu_driver(struct amdgpu_device *adev) +{ + void *pp_handle = adev->powerplay.pp_handle; + + return smu_get_ras_smu_driver(pp_handle); +} diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h index c7ea29385682..aa3f427819a0 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h @@ -612,5 +612,6 @@ int amdgpu_dpm_reset_vcn(struct amdgpu_device *adev, uint32_t inst_mask); bool amdgpu_dpm_reset_vcn_is_supported(struct amdgpu_device *adev); bool amdgpu_dpm_is_temp_metrics_supported(struct amdgpu_device *adev, enum smu_temp_metric_type type); +const struct ras_smu_drv *amdgpu_dpm_get_ras_smu_driver(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index b3510345a32a..c0e7c45ac0e6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -2803,6 +2803,17 @@ const struct amdgpu_ip_block_version smu_v14_0_ip_block = { .funcs = &smu_ip_funcs, }; +const struct ras_smu_drv *smu_get_ras_smu_driver(void *handle) +{ + struct smu_context *smu = (struct smu_context *)handle; + const struct ras_smu_drv *tmp = NULL; + int ret; + + ret = smu_get_ras_smu_drv(smu, &tmp); + + return ret ? NULL : tmp; +} + static int smu_load_microcode(void *handle) { struct smu_context *smu = handle; diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index c48028abc8c4..8815fc70b63b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -1531,6 +1531,12 @@ struct pptable_funcs { int (*ras_send_msg)(struct smu_context *smu, enum smu_message_type msg, uint32_t param, uint32_t *read_arg); + + /** + * @get_ras_smu_drv: Get RAS smu driver interface + * Return: ras_smu_drv * + */ + int (*get_ras_smu_drv)(struct smu_context *smu, const struct ras_smu_drv **ras_smu_drv); }; typedef enum { @@ -1795,6 +1801,7 @@ int smu_set_pm_policy(struct smu_context *smu, enum pp_pm_policy p_type, int level); ssize_t smu_get_pm_policy_info(struct smu_context *smu, enum pp_pm_policy p_type, char *sysbuf); +const struct ras_smu_drv *smu_get_ras_smu_driver(void *handle); int amdgpu_smu_ras_send_msg(struct amdgpu_device *adev, enum smu_message_type msg, uint32_t param, uint32_t *readarg); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index de4c944885f6..095f54b7e9e6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -3905,6 +3905,26 @@ static void smu_v13_0_6_set_temp_funcs(struct smu_context *smu) == IP_VERSION(13, 0, 12)) ? &smu_v13_0_12_temp_funcs : NULL; } +static int smu_v13_0_6_get_ras_smu_drv(struct smu_context *smu, const struct ras_smu_drv **ras_smu_drv) +{ + if (!ras_smu_drv) + return -EINVAL; + + if (amdgpu_sriov_vf(smu->adev)) + return -EOPNOTSUPP; + + switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) { + case IP_VERSION(13, 0, 12): + *ras_smu_drv = &smu_v13_0_12_ras_smu_drv; + break; + default: + *ras_smu_drv = NULL; + break; + } + + return 0; +} + static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { /* init dpm */ .get_allowed_feature_mask = smu_v13_0_6_get_allowed_feature_mask, @@ -3964,6 +3984,7 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = { .dpm_reset_vcn = smu_v13_0_6_reset_vcn, .post_init = smu_v13_0_6_post_init, .ras_send_msg = smu_v13_0_6_ras_send_msg, + .get_ras_smu_drv = smu_v13_0_6_get_ras_smu_drv, }; void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h b/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h index c09ecf1a68a0..34f6b4b1c3ba 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h @@ -100,6 +100,7 @@ #define smu_is_asic_wbrf_supported(smu) smu_ppt_funcs(is_asic_wbrf_supported, false, smu) #define smu_enable_uclk_shadow(smu, enable) smu_ppt_funcs(enable_uclk_shadow, 0, smu, enable) #define smu_set_wbrf_exclusion_ranges(smu, freq_band_range) smu_ppt_funcs(set_wbrf_exclusion_ranges, -EOPNOTSUPP, smu, freq_band_range) +#define smu_get_ras_smu_drv(smu, ras_smu_drv) smu_ppt_funcs(get_ras_smu_drv, -EOPNOTSUPP, smu, ras_smu_drv) #endif #endif From f6cdcbd2c0f37896766623b928a4ce95c54fb3e6 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Mon, 15 Sep 2025 12:55:36 +0800 Subject: [PATCH 62/83] drm/amdgpu: add function to check if pmfw eeprom is supported add function to check if pmfw is supported, skip eeprom check and recover when pmfw eeprom is supported Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 + .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 46 ++++++++++++++++++- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 + 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0984928db042..37999b367957 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3767,6 +3767,8 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) return 0; control = &con->eeprom_control; + con->ras_smu_drv = amdgpu_dpm_get_ras_smu_driver(adev); + ret = amdgpu_ras_eeprom_init(control); control->is_eeprom_valid = !ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 011fa4748084..89d0def82797 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -1546,7 +1546,8 @@ void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev) struct amdgpu_ras_eeprom_control *control; int res; - if (!__is_ras_eeprom_supported(adev) || !ras) + if (!__is_ras_eeprom_supported(adev) || !ras || + amdgpu_ras_smu_eeprom_supported(adev)) return; control = &ras->eeprom_control; if (!control->is_eeprom_valid) @@ -1566,4 +1567,45 @@ void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev) control->is_eeprom_valid = false; } return; -} \ No newline at end of file +} + +static const struct ras_smu_drv *amdgpu_ras_get_smu_ras_drv(struct amdgpu_device *adev) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (!ras) + return NULL; + + return ras->ras_smu_drv; +} + +static uint64_t amdgpu_ras_smu_get_feature_flags(struct amdgpu_device *adev) +{ + const struct ras_smu_drv *ras_smu_drv = amdgpu_ras_get_smu_ras_drv(adev); + uint64_t flags = 0ULL; + + if (!ras_smu_drv) + goto out; + + if (ras_smu_drv->ras_smu_feature_flags) + ras_smu_drv->ras_smu_feature_flags(adev, &flags); + +out: + return flags; +} + +bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev) +{ + const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); + uint64_t flags = 0ULL; + + if (!__is_ras_eeprom_supported(adev) || !smu_ras_drv) + return false; + + if (!smu_ras_drv->smu_eeprom_funcs) + return false; + + flags = amdgpu_ras_smu_get_feature_flags(adev); + + return !!(flags & RAS_SMU_FEATURE_BIT__RAS_EEPROM); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index ebfca4cb5688..feff46b22b6f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -163,6 +163,8 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control); void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev); +bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev); + extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops; extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops; From d4432f16d3393180e8f0b344b21839e553f7938b Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Mon, 15 Sep 2025 17:13:25 +0800 Subject: [PATCH 63/83] drm/amdgpu: add wrapper functions for pmfw eeprom interface add wrapper functions for pmfw eeprom interface, for these interfaces to be easily and safely called Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 98 +++++++++++++++++++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 21 ++++ 2 files changed, 119 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 89d0def82797..258ff0f121a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -1609,3 +1609,101 @@ bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev) return !!(flags & RAS_SMU_FEATURE_BIT__RAS_EEPROM); } + +int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev, + uint32_t *table_version) +{ + const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return -EOPNOTSUPP; + + if (smu_ras_drv->smu_eeprom_funcs->get_ras_table_version) + return smu_ras_drv->smu_eeprom_funcs->get_ras_table_version(adev, + table_version); + return -EOPNOTSUPP; +} + +int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev, + uint32_t *count, uint32_t timeout) +{ + const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return -EOPNOTSUPP; + + if (smu_ras_drv->smu_eeprom_funcs->get_badpage_count) + return smu_ras_drv->smu_eeprom_funcs->get_badpage_count(adev, + count, timeout); + return -EOPNOTSUPP; +} + +int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev, + uint16_t index, uint64_t *mca_addr) +{ + const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return -EOPNOTSUPP; + + if (smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr) + return smu_ras_drv->smu_eeprom_funcs->get_badpage_mca_addr(adev, + index, mca_addr); + return -EOPNOTSUPP; +} + +int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev, + uint64_t timestamp) +{ + const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return -EOPNOTSUPP; + + if (smu_ras_drv->smu_eeprom_funcs->set_timestamp) + return smu_ras_drv->smu_eeprom_funcs->set_timestamp(adev, + timestamp); + return -EOPNOTSUPP; +} + +int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev, + uint16_t index, uint64_t *timestamp) +{ + const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return -EOPNOTSUPP; + + if (smu_ras_drv->smu_eeprom_funcs->get_timestamp) + return smu_ras_drv->smu_eeprom_funcs->get_timestamp(adev, + index, timestamp); + return -EOPNOTSUPP; +} + +int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev, + uint16_t index, uint64_t *ipid) +{ + const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return -EOPNOTSUPP; + + if (smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid) + return smu_ras_drv->smu_eeprom_funcs->get_badpage_ipid(adev, + index, ipid); + return -EOPNOTSUPP; +} + +int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev, + uint32_t *result) +{ + const struct ras_smu_drv *smu_ras_drv = amdgpu_ras_get_smu_ras_drv(adev); + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return -EOPNOTSUPP; + + if (smu_ras_drv->smu_eeprom_funcs->erase_ras_table) + return smu_ras_drv->smu_eeprom_funcs->erase_ras_table(adev, + result); + return -EOPNOTSUPP; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index feff46b22b6f..cfbd402ddea2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -165,6 +165,27 @@ void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev); bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev); +int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev, + uint32_t *table_version); + +int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev, + uint32_t *count, uint32_t timeout); + +int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev, + uint16_t index, uint64_t *mca_addr); + +int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev, + uint64_t timestamp); + +int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev, + uint16_t index, uint64_t *timestamp); + +int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev, + uint16_t index, uint64_t *ipid); + +int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev, + uint32_t *result); + extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops; extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops; From d95ca7f515cfd2e721de07e86aa79adb17575a52 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 28 Oct 2025 16:18:31 +0800 Subject: [PATCH 64/83] drm/amdgpu: suspend ras module before gpu reset During gpu reset, all GPU-related resources are inaccessible. To avoid affecting ras functionality, suspend ras module before gpu reset and resume it after gpu reset is complete. V2: Rename functions to avoid misunderstanding. V3: Move flush_delayed_work to amdgpu_ras_process_pause, Move schedule_delayed_work to amdgpu_ras_process_unpause. V4: Rename functions. V5: Move the function to amdgpu_ras.c. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Reviewed-by: Hawking Zhang Acked-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++ .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c | 22 +++++++ .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h | 5 ++ .../drm/amd/ras/ras_mgr/amdgpu_ras_process.c | 64 +++++++++++++++++++ .../drm/amd/ras/ras_mgr/amdgpu_ras_process.h | 4 ++ .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c | 6 ++ drivers/gpu/drm/amd/ras/rascore/ras.h | 2 + drivers/gpu/drm/amd/ras/rascore/ras_process.c | 7 ++ 10 files changed, 148 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 95f7ae36e4f1..dcf6fce1c5a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -71,6 +71,7 @@ #include "amdgpu_xgmi.h" #include "amdgpu_ras.h" +#include "amdgpu_ras_mgr.h" #include "amdgpu_pmu.h" #include "amdgpu_fru_eeprom.h" #include "amdgpu_reset.h" @@ -6660,6 +6661,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, goto end_reset; } + /* Cannot be called after locking reset domain */ + amdgpu_ras_pre_reset(adev, &device_list); + /* We need to lock reset domain only once both for XGMI and single device */ amdgpu_device_recovery_get_reset_lock(adev, &device_list); @@ -6691,6 +6695,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, reset_unlock: amdgpu_device_recovery_put_reset_lock(adev, &device_list); end_reset: + amdgpu_ras_post_reset(adev, &device_list); if (hive) { mutex_unlock(&hive->hive_lock); amdgpu_put_xgmi_hive(hive); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 37999b367957..62d2f988d88f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2921,8 +2921,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) type = amdgpu_ras_get_fatal_error_event(adev); list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { - amdgpu_ras_query_err_status(remote_adev); - amdgpu_ras_log_on_err_counter(remote_adev, type); + if (amdgpu_uniras_enabled(remote_adev)) { + amdgpu_ras_mgr_update_ras_ecc(remote_adev); + } else { + amdgpu_ras_query_err_status(remote_adev); + amdgpu_ras_log_on_err_counter(remote_adev, type); + } } } @@ -5673,3 +5677,25 @@ bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr return ret; } + +void amdgpu_ras_pre_reset(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { + if (amdgpu_uniras_enabled(tmp_adev)) + amdgpu_ras_mgr_pre_reset(tmp_adev); + } +} + +void amdgpu_ras_post_reset(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { + if (amdgpu_uniras_enabled(tmp_adev)) + amdgpu_ras_mgr_post_reset(tmp_adev); + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 674bcd3c814c..ff44190d7d98 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -1039,4 +1039,9 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, const char *fmt, ...); bool amdgpu_ras_is_rma(struct amdgpu_device *adev); + +void amdgpu_ras_pre_reset(struct amdgpu_device *adev, + struct list_head *device_list); +void amdgpu_ras_post_reset(struct amdgpu_device *adev, + struct list_head *device_list); #endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c index adb01bdee003..afe8135b6258 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c @@ -624,3 +624,25 @@ int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, return ret; } + +int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_mgr_is_ready(adev)) { + RAS_DEV_ERR(adev, "Invalid ras suspend!\n"); + return -EPERM; + } + + amdgpu_ras_process_pre_reset(adev); + return 0; +} + +int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_mgr_is_ready(adev)) { + RAS_DEV_ERR(adev, "Invalid ras resume!\n"); + return -EPERM; + } + + amdgpu_ras_process_post_reset(adev); + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h index 42f190a8feb9..8fb7eb4b8f13 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h @@ -52,6 +52,9 @@ struct amdgpu_ras_mgr { struct ras_event_manager ras_event_mgr; uint64_t last_poison_consumption_seqno; bool ras_is_ready; + + bool is_paused; + struct completion ras_event_done; }; extern const struct amdgpu_ip_block_version ras_v1_0_ip_block; @@ -75,4 +78,6 @@ bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev); int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, uint32_t cmd_id, void *input, uint32_t input_size, void *output, uint32_t out_size); +int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev); +int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c index 6727fc9a2b9b..5782c007de71 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c @@ -29,6 +29,7 @@ #include "amdgpu_ras_process.h" #define RAS_MGR_RETIRE_PAGE_INTERVAL 100 +#define RAS_EVENT_PROCESS_TIMEOUT 1200 static void ras_process_retire_page_dwork(struct work_struct *work) { @@ -57,6 +58,9 @@ int amdgpu_ras_process_init(struct amdgpu_device *adev) { struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + ras_mgr->is_paused = false; + init_completion(&ras_mgr->ras_event_done); + INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork); return 0; @@ -66,6 +70,7 @@ int amdgpu_ras_process_fini(struct amdgpu_device *adev) { struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + ras_mgr->is_paused = false; /* Save all cached bad pages to eeprom */ flush_delayed_work(&ras_mgr->retire_page_dwork); cancel_delayed_work_sync(&ras_mgr->retire_page_dwork); @@ -124,3 +129,62 @@ int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false); } + +int amdgpu_ras_process_begin(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (ras_mgr->is_paused) + return -EAGAIN; + + reinit_completion(&ras_mgr->ras_event_done); + return 0; +} + +int amdgpu_ras_process_end(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + complete(&ras_mgr->ras_event_done); + return 0; +} + +int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + long rc; + + if (!ras_mgr || !ras_mgr->ras_core) + return -EINVAL; + + if (!ras_mgr->ras_core->is_initialized) + return -EPERM; + + ras_mgr->is_paused = true; + + /* Wait for RAS event processing to complete */ + rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done, + msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT)); + if (rc <= 0) + RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n", + rc ? "interrupted" : "timeout"); + + flush_delayed_work(&ras_mgr->retire_page_dwork); + return 0; +} + +int amdgpu_ras_process_post_reset(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!ras_mgr || !ras_mgr->ras_core) + return -EINVAL; + + if (!ras_mgr->ras_core->is_initialized) + return -EPERM; + + ras_mgr->is_paused = false; + + schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h index b9502bd21beb..d55cdaeac441 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h @@ -34,4 +34,8 @@ int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, void *data); int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, void *data); +int amdgpu_ras_process_begin(struct amdgpu_device *adev); +int amdgpu_ras_process_end(struct amdgpu_device *adev); +int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev); +int amdgpu_ras_process_post_reset(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c index f21cd55a25be..45ed8c3b5563 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c @@ -142,6 +142,12 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core, case RAS_EVENT_ID__RESET_GPU: ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data); break; + case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN: + ret = amdgpu_ras_process_begin(ras_core->dev); + break; + case RAS_EVENT_ID__RAS_EVENT_PROC_END: + ret = amdgpu_ras_process_end(ras_core->dev); + break; default: RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id); break; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h b/drivers/gpu/drm/amd/ras/rascore/ras.h index fa224b36e3f2..3396b2e0949d 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras.h +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h @@ -115,6 +115,8 @@ enum ras_notify_event { RAS_EVENT_ID__FATAL_ERROR_DETECTED, RAS_EVENT_ID__RESET_GPU, RAS_EVENT_ID__RESET_VF, + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, + RAS_EVENT_ID__RAS_EVENT_PROC_END, }; enum ras_gpu_status { diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.c b/drivers/gpu/drm/amd/ras/rascore/ras_process.c index 02f0657f78a3..3267dcdb169c 100644 --- a/drivers/gpu/drm/amd/ras/rascore/ras_process.c +++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.c @@ -162,6 +162,11 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core) uint32_t umc_event_count; int ret; + ret = ras_core_event_notify(ras_core, + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL); + if (ret) + return ret; + ras_aca_clear_fatal_flag(ras_core); ras_umc_log_pending_bad_bank(ras_core); @@ -185,6 +190,8 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core) atomic_set(&ras_proc->umc_interrupt_count, 0); } + ras_core_event_notify(ras_core, + RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL); return ret; } From f903b85ed0f14fc412d8a781d3fcc0c023dfcd7c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 22 Oct 2025 17:11:38 -0400 Subject: [PATCH 65/83] drm/amdgpu: fix possible fence leaks from job structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we don't end up initializing the fences, free them when we free the job. We can't set the hw_fence to NULL after emitting it because we need it in the cleanup path for the submit direct case. v2: take a reference to the fences if we emit them v3: handle non-job fence in error paths Fixes: db36632ea51e ("drm/amdgpu: clean up and unify hw fence handling") Reviewed-by: Jesse Zhang (v1) Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 19 +++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 18 ++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 ++ 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 39229ece83f8..586a58facca1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -176,18 +176,21 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned int num_ibs, if (!ring->sched.ready) { dev_err(adev->dev, "couldn't schedule ib on ring <%s>\n", ring->name); - return -EINVAL; + r = -EINVAL; + goto free_fence; } if (vm && !job->vmid) { dev_err(adev->dev, "VM IB without ID\n"); - return -EINVAL; + r = -EINVAL; + goto free_fence; } if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) && (!ring->funcs->secure_submission_supported)) { dev_err(adev->dev, "secure submissions not supported on ring <%s>\n", ring->name); - return -EINVAL; + r = -EINVAL; + goto free_fence; } alloc_size = ring->funcs->emit_frame_size + num_ibs * @@ -196,7 +199,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned int num_ibs, r = amdgpu_ring_alloc(ring, alloc_size); if (r) { dev_err(adev->dev, "scheduling IB failed (%d).\n", r); - return r; + goto free_fence; } need_ctx_switch = ring->current_ctx != fence_ctx; @@ -302,6 +305,9 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned int num_ibs, return r; } *f = &af->base; + /* get a ref for the job */ + if (job) + dma_fence_get(*f); if (ring->funcs->insert_end) ring->funcs->insert_end(ring); @@ -328,6 +334,11 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned int num_ibs, amdgpu_ring_commit(ring); return 0; + +free_fence: + if (!job) + kfree(af); + return r; } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 9e0cd1e0afc3..7d8ef7ae10c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -293,6 +293,15 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) amdgpu_sync_free(&job->explicit_sync); + if (job->hw_fence->base.ops) + dma_fence_put(&job->hw_fence->base); + else + kfree(job->hw_fence); + if (job->hw_vm_fence->base.ops) + dma_fence_put(&job->hw_vm_fence->base); + else + kfree(job->hw_vm_fence); + kfree(job); } @@ -322,6 +331,15 @@ void amdgpu_job_free(struct amdgpu_job *job) if (job->gang_submit != &job->base.s_fence->scheduled) dma_fence_put(job->gang_submit); + if (job->hw_fence->base.ops) + dma_fence_put(&job->hw_fence->base); + else + kfree(job->hw_fence); + if (job->hw_vm_fence->base.ops) + dma_fence_put(&job->hw_vm_fence->base); + else + kfree(job->hw_vm_fence); + kfree(job); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 453d3b576456..700b4a776532 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -849,6 +849,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, if (r) return r; fence = &job->hw_vm_fence->base; + /* get a ref for the job */ + dma_fence_get(fence); } if (vm_flush_needed) { From 9ce015e5fd025a23e357bbbe6602c1e7cb4b89e0 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Thu, 4 Sep 2025 18:04:33 +0800 Subject: [PATCH 66/83] drm/amdgpu: adapt reset function for pmfw eeprom adapt reset function for pmfw eeprom Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 61 +++++++++++-------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 258ff0f121a2..e9c5781e4376 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -444,40 +444,51 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai; struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + u32 erase_res = 0; u8 csum; int res; mutex_lock(&control->ras_tbl_mutex); - hdr->header = RAS_TABLE_HDR_VAL; - amdgpu_ras_set_eeprom_table_version(control); + if (!amdgpu_ras_smu_eeprom_supported(adev)) { + hdr->header = RAS_TABLE_HDR_VAL; + amdgpu_ras_set_eeprom_table_version(control); - if (hdr->version >= RAS_TABLE_VER_V2_1) { - hdr->first_rec_offset = RAS_RECORD_START_V2_1; - hdr->tbl_size = RAS_TABLE_HEADER_SIZE + - RAS_TABLE_V2_1_INFO_SIZE; - rai->rma_status = GPU_HEALTH_USABLE; - /** - * GPU health represented as a percentage. - * 0 means worst health, 100 means fully health. - */ - rai->health_percent = 100; - /* ecc_page_threshold = 0 means disable bad page retirement */ - rai->ecc_page_threshold = con->bad_page_cnt_threshold; + if (hdr->version >= RAS_TABLE_VER_V2_1) { + hdr->first_rec_offset = RAS_RECORD_START_V2_1; + hdr->tbl_size = RAS_TABLE_HEADER_SIZE + + RAS_TABLE_V2_1_INFO_SIZE; + rai->rma_status = GPU_HEALTH_USABLE; + /** + * GPU health represented as a percentage. + * 0 means worst health, 100 means fully health. + */ + rai->health_percent = 100; + /* ecc_page_threshold = 0 means disable bad page retirement */ + rai->ecc_page_threshold = con->bad_page_cnt_threshold; + } else { + hdr->first_rec_offset = RAS_RECORD_START; + hdr->tbl_size = RAS_TABLE_HEADER_SIZE; + } + + csum = __calc_hdr_byte_sum(control); + if (hdr->version >= RAS_TABLE_VER_V2_1) + csum += __calc_ras_info_byte_sum(control); + csum = -csum; + hdr->checksum = csum; + res = __write_table_header(control); + if (!res && hdr->version > RAS_TABLE_VER_V1) + res = __write_table_ras_info(control); } else { - hdr->first_rec_offset = RAS_RECORD_START; - hdr->tbl_size = RAS_TABLE_HEADER_SIZE; + res = amdgpu_ras_smu_erase_ras_table(adev, &erase_res); + if (res || erase_res) { + dev_warn(adev->dev, "RAS EEPROM reset failed, res:%d result:%d", + res, erase_res); + if (!res) + res = -EIO; + } } - csum = __calc_hdr_byte_sum(control); - if (hdr->version >= RAS_TABLE_VER_V2_1) - csum += __calc_ras_info_byte_sum(control); - csum = -csum; - hdr->checksum = csum; - res = __write_table_header(control); - if (!res && hdr->version > RAS_TABLE_VER_V1) - res = __write_table_ras_info(control); - control->ras_num_recs = 0; control->ras_num_bad_pages = 0; control->ras_num_mca_recs = 0; From 19c815d516745083994009518f63092a89f6edc1 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Thu, 4 Sep 2025 18:07:40 +0800 Subject: [PATCH 67/83] drm/amdgpu: add initialization function for pmfw eeprom add initialization function for pmfw eeprom Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index e9c5781e4376..6b51574530a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -1386,6 +1386,42 @@ static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control) return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } +static int amdgpu_ras_smu_eeprom_init(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + uint64_t local_time; + int res; + + ras->is_rma = false; + + if (!__is_ras_eeprom_supported(adev)) + return 0; + mutex_init(&control->ras_tbl_mutex); + + res = amdgpu_ras_smu_get_table_version(adev, &(hdr->version)); + if (res) + return res; + + res = amdgpu_ras_smu_get_badpage_count(adev, + &(control->ras_num_recs), 100); + if (res) + return res; + + local_time = (uint64_t)ktime_get_real_seconds(); + res = amdgpu_ras_smu_set_timestamp(adev, local_time); + if (res) + return res; + + control->ras_max_record_count = 4000; + + control->ras_num_mca_recs = 0; + control->ras_num_pa_recs = 0; + + return 0; +} + int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); @@ -1394,6 +1430,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int res; + if (amdgpu_ras_smu_eeprom_supported(adev)) + return amdgpu_ras_smu_eeprom_init(control); + ras->is_rma = false; if (!__is_ras_eeprom_supported(adev)) From cd5b28a040edc46dd251cd681d6f414953a988c6 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Thu, 4 Sep 2025 18:09:40 +0800 Subject: [PATCH 68/83] drm/amdgpu: add check function for pmfw eeprom add check function for pmfw eeprom Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 6b51574530a4..3c646d9dad77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -1499,6 +1499,47 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) return 0; } +static int amdgpu_ras_smu_eeprom_check(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (!__is_ras_eeprom_supported(adev)) + return 0; + + control->ras_num_bad_pages = ras->bad_page_num; + + if ((ras->bad_page_cnt_threshold < control->ras_num_bad_pages) && + amdgpu_bad_page_threshold != 0) { + dev_warn(adev->dev, + "RAS records:%d exceed threshold:%d\n", + control->ras_num_bad_pages, ras->bad_page_cnt_threshold); + if ((amdgpu_bad_page_threshold == -1) || + (amdgpu_bad_page_threshold == -2)) { + dev_warn(adev->dev, + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); + } else { + ras->is_rma = true; + dev_warn(adev->dev, + "User defined threshold is set, runtime service will be halt when threshold is reached\n"); + } + + return 0; + } + + dev_dbg(adev->dev, + "Found existing EEPROM table with %d records", + control->ras_num_bad_pages); + + /* Warn if we are at 90% of the threshold or above + */ + if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold) + dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d", + control->ras_num_bad_pages, + ras->bad_page_cnt_threshold); + return 0; +} + int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); @@ -1506,6 +1547,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int res = 0; + if (amdgpu_ras_smu_eeprom_supported(adev)) + return amdgpu_ras_smu_eeprom_check(control); + if (!__is_ras_eeprom_supported(adev)) return 0; From a448c40ff275d9506c859e9874048454c9791860 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Wed, 22 Oct 2025 10:36:40 +0800 Subject: [PATCH 69/83] drm/amd/pm: check pmfw eeprom feature bit get and check the pmfw eeprom feature bit to decide if pmfw eeprom is supported Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- .../gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h | 2 +- drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h | 3 ++- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 7 +++++++ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 3 +++ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h | 1 + 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h index bf6aa9620911..fa43d2e229a0 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_12_pmfw.h @@ -87,7 +87,7 @@ typedef enum { /*37*/ FEATURE_DVO = 37, /*38*/ FEATURE_XVMINORPSM_CLKSTOP_DS = 38, /*39*/ FEATURE_GLOBAL_DPM = 39, -/*40*/ FEATURE_NODE_POWER_MANAGER = 40, +/*40*/ FEATURE_HROM_EN = 40, /*41*/ NUM_FEATURES = 41 } FEATURE_LIST_e; diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h index 9315ce49b396..3a3930ef7ed9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h @@ -465,7 +465,8 @@ enum smu_clk_type { __SMU_DUMMY_MAP(GFX_EDC_XVMIN), \ __SMU_DUMMY_MAP(GFX_DIDT_XVMIN), \ __SMU_DUMMY_MAP(FAN_ABNORMAL), \ - __SMU_DUMMY_MAP(PIT), + __SMU_DUMMY_MAP(PIT), \ + __SMU_DUMMY_MAP(HROM_EN), #undef __SMU_DUMMY_MAP #define __SMU_DUMMY_MAP(feature) SMU_FEATURE_##feature##_BIT diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c index 24aaef1494a4..0ce8cff27bf9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c @@ -82,6 +82,7 @@ const struct cmn2asic_mapping smu_v13_0_12_feature_mask_map[SMU_FEATURE_COUNT] = SMU_13_0_12_FEA_MAP(SMU_FEATURE_DS_MPIOCLK_BIT, FEATURE_DS_MPIOCLK), SMU_13_0_12_FEA_MAP(SMU_FEATURE_DS_MP0CLK_BIT, FEATURE_DS_MP0CLK), SMU_13_0_12_FEA_MAP(SMU_FEATURE_PIT_BIT, FEATURE_PIT), + SMU_13_0_12_FEA_MAP(SMU_FEATURE_HROM_EN_BIT, FEATURE_HROM_EN), }; const struct cmn2asic_msg_mapping smu_v13_0_12_message_map[SMU_MSG_MAX_COUNT] = { @@ -1044,10 +1045,16 @@ static const struct ras_eeprom_smu_funcs smu_v13_0_12_eeprom_smu_funcs = { static void smu_v13_0_12_ras_smu_feature_flags(struct amdgpu_device *adev, uint64_t *flags) { + struct smu_context *smu = adev->powerplay.pp_handle; + if (!flags) return; *flags = 0ULL; + + if (smu_v13_0_6_cap_supported(smu, SMU_CAP(RAS_EEPROM))) + *flags |= RAS_SMU_FEATURE_BIT__RAS_EEPROM; + } const struct ras_smu_drv smu_v13_0_12_ras_smu_drv = { diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 095f54b7e9e6..31bdaabbd59c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -3913,6 +3913,9 @@ static int smu_v13_0_6_get_ras_smu_drv(struct smu_context *smu, const struct ras if (amdgpu_sriov_vf(smu->adev)) return -EOPNOTSUPP; + if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_HROM_EN_BIT)) + smu_v13_0_6_cap_set(smu, SMU_CAP(RAS_EEPROM)); + switch (amdgpu_ip_version(smu->adev, MP1_HWIP, 0)) { case IP_VERSION(13, 0, 12): *ras_smu_drv = &smu_v13_0_12_ras_smu_drv; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h index ecec7af8a64f..367102cdbf09 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h @@ -72,6 +72,7 @@ enum smu_v13_0_6_caps { SMU_CAP(PLDM_VERSION), SMU_CAP(TEMP_METRICS), SMU_CAP(NPM_METRICS), + SMU_CAP(RAS_EEPROM), SMU_CAP(ALL), }; From 1349b31313d5bf840e0ffe780d17acafc4ef81f8 Mon Sep 17 00:00:00 2001 From: Gangliang Xie Date: Fri, 31 Oct 2025 13:41:36 +0800 Subject: [PATCH 70/83] drm/amdgpu: initialize max record count after table reset initialize max record count and record offset after table reset Signed-off-by: Gangliang Xie Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 3c646d9dad77..d7e2a81bc274 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -459,6 +459,9 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) hdr->tbl_size = RAS_TABLE_HEADER_SIZE + RAS_TABLE_V2_1_INFO_SIZE; rai->rma_status = GPU_HEALTH_USABLE; + + control->ras_record_offset = RAS_RECORD_START_V2_1; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; /** * GPU health represented as a percentage. * 0 means worst health, 100 means fully health. @@ -469,6 +472,9 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } else { hdr->first_rec_offset = RAS_RECORD_START; hdr->tbl_size = RAS_TABLE_HEADER_SIZE; + + control->ras_record_offset = RAS_RECORD_START; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT; } csum = __calc_hdr_byte_sum(control); From b665f29a2f93a266183dc8cd5d732f203157613b Mon Sep 17 00:00:00 2001 From: "David (Ming Qiang) Wu" Date: Tue, 28 Oct 2025 18:42:05 -0400 Subject: [PATCH 71/83] drm/amdgpu/userq: need to unref bo unref bo after amdgpu_bo_reserve() failure as it has called amdgpu_bo_ref() already Reviewed-by: Alex Deucher Signed-off-by: David (Ming Qiang) Wu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 2aeeaa954882..99ae1d19b751 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -386,6 +386,7 @@ static int amdgpu_userq_fence_read_wptr(struct amdgpu_usermode_queue *queue, amdgpu_bo_unreserve(queue->vm->root.bo); r = amdgpu_bo_reserve(bo, true); if (r) { + amdgpu_bo_unref(&bo); DRM_ERROR("Failed to reserve userqueue wptr bo"); return r; } From 07528f7d97f9f78a6e075662404ed4217ac6707b Mon Sep 17 00:00:00 2001 From: Ahmad Rehman Date: Tue, 4 Nov 2025 12:23:09 -0500 Subject: [PATCH 72/83] drm/amdkfd: Do not wait for queue op response during reset This patch adds the condition to not wait for the queue response for unmap, if the gpu is in reset. Signed-off-by: Ahmad Rehman Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 4fbe865ff279..0904c36192c7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2094,7 +2094,8 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm, while (*fence_addr != fence_value) { /* Fatal err detected, this response won't come */ - if (amdgpu_amdkfd_is_fed(dqm->dev->adev)) + if (amdgpu_amdkfd_is_fed(dqm->dev->adev) || + amdgpu_in_reset(dqm->dev->adev)) return -EIO; if (time_after(jiffies, end_jiffies)) { From 127cdd726f997d2aeadb43d3c4b299c3d101aa7a Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 4 Nov 2025 20:07:58 +0800 Subject: [PATCH 73/83] drm/amd/ras: ras supports i2c eeprom for mp1 v13_0_12 ras supports i2c eeprom for mp1 v13_0_12. Signed-off-by: YiPeng Chai Acked-by: Alex Deucher Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c index 1bb7b7001ec7..3ed3ff42b7e1 100644 --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c @@ -85,6 +85,7 @@ static int ras_eeprom_i2c_config(struct ras_core_context *ras_core) case IP_VERSION(13, 0, 5): case IP_VERSION(13, 0, 6): case IP_VERSION(13, 0, 10): + case IP_VERSION(13, 0, 12): case IP_VERSION(13, 0, 14): control->i2c_address = EEPROM_I2C_MADDR_4; return 0; From 5d1b32cfe4a676fe552416cb5ae847b215463a1a Mon Sep 17 00:00:00 2001 From: Samuel Zhang Date: Wed, 5 Nov 2025 03:04:08 +0000 Subject: [PATCH 74/83] drm/amdgpu: fix gpu page fault after hibernation on PF passthrough On PF passthrough environment, after hibernate and then resume, coralgemm will cause gpu page fault. Mode1 reset happens during hibernate, but partition mode is not restored on resume, register mmCP_HYP_XCP_CTL and mmCP_PSP_XCP_CTL is not right after resume. When CP access the MQD BO, wrong stride size is used, this will cause out of bound access on the MQD BO, resulting page fault. The fix is to ensure gfx_v9_4_3_switch_compute_partition() is called when resume from a hibernation. KFD resume is called separately during a reset recovery or resume from suspend sequence. Hence it's not required to be called as part of partition switch. Signed-off-by: Samuel Zhang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 3 ++- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index 811124ff88a8..f9e2edf5260b 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -407,7 +407,8 @@ static int aqua_vanjaram_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, return -EINVAL; } - if (adev->kfd.init_complete && !amdgpu_in_reset(adev)) + if (adev->kfd.init_complete && !amdgpu_in_reset(adev) && + !adev->in_suspend) flags |= AMDGPU_XCP_OPS_KFD; if (flags & AMDGPU_XCP_OPS_KFD) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index c4c551ef6b87..cbb74ffc4792 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -2291,7 +2291,9 @@ static int gfx_v9_4_3_cp_resume(struct amdgpu_device *adev) r = amdgpu_xcp_init(adev->xcp_mgr, num_xcp, mode); } else { - if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr, + if (adev->in_suspend) + amdgpu_xcp_restore_partition_mode(adev->xcp_mgr); + else if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) == AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE) r = amdgpu_xcp_switch_partition_mode( From be031770bfc1662e773e2dfc696f1a0c1401f300 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Wed, 5 Nov 2025 17:25:37 +0800 Subject: [PATCH 75/83] drm/amd/ras: Fix the issue of incorrect function call When amdgpu_device_health_check fails, amdgpu_ras_pre_reset will not be called and therefore amdgpu_ras_post_reset cannot be called either. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dcf6fce1c5a2..86255c13fbb7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6694,8 +6694,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); reset_unlock: amdgpu_device_recovery_put_reset_lock(adev, &device_list); -end_reset: amdgpu_ras_post_reset(adev, &device_list); +end_reset: if (hive) { mutex_unlock(&hive->hive_lock); amdgpu_put_xgmi_hive(hive); From 4104c0a454f6a4d1e0d14895d03c0e7bdd0c8240 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 4 Nov 2025 13:38:02 -0600 Subject: [PATCH 76/83] drm/amd: Fix suspend failure with secure display TA commit c760bcda83571 ("drm/amd: Check whether secure display TA loaded successfully") attempted to fix extra messages, but failed to port the cleanup that was in commit 5c6d52ff4b61e ("drm/amd: Don't try to enable secure display TA multiple times") to prevent multiple tries. Add that to the failure handling path even on a quick failure. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4679 Fixes: c760bcda8357 ("drm/amd: Check whether secure display TA loaded successfully") Signed-off-by: Mario Limonciello Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 8c0e5d03de50..aa7987d0806c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2355,8 +2355,11 @@ static int psp_securedisplay_initialize(struct psp_context *psp) if (!ret && !psp->securedisplay_context.context.resp_status) { psp->securedisplay_context.context.initialized = true; mutex_init(&psp->securedisplay_context.mutex); - } else + } else { + /* don't try again */ + psp->securedisplay_context.context.bin_desc.size_bytes = 0; return ret; + } mutex_lock(&psp->securedisplay_context.mutex); From cd74132be8461429b5e98991aa15edeeb81a9f56 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 30 Sep 2025 10:56:00 +0800 Subject: [PATCH 77/83] drm/amdgpu: make MCA IPID parse global So we can call it in other blocks. v2: add a new IPID parse interface for umc and we can implement it for each ASIC. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 2 ++ drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index ec203f9e5ffa..28dff750c47e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -113,6 +113,8 @@ struct amdgpu_umc_ras { uint32_t (*get_die_id_from_pa)(struct amdgpu_device *adev, uint64_t mca_addr, uint64_t retired_page); void (*get_retire_flip_bits)(struct amdgpu_device *adev); + void (*mca_ipid_parse)(struct amdgpu_device *adev, uint64_t ipid, + uint32_t *did, uint32_t *ch, uint32_t *umc_inst, uint32_t *sid); }; struct amdgpu_umc_funcs { diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 8dc32787d625..0f5b1719fda5 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -711,6 +711,19 @@ static uint32_t umc_v12_0_get_die_id(struct amdgpu_device *adev, return die; } +static void umc_v12_0_mca_ipid_parse(struct amdgpu_device *adev, uint64_t ipid, + uint32_t *did, uint32_t *ch, uint32_t *umc_inst, uint32_t *sid) +{ + if (did) + *did = MCA_IPID_2_DIE_ID(ipid); + if (ch) + *ch = MCA_IPID_2_UMC_CH(ipid); + if (umc_inst) + *umc_inst = MCA_IPID_2_UMC_INST(ipid); + if (sid) + *sid = MCA_IPID_2_SOCKET_ID(ipid); +} + struct amdgpu_umc_ras umc_v12_0_ras = { .ras_block = { .hw_ops = &umc_v12_0_ras_hw_ops, @@ -724,5 +737,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = { .convert_ras_err_addr = umc_v12_0_convert_error_address, .get_die_id_from_pa = umc_v12_0_get_die_id, .get_retire_flip_bits = umc_v12_0_get_retire_flip_bits, + .mca_ipid_parse = umc_v12_0_mca_ipid_parse, }; From 7f34ddf77d30959fcc24cd279074f7e0d4f732df Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Wed, 23 Jul 2025 19:04:17 +0800 Subject: [PATCH 78/83] drm/amdgpu: add ras_eeprom_read_idx interface PMFW will manage RAS eeprom data by itself, add new interface to read eeprom data via PMFW, we can read part of records by setting index. v2: use IPID parse interface. pa is not used and set it to a fixed value. v3: optimize the null pointer check for IPID parse interface. Signed-off-by: Tao Zhou Reviewed-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 51 +++++++++++++++++++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 4 ++ 2 files changed, 55 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index d7e2a81bc274..9854238ce7bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -970,6 +970,50 @@ static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, return res; } +int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *record, u32 rec_idx, + const u32 num) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + uint64_t ts, end_idx; + int i, ret; + u64 mca, ipid; + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return 0; + + if (!adev->umc.ras || !adev->umc.ras->mca_ipid_parse) + return -EOPNOTSUPP; + + end_idx = rec_idx + num; + for (i = rec_idx; i < end_idx; i++) { + ret = amdgpu_ras_smu_get_badpage_mca_addr(adev, i, &mca); + if (ret) + return ret; + + ret = amdgpu_ras_smu_get_badpage_ipid(adev, i, &ipid); + if (ret) + return ret; + + ret = amdgpu_ras_smu_get_timestamp(adev, i, &ts); + if (ret) + return ret; + + record[i - rec_idx].address = mca; + /* retired_page (pa) is unused now */ + record[i - rec_idx].retired_page = 0x1ULL; + record[i - rec_idx].ts = ts; + record[i - rec_idx].err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; + record[i - rec_idx].cu = 0; + + adev->umc.ras->mca_ipid_parse(adev, ipid, NULL, + (uint32_t *)&(record[i - rec_idx].mem_channel), + (uint32_t *)&(record[i - rec_idx].mcumc_id), NULL); + } + + return 0; +} + /** * amdgpu_ras_eeprom_read -- read EEPROM * @control: pointer to control structure @@ -991,6 +1035,9 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, u8 *buf, *pp; u32 g0, g1; + if (amdgpu_ras_smu_eeprom_supported(adev)) + return amdgpu_ras_eeprom_read_idx(control, record, 0, num); + if (!__is_ras_eeprom_supported(adev)) return 0; @@ -1162,6 +1209,10 @@ static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf, int res = -EFAULT; size_t data_len; + /* pmfw manages eeprom data by itself */ + if (amdgpu_ras_smu_eeprom_supported(adev)) + return 0; + mutex_lock(&control->ras_tbl_mutex); /* We want *pos - data_len > 0, which means there's diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index cfbd402ddea2..e881007f715b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -186,6 +186,10 @@ int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev, int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev, uint32_t *result); +int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *record, u32 rec_idx, + const u32 num); + extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops; extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops; From 1ad25fd272753db14c5d1cc8c68e20ce01f3f888 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 3 Nov 2025 16:21:50 +0530 Subject: [PATCH 79/83] drm/amdgpu: Fix wait after reset sequence in S3 For a mode-1 reset done at the end of S3 on PSPv11 dGPUs, only check if TOS is unloaded. Fixes: 32f73741d6ee ("drm/amdgpu: Wait for bootloader after PSPv11 reset") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4649 Signed-off-by: Lijo Lazar Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 +++++++-- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 26 ++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 3776901bbb1b..cb522d6272d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2625,9 +2625,14 @@ static int amdgpu_pmops_suspend_noirq(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(drm_dev); + int r; - if (amdgpu_acpi_should_gpu_reset(adev)) - return amdgpu_asic_reset(adev); + if (amdgpu_acpi_should_gpu_reset(adev)) { + amdgpu_device_lock_reset_domain(adev->reset_domain); + r = amdgpu_asic_reset(adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); + return r; + } return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index 64b240b51f1a..a9be7a505026 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -142,13 +142,37 @@ static int psp_v11_0_init_microcode(struct psp_context *psp) return err; } +static int psp_v11_wait_for_tos_unload(struct psp_context *psp) +{ + struct amdgpu_device *adev = psp->adev; + uint32_t sol_reg1, sol_reg2; + int retry_loop; + + /* Wait for the TOS to be unloaded */ + for (retry_loop = 0; retry_loop < 20; retry_loop++) { + sol_reg1 = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81); + usleep_range(1000, 2000); + sol_reg2 = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81); + if (sol_reg1 == sol_reg2) + return 0; + } + dev_err(adev->dev, "TOS unload failed, C2PMSG_33: %x C2PMSG_81: %x", + RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_33), + RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81)); + + return -ETIME; +} + static int psp_v11_0_wait_for_bootloader(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; - int ret; int retry_loop; + /* For a reset done at the end of S3, only wait for TOS to be unloaded */ + if (adev->in_s3 && !(adev->flags & AMD_IS_APU) && amdgpu_in_reset(adev)) + return psp_v11_wait_for_tos_unload(psp); + for (retry_loop = 0; retry_loop < 20; retry_loop++) { /* Wait for bootloader to signify that is ready having bit 31 of C2PMSG_35 set to 1 */ From e1ca536e1772f952e1b08be47fe9006c54a711a8 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 24 Jul 2025 15:01:03 +0800 Subject: [PATCH 80/83] drm/amdgpu: support to load RAS bad pages from PMFW PMFW manages eeprom bad page records, update bad page loading accrodingly. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 62d2f988d88f..055a9bbabbdb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3158,8 +3158,12 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, int i = 0; enum amdgpu_memory_partition save_nps; - save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; - bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); + if (!amdgpu_ras_smu_eeprom_supported(adev)) { + save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; + bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); + } else { + save_nps = nps; + } if (save_nps == nps) { if (amdgpu_umc_pages_in_a_row(adev, err_data, @@ -3225,7 +3229,8 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, if (from_rom) { /* there is no pa recs in V3, so skip pa recs processing */ - if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { + if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && + !amdgpu_ras_smu_eeprom_supported(adev)) { for (i = 0; i < pages; i++) { if (control->ras_num_recs - i >= adev->umc.retire_unit) { if ((bps[i].address == bps[i + 1].address) && @@ -3356,7 +3361,8 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) /*In V3, there is no pa recs, and some cases(when address==0) may be parsed as pa recs, so add verion check to avoid it. */ - if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { + if ((control->tbl_hdr.version < RAS_TABLE_VER_V3) && + !amdgpu_ras_smu_eeprom_supported(adev)) { for (i = 0; i < control->ras_num_recs; i++) { if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { if ((bps[i].address == bps[i + 1].address) && From 62320fb8d91a0bddc44a228203cfa9bfbb5395bd Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Wed, 5 Nov 2025 10:36:31 +0800 Subject: [PATCH 81/83] drm/amd/display: Enable mst when it's detected but yet to be initialized [Why] drm_dp_mst_topology_queue_probe() is used under the assumption that mst is already initialized. If we connect system with SST first then switch to the mst branch during suspend, we will fail probing topology by calling the wrong API since the mst manager is yet to be initialized. [How] At dm_resume(), once it's detected as mst branc connected, check if the mst is initialized already. If not, call dm_helpers_dp_mst_start_top_mgr() instead to initialize mst V2: Adjust the commit msg a bit Fixes: bc068194f548 ("drm/amd/display: Don't write DP_MSTM_CTRL after LT") Cc: Fangzhi Zuo Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Tom Chung Signed-off-by: Wayne Lin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 5f090c13f224..18f1cf16ec18 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3624,6 +3624,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block) /* Do mst topology probing after resuming cached state*/ drm_connector_list_iter_begin(ddev, &iter); drm_for_each_connector_iter(connector, &iter) { + bool init = false; if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) continue; @@ -3633,7 +3634,14 @@ static int dm_resume(struct amdgpu_ip_block *ip_block) aconnector->mst_root) continue; - drm_dp_mst_topology_queue_probe(&aconnector->mst_mgr); + scoped_guard(mutex, &aconnector->mst_mgr.lock) { + init = !aconnector->mst_mgr.mst_primary; + } + if (init) + dm_helpers_dp_mst_start_top_mgr(aconnector->dc_link->ctx, + aconnector->dc_link, false); + else + drm_dp_mst_topology_queue_probe(&aconnector->mst_mgr); } drm_connector_list_iter_end(&iter); From 541414065c59db785000d7661d3d07184e104ec2 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 8 Sep 2025 20:39:49 +0800 Subject: [PATCH 82/83] drm/amdgpu: skip writing eeprom when PMFW manages RAS data Only update bad page number in legacy eeprom write path. v2: add null pointer check for con. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 20 ++++++++++++++++++- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 +++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9854238ce7bf..670c0dedf4e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -871,6 +871,18 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) return res; } +int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + + if (!amdgpu_ras_smu_eeprom_supported(adev)) + return 0; + + control->ras_num_recs_old = control->ras_num_recs; + return amdgpu_ras_smu_get_badpage_count(adev, + &(control->ras_num_recs), 12); +} + /** * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table * @control: pointer to control structure @@ -889,12 +901,18 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, const u32 num) { struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int res, i; uint64_t nps = AMDGPU_NPS1_PARTITION_MODE; - if (!__is_ras_eeprom_supported(adev)) + if (!__is_ras_eeprom_supported(adev) || !con) return 0; + if (amdgpu_ras_smu_eeprom_supported(adev)) { + control->ras_num_bad_pages = con->bad_page_num; + return 0; + } + if (num == 0) { dev_err(adev->dev, "will not append 0 records\n"); return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index e881007f715b..2e5d63957e71 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -82,6 +82,7 @@ struct amdgpu_ras_eeprom_control { /* Number of records in the table. */ u32 ras_num_recs; + u32 ras_num_recs_old; /* the bad page number is ras_num_recs or * ras_num_recs * umc.retire_unit @@ -190,6 +191,8 @@ int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *record, u32 rec_idx, const u32 num); +int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control); + extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops; extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops; From 2e640e8e7b9e9fc0f42c1e15ea0e02d00916ad57 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Thu, 6 Nov 2025 14:11:45 +0800 Subject: [PATCH 83/83] drm/amd/pm: Update default power1_cap Update default power1_cap to max limit for smu_v13_0_6 and smu_v13_0_12 Signed-off-by: Asad Kamal Reviewed-by: Lijo Lazar Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 31bdaabbd59c..dd8c7b98ce7e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1872,7 +1872,7 @@ static int smu_v13_0_6_get_power_limit(struct smu_context *smu, if (current_power_limit) *current_power_limit = power_limit; if (default_power_limit) - *default_power_limit = power_limit; + *default_power_limit = pptable->MaxSocketPowerLimit; if (max_power_limit) { *max_power_limit = pptable->MaxSocketPowerLimit;