drm/amd/pm: correct mem_busy_percent display due to calculation errors

PMFW may return invalid values due to internal calculation errors.
so, the kmd driver must validate and sanitize the returned values to
prevent issues caused by firmware calculation errors.

For example, values 0xfffe (-2) and 0xffff (-1) are treated
as invalid and clamped to 0.

this applies to devices with CAB (Cache As Buffer) functionality.

Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4905
Signed-off-by: Yang Wang <kevinyang.wang@amd.com>
Reviewed-by: Kenneth Feng <kenneth.feng@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Yang Wang 2026-03-25 21:41:46 -04:00 committed by Alex Deucher
parent 95a599c8a2
commit 592713a896
4 changed files with 32 additions and 15 deletions

View File

@ -2164,4 +2164,21 @@ static inline void smu_feature_init(struct smu_context *smu, int feature_num)
smu_feature_list_clear_all(smu, SMU_FEATURE_LIST_ALLOWED);
}
/*
* smu_safe_u16_nn - Make u16 safe by filtering negative overflow errors
* @val: Input u16 value, may contain invalid negative overflows
*
* Convert u16 to non-negative value. Cast to s16 to detect negative values
* caused by calculation errors. Return 0 for negative errors, return
* original value if valid.
*
* Return: Valid u16 value or 0
*/
static inline u16 smu_safe_u16_nn(u16 val)
{
s16 tmp = (s16)val;
return tmp < 0 ? 0 : val;
}
#endif

View File

@ -773,13 +773,13 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu,
*value = metrics->AverageGfxclkFrequencyPreDs;
break;
case METRICS_AVERAGE_FCLK:
if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
*value = metrics->AverageFclkFrequencyPostDs;
else
*value = metrics->AverageFclkFrequencyPreDs;
break;
case METRICS_AVERAGE_UCLK:
if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
*value = metrics->AverageMemclkFrequencyPostDs;
else
*value = metrics->AverageMemclkFrequencyPreDs;
@ -800,7 +800,7 @@ static int smu_v13_0_0_get_smu_metrics_data(struct smu_context *smu,
*value = metrics->AverageGfxActivity;
break;
case METRICS_AVERAGE_MEMACTIVITY:
*value = metrics->AverageUclkActivity;
*value = smu_safe_u16_nn(metrics->AverageUclkActivity);
break;
case METRICS_AVERAGE_VCNACTIVITY:
*value = max(metrics->Vcn0ActivityPercentage,
@ -2085,7 +2085,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct smu_context *smu,
metrics->AvgTemperature[TEMP_VR_MEM1]);
gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage,
metrics->Vcn1ActivityPercentage);
@ -2102,7 +2102,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct smu_context *smu,
else
gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_0_BUSY_THRESHOLD)
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
else
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;

View File

@ -783,13 +783,13 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu,
*value = metrics->AverageGfxclkFrequencyPreDs;
break;
case METRICS_AVERAGE_FCLK:
if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
*value = metrics->AverageFclkFrequencyPostDs;
else
*value = metrics->AverageFclkFrequencyPreDs;
break;
case METRICS_AVERAGE_UCLK:
if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
*value = metrics->AverageMemclkFrequencyPostDs;
else
*value = metrics->AverageMemclkFrequencyPreDs;
@ -814,7 +814,7 @@ static int smu_v13_0_7_get_smu_metrics_data(struct smu_context *smu,
*value = metrics->AverageGfxActivity;
break;
case METRICS_AVERAGE_MEMACTIVITY:
*value = metrics->AverageUclkActivity;
*value = smu_safe_u16_nn(metrics->AverageUclkActivity);
break;
case METRICS_AVERAGE_SOCKETPOWER:
*value = metrics->AverageSocketPower << 8;
@ -2091,7 +2091,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct smu_context *smu,
metrics->AvgTemperature[TEMP_VR_MEM1]);
gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
gpu_metrics->average_mm_activity = max(metrics->Vcn0ActivityPercentage,
metrics->Vcn1ActivityPercentage);
@ -2104,7 +2104,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct smu_context *smu,
else
gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_13_0_7_BUSY_THRESHOLD)
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
else
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;

View File

@ -661,13 +661,13 @@ static int smu_v14_0_2_get_smu_metrics_data(struct smu_context *smu,
*value = metrics->AverageGfxclkFrequencyPreDs;
break;
case METRICS_AVERAGE_FCLK:
if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
*value = metrics->AverageFclkFrequencyPostDs;
else
*value = metrics->AverageFclkFrequencyPreDs;
break;
case METRICS_AVERAGE_UCLK:
if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
*value = metrics->AverageMemclkFrequencyPostDs;
else
*value = metrics->AverageMemclkFrequencyPreDs;
@ -688,7 +688,7 @@ static int smu_v14_0_2_get_smu_metrics_data(struct smu_context *smu,
*value = metrics->AverageGfxActivity;
break;
case METRICS_AVERAGE_MEMACTIVITY:
*value = metrics->AverageUclkActivity;
*value = smu_safe_u16_nn(metrics->AverageUclkActivity);
break;
case METRICS_AVERAGE_VCNACTIVITY:
*value = max(metrics->AverageVcn0ActivityPercentage,
@ -2147,7 +2147,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu,
metrics->AvgTemperature[TEMP_VR_MEM1]);
gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
gpu_metrics->average_umc_activity = smu_safe_u16_nn(metrics->AverageUclkActivity);
gpu_metrics->average_mm_activity = max(metrics->AverageVcn0ActivityPercentage,
metrics->Vcn1ActivityPercentage);
@ -2159,7 +2159,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu,
else
gpu_metrics->average_gfxclk_frequency = metrics->AverageGfxclkFrequencyPreDs;
if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= SMU_14_0_2_BUSY_THRESHOLD)
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPostDs;
else
gpu_metrics->average_uclk_frequency = metrics->AverageMemclkFrequencyPreDs;