drm/amdgpu: try for more times if RAS bad page number is not updated

RAS info update in PMFW is time cost, wait for it.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tao Zhou 2025-08-27 19:33:02 +08:00 committed by Alex Deucher
parent ec49374ccb
commit 334b27bf71

View File

@ -874,13 +874,33 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
int ret, timeout = 1000;
if (!amdgpu_ras_smu_eeprom_supported(adev))
return 0;
control->ras_num_recs_old = control->ras_num_recs;
return amdgpu_ras_smu_get_badpage_count(adev,
do {
ret = amdgpu_ras_smu_get_badpage_count(adev,
&(control->ras_num_recs), 12);
if (!ret &&
(control->ras_num_recs_old == control->ras_num_recs)) {
/* record number update in PMFW needs some time */
msleep(50);
timeout -= 50;
} else {
break;
}
} while (timeout);
/* no update of record number is not a real failure,
* don't print warning here
*/
if (!ret && (control->ras_num_recs_old == control->ras_num_recs))
ret = -EINVAL;
return ret;
}
/**