drm/amdgpu: try for more times if RAS bad page number is not updated

RAS info update in PMFW is time cost, wait for it. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2026-05-31 10:33:41 +02:00 · 2025-08-27 19:33:02 +08:00 · 2025-08-27 19:33:02 +08:00 · 334b27bf71
commit 334b27bf71
parent ec49374ccb
1 changed files with 21 additions and 1 deletions
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@ -874,13 +874,33 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
 int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control)
 {
 	struct amdgpu_device *adev = to_amdgpu_device(control);
+	int ret, timeout = 1000;

 	if (!amdgpu_ras_smu_eeprom_supported(adev))
 		return 0;

 	control->ras_num_recs_old = control->ras_num_recs;
-	return amdgpu_ras_smu_get_badpage_count(adev,
+
+	do {
+		ret = amdgpu_ras_smu_get_badpage_count(adev,
 			&(control->ras_num_recs), 12);
+		if (!ret &&
+		    (control->ras_num_recs_old == control->ras_num_recs)) {
+			/* record number update in PMFW needs some time */
+			msleep(50);
+			timeout -= 50;
+		} else {
+			break;
+		}
+	} while (timeout);
+
+	/* no update of record number is not a real failure,
+	 * don't print warning here
+	 */
+	if (!ret && (control->ras_num_recs_old == control->ras_num_recs))
+		ret = -EINVAL;
+
+	return ret;
 }

 /**