mirror of
https://github.com/torvalds/linux.git
synced 2026-05-24 15:12:13 +02:00
drm/amd/ras: add check func for pmfw eeprom
add check func for pmfw eeprom Signed-off-by: Gangliang Xie <ganglxie@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
b2d13a41da
commit
e82f9aac39
|
|
@ -137,7 +137,8 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
|
|||
break;
|
||||
case RAS_EVENT_ID__DEVICE_RMA:
|
||||
ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL);
|
||||
ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
|
||||
if (!ras_fw_eeprom_supported(ras_core))
|
||||
ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
|
||||
break;
|
||||
case RAS_EVENT_ID__RESET_GPU:
|
||||
ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
|
||||
|
|
|
|||
|
|
@ -50,6 +50,13 @@
|
|||
#define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002)
|
||||
#define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004)
|
||||
|
||||
enum ras_gpu_health_status {
|
||||
RAS_GPU_HEALTH_NONE = 0,
|
||||
RAS_GPU_HEALTH_USABLE = 1,
|
||||
RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
|
||||
RAS_GPU_IN_BAD_STATUS = 3,
|
||||
};
|
||||
|
||||
enum ras_core_fw_feature_flags {
|
||||
RAS_CORE_FW_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0),
|
||||
};
|
||||
|
|
|
|||
|
|
@ -403,7 +403,10 @@ int ras_core_hw_init(struct ras_core_context *ras_core)
|
|||
goto init_err6;
|
||||
}
|
||||
|
||||
ret = ras_eeprom_check_storage_status(ras_core);
|
||||
if (ras_fw_eeprom_supported(ras_core))
|
||||
ret = ras_fw_eeprom_check_storage_status(ras_core);
|
||||
else
|
||||
ret = ras_eeprom_check_storage_status(ras_core);
|
||||
if (ret)
|
||||
goto init_err6;
|
||||
|
||||
|
|
|
|||
|
|
@ -57,13 +57,6 @@ do { \
|
|||
(RECORD)->retired_row_pfn = tmp; \
|
||||
} while (0)
|
||||
|
||||
enum ras_gpu_health_status {
|
||||
RAS_GPU_HEALTH_NONE = 0,
|
||||
RAS_GPU_HEALTH_USABLE = 1,
|
||||
RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
|
||||
RAS_GPU_IN_BAD_STATUS = 3,
|
||||
};
|
||||
|
||||
enum ras_eeprom_err_type {
|
||||
RAS_EEPROM_ERR_NA,
|
||||
RAS_EEPROM_ERR_RECOVERABLE,
|
||||
|
|
|
|||
|
|
@ -453,3 +453,54 @@ int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
|
||||
int bad_page_count;
|
||||
|
||||
bad_page_count = ras_umc_get_badpage_count(ras_core);
|
||||
|
||||
if ((control->record_threshold_count < bad_page_count) &&
|
||||
(control->record_threshold_config != 0)) {
|
||||
RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d",
|
||||
bad_page_count, control->record_threshold_count);
|
||||
if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) ||
|
||||
(control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) {
|
||||
RAS_DEV_WARN(ras_core->dev,
|
||||
"Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
|
||||
} else {
|
||||
ras_core->is_rma = true;
|
||||
RAS_DEV_ERR(ras_core->dev,
|
||||
"User defined threshold is set, runtime service will be halt when threshold is reached\n");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
RAS_DEV_INFO(ras_core->dev,
|
||||
"Found existing EEPROM table with %d records\n",
|
||||
bad_page_count);
|
||||
/* Warn if we are at 90% of the threshold or above
|
||||
*/
|
||||
if (10 * bad_page_count >= 9 * control->record_threshold_count)
|
||||
RAS_DEV_WARN(ras_core->dev,
|
||||
"RAS records:%u exceeds 90%% of threshold:%d\n",
|
||||
bad_page_count,
|
||||
control->record_threshold_count);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum ras_gpu_health_status
|
||||
ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core)
|
||||
{
|
||||
struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
|
||||
|
||||
if (!control->record_threshold_config)
|
||||
return RAS_GPU_HEALTH_NONE;
|
||||
|
||||
if (ras_core->is_rma)
|
||||
return RAS_GPU_RETIRED__ECC_REACH_THRESHOLD;
|
||||
|
||||
return RAS_GPU_HEALTH_USABLE;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -79,5 +79,8 @@ int ras_fw_eeprom_update_record(struct ras_core_context *ras_core,
|
|||
struct ras_bank_ecc *ras_ecc);
|
||||
int ras_fw_eeprom_hw_init(struct ras_core_context *ras_core);
|
||||
int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core);
|
||||
int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core);
|
||||
enum ras_gpu_health_status
|
||||
ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user