mirror of
https://github.com/torvalds/linux.git
synced 2026-05-25 07:33:19 +02:00
drm/amdgpu: Update usage for bad page threshold
The driver's behavior varies based on the configuration of amdgpu_bad_page_threshold setting Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
c003b5ccaf
commit
16b85a0942
|
|
@ -964,7 +964,7 @@ module_param_named_unsafe(reset_method, amdgpu_reset_method, int, 0644);
|
|||
* result in the GPU entering bad status when the number of total
|
||||
* faulty pages by ECC exceeds the threshold value.
|
||||
*/
|
||||
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = driver sets threshold)");
|
||||
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = threshold determined by a formula, 0 < threshold < max records, user-defined threshold)");
|
||||
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
|
||||
|
||||
MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
|
||||
|
|
|
|||
|
|
@ -3080,31 +3080,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
|
|||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
/*
|
||||
* Justification of value bad_page_cnt_threshold in ras structure
|
||||
*
|
||||
* Generally, 0 <= amdgpu_bad_page_threshold <= max record length
|
||||
* in eeprom or amdgpu_bad_page_threshold == -2, introduce two
|
||||
* scenarios accordingly.
|
||||
*
|
||||
* Bad page retirement enablement:
|
||||
* - If amdgpu_bad_page_threshold = -2,
|
||||
* bad_page_cnt_threshold = typical value by formula.
|
||||
*
|
||||
* - When the value from user is 0 < amdgpu_bad_page_threshold <
|
||||
* max record length in eeprom, use it directly.
|
||||
*
|
||||
* Bad page retirement disablement:
|
||||
* - If amdgpu_bad_page_threshold = 0, bad page retirement
|
||||
* functionality is disabled, and bad_page_cnt_threshold will
|
||||
* take no effect.
|
||||
* amdgpu_bad_page_threshold is used to config
|
||||
* the threshold for the number of bad pages.
|
||||
* -1: Threshold is set to default value
|
||||
* Driver will issue a warning message when threshold is reached
|
||||
* and continue runtime services.
|
||||
* 0: Disable bad page retirement
|
||||
* Driver will not retire bad pages
|
||||
* which is intended for debugging purpose.
|
||||
* -2: Threshold is determined by a formula
|
||||
* that assumes 1 bad page per 100M of local memory.
|
||||
* Driver will continue runtime services when threhold is reached.
|
||||
* 0 < threshold < max number of bad page records in EEPROM,
|
||||
* A user-defined threshold is set
|
||||
* Driver will halt runtime services when this custom threshold is reached.
|
||||
*/
|
||||
|
||||
if (amdgpu_bad_page_threshold < 0) {
|
||||
if (amdgpu_bad_page_threshold == -2) {
|
||||
u64 val = adev->gmc.mc_vram_size;
|
||||
|
||||
do_div(val, RAS_BAD_PAGE_COVER);
|
||||
con->bad_page_cnt_threshold = min(lower_32_bits(val),
|
||||
max_count);
|
||||
} else if (amdgpu_bad_page_threshold == -1) {
|
||||
con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
|
||||
} else {
|
||||
con->bad_page_cnt_threshold = min_t(int, max_count,
|
||||
amdgpu_bad_page_threshold);
|
||||
|
|
@ -3848,8 +3846,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
|
|||
case IP_VERSION(13, 0, 2):
|
||||
case IP_VERSION(13, 0, 6):
|
||||
case IP_VERSION(13, 0, 12):
|
||||
con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
|
||||
break;
|
||||
case IP_VERSION(13, 0, 14):
|
||||
con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
|
||||
con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ struct amdgpu_iv_entry;
|
|||
|
||||
/* Reserve 8 physical dram row for possible retirement.
|
||||
* In worst cases, it will lose 8 * 2MB memory in vram domain */
|
||||
#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20)
|
||||
#define AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20)
|
||||
/* The high three bits indicates socketid */
|
||||
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
|
||||
|
||||
|
|
|
|||
|
|
@ -558,16 +558,17 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
|
|||
return false;
|
||||
|
||||
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
|
||||
if (amdgpu_bad_page_threshold == -1) {
|
||||
if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold)
|
||||
dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
|
||||
con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
|
||||
con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
|
||||
if ((amdgpu_bad_page_threshold == -1) ||
|
||||
(amdgpu_bad_page_threshold == -2)) {
|
||||
dev_warn(adev->dev,
|
||||
"But GPU can be operated due to bad_page_threshold = -1.\n");
|
||||
"Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n");
|
||||
return false;
|
||||
} else {
|
||||
dev_warn(adev->dev, "This GPU is in BAD status.");
|
||||
dev_warn(adev->dev, "Please retire it or set a larger "
|
||||
"threshold value when reloading driver.\n");
|
||||
dev_warn(adev->dev,
|
||||
"Please consider adjusting the customized threshold.\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -758,7 +759,8 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
|
|||
control->tbl_rai.health_percent = 0;
|
||||
}
|
||||
|
||||
if (amdgpu_bad_page_threshold != -1)
|
||||
if ((amdgpu_bad_page_threshold != -1) &&
|
||||
(amdgpu_bad_page_threshold != -2))
|
||||
ras->is_rma = true;
|
||||
|
||||
/* ignore the -ENOTSUPP return value */
|
||||
|
|
@ -1428,8 +1430,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
|||
|
||||
res = __verify_ras_table_checksum(control);
|
||||
if (res)
|
||||
DRM_ERROR("RAS table incorrect checksum or error:%d\n",
|
||||
res);
|
||||
dev_err(adev->dev,
|
||||
"RAS table incorrect checksum or error:%d\n",
|
||||
res);
|
||||
|
||||
/* Warn if we are at 90% of the threshold or above
|
||||
*/
|
||||
|
|
@ -1447,8 +1450,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
|||
|
||||
res = __verify_ras_table_checksum(control);
|
||||
if (res) {
|
||||
dev_err(adev->dev, "RAS Table incorrect checksum or error:%d\n",
|
||||
res);
|
||||
dev_err(adev->dev,
|
||||
"RAS Table incorrect checksum or error:%d\n",
|
||||
res);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) {
|
||||
|
|
@ -1466,17 +1470,18 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
|||
res = amdgpu_ras_eeprom_correct_header_tag(control,
|
||||
RAS_TABLE_HDR_VAL);
|
||||
} else {
|
||||
dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
|
||||
dev_warn(adev->dev,
|
||||
"RAS records:%d exceed threshold:%d\n",
|
||||
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
|
||||
if (amdgpu_bad_page_threshold == -1) {
|
||||
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
|
||||
if ((amdgpu_bad_page_threshold == -1) ||
|
||||
(amdgpu_bad_page_threshold == -2)) {
|
||||
res = 0;
|
||||
dev_warn(adev->dev,
|
||||
"Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
|
||||
} else {
|
||||
ras->is_rma = true;
|
||||
dev_err(adev->dev,
|
||||
"RAS records:%d exceed threshold:%d, "
|
||||
"GPU will not be initialized. Replace this GPU or increase the threshold",
|
||||
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
|
||||
dev_warn(adev->dev,
|
||||
"User defined threshold is set, runtime service will be halt when threshold is reached\n");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user