drm/amdgpu: do RAS MCA2PA conversion in device init phase

NPS mode is introduced, the value of memory physical address (PA)
related to a MCA address varies per nps mode. We need to rely on
MCA address and convert it into PA accroding to nps mode.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tao Zhou 2024-10-18 14:49:00 +08:00 committed by Alex Deucher
parent 772df3df80
commit 0eecff79e4

View File

@ -2717,40 +2717,110 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
return 0;
}
static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
struct eeprom_table_record *bps,
struct ras_err_data *err_data)
{
struct ta_ras_query_address_input addr_in;
uint32_t socket = 0;
int ret = 0;
if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
socket = adev->smuio.funcs->get_socket_id(adev);
/* reinit err_data */
err_data->err_addr_cnt = 0;
err_data->err_addr_len = adev->umc.retire_unit;
memset(&addr_in, 0, sizeof(addr_in));
addr_in.ma.err_addr = bps->address;
addr_in.ma.socket_id = socket;
addr_in.ma.ch_inst = bps->mem_channel;
/* tell RAS TA the node instance is not used */
addr_in.ma.node_inst = TA_RAS_INV_NODE;
if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
&addr_in, NULL, false);
return ret;
}
/* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
struct ras_err_data err_data;
struct eeprom_table_record *err_rec;
int ret = 0;
uint32_t i;
uint32_t i, j, loop_cnt = 1;
bool is_mca_add = true;
if (!con || !con->eh_data || !bps || pages <= 0)
return 0;
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) {
is_mca_add = false;
} else {
if ((pages > 1) &&
(bps[0].address == bps[1].address) &&
(bps[0].mem_channel == bps[1].mem_channel))
is_mca_add = false;
}
mutex_lock(&con->recovery_lock);
data = con->eh_data;
if (!data)
goto out;
for (i = 0; i < pages; i++) {
if (amdgpu_ras_check_bad_page_unlock(con,
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
continue;
if (!data->space_left &&
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
if (is_mca_add) {
err_data.err_addr =
kcalloc(adev->umc.retire_unit,
sizeof(struct eeprom_table_record), GFP_KERNEL);
if (!err_data.err_addr) {
dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
ret = -ENOMEM;
goto out;
}
amdgpu_ras_reserve_page(adev, bps[i].retired_page);
memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
data->count++;
data->space_left--;
loop_cnt = adev->umc.retire_unit;
}
for (i = 0; i < pages; i++) {
if (is_mca_add) {
if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data))
goto free;
err_rec = err_data.err_addr;
} else {
err_rec = &bps[i];
}
for (j = 0; j < loop_cnt; j++) {
if (amdgpu_ras_check_bad_page_unlock(con,
err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
continue;
if (!data->space_left &&
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
ret = -ENOMEM;
goto free;
}
amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
memcpy(&data->bps[data->count], &(err_rec[j]),
sizeof(struct eeprom_table_record));
data->count++;
data->space_left--;
}
}
free:
if (is_mca_add)
kfree(err_data.err_addr);
out:
mutex_unlock(&con->recovery_lock);