mirror of
https://github.com/torvalds/linux.git
synced 2026-05-31 10:33:41 +02:00
drm/amdgpu: do RAS MCA2PA conversion in device init phase
NPS mode is introduced, the value of memory physical address (PA) related to a MCA address varies per nps mode. We need to rely on MCA address and convert it into PA accroding to nps mode. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
772df3df80
commit
0eecff79e4
|
|
@ -2717,40 +2717,110 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_ras_mca2pa(struct amdgpu_device *adev,
|
||||
struct eeprom_table_record *bps,
|
||||
struct ras_err_data *err_data)
|
||||
{
|
||||
struct ta_ras_query_address_input addr_in;
|
||||
uint32_t socket = 0;
|
||||
int ret = 0;
|
||||
|
||||
if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id)
|
||||
socket = adev->smuio.funcs->get_socket_id(adev);
|
||||
|
||||
/* reinit err_data */
|
||||
err_data->err_addr_cnt = 0;
|
||||
err_data->err_addr_len = adev->umc.retire_unit;
|
||||
|
||||
memset(&addr_in, 0, sizeof(addr_in));
|
||||
addr_in.ma.err_addr = bps->address;
|
||||
addr_in.ma.socket_id = socket;
|
||||
addr_in.ma.ch_inst = bps->mem_channel;
|
||||
/* tell RAS TA the node instance is not used */
|
||||
addr_in.ma.node_inst = TA_RAS_INV_NODE;
|
||||
|
||||
if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
|
||||
ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
|
||||
&addr_in, NULL, false);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* it deal with vram only. */
|
||||
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
|
||||
struct eeprom_table_record *bps, int pages)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data *data;
|
||||
struct ras_err_data err_data;
|
||||
struct eeprom_table_record *err_rec;
|
||||
int ret = 0;
|
||||
uint32_t i;
|
||||
uint32_t i, j, loop_cnt = 1;
|
||||
bool is_mca_add = true;
|
||||
|
||||
if (!con || !con->eh_data || !bps || pages <= 0)
|
||||
return 0;
|
||||
|
||||
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) {
|
||||
is_mca_add = false;
|
||||
} else {
|
||||
if ((pages > 1) &&
|
||||
(bps[0].address == bps[1].address) &&
|
||||
(bps[0].mem_channel == bps[1].mem_channel))
|
||||
is_mca_add = false;
|
||||
}
|
||||
|
||||
mutex_lock(&con->recovery_lock);
|
||||
data = con->eh_data;
|
||||
if (!data)
|
||||
goto out;
|
||||
|
||||
for (i = 0; i < pages; i++) {
|
||||
if (amdgpu_ras_check_bad_page_unlock(con,
|
||||
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
|
||||
continue;
|
||||
|
||||
if (!data->space_left &&
|
||||
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
|
||||
if (is_mca_add) {
|
||||
err_data.err_addr =
|
||||
kcalloc(adev->umc.retire_unit,
|
||||
sizeof(struct eeprom_table_record), GFP_KERNEL);
|
||||
if (!err_data.err_addr) {
|
||||
dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
amdgpu_ras_reserve_page(adev, bps[i].retired_page);
|
||||
|
||||
memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
|
||||
data->count++;
|
||||
data->space_left--;
|
||||
loop_cnt = adev->umc.retire_unit;
|
||||
}
|
||||
|
||||
for (i = 0; i < pages; i++) {
|
||||
if (is_mca_add) {
|
||||
if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data))
|
||||
goto free;
|
||||
|
||||
err_rec = err_data.err_addr;
|
||||
} else {
|
||||
err_rec = &bps[i];
|
||||
}
|
||||
|
||||
for (j = 0; j < loop_cnt; j++) {
|
||||
if (amdgpu_ras_check_bad_page_unlock(con,
|
||||
err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
|
||||
continue;
|
||||
|
||||
if (!data->space_left &&
|
||||
amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
|
||||
ret = -ENOMEM;
|
||||
goto free;
|
||||
}
|
||||
|
||||
amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
|
||||
|
||||
memcpy(&data->bps[data->count], &(err_rec[j]),
|
||||
sizeof(struct eeprom_table_record));
|
||||
data->count++;
|
||||
data->space_left--;
|
||||
}
|
||||
}
|
||||
|
||||
free:
|
||||
if (is_mca_add)
|
||||
kfree(err_data.err_addr);
|
||||
out:
|
||||
mutex_unlock(&con->recovery_lock);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user