mirror of
https://github.com/torvalds/linux.git
synced 2026-05-27 16:44:58 +02:00
drm/amd/ras: Add ras ioctl command handler
Add ras ioctl command handler. V2: Remove ras global device list. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
c49ef01183
commit
19030244e1
527
drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
Normal file
527
drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
Normal file
|
|
@ -0,0 +1,527 @@
|
|||
// SPDX-License-Identifier: MIT
|
||||
/*
|
||||
* Copyright 2025 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
#include "ras.h"
|
||||
#include "ras_cmd.h"
|
||||
|
||||
#define RAS_CMD_MAJOR_VERSION 6
|
||||
#define RAS_CMD_MINOR_VERSION 0
|
||||
#define RAS_CMD_VERSION (((RAS_CMD_MAJOR_VERSION) << 10) | (RAS_CMD_MINOR_VERSION))
|
||||
|
||||
static int ras_cmd_add_device(struct ras_core_context *ras_core)
|
||||
{
|
||||
INIT_LIST_HEAD(&ras_core->ras_cmd.head);
|
||||
ras_core->ras_cmd.ras_core = ras_core;
|
||||
ras_core->ras_cmd.dev_handle = (uint64_t)ras_core ^ RAS_CMD_DEV_HANDLE_MAGIC;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ras_cmd_remove_device(struct ras_core_context *ras_core)
|
||||
{
|
||||
memset(&ras_core->ras_cmd, 0, sizeof(ras_core->ras_cmd));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ras_get_block_ecc_info(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
struct ras_cmd_block_ecc_info_req *input_data =
|
||||
(struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
|
||||
struct ras_cmd_block_ecc_info_rsp *output_data =
|
||||
(struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
|
||||
struct ras_ecc_count err_data;
|
||||
int ret;
|
||||
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
memset(&err_data, 0, sizeof(err_data));
|
||||
ret = ras_aca_get_block_ecc_count(ras_core, input_data->block_id, &err_data);
|
||||
if (ret)
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
output_data->ce_count = err_data.total_ce_count;
|
||||
output_data->ue_count = err_data.total_ue_count;
|
||||
output_data->de_count = err_data.total_de_count;
|
||||
|
||||
cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static void ras_cmd_update_bad_page_info(struct ras_cmd_bad_page_record *ras_cmd_record,
|
||||
struct eeprom_umc_record *record)
|
||||
{
|
||||
ras_cmd_record->retired_page = record->cur_nps_retired_row_pfn;
|
||||
ras_cmd_record->ts = record->ts;
|
||||
ras_cmd_record->err_type = record->err_type;
|
||||
ras_cmd_record->mem_channel = record->mem_channel;
|
||||
ras_cmd_record->mcumc_id = record->mcumc_id;
|
||||
ras_cmd_record->address = record->address;
|
||||
ras_cmd_record->bank = record->bank;
|
||||
ras_cmd_record->valid = 1;
|
||||
}
|
||||
|
||||
static int ras_cmd_get_group_bad_pages(struct ras_core_context *ras_core,
|
||||
uint32_t group_index, struct ras_cmd_bad_pages_info_rsp *output_data)
|
||||
{
|
||||
struct eeprom_umc_record record;
|
||||
struct ras_cmd_bad_page_record *ras_cmd_record;
|
||||
uint32_t i = 0, bp_cnt = 0, group_cnt = 0;
|
||||
|
||||
output_data->bp_in_group = 0;
|
||||
output_data->group_index = 0;
|
||||
|
||||
bp_cnt = ras_umc_get_badpage_count(ras_core);
|
||||
if (bp_cnt) {
|
||||
output_data->group_index = group_index;
|
||||
group_cnt = bp_cnt / RAS_CMD_MAX_BAD_PAGES_PER_GROUP
|
||||
+ ((bp_cnt % RAS_CMD_MAX_BAD_PAGES_PER_GROUP) ? 1 : 0);
|
||||
|
||||
if (group_index >= group_cnt)
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
|
||||
|
||||
i = group_index * RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
|
||||
for (;
|
||||
i < bp_cnt && output_data->bp_in_group < RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
|
||||
i++) {
|
||||
if (ras_umc_get_badpage_record(ras_core, i, &record))
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
ras_cmd_record = &output_data->records[i % RAS_CMD_MAX_BAD_PAGES_PER_GROUP];
|
||||
|
||||
memset(ras_cmd_record, 0, sizeof(*ras_cmd_record));
|
||||
ras_cmd_update_bad_page_info(ras_cmd_record, &record);
|
||||
output_data->bp_in_group++;
|
||||
}
|
||||
}
|
||||
output_data->bp_total_cnt = bp_cnt;
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static int ras_cmd_get_bad_pages(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
struct ras_cmd_bad_pages_info_req *input_data =
|
||||
(struct ras_cmd_bad_pages_info_req *)cmd->input_buff_raw;
|
||||
struct ras_cmd_bad_pages_info_rsp *output_data =
|
||||
(struct ras_cmd_bad_pages_info_rsp *)cmd->output_buff_raw;
|
||||
int ret;
|
||||
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_bad_pages_info_req))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
ret = ras_cmd_get_group_bad_pages(ras_core, input_data->group_index, output_data);
|
||||
if (ret)
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
output_data->version = 0;
|
||||
|
||||
cmd->output_size = sizeof(struct ras_cmd_bad_pages_info_rsp);
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static int ras_cmd_clear_bad_page_info(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
if (ras_eeprom_reset_table(ras_core))
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
if (ras_umc_clean_badpage_data(ras_core))
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static int ras_cmd_reset_all_error_counts(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
if (ras_aca_clear_all_blocks_ecc_count(ras_core))
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
if (ras_umc_clear_logged_ecc(ras_core))
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static int ras_cmd_get_cper_snapshot(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
struct ras_cmd_cper_snapshot_rsp *output_data =
|
||||
(struct ras_cmd_cper_snapshot_rsp *)cmd->output_buff_raw;
|
||||
struct ras_log_batch_overview overview;
|
||||
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
ras_log_ring_get_batch_overview(ras_core, &overview);
|
||||
|
||||
output_data->total_cper_num = overview.logged_batch_count;
|
||||
output_data->start_cper_id = overview.first_batch_id;
|
||||
output_data->latest_cper_id = overview.last_batch_id;
|
||||
|
||||
output_data->version = 0;
|
||||
|
||||
cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp);
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
struct ras_cmd_cper_record_req *req =
|
||||
(struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
|
||||
struct ras_cmd_cper_record_rsp *rsp =
|
||||
(struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
|
||||
struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0};
|
||||
struct ras_log_batch_overview overview;
|
||||
uint32_t offset = 0, real_data_len = 0;
|
||||
uint64_t batch_id;
|
||||
uint8_t *buffer;
|
||||
int ret = 0, i, count;
|
||||
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
if (!req->buf_size || !req->buf_ptr || !req->cper_num)
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
|
||||
|
||||
if (!access_ok((void *)req->buf_ptr, req->buf_size)) {
|
||||
RAS_DEV_ERR(ras_core->dev, "Invalid cper buffer memory!\n");
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
|
||||
}
|
||||
|
||||
buffer = kzalloc(req->buf_size, GFP_KERNEL);
|
||||
if (!buffer)
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
ras_log_ring_get_batch_overview(ras_core, &overview);
|
||||
for (i = 0; i < req->cper_num; i++) {
|
||||
batch_id = req->cper_start_id + i;
|
||||
if (batch_id >= overview.last_batch_id)
|
||||
break;
|
||||
|
||||
count = ras_log_ring_get_batch_records(ras_core, batch_id, trace,
|
||||
ARRAY_SIZE(trace));
|
||||
if (count > 0) {
|
||||
ret = ras_cper_generate_cper(ras_core, trace, count,
|
||||
&buffer[offset], req->buf_size - offset, &real_data_len);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
offset += real_data_len;
|
||||
}
|
||||
}
|
||||
|
||||
if ((ret && (ret != -ENOMEM)) ||
|
||||
copy_to_user((void *)req->buf_ptr, buffer, offset)) {
|
||||
kfree(buffer);
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
}
|
||||
|
||||
rsp->real_data_size = offset;
|
||||
rsp->real_cper_num = i;
|
||||
rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0;
|
||||
rsp->version = 0;
|
||||
|
||||
cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
|
||||
|
||||
kfree(buffer);
|
||||
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
struct ras_cmd_batch_trace_snapshot_rsp *rsp =
|
||||
(struct ras_cmd_batch_trace_snapshot_rsp *)cmd->output_buff_raw;
|
||||
struct ras_log_batch_overview overview;
|
||||
|
||||
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_snapshot_req))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
ras_log_ring_get_batch_overview(ras_core, &overview);
|
||||
|
||||
rsp->total_batch_num = overview.logged_batch_count;
|
||||
rsp->start_batch_id = overview.first_batch_id;
|
||||
rsp->latest_batch_id = overview.last_batch_id;
|
||||
rsp->version = 0;
|
||||
|
||||
cmd->output_size = sizeof(struct ras_cmd_batch_trace_snapshot_rsp);
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
struct ras_cmd_batch_trace_record_req *input_data =
|
||||
(struct ras_cmd_batch_trace_record_req *)cmd->input_buff_raw;
|
||||
struct ras_cmd_batch_trace_record_rsp *output_data =
|
||||
(struct ras_cmd_batch_trace_record_rsp *)cmd->output_buff_raw;
|
||||
struct ras_log_batch_overview overview;
|
||||
struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0};
|
||||
struct ras_log_info *record;
|
||||
int i, j, count = 0, offset = 0;
|
||||
uint64_t id;
|
||||
bool completed = false;
|
||||
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_record_req))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
if ((!input_data->batch_num) || (input_data->batch_num > RAS_CMD_MAX_BATCH_NUM))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
|
||||
|
||||
ras_log_ring_get_batch_overview(ras_core, &overview);
|
||||
if ((input_data->start_batch_id < overview.first_batch_id) ||
|
||||
(input_data->start_batch_id >= overview.last_batch_id))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
for (i = 0; i < input_data->batch_num; i++) {
|
||||
id = input_data->start_batch_id + i;
|
||||
if (id >= overview.last_batch_id) {
|
||||
completed = true;
|
||||
break;
|
||||
}
|
||||
|
||||
count = ras_log_ring_get_batch_records(ras_core,
|
||||
id, trace_arry, ARRAY_SIZE(trace_arry));
|
||||
if (count > 0) {
|
||||
if ((offset + count) > RAS_CMD_MAX_TRACE_NUM)
|
||||
break;
|
||||
for (j = 0; j < count; j++) {
|
||||
record = &output_data->records[offset + j];
|
||||
record->seqno = trace_arry[j]->seqno;
|
||||
record->timestamp = trace_arry[j]->timestamp;
|
||||
record->event = trace_arry[j]->event;
|
||||
memcpy(&record->aca_reg,
|
||||
&trace_arry[j]->aca_reg, sizeof(trace_arry[j]->aca_reg));
|
||||
}
|
||||
} else {
|
||||
count = 0;
|
||||
}
|
||||
|
||||
output_data->batchs[i].batch_id = id;
|
||||
output_data->batchs[i].offset = offset;
|
||||
output_data->batchs[i].trace_num = count;
|
||||
offset += count;
|
||||
}
|
||||
|
||||
output_data->start_batch_id = input_data->start_batch_id;
|
||||
output_data->real_batch_num = i;
|
||||
output_data->remain_num = completed ? 0 : (input_data->batch_num - i);
|
||||
output_data->version = 0;
|
||||
|
||||
cmd->output_size = sizeof(struct ras_cmd_batch_trace_record_rsp);
|
||||
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static enum ras_ta_block __get_ras_ta_block(enum ras_block_id block)
|
||||
{
|
||||
switch (block) {
|
||||
case RAS_BLOCK_ID__UMC:
|
||||
return RAS_TA_BLOCK__UMC;
|
||||
case RAS_BLOCK_ID__SDMA:
|
||||
return RAS_TA_BLOCK__SDMA;
|
||||
case RAS_BLOCK_ID__GFX:
|
||||
return RAS_TA_BLOCK__GFX;
|
||||
case RAS_BLOCK_ID__MMHUB:
|
||||
return RAS_TA_BLOCK__MMHUB;
|
||||
case RAS_BLOCK_ID__ATHUB:
|
||||
return RAS_TA_BLOCK__ATHUB;
|
||||
case RAS_BLOCK_ID__PCIE_BIF:
|
||||
return RAS_TA_BLOCK__PCIE_BIF;
|
||||
case RAS_BLOCK_ID__HDP:
|
||||
return RAS_TA_BLOCK__HDP;
|
||||
case RAS_BLOCK_ID__XGMI_WAFL:
|
||||
return RAS_TA_BLOCK__XGMI_WAFL;
|
||||
case RAS_BLOCK_ID__DF:
|
||||
return RAS_TA_BLOCK__DF;
|
||||
case RAS_BLOCK_ID__SMN:
|
||||
return RAS_TA_BLOCK__SMN;
|
||||
case RAS_BLOCK_ID__SEM:
|
||||
return RAS_TA_BLOCK__SEM;
|
||||
case RAS_BLOCK_ID__MP0:
|
||||
return RAS_TA_BLOCK__MP0;
|
||||
case RAS_BLOCK_ID__MP1:
|
||||
return RAS_TA_BLOCK__MP1;
|
||||
case RAS_BLOCK_ID__FUSE:
|
||||
return RAS_TA_BLOCK__FUSE;
|
||||
case RAS_BLOCK_ID__MCA:
|
||||
return RAS_TA_BLOCK__MCA;
|
||||
case RAS_BLOCK_ID__VCN:
|
||||
return RAS_TA_BLOCK__VCN;
|
||||
case RAS_BLOCK_ID__JPEG:
|
||||
return RAS_TA_BLOCK__JPEG;
|
||||
default:
|
||||
return RAS_TA_BLOCK__UMC;
|
||||
}
|
||||
}
|
||||
|
||||
static enum ras_ta_error_type __get_ras_ta_err_type(enum ras_ecc_err_type error)
|
||||
{
|
||||
switch (error) {
|
||||
case RAS_ECC_ERR__NONE:
|
||||
return RAS_TA_ERROR__NONE;
|
||||
case RAS_ECC_ERR__PARITY:
|
||||
return RAS_TA_ERROR__PARITY;
|
||||
case RAS_ECC_ERR__SINGLE_CORRECTABLE:
|
||||
return RAS_TA_ERROR__SINGLE_CORRECTABLE;
|
||||
case RAS_ECC_ERR__MULTI_UNCORRECTABLE:
|
||||
return RAS_TA_ERROR__MULTI_UNCORRECTABLE;
|
||||
case RAS_ECC_ERR__POISON:
|
||||
return RAS_TA_ERROR__POISON;
|
||||
default:
|
||||
return RAS_TA_ERROR__NONE;
|
||||
}
|
||||
}
|
||||
|
||||
static int ras_cmd_inject_error(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
struct ras_cmd_inject_error_req *req =
|
||||
(struct ras_cmd_inject_error_req *)cmd->input_buff_raw;
|
||||
struct ras_cmd_inject_error_rsp *output_data =
|
||||
(struct ras_cmd_inject_error_rsp *)cmd->output_buff_raw;
|
||||
int ret = 0;
|
||||
struct ras_ta_trigger_error_input block_info = {
|
||||
.block_id = __get_ras_ta_block(req->block_id),
|
||||
.sub_block_index = req->subblock_id,
|
||||
.inject_error_type = __get_ras_ta_err_type(req->error_type),
|
||||
.address = req->address,
|
||||
.value = req->method,
|
||||
};
|
||||
|
||||
ret = ras_psp_trigger_error(ras_core, &block_info, req->instance_mask);
|
||||
if (!ret) {
|
||||
output_data->version = 0;
|
||||
output_data->address = block_info.address;
|
||||
cmd->output_size = sizeof(struct ras_cmd_inject_error_rsp);
|
||||
} else {
|
||||
RAS_DEV_ERR(ras_core->dev, "ras inject block %u failed %d\n", req->block_id, ret);
|
||||
ret = RAS_CMD__ERROR_ACCESS_DENIED;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct ras_cmd_func_map ras_cmd_maps[] = {
|
||||
{RAS_CMD__INJECT_ERROR, ras_cmd_inject_error},
|
||||
{RAS_CMD__GET_BLOCK_ECC_STATUS, ras_get_block_ecc_info},
|
||||
{RAS_CMD__GET_BAD_PAGES, ras_cmd_get_bad_pages},
|
||||
{RAS_CMD__CLEAR_BAD_PAGE_INFO, ras_cmd_clear_bad_page_info},
|
||||
{RAS_CMD__RESET_ALL_ERROR_COUNTS, ras_cmd_reset_all_error_counts},
|
||||
{RAS_CMD__GET_CPER_SNAPSHOT, ras_cmd_get_cper_snapshot},
|
||||
{RAS_CMD__GET_CPER_RECORD, ras_cmd_get_cper_records},
|
||||
{RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, ras_cmd_get_batch_trace_snapshot},
|
||||
{RAS_CMD__GET_BATCH_TRACE_RECORD, ras_cmd_get_batch_trace_records},
|
||||
};
|
||||
|
||||
int rascore_handle_cmd(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data)
|
||||
{
|
||||
struct ras_cmd_func_map *ras_cmd = NULL;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(ras_cmd_maps); i++) {
|
||||
if (cmd->cmd_id == ras_cmd_maps[i].cmd_id) {
|
||||
ras_cmd = &ras_cmd_maps[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ras_cmd)
|
||||
return RAS_CMD__ERROR_UKNOWN_CMD;
|
||||
|
||||
return ras_cmd->func(ras_core, cmd, data);
|
||||
}
|
||||
|
||||
int ras_cmd_init(struct ras_core_context *ras_core)
|
||||
{
|
||||
return ras_cmd_add_device(ras_core);
|
||||
}
|
||||
|
||||
int ras_cmd_fini(struct ras_core_context *ras_core)
|
||||
{
|
||||
ras_cmd_remove_device(ras_core);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
|
||||
struct ras_query_interface_info_rsp *rsp)
|
||||
{
|
||||
rsp->ras_cmd_major_ver = RAS_CMD_MAJOR_VERSION;
|
||||
rsp->ras_cmd_minor_ver = RAS_CMD_MINOR_VERSION;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core,
|
||||
uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr)
|
||||
{
|
||||
struct umc_bank_addr umc_bank = {0};
|
||||
int ret;
|
||||
|
||||
ret = ras_umc_translate_soc_pa_and_bank(ras_core, &soc_pa, &umc_bank, false);
|
||||
if (ret)
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
bank_addr->stack_id = umc_bank.stack_id;
|
||||
bank_addr->bank_group = umc_bank.bank_group;
|
||||
bank_addr->bank = umc_bank.bank;
|
||||
bank_addr->row = umc_bank.row;
|
||||
bank_addr->column = umc_bank.column;
|
||||
bank_addr->channel = umc_bank.channel;
|
||||
bank_addr->subchannel = umc_bank.subchannel;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core,
|
||||
struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa)
|
||||
{
|
||||
struct umc_bank_addr umc_bank = {0};
|
||||
|
||||
umc_bank.stack_id = bank_addr.stack_id;
|
||||
umc_bank.bank_group = bank_addr.bank_group;
|
||||
umc_bank.bank = bank_addr.bank;
|
||||
umc_bank.row = bank_addr.row;
|
||||
umc_bank.column = bank_addr.column;
|
||||
umc_bank.channel = bank_addr.channel;
|
||||
umc_bank.subchannel = bank_addr.subchannel;
|
||||
|
||||
return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, &umc_bank, true);
|
||||
}
|
||||
|
||||
uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core)
|
||||
{
|
||||
return ras_core->ras_cmd.dev_handle;
|
||||
}
|
||||
425
drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
Normal file
425
drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
Normal file
|
|
@ -0,0 +1,425 @@
|
|||
/* SPDX-License-Identifier: MIT */
|
||||
/*
|
||||
* Copyright 2025 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __RAS_CMD_H__
|
||||
#define __RAS_CMD_H__
|
||||
#include "ras.h"
|
||||
#include "ras_eeprom.h"
|
||||
#include "ras_log_ring.h"
|
||||
#include "ras_cper.h"
|
||||
|
||||
#define RAS_CMD_DEV_HANDLE_MAGIC 0xFEEDAD00UL
|
||||
|
||||
#define RAS_CMD_MAX_IN_SIZE 256
|
||||
#define RAS_CMD_MAX_GPU_NUM 32
|
||||
#define RAS_CMD_MAX_BAD_PAGES_PER_GROUP 32
|
||||
|
||||
/* position of instance value in sub_block_index of
|
||||
* ta_ras_trigger_error_input, the sub block uses lower 12 bits
|
||||
*/
|
||||
#define RAS_TA_INST_MASK 0xfffff000
|
||||
#define RAS_TA_INST_SHIFT 0xc
|
||||
|
||||
enum ras_cmd_interface_type {
|
||||
RAS_CMD_INTERFACE_TYPE_NONE,
|
||||
RAS_CMD_INTERFACE_TYPE_AMDGPU,
|
||||
RAS_CMD_INTERFACE_TYPE_VF,
|
||||
RAS_CMD_INTERFACE_TYPE_PF,
|
||||
};
|
||||
|
||||
enum ras_cmd_id_range {
|
||||
RAS_CMD_ID_COMMON_START = 0,
|
||||
RAS_CMD_ID_COMMON_END = 0x10000,
|
||||
RAS_CMD_ID_AMDGPU_START = RAS_CMD_ID_COMMON_END,
|
||||
RAS_CMD_ID_AMDGPU_END = 0x20000,
|
||||
RAS_CMD_ID_MXGPU_START = RAS_CMD_ID_AMDGPU_END,
|
||||
RAS_CMD_ID_MXGPU_END = 0x30000,
|
||||
RAS_CMD_ID_MXGPU_VF_START = RAS_CMD_ID_MXGPU_END,
|
||||
RAS_CMD_ID_MXGPU_VF_END = 0x40000,
|
||||
};
|
||||
|
||||
enum ras_cmd_id {
|
||||
RAS_CMD__BEGIN = RAS_CMD_ID_COMMON_START,
|
||||
RAS_CMD__QUERY_INTERFACE_INFO,
|
||||
RAS_CMD__GET_DEVICES_INFO,
|
||||
RAS_CMD__GET_BLOCK_ECC_STATUS,
|
||||
RAS_CMD__INJECT_ERROR,
|
||||
RAS_CMD__GET_BAD_PAGES,
|
||||
RAS_CMD__CLEAR_BAD_PAGE_INFO,
|
||||
RAS_CMD__RESET_ALL_ERROR_COUNTS,
|
||||
RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES,
|
||||
RAS_CMD__TRANSLATE_FB_ADDRESS,
|
||||
RAS_CMD__GET_LINK_TOPOLOGY,
|
||||
RAS_CMD__GET_CPER_SNAPSHOT,
|
||||
RAS_CMD__GET_CPER_RECORD,
|
||||
RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
|
||||
RAS_CMD__GET_BATCH_TRACE_RECORD,
|
||||
RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END,
|
||||
};
|
||||
|
||||
enum ras_cmd_response {
|
||||
RAS_CMD__SUCCESS = 0,
|
||||
RAS_CMD__SUCCESS_EXEED_BUFFER,
|
||||
RAS_CMD__ERROR_UKNOWN_CMD,
|
||||
RAS_CMD__ERROR_INVALID_CMD,
|
||||
RAS_CMD__ERROR_VERSION,
|
||||
RAS_CMD__ERROR_INVALID_INPUT_SIZE,
|
||||
RAS_CMD__ERROR_INVALID_INPUT_DATA,
|
||||
RAS_CMD__ERROR_DRV_INIT_FAIL,
|
||||
RAS_CMD__ERROR_ACCESS_DENIED,
|
||||
RAS_CMD__ERROR_GENERIC,
|
||||
RAS_CMD__ERROR_TIMEOUT,
|
||||
};
|
||||
|
||||
enum ras_error_type {
|
||||
RAS_TYPE_ERROR__NONE = 0,
|
||||
RAS_TYPE_ERROR__PARITY = 1,
|
||||
RAS_TYPE_ERROR__SINGLE_CORRECTABLE = 2,
|
||||
RAS_TYPE_ERROR__MULTI_UNCORRECTABLE = 4,
|
||||
RAS_TYPE_ERROR__POISON = 8,
|
||||
};
|
||||
|
||||
struct ras_core_context;
|
||||
struct ras_cmd_ioctl;
|
||||
|
||||
struct ras_cmd_mgr {
|
||||
struct list_head head;
|
||||
struct ras_core_context *ras_core;
|
||||
uint64_t dev_handle;
|
||||
};
|
||||
|
||||
struct ras_cmd_func_map {
|
||||
uint32_t cmd_id;
|
||||
int (*func)(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ioctl *cmd, void *data);
|
||||
};
|
||||
|
||||
struct ras_device_bdf {
|
||||
union {
|
||||
struct {
|
||||
uint32_t function : 3;
|
||||
uint32_t device : 5;
|
||||
uint32_t bus : 8;
|
||||
uint32_t domain : 16;
|
||||
};
|
||||
uint32_t u32_all;
|
||||
};
|
||||
};
|
||||
|
||||
struct ras_cmd_param {
|
||||
uint32_t idx_vf;
|
||||
void *data;
|
||||
};
|
||||
|
||||
#pragma pack(push, 8)
|
||||
struct ras_cmd_ioctl {
|
||||
uint32_t magic;
|
||||
union {
|
||||
struct {
|
||||
uint16_t ras_cmd_minor_ver : 10;
|
||||
uint16_t ras_cmd_major_ver : 6;
|
||||
};
|
||||
uint16_t ras_cmd_ver;
|
||||
};
|
||||
union {
|
||||
struct {
|
||||
uint16_t plat_major_ver : 10;
|
||||
uint16_t plat_minor_ver : 6;
|
||||
};
|
||||
uint16_t plat_ver;
|
||||
};
|
||||
uint32_t cmd_id;
|
||||
uint32_t cmd_res;
|
||||
uint32_t input_size;
|
||||
uint32_t output_size;
|
||||
uint32_t reserved[6];
|
||||
uint8_t input_buff_raw[RAS_CMD_MAX_IN_SIZE];
|
||||
uint8_t output_buff_raw[];
|
||||
};
|
||||
|
||||
struct ras_cmd_dev_handle {
|
||||
uint64_t dev_handle;
|
||||
};
|
||||
|
||||
struct ras_cmd_block_ecc_info_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
uint32_t block_id;
|
||||
uint32_t subblock_id;
|
||||
uint32_t reserved[4];
|
||||
};
|
||||
|
||||
struct ras_cmd_block_ecc_info_rsp {
|
||||
uint32_t version;
|
||||
uint32_t ce_count;
|
||||
uint32_t ue_count;
|
||||
uint32_t de_count;
|
||||
uint32_t reserved[6];
|
||||
};
|
||||
|
||||
struct ras_cmd_inject_error_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
uint32_t block_id;
|
||||
uint32_t subblock_id;
|
||||
uint64_t address;
|
||||
uint32_t error_type;
|
||||
uint32_t instance_mask;
|
||||
union {
|
||||
struct {
|
||||
/* vf index */
|
||||
uint64_t vf_idx : 6;
|
||||
/* method of error injection. i.e persistent, coherent etc */
|
||||
uint64_t method : 10;
|
||||
uint64_t rsv : 48;
|
||||
};
|
||||
uint64_t value;
|
||||
};
|
||||
uint32_t reserved[8];
|
||||
};
|
||||
|
||||
struct ras_cmd_inject_error_rsp {
|
||||
uint32_t version;
|
||||
uint32_t reserved[5];
|
||||
uint64_t address;
|
||||
};
|
||||
|
||||
struct ras_cmd_dev_info {
|
||||
uint64_t dev_handle;
|
||||
uint32_t location_id;
|
||||
uint32_t ecc_enabled;
|
||||
uint32_t ecc_supported;
|
||||
uint32_t vf_num;
|
||||
uint32_t asic_type;
|
||||
uint32_t oam_id;
|
||||
uint32_t reserved[8];
|
||||
};
|
||||
|
||||
struct ras_cmd_devices_info_rsp {
|
||||
uint32_t version;
|
||||
uint32_t dev_num;
|
||||
uint32_t reserved[6];
|
||||
struct ras_cmd_dev_info devs[RAS_CMD_MAX_GPU_NUM];
|
||||
};
|
||||
|
||||
struct ras_cmd_bad_page_record {
|
||||
union {
|
||||
uint64_t address;
|
||||
uint64_t offset;
|
||||
};
|
||||
uint64_t retired_page;
|
||||
uint64_t ts;
|
||||
|
||||
uint32_t err_type;
|
||||
|
||||
union {
|
||||
unsigned char bank;
|
||||
unsigned char cu;
|
||||
};
|
||||
|
||||
unsigned char mem_channel;
|
||||
unsigned char mcumc_id;
|
||||
|
||||
unsigned char valid;
|
||||
unsigned char reserved[8];
|
||||
};
|
||||
|
||||
struct ras_cmd_bad_pages_info_req {
|
||||
struct ras_cmd_dev_handle device;
|
||||
uint32_t group_index;
|
||||
uint32_t reserved[5];
|
||||
};
|
||||
|
||||
struct ras_cmd_bad_pages_info_rsp {
|
||||
uint32_t version;
|
||||
uint32_t group_index;
|
||||
uint32_t bp_in_group;
|
||||
uint32_t bp_total_cnt;
|
||||
uint32_t reserved[4];
|
||||
struct ras_cmd_bad_page_record records[RAS_CMD_MAX_BAD_PAGES_PER_GROUP];
|
||||
};
|
||||
|
||||
struct ras_query_interface_info_req {
|
||||
uint32_t reserved[8];
|
||||
};
|
||||
|
||||
struct ras_query_interface_info_rsp {
|
||||
uint32_t version;
|
||||
uint32_t ras_cmd_major_ver;
|
||||
uint32_t ras_cmd_minor_ver;
|
||||
uint32_t plat_major_ver;
|
||||
uint32_t plat_minor_ver;
|
||||
uint8_t interface_type;
|
||||
uint8_t rsv[3];
|
||||
uint32_t reserved[8];
|
||||
};
|
||||
|
||||
#define RAS_MAX_NUM_SAFE_RANGES 64
|
||||
struct ras_cmd_ras_safe_fb_address_ranges_rsp {
|
||||
uint32_t version;
|
||||
uint32_t num_ranges;
|
||||
uint32_t reserved[4];
|
||||
struct {
|
||||
uint64_t start;
|
||||
uint64_t size;
|
||||
uint32_t idx;
|
||||
uint32_t reserved[3];
|
||||
} range[RAS_MAX_NUM_SAFE_RANGES];
|
||||
};
|
||||
|
||||
enum ras_fb_addr_type {
|
||||
RAS_FB_ADDR_SOC_PHY, /* SPA */
|
||||
RAS_FB_ADDR_BANK,
|
||||
RAS_FB_ADDR_VF_PHY, /* GPA */
|
||||
RAS_FB_ADDR_UNKNOWN
|
||||
};
|
||||
|
||||
struct ras_fb_bank_addr {
|
||||
uint32_t stack_id; /* SID */
|
||||
uint32_t bank_group;
|
||||
uint32_t bank;
|
||||
uint32_t row;
|
||||
uint32_t column;
|
||||
uint32_t channel;
|
||||
uint32_t subchannel; /* Also called Pseudochannel (PC) */
|
||||
uint32_t reserved[3];
|
||||
};
|
||||
|
||||
struct ras_fb_vf_phy_addr {
|
||||
uint32_t vf_idx;
|
||||
uint32_t reserved;
|
||||
uint64_t addr;
|
||||
};
|
||||
|
||||
union ras_translate_fb_address {
|
||||
struct ras_fb_bank_addr bank_addr;
|
||||
uint64_t soc_phy_addr;
|
||||
struct ras_fb_vf_phy_addr vf_phy_addr;
|
||||
};
|
||||
|
||||
struct ras_cmd_translate_fb_address_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
enum ras_fb_addr_type src_addr_type;
|
||||
enum ras_fb_addr_type dest_addr_type;
|
||||
union ras_translate_fb_address trans_addr;
|
||||
};
|
||||
|
||||
struct ras_cmd_translate_fb_address_rsp {
|
||||
uint32_t version;
|
||||
uint32_t reserved[5];
|
||||
union ras_translate_fb_address trans_addr;
|
||||
};
|
||||
|
||||
struct ras_dev_link_topology_req {
|
||||
struct ras_cmd_dev_handle src;
|
||||
struct ras_cmd_dev_handle dst;
|
||||
};
|
||||
|
||||
struct ras_dev_link_topology_rsp {
|
||||
uint32_t version;
|
||||
uint32_t link_status; /* HW status of the link */
|
||||
uint32_t link_type; /* type of the link */
|
||||
uint32_t num_hops; /* number of hops */
|
||||
uint32_t reserved[8];
|
||||
};
|
||||
|
||||
struct ras_cmd_cper_snapshot_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
};
|
||||
|
||||
struct ras_cmd_cper_snapshot_rsp {
|
||||
uint32_t version;
|
||||
uint32_t reserved[4];
|
||||
uint32_t total_cper_num;
|
||||
uint64_t start_cper_id;
|
||||
uint64_t latest_cper_id;
|
||||
};
|
||||
|
||||
struct ras_cmd_cper_record_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
uint64_t cper_start_id;
|
||||
uint32_t cper_num;
|
||||
uint32_t buf_size;
|
||||
uint64_t buf_ptr;
|
||||
uint32_t reserved[4];
|
||||
};
|
||||
|
||||
struct ras_cmd_cper_record_rsp {
|
||||
uint32_t version;
|
||||
uint32_t real_data_size;
|
||||
uint32_t real_cper_num;
|
||||
uint32_t remain_num;
|
||||
uint32_t reserved[4];
|
||||
};
|
||||
|
||||
struct ras_cmd_batch_trace_snapshot_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
};
|
||||
|
||||
struct ras_cmd_batch_trace_snapshot_rsp {
|
||||
uint32_t version;
|
||||
uint32_t reserved[4];
|
||||
uint32_t total_batch_num;
|
||||
uint64_t start_batch_id;
|
||||
uint64_t latest_batch_id;
|
||||
};
|
||||
|
||||
struct ras_cmd_batch_trace_record_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
uint64_t start_batch_id;
|
||||
uint32_t batch_num;
|
||||
uint32_t reserved[5];
|
||||
};
|
||||
|
||||
struct batch_ras_trace_info {
|
||||
uint64_t batch_id;
|
||||
uint16_t offset;
|
||||
uint8_t trace_num;
|
||||
uint8_t rsv;
|
||||
uint32_t reserved;
|
||||
};
|
||||
|
||||
#define RAS_CMD_MAX_BATCH_NUM 300
|
||||
#define RAS_CMD_MAX_TRACE_NUM 300
|
||||
struct ras_cmd_batch_trace_record_rsp {
|
||||
uint32_t version;
|
||||
uint16_t real_batch_num;
|
||||
uint16_t remain_num;
|
||||
uint64_t start_batch_id;
|
||||
uint32_t reserved[2];
|
||||
struct batch_ras_trace_info batchs[RAS_CMD_MAX_BATCH_NUM];
|
||||
struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM];
|
||||
};
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
int ras_cmd_init(struct ras_core_context *ras_core);
|
||||
int ras_cmd_fini(struct ras_core_context *ras_core);
|
||||
int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ioctl *cmd, void *data);
|
||||
uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core);
|
||||
int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
|
||||
struct ras_query_interface_info_rsp *rsp);
|
||||
int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core,
|
||||
uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr);
|
||||
int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core,
|
||||
struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa);
|
||||
#endif
|
||||
Loading…
Reference in New Issue
Block a user