drm/amd/ras: Add ras ioctl command handler

Add ras ioctl command handler.

V2:
  Remove ras global device list.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai 2025-03-17 17:31:24 +08:00 committed by Alex Deucher
parent c49ef01183
commit 19030244e1
2 changed files with 952 additions and 0 deletions

View File

@ -0,0 +1,527 @@
// SPDX-License-Identifier: MIT
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "ras.h"
#include "ras_cmd.h"
#define RAS_CMD_MAJOR_VERSION 6
#define RAS_CMD_MINOR_VERSION 0
#define RAS_CMD_VERSION (((RAS_CMD_MAJOR_VERSION) << 10) | (RAS_CMD_MINOR_VERSION))
static int ras_cmd_add_device(struct ras_core_context *ras_core)
{
INIT_LIST_HEAD(&ras_core->ras_cmd.head);
ras_core->ras_cmd.ras_core = ras_core;
ras_core->ras_cmd.dev_handle = (uint64_t)ras_core ^ RAS_CMD_DEV_HANDLE_MAGIC;
return 0;
}
static int ras_cmd_remove_device(struct ras_core_context *ras_core)
{
memset(&ras_core->ras_cmd, 0, sizeof(ras_core->ras_cmd));
return 0;
}
static int ras_get_block_ecc_info(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
struct ras_cmd_block_ecc_info_req *input_data =
(struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
struct ras_cmd_block_ecc_info_rsp *output_data =
(struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
struct ras_ecc_count err_data;
int ret;
if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
memset(&err_data, 0, sizeof(err_data));
ret = ras_aca_get_block_ecc_count(ras_core, input_data->block_id, &err_data);
if (ret)
return RAS_CMD__ERROR_GENERIC;
output_data->ce_count = err_data.total_ce_count;
output_data->ue_count = err_data.total_ue_count;
output_data->de_count = err_data.total_de_count;
cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
return RAS_CMD__SUCCESS;
}
static void ras_cmd_update_bad_page_info(struct ras_cmd_bad_page_record *ras_cmd_record,
struct eeprom_umc_record *record)
{
ras_cmd_record->retired_page = record->cur_nps_retired_row_pfn;
ras_cmd_record->ts = record->ts;
ras_cmd_record->err_type = record->err_type;
ras_cmd_record->mem_channel = record->mem_channel;
ras_cmd_record->mcumc_id = record->mcumc_id;
ras_cmd_record->address = record->address;
ras_cmd_record->bank = record->bank;
ras_cmd_record->valid = 1;
}
static int ras_cmd_get_group_bad_pages(struct ras_core_context *ras_core,
uint32_t group_index, struct ras_cmd_bad_pages_info_rsp *output_data)
{
struct eeprom_umc_record record;
struct ras_cmd_bad_page_record *ras_cmd_record;
uint32_t i = 0, bp_cnt = 0, group_cnt = 0;
output_data->bp_in_group = 0;
output_data->group_index = 0;
bp_cnt = ras_umc_get_badpage_count(ras_core);
if (bp_cnt) {
output_data->group_index = group_index;
group_cnt = bp_cnt / RAS_CMD_MAX_BAD_PAGES_PER_GROUP
+ ((bp_cnt % RAS_CMD_MAX_BAD_PAGES_PER_GROUP) ? 1 : 0);
if (group_index >= group_cnt)
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
i = group_index * RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
for (;
i < bp_cnt && output_data->bp_in_group < RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
i++) {
if (ras_umc_get_badpage_record(ras_core, i, &record))
return RAS_CMD__ERROR_GENERIC;
ras_cmd_record = &output_data->records[i % RAS_CMD_MAX_BAD_PAGES_PER_GROUP];
memset(ras_cmd_record, 0, sizeof(*ras_cmd_record));
ras_cmd_update_bad_page_info(ras_cmd_record, &record);
output_data->bp_in_group++;
}
}
output_data->bp_total_cnt = bp_cnt;
return RAS_CMD__SUCCESS;
}
static int ras_cmd_get_bad_pages(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
struct ras_cmd_bad_pages_info_req *input_data =
(struct ras_cmd_bad_pages_info_req *)cmd->input_buff_raw;
struct ras_cmd_bad_pages_info_rsp *output_data =
(struct ras_cmd_bad_pages_info_rsp *)cmd->output_buff_raw;
int ret;
if (cmd->input_size != sizeof(struct ras_cmd_bad_pages_info_req))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
ret = ras_cmd_get_group_bad_pages(ras_core, input_data->group_index, output_data);
if (ret)
return RAS_CMD__ERROR_GENERIC;
output_data->version = 0;
cmd->output_size = sizeof(struct ras_cmd_bad_pages_info_rsp);
return RAS_CMD__SUCCESS;
}
static int ras_cmd_clear_bad_page_info(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
if (ras_eeprom_reset_table(ras_core))
return RAS_CMD__ERROR_GENERIC;
if (ras_umc_clean_badpage_data(ras_core))
return RAS_CMD__ERROR_GENERIC;
return RAS_CMD__SUCCESS;
}
static int ras_cmd_reset_all_error_counts(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
if (ras_aca_clear_all_blocks_ecc_count(ras_core))
return RAS_CMD__ERROR_GENERIC;
if (ras_umc_clear_logged_ecc(ras_core))
return RAS_CMD__ERROR_GENERIC;
return RAS_CMD__SUCCESS;
}
static int ras_cmd_get_cper_snapshot(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
struct ras_cmd_cper_snapshot_rsp *output_data =
(struct ras_cmd_cper_snapshot_rsp *)cmd->output_buff_raw;
struct ras_log_batch_overview overview;
if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
ras_log_ring_get_batch_overview(ras_core, &overview);
output_data->total_cper_num = overview.logged_batch_count;
output_data->start_cper_id = overview.first_batch_id;
output_data->latest_cper_id = overview.last_batch_id;
output_data->version = 0;
cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp);
return RAS_CMD__SUCCESS;
}
static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
struct ras_cmd_cper_record_req *req =
(struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
struct ras_cmd_cper_record_rsp *rsp =
(struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0};
struct ras_log_batch_overview overview;
uint32_t offset = 0, real_data_len = 0;
uint64_t batch_id;
uint8_t *buffer;
int ret = 0, i, count;
if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
if (!req->buf_size || !req->buf_ptr || !req->cper_num)
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
if (!access_ok((void *)req->buf_ptr, req->buf_size)) {
RAS_DEV_ERR(ras_core->dev, "Invalid cper buffer memory!\n");
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
}
buffer = kzalloc(req->buf_size, GFP_KERNEL);
if (!buffer)
return RAS_CMD__ERROR_GENERIC;
ras_log_ring_get_batch_overview(ras_core, &overview);
for (i = 0; i < req->cper_num; i++) {
batch_id = req->cper_start_id + i;
if (batch_id >= overview.last_batch_id)
break;
count = ras_log_ring_get_batch_records(ras_core, batch_id, trace,
ARRAY_SIZE(trace));
if (count > 0) {
ret = ras_cper_generate_cper(ras_core, trace, count,
&buffer[offset], req->buf_size - offset, &real_data_len);
if (ret)
break;
offset += real_data_len;
}
}
if ((ret && (ret != -ENOMEM)) ||
copy_to_user((void *)req->buf_ptr, buffer, offset)) {
kfree(buffer);
return RAS_CMD__ERROR_GENERIC;
}
rsp->real_data_size = offset;
rsp->real_cper_num = i;
rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0;
rsp->version = 0;
cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
kfree(buffer);
return RAS_CMD__SUCCESS;
}
static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
struct ras_cmd_batch_trace_snapshot_rsp *rsp =
(struct ras_cmd_batch_trace_snapshot_rsp *)cmd->output_buff_raw;
struct ras_log_batch_overview overview;
if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_snapshot_req))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
ras_log_ring_get_batch_overview(ras_core, &overview);
rsp->total_batch_num = overview.logged_batch_count;
rsp->start_batch_id = overview.first_batch_id;
rsp->latest_batch_id = overview.last_batch_id;
rsp->version = 0;
cmd->output_size = sizeof(struct ras_cmd_batch_trace_snapshot_rsp);
return RAS_CMD__SUCCESS;
}
static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
struct ras_cmd_batch_trace_record_req *input_data =
(struct ras_cmd_batch_trace_record_req *)cmd->input_buff_raw;
struct ras_cmd_batch_trace_record_rsp *output_data =
(struct ras_cmd_batch_trace_record_rsp *)cmd->output_buff_raw;
struct ras_log_batch_overview overview;
struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0};
struct ras_log_info *record;
int i, j, count = 0, offset = 0;
uint64_t id;
bool completed = false;
if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_record_req))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
if ((!input_data->batch_num) || (input_data->batch_num > RAS_CMD_MAX_BATCH_NUM))
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
ras_log_ring_get_batch_overview(ras_core, &overview);
if ((input_data->start_batch_id < overview.first_batch_id) ||
(input_data->start_batch_id >= overview.last_batch_id))
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
for (i = 0; i < input_data->batch_num; i++) {
id = input_data->start_batch_id + i;
if (id >= overview.last_batch_id) {
completed = true;
break;
}
count = ras_log_ring_get_batch_records(ras_core,
id, trace_arry, ARRAY_SIZE(trace_arry));
if (count > 0) {
if ((offset + count) > RAS_CMD_MAX_TRACE_NUM)
break;
for (j = 0; j < count; j++) {
record = &output_data->records[offset + j];
record->seqno = trace_arry[j]->seqno;
record->timestamp = trace_arry[j]->timestamp;
record->event = trace_arry[j]->event;
memcpy(&record->aca_reg,
&trace_arry[j]->aca_reg, sizeof(trace_arry[j]->aca_reg));
}
} else {
count = 0;
}
output_data->batchs[i].batch_id = id;
output_data->batchs[i].offset = offset;
output_data->batchs[i].trace_num = count;
offset += count;
}
output_data->start_batch_id = input_data->start_batch_id;
output_data->real_batch_num = i;
output_data->remain_num = completed ? 0 : (input_data->batch_num - i);
output_data->version = 0;
cmd->output_size = sizeof(struct ras_cmd_batch_trace_record_rsp);
return RAS_CMD__SUCCESS;
}
static enum ras_ta_block __get_ras_ta_block(enum ras_block_id block)
{
switch (block) {
case RAS_BLOCK_ID__UMC:
return RAS_TA_BLOCK__UMC;
case RAS_BLOCK_ID__SDMA:
return RAS_TA_BLOCK__SDMA;
case RAS_BLOCK_ID__GFX:
return RAS_TA_BLOCK__GFX;
case RAS_BLOCK_ID__MMHUB:
return RAS_TA_BLOCK__MMHUB;
case RAS_BLOCK_ID__ATHUB:
return RAS_TA_BLOCK__ATHUB;
case RAS_BLOCK_ID__PCIE_BIF:
return RAS_TA_BLOCK__PCIE_BIF;
case RAS_BLOCK_ID__HDP:
return RAS_TA_BLOCK__HDP;
case RAS_BLOCK_ID__XGMI_WAFL:
return RAS_TA_BLOCK__XGMI_WAFL;
case RAS_BLOCK_ID__DF:
return RAS_TA_BLOCK__DF;
case RAS_BLOCK_ID__SMN:
return RAS_TA_BLOCK__SMN;
case RAS_BLOCK_ID__SEM:
return RAS_TA_BLOCK__SEM;
case RAS_BLOCK_ID__MP0:
return RAS_TA_BLOCK__MP0;
case RAS_BLOCK_ID__MP1:
return RAS_TA_BLOCK__MP1;
case RAS_BLOCK_ID__FUSE:
return RAS_TA_BLOCK__FUSE;
case RAS_BLOCK_ID__MCA:
return RAS_TA_BLOCK__MCA;
case RAS_BLOCK_ID__VCN:
return RAS_TA_BLOCK__VCN;
case RAS_BLOCK_ID__JPEG:
return RAS_TA_BLOCK__JPEG;
default:
return RAS_TA_BLOCK__UMC;
}
}
static enum ras_ta_error_type __get_ras_ta_err_type(enum ras_ecc_err_type error)
{
switch (error) {
case RAS_ECC_ERR__NONE:
return RAS_TA_ERROR__NONE;
case RAS_ECC_ERR__PARITY:
return RAS_TA_ERROR__PARITY;
case RAS_ECC_ERR__SINGLE_CORRECTABLE:
return RAS_TA_ERROR__SINGLE_CORRECTABLE;
case RAS_ECC_ERR__MULTI_UNCORRECTABLE:
return RAS_TA_ERROR__MULTI_UNCORRECTABLE;
case RAS_ECC_ERR__POISON:
return RAS_TA_ERROR__POISON;
default:
return RAS_TA_ERROR__NONE;
}
}
static int ras_cmd_inject_error(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
struct ras_cmd_inject_error_req *req =
(struct ras_cmd_inject_error_req *)cmd->input_buff_raw;
struct ras_cmd_inject_error_rsp *output_data =
(struct ras_cmd_inject_error_rsp *)cmd->output_buff_raw;
int ret = 0;
struct ras_ta_trigger_error_input block_info = {
.block_id = __get_ras_ta_block(req->block_id),
.sub_block_index = req->subblock_id,
.inject_error_type = __get_ras_ta_err_type(req->error_type),
.address = req->address,
.value = req->method,
};
ret = ras_psp_trigger_error(ras_core, &block_info, req->instance_mask);
if (!ret) {
output_data->version = 0;
output_data->address = block_info.address;
cmd->output_size = sizeof(struct ras_cmd_inject_error_rsp);
} else {
RAS_DEV_ERR(ras_core->dev, "ras inject block %u failed %d\n", req->block_id, ret);
ret = RAS_CMD__ERROR_ACCESS_DENIED;
}
return ret;
}
static struct ras_cmd_func_map ras_cmd_maps[] = {
{RAS_CMD__INJECT_ERROR, ras_cmd_inject_error},
{RAS_CMD__GET_BLOCK_ECC_STATUS, ras_get_block_ecc_info},
{RAS_CMD__GET_BAD_PAGES, ras_cmd_get_bad_pages},
{RAS_CMD__CLEAR_BAD_PAGE_INFO, ras_cmd_clear_bad_page_info},
{RAS_CMD__RESET_ALL_ERROR_COUNTS, ras_cmd_reset_all_error_counts},
{RAS_CMD__GET_CPER_SNAPSHOT, ras_cmd_get_cper_snapshot},
{RAS_CMD__GET_CPER_RECORD, ras_cmd_get_cper_records},
{RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, ras_cmd_get_batch_trace_snapshot},
{RAS_CMD__GET_BATCH_TRACE_RECORD, ras_cmd_get_batch_trace_records},
};
int rascore_handle_cmd(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data)
{
struct ras_cmd_func_map *ras_cmd = NULL;
int i;
for (i = 0; i < ARRAY_SIZE(ras_cmd_maps); i++) {
if (cmd->cmd_id == ras_cmd_maps[i].cmd_id) {
ras_cmd = &ras_cmd_maps[i];
break;
}
}
if (!ras_cmd)
return RAS_CMD__ERROR_UKNOWN_CMD;
return ras_cmd->func(ras_core, cmd, data);
}
int ras_cmd_init(struct ras_core_context *ras_core)
{
return ras_cmd_add_device(ras_core);
}
int ras_cmd_fini(struct ras_core_context *ras_core)
{
ras_cmd_remove_device(ras_core);
return 0;
}
int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
struct ras_query_interface_info_rsp *rsp)
{
rsp->ras_cmd_major_ver = RAS_CMD_MAJOR_VERSION;
rsp->ras_cmd_minor_ver = RAS_CMD_MINOR_VERSION;
return 0;
}
int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core,
uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr)
{
struct umc_bank_addr umc_bank = {0};
int ret;
ret = ras_umc_translate_soc_pa_and_bank(ras_core, &soc_pa, &umc_bank, false);
if (ret)
return RAS_CMD__ERROR_GENERIC;
bank_addr->stack_id = umc_bank.stack_id;
bank_addr->bank_group = umc_bank.bank_group;
bank_addr->bank = umc_bank.bank;
bank_addr->row = umc_bank.row;
bank_addr->column = umc_bank.column;
bank_addr->channel = umc_bank.channel;
bank_addr->subchannel = umc_bank.subchannel;
return 0;
}
int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core,
struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa)
{
struct umc_bank_addr umc_bank = {0};
umc_bank.stack_id = bank_addr.stack_id;
umc_bank.bank_group = bank_addr.bank_group;
umc_bank.bank = bank_addr.bank;
umc_bank.row = bank_addr.row;
umc_bank.column = bank_addr.column;
umc_bank.channel = bank_addr.channel;
umc_bank.subchannel = bank_addr.subchannel;
return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, &umc_bank, true);
}
uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core)
{
return ras_core->ras_cmd.dev_handle;
}

View File

@ -0,0 +1,425 @@
/* SPDX-License-Identifier: MIT */
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef __RAS_CMD_H__
#define __RAS_CMD_H__
#include "ras.h"
#include "ras_eeprom.h"
#include "ras_log_ring.h"
#include "ras_cper.h"
#define RAS_CMD_DEV_HANDLE_MAGIC 0xFEEDAD00UL
#define RAS_CMD_MAX_IN_SIZE 256
#define RAS_CMD_MAX_GPU_NUM 32
#define RAS_CMD_MAX_BAD_PAGES_PER_GROUP 32
/* position of instance value in sub_block_index of
* ta_ras_trigger_error_input, the sub block uses lower 12 bits
*/
#define RAS_TA_INST_MASK 0xfffff000
#define RAS_TA_INST_SHIFT 0xc
enum ras_cmd_interface_type {
RAS_CMD_INTERFACE_TYPE_NONE,
RAS_CMD_INTERFACE_TYPE_AMDGPU,
RAS_CMD_INTERFACE_TYPE_VF,
RAS_CMD_INTERFACE_TYPE_PF,
};
enum ras_cmd_id_range {
RAS_CMD_ID_COMMON_START = 0,
RAS_CMD_ID_COMMON_END = 0x10000,
RAS_CMD_ID_AMDGPU_START = RAS_CMD_ID_COMMON_END,
RAS_CMD_ID_AMDGPU_END = 0x20000,
RAS_CMD_ID_MXGPU_START = RAS_CMD_ID_AMDGPU_END,
RAS_CMD_ID_MXGPU_END = 0x30000,
RAS_CMD_ID_MXGPU_VF_START = RAS_CMD_ID_MXGPU_END,
RAS_CMD_ID_MXGPU_VF_END = 0x40000,
};
enum ras_cmd_id {
RAS_CMD__BEGIN = RAS_CMD_ID_COMMON_START,
RAS_CMD__QUERY_INTERFACE_INFO,
RAS_CMD__GET_DEVICES_INFO,
RAS_CMD__GET_BLOCK_ECC_STATUS,
RAS_CMD__INJECT_ERROR,
RAS_CMD__GET_BAD_PAGES,
RAS_CMD__CLEAR_BAD_PAGE_INFO,
RAS_CMD__RESET_ALL_ERROR_COUNTS,
RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES,
RAS_CMD__TRANSLATE_FB_ADDRESS,
RAS_CMD__GET_LINK_TOPOLOGY,
RAS_CMD__GET_CPER_SNAPSHOT,
RAS_CMD__GET_CPER_RECORD,
RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
RAS_CMD__GET_BATCH_TRACE_RECORD,
RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END,
};
enum ras_cmd_response {
RAS_CMD__SUCCESS = 0,
RAS_CMD__SUCCESS_EXEED_BUFFER,
RAS_CMD__ERROR_UKNOWN_CMD,
RAS_CMD__ERROR_INVALID_CMD,
RAS_CMD__ERROR_VERSION,
RAS_CMD__ERROR_INVALID_INPUT_SIZE,
RAS_CMD__ERROR_INVALID_INPUT_DATA,
RAS_CMD__ERROR_DRV_INIT_FAIL,
RAS_CMD__ERROR_ACCESS_DENIED,
RAS_CMD__ERROR_GENERIC,
RAS_CMD__ERROR_TIMEOUT,
};
enum ras_error_type {
RAS_TYPE_ERROR__NONE = 0,
RAS_TYPE_ERROR__PARITY = 1,
RAS_TYPE_ERROR__SINGLE_CORRECTABLE = 2,
RAS_TYPE_ERROR__MULTI_UNCORRECTABLE = 4,
RAS_TYPE_ERROR__POISON = 8,
};
struct ras_core_context;
struct ras_cmd_ioctl;
struct ras_cmd_mgr {
struct list_head head;
struct ras_core_context *ras_core;
uint64_t dev_handle;
};
struct ras_cmd_func_map {
uint32_t cmd_id;
int (*func)(struct ras_core_context *ras_core,
struct ras_cmd_ioctl *cmd, void *data);
};
struct ras_device_bdf {
union {
struct {
uint32_t function : 3;
uint32_t device : 5;
uint32_t bus : 8;
uint32_t domain : 16;
};
uint32_t u32_all;
};
};
struct ras_cmd_param {
uint32_t idx_vf;
void *data;
};
#pragma pack(push, 8)
struct ras_cmd_ioctl {
uint32_t magic;
union {
struct {
uint16_t ras_cmd_minor_ver : 10;
uint16_t ras_cmd_major_ver : 6;
};
uint16_t ras_cmd_ver;
};
union {
struct {
uint16_t plat_major_ver : 10;
uint16_t plat_minor_ver : 6;
};
uint16_t plat_ver;
};
uint32_t cmd_id;
uint32_t cmd_res;
uint32_t input_size;
uint32_t output_size;
uint32_t reserved[6];
uint8_t input_buff_raw[RAS_CMD_MAX_IN_SIZE];
uint8_t output_buff_raw[];
};
struct ras_cmd_dev_handle {
uint64_t dev_handle;
};
struct ras_cmd_block_ecc_info_req {
struct ras_cmd_dev_handle dev;
uint32_t block_id;
uint32_t subblock_id;
uint32_t reserved[4];
};
struct ras_cmd_block_ecc_info_rsp {
uint32_t version;
uint32_t ce_count;
uint32_t ue_count;
uint32_t de_count;
uint32_t reserved[6];
};
struct ras_cmd_inject_error_req {
struct ras_cmd_dev_handle dev;
uint32_t block_id;
uint32_t subblock_id;
uint64_t address;
uint32_t error_type;
uint32_t instance_mask;
union {
struct {
/* vf index */
uint64_t vf_idx : 6;
/* method of error injection. i.e persistent, coherent etc */
uint64_t method : 10;
uint64_t rsv : 48;
};
uint64_t value;
};
uint32_t reserved[8];
};
struct ras_cmd_inject_error_rsp {
uint32_t version;
uint32_t reserved[5];
uint64_t address;
};
struct ras_cmd_dev_info {
uint64_t dev_handle;
uint32_t location_id;
uint32_t ecc_enabled;
uint32_t ecc_supported;
uint32_t vf_num;
uint32_t asic_type;
uint32_t oam_id;
uint32_t reserved[8];
};
struct ras_cmd_devices_info_rsp {
uint32_t version;
uint32_t dev_num;
uint32_t reserved[6];
struct ras_cmd_dev_info devs[RAS_CMD_MAX_GPU_NUM];
};
struct ras_cmd_bad_page_record {
union {
uint64_t address;
uint64_t offset;
};
uint64_t retired_page;
uint64_t ts;
uint32_t err_type;
union {
unsigned char bank;
unsigned char cu;
};
unsigned char mem_channel;
unsigned char mcumc_id;
unsigned char valid;
unsigned char reserved[8];
};
struct ras_cmd_bad_pages_info_req {
struct ras_cmd_dev_handle device;
uint32_t group_index;
uint32_t reserved[5];
};
struct ras_cmd_bad_pages_info_rsp {
uint32_t version;
uint32_t group_index;
uint32_t bp_in_group;
uint32_t bp_total_cnt;
uint32_t reserved[4];
struct ras_cmd_bad_page_record records[RAS_CMD_MAX_BAD_PAGES_PER_GROUP];
};
struct ras_query_interface_info_req {
uint32_t reserved[8];
};
struct ras_query_interface_info_rsp {
uint32_t version;
uint32_t ras_cmd_major_ver;
uint32_t ras_cmd_minor_ver;
uint32_t plat_major_ver;
uint32_t plat_minor_ver;
uint8_t interface_type;
uint8_t rsv[3];
uint32_t reserved[8];
};
#define RAS_MAX_NUM_SAFE_RANGES 64
struct ras_cmd_ras_safe_fb_address_ranges_rsp {
uint32_t version;
uint32_t num_ranges;
uint32_t reserved[4];
struct {
uint64_t start;
uint64_t size;
uint32_t idx;
uint32_t reserved[3];
} range[RAS_MAX_NUM_SAFE_RANGES];
};
enum ras_fb_addr_type {
RAS_FB_ADDR_SOC_PHY, /* SPA */
RAS_FB_ADDR_BANK,
RAS_FB_ADDR_VF_PHY, /* GPA */
RAS_FB_ADDR_UNKNOWN
};
struct ras_fb_bank_addr {
uint32_t stack_id; /* SID */
uint32_t bank_group;
uint32_t bank;
uint32_t row;
uint32_t column;
uint32_t channel;
uint32_t subchannel; /* Also called Pseudochannel (PC) */
uint32_t reserved[3];
};
struct ras_fb_vf_phy_addr {
uint32_t vf_idx;
uint32_t reserved;
uint64_t addr;
};
union ras_translate_fb_address {
struct ras_fb_bank_addr bank_addr;
uint64_t soc_phy_addr;
struct ras_fb_vf_phy_addr vf_phy_addr;
};
struct ras_cmd_translate_fb_address_req {
struct ras_cmd_dev_handle dev;
enum ras_fb_addr_type src_addr_type;
enum ras_fb_addr_type dest_addr_type;
union ras_translate_fb_address trans_addr;
};
struct ras_cmd_translate_fb_address_rsp {
uint32_t version;
uint32_t reserved[5];
union ras_translate_fb_address trans_addr;
};
struct ras_dev_link_topology_req {
struct ras_cmd_dev_handle src;
struct ras_cmd_dev_handle dst;
};
struct ras_dev_link_topology_rsp {
uint32_t version;
uint32_t link_status; /* HW status of the link */
uint32_t link_type; /* type of the link */
uint32_t num_hops; /* number of hops */
uint32_t reserved[8];
};
struct ras_cmd_cper_snapshot_req {
struct ras_cmd_dev_handle dev;
};
struct ras_cmd_cper_snapshot_rsp {
uint32_t version;
uint32_t reserved[4];
uint32_t total_cper_num;
uint64_t start_cper_id;
uint64_t latest_cper_id;
};
struct ras_cmd_cper_record_req {
struct ras_cmd_dev_handle dev;
uint64_t cper_start_id;
uint32_t cper_num;
uint32_t buf_size;
uint64_t buf_ptr;
uint32_t reserved[4];
};
struct ras_cmd_cper_record_rsp {
uint32_t version;
uint32_t real_data_size;
uint32_t real_cper_num;
uint32_t remain_num;
uint32_t reserved[4];
};
struct ras_cmd_batch_trace_snapshot_req {
struct ras_cmd_dev_handle dev;
};
struct ras_cmd_batch_trace_snapshot_rsp {
uint32_t version;
uint32_t reserved[4];
uint32_t total_batch_num;
uint64_t start_batch_id;
uint64_t latest_batch_id;
};
struct ras_cmd_batch_trace_record_req {
struct ras_cmd_dev_handle dev;
uint64_t start_batch_id;
uint32_t batch_num;
uint32_t reserved[5];
};
struct batch_ras_trace_info {
uint64_t batch_id;
uint16_t offset;
uint8_t trace_num;
uint8_t rsv;
uint32_t reserved;
};
#define RAS_CMD_MAX_BATCH_NUM 300
#define RAS_CMD_MAX_TRACE_NUM 300
struct ras_cmd_batch_trace_record_rsp {
uint32_t version;
uint16_t real_batch_num;
uint16_t remain_num;
uint64_t start_batch_id;
uint32_t reserved[2];
struct batch_ras_trace_info batchs[RAS_CMD_MAX_BATCH_NUM];
struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM];
};
#pragma pack(pop)
int ras_cmd_init(struct ras_core_context *ras_core);
int ras_cmd_fini(struct ras_core_context *ras_core);
int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ioctl *cmd, void *data);
uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core);
int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
struct ras_query_interface_info_rsp *rsp);
int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core,
uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr);
int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core,
struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa);
#endif