mirror of
https://github.com/torvalds/linux.git
synced 2026-05-23 14:42:08 +02:00
drm/amd/ras: Support high-frequency querying sriov ras block error count
Support high-frequency querying sriov ras block error count: 1. Create shared memory and fills it with RAS_CMD__GET_LAL_LOC_STATUS ras command. 2. The RAS_CMD_GET_ALL_BLOCK_ECC_STATUS command and shared memory are registered to sriov host ras auto-update list via RAS_CMD_SET_CMD_AUTO_UPDATE command. 3. Once sriov host detects ras error, it will automatically execute RAS_CMD__GET_ALL_BLOCK_ECC_STATUS command and write the result to shared memory. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
fcfa8dbb08
commit
11dcf72eb5
|
|
@ -235,9 +235,90 @@ static int amdgpu_virt_ras_get_cper_records(struct ras_core_context *ras_core,
|
|||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static int __fill_get_blocks_ecc_cmd(struct amdgpu_device *adev,
|
||||
struct vram_blocks_ecc *blks_ecc)
|
||||
{
|
||||
struct ras_cmd_ctx *rcmd;
|
||||
|
||||
if (!blks_ecc || !blks_ecc->bo || !blks_ecc->cpu_addr)
|
||||
return -EINVAL;
|
||||
|
||||
rcmd = (struct ras_cmd_ctx *)blks_ecc->cpu_addr;
|
||||
|
||||
rcmd->cmd_id = RAS_CMD__GET_ALL_BLOCK_ECC_STATUS;
|
||||
rcmd->input_size = sizeof(struct ras_cmd_blocks_ecc_req);
|
||||
rcmd->output_buf_size = blks_ecc->size - sizeof(*rcmd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __set_cmd_auto_update(struct amdgpu_device *adev,
|
||||
enum ras_cmd_id cmd_id, uint64_t gpa_addr, uint32_t len, bool reg)
|
||||
{
|
||||
struct ras_cmd_auto_update_req req = {0};
|
||||
struct ras_cmd_auto_update_rsp rsp = {0};
|
||||
int ret;
|
||||
|
||||
req.mode = reg ? 1 : 0;
|
||||
req.cmd_id = cmd_id;
|
||||
req.addr = gpa_addr;
|
||||
req.len = len;
|
||||
ret = amdgpu_ras_mgr_handle_ras_cmd(adev, RAS_CMD__SET_CMD_AUTO_UPDATE,
|
||||
&req, sizeof(req), &rsp, sizeof(rsp));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int amdgpu_virt_ras_get_block_ecc(struct ras_core_context *ras_core,
|
||||
struct ras_cmd_ctx *cmd, void *data)
|
||||
{
|
||||
struct amdgpu_device *adev = ras_core->dev;
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
struct amdgpu_virt_ras_cmd *virt_ras =
|
||||
(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
|
||||
struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
|
||||
struct ras_cmd_ctx *blks_ecc_cmd_ctx;
|
||||
struct ras_cmd_blocks_ecc_rsp *blks_ecc_rsp;
|
||||
struct ras_cmd_block_ecc_info_req *input_data =
|
||||
(struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
|
||||
struct ras_cmd_block_ecc_info_rsp *output_data =
|
||||
(struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
|
||||
int ret = 0;
|
||||
|
||||
if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
|
||||
|
||||
if (input_data->block_id >= MAX_RAS_BLOCK_NUM)
|
||||
return RAS_CMD__ERROR_INVALID_INPUT_DATA;
|
||||
|
||||
if (__fill_get_blocks_ecc_cmd(adev, blks_ecc))
|
||||
return RAS_CMD__ERROR_GENERIC;
|
||||
|
||||
if (!virt_ras->blocks_ecc.auto_update_actived) {
|
||||
ret = __set_cmd_auto_update(adev, RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
|
||||
blks_ecc->mc_addr - adev->gmc.vram_start,
|
||||
blks_ecc->size, true);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
blks_ecc->auto_update_actived = true;
|
||||
}
|
||||
|
||||
blks_ecc_cmd_ctx = blks_ecc->cpu_addr;
|
||||
blks_ecc_rsp = (struct ras_cmd_blocks_ecc_rsp *)blks_ecc_cmd_ctx->output_buff_raw;
|
||||
|
||||
output_data->ce_count = blks_ecc_rsp->blocks[input_data->block_id].ce_count;
|
||||
output_data->ue_count = blks_ecc_rsp->blocks[input_data->block_id].ue_count;
|
||||
output_data->de_count = blks_ecc_rsp->blocks[input_data->block_id].de_count;
|
||||
|
||||
cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
|
||||
return RAS_CMD__SUCCESS;
|
||||
}
|
||||
|
||||
static struct ras_cmd_func_map amdgpu_virt_ras_cmd_maps[] = {
|
||||
{RAS_CMD__GET_CPER_SNAPSHOT, amdgpu_virt_ras_get_cper_snapshot},
|
||||
{RAS_CMD__GET_CPER_RECORD, amdgpu_virt_ras_get_cper_records},
|
||||
{RAS_CMD__GET_BLOCK_ECC_STATUS, amdgpu_virt_ras_get_block_ecc},
|
||||
};
|
||||
|
||||
int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
|
||||
|
|
@ -294,10 +375,41 @@ int amdgpu_virt_ras_sw_fini(struct amdgpu_device *adev)
|
|||
|
||||
int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
struct amdgpu_virt_ras_cmd *virt_ras =
|
||||
(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
|
||||
struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
|
||||
|
||||
memset(blks_ecc, 0, sizeof(*blks_ecc));
|
||||
blks_ecc->size = PAGE_SIZE;
|
||||
if (amdgpu_bo_create_kernel(adev, blks_ecc->size,
|
||||
PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
|
||||
&blks_ecc->bo, &blks_ecc->mc_addr,
|
||||
(void **)&blks_ecc->cpu_addr))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
|
||||
struct amdgpu_virt_ras_cmd *virt_ras =
|
||||
(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
|
||||
struct vram_blocks_ecc *blks_ecc = &virt_ras->blocks_ecc;
|
||||
|
||||
if (blks_ecc->bo) {
|
||||
__set_cmd_auto_update(adev,
|
||||
RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
|
||||
blks_ecc->mc_addr - adev->gmc.vram_start,
|
||||
blks_ecc->size, false);
|
||||
|
||||
memset(blks_ecc->cpu_addr, 0, blks_ecc->size);
|
||||
amdgpu_bo_free_kernel(&blks_ecc->bo,
|
||||
&blks_ecc->mc_addr, &blks_ecc->cpu_addr);
|
||||
|
||||
memset(blks_ecc, 0, sizeof(*blks_ecc));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,8 +30,17 @@ struct remote_batch_trace_mgr {
|
|||
struct ras_cmd_batch_trace_record_rsp batch_trace;
|
||||
};
|
||||
|
||||
struct vram_blocks_ecc {
|
||||
struct amdgpu_bo *bo;
|
||||
uint64_t mc_addr;
|
||||
void *cpu_addr;
|
||||
uint32_t size;
|
||||
bool auto_update_actived;
|
||||
};
|
||||
|
||||
struct amdgpu_virt_ras_cmd {
|
||||
struct remote_batch_trace_mgr batch_mgr;
|
||||
struct vram_blocks_ecc blocks_ecc;
|
||||
};
|
||||
|
||||
int amdgpu_virt_ras_sw_init(struct amdgpu_device *adev);
|
||||
|
|
|
|||
|
|
@ -75,6 +75,8 @@ enum ras_cmd_id {
|
|||
RAS_CMD__GET_CPER_RECORD,
|
||||
RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
|
||||
RAS_CMD__GET_BATCH_TRACE_RECORD,
|
||||
RAS_CMD__GET_ALL_BLOCK_ECC_STATUS,
|
||||
RAS_CMD__SET_CMD_AUTO_UPDATE,
|
||||
RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END,
|
||||
};
|
||||
|
||||
|
|
@ -411,6 +413,37 @@ struct ras_cmd_batch_trace_record_rsp {
|
|||
struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM];
|
||||
};
|
||||
|
||||
struct ras_cmd_auto_update_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
uint32_t mode;
|
||||
uint32_t cmd_id;
|
||||
uint64_t addr;
|
||||
uint32_t len;
|
||||
uint32_t reserved[5];
|
||||
};
|
||||
|
||||
struct ras_cmd_auto_update_rsp {
|
||||
uint32_t version;
|
||||
uint32_t reserved[4];
|
||||
};
|
||||
|
||||
struct ras_cmd_blocks_ecc_req {
|
||||
struct ras_cmd_dev_handle dev;
|
||||
};
|
||||
|
||||
struct ras_cmd_block_ecc {
|
||||
uint32_t ce_count;
|
||||
uint32_t ue_count;
|
||||
uint32_t de_count;
|
||||
};
|
||||
|
||||
#define MAX_RAS_BLOCK_NUM 20
|
||||
struct ras_cmd_blocks_ecc_rsp {
|
||||
uint32_t version;
|
||||
uint32_t reserved[5];
|
||||
struct ras_cmd_block_ecc blocks[MAX_RAS_BLOCK_NUM];
|
||||
};
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
int ras_cmd_init(struct ras_core_context *ras_core);
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user