drm/amdgpu: Introduce funcs for generating cper record

Introduce new functions that are used to generate
cper ue or ce records.

v2: return -ENOMEM instead of false
v2: check return value of fill section function

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Xiang Liu <xiang.liu@amd.com>
Reviewed-by: Yang Wang <keivnyang.wang@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Hawking Zhang 2025-01-26 17:15:48 +08:00 committed by Alex Deucher
parent 56316ee91b
commit ad97840f95
4 changed files with 128 additions and 13 deletions

View File

@ -30,16 +30,6 @@
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
struct aca_banks {
int nr_banks;
struct list_head list;
};
struct aca_hwip {
int hwid;
int mcatype;
};
static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = {
ACA_BANK_HWID(SMU, 0x01, 0x01),
ACA_BANK_HWID(PCS_XGMI, 0x50, 0x00),
@ -111,7 +101,7 @@ static struct aca_regs_dump {
{"STATUS", ACA_REG_IDX_STATUS},
{"ADDR", ACA_REG_IDX_ADDR},
{"MISC", ACA_REG_IDX_MISC0},
{"CONFIG", ACA_REG_IDX_CONFG},
{"CONFIG", ACA_REG_IDX_CONFIG},
{"IPID", ACA_REG_IDX_IPID},
{"SYND", ACA_REG_IDX_SYND},
{"DESTAT", ACA_REG_IDX_DESTAT},

View File

@ -81,7 +81,7 @@ enum aca_reg_idx {
ACA_REG_IDX_STATUS = 1,
ACA_REG_IDX_ADDR = 2,
ACA_REG_IDX_MISC0 = 3,
ACA_REG_IDX_CONFG = 4,
ACA_REG_IDX_CONFIG = 4,
ACA_REG_IDX_IPID = 5,
ACA_REG_IDX_SYND = 6,
ACA_REG_IDX_DESTAT = 8,
@ -114,6 +114,11 @@ enum aca_smu_type {
ACA_SMU_TYPE_COUNT,
};
struct aca_hwip {
int hwid;
int mcatype;
};
struct aca_bank {
enum aca_error_type aca_err_type;
enum aca_smu_type smu_err_type;
@ -125,6 +130,11 @@ struct aca_bank_node {
struct list_head node;
};
struct aca_banks {
int nr_banks;
struct list_head list;
};
struct aca_bank_info {
int die_id;
int socket_id;

View File

@ -21,6 +21,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <linux/list.h>
#include "amdgpu.h"
static const guid_t MCE = CPER_NOTIFY_MCE;
@ -257,6 +258,113 @@ struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
return hdr;
}
int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
struct aca_bank *bank)
{
struct cper_hdr *fatal = NULL;
struct cper_sec_crashdump_reg_data reg_data = { 0 };
int ret;
fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
if (!fatal) {
dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
return -ENOMEM;
}
reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
if (ret)
return ret;
/*TODO: commit the cper entry to cper ring */
return 0;
}
static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
enum aca_error_type aca_err_type)
{
switch (aca_err_type) {
case ACA_ERROR_TYPE_UE:
return CPER_SEV_FATAL;
case ACA_ERROR_TYPE_CE:
return CPER_SEV_NON_FATAL_CORRECTED;
case ACA_ERROR_TYPE_DEFERRED:
return CPER_SEV_NON_FATAL_UNCORRECTED;
default:
dev_err(adev->dev, "Unknown ACA error type!\n");
return CPER_SEV_FATAL;
}
}
int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
struct aca_banks *banks,
uint16_t bank_count)
{
struct cper_hdr *corrected = NULL;
enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
struct aca_bank_node *node;
struct aca_bank *bank;
uint32_t i = 0;
int ret;
corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
if (!corrected) {
dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
return -ENOMEM;
}
/* Raise severity if any DE is detected in the ACA bank list */
list_for_each_entry(node, &banks->list, node) {
bank = &node->bank;
if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
sev = CPER_SEV_NON_FATAL_UNCORRECTED;
break;
}
}
amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
/* Combine CE and UE in cper record */
list_for_each_entry(node, &banks->list, node) {
bank = &node->bank;
reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
reg_data, CPER_ACA_REG_COUNT);
if (ret)
return ret;
}
/*TODO: commit the cper entry to cper ring */
return 0;
}
int amdgpu_cper_init(struct amdgpu_device *adev)
{
mutex_init(&adev->cper.cper_lock);

View File

@ -26,6 +26,7 @@
#define __AMDGPU_CPER_H__
#include "amd_cper.h"
#include "amdgpu_aca.h"
#define CPER_MAX_ALLOWED_COUNT 0x1000
#define HDR_LEN (sizeof(struct cper_hdr))
@ -84,7 +85,13 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
enum amdgpu_cper_type type,
uint16_t section_count);
/* UE must be encoded into separated cper entries, 1 UE 1 cper */
int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
struct aca_bank *bank);
/* CEs and DEs are combined into 1 cper entry */
int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
struct aca_banks *banks,
uint16_t bank_count);
int amdgpu_cper_init(struct amdgpu_device *adev);
int amdgpu_cper_fini(struct amdgpu_device *adev);