diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h index c146b9ef44eb..cd17d7d7372c 100644 --- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h @@ -6,15 +6,58 @@ #ifndef _XE_HW_ERROR_REGS_H_ #define _XE_HW_ERROR_REGS_H_ -#define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118) -#define UNCORR_FW_REPORTED_ERR BIT(6) +#define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118) +#define UNCORR_FW_REPORTED_ERR REG_BIT(6) -#define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124) +#define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124) -#define DEV_ERR_STAT_NONFATAL 0x100178 -#define DEV_ERR_STAT_CORRECTABLE 0x10017c -#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \ - DEV_ERR_STAT_CORRECTABLE, \ - DEV_ERR_STAT_NONFATAL)) -#define XE_CSC_ERROR BIT(17) +#define ERR_STAT_GT_COR 0x100160 +#define EU_GRF_COR_ERR REG_BIT(15) +#define EU_IC_COR_ERR REG_BIT(14) +#define SLM_COR_ERR REG_BIT(13) +#define GUC_COR_ERR REG_BIT(1) + +#define ERR_STAT_GT_NONFATAL 0x100164 +#define ERR_STAT_GT_FATAL 0x100168 +#define EU_GRF_FAT_ERR REG_BIT(15) +#define SLM_FAT_ERR REG_BIT(13) +#define GUC_FAT_ERR REG_BIT(6) +#define FPU_FAT_ERR REG_BIT(3) + +#define ERR_STAT_GT_REG(x) XE_REG(_PICK_EVEN((x), \ + ERR_STAT_GT_COR, \ + ERR_STAT_GT_NONFATAL)) + +#define PVC_COR_ERR_MASK (GUC_COR_ERR | SLM_COR_ERR | \ + EU_IC_COR_ERR | EU_GRF_COR_ERR) + +#define PVC_FAT_ERR_MASK (FPU_FAT_ERR | GUC_FAT_ERR | \ + EU_GRF_FAT_ERR | SLM_FAT_ERR) + +#define DEV_ERR_STAT_NONFATAL 0x100178 +#define DEV_ERR_STAT_CORRECTABLE 0x10017c +#define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \ + DEV_ERR_STAT_CORRECTABLE, \ + DEV_ERR_STAT_NONFATAL)) + +#define XE_CSC_ERROR 17 +#define XE_GT_ERROR 0 + +#define ERR_STAT_GT_FATAL_VECTOR_0 0x100260 +#define ERR_STAT_GT_FATAL_VECTOR_1 0x100264 + +#define ERR_STAT_GT_FATAL_VECTOR_REG(x) XE_REG(_PICK_EVEN((x), \ + ERR_STAT_GT_FATAL_VECTOR_0, \ + ERR_STAT_GT_FATAL_VECTOR_1)) + +#define ERR_STAT_GT_COR_VECTOR_0 0x1002a0 +#define ERR_STAT_GT_COR_VECTOR_1 0x1002a4 + +#define ERR_STAT_GT_COR_VECTOR_REG(x) XE_REG(_PICK_EVEN((x), \ + ERR_STAT_GT_COR_VECTOR_0, \ + ERR_STAT_GT_COR_VECTOR_1)) + +#define ERR_STAT_GT_VECTOR_REG(hw_err, x) (hw_err == HARDWARE_ERROR_CORRECTABLE ? \ + ERR_STAT_GT_COR_VECTOR_REG(x) : \ + ERR_STAT_GT_FATAL_VECTOR_REG(x)) #endif diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index baae050163df..8062a6ff98a4 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -3,6 +3,7 @@ * Copyright © 2025 Intel Corporation */ +#include #include #include "regs/xe_gsc_regs.h" @@ -15,7 +16,13 @@ #include "xe_mmio.h" #include "xe_survivability_mode.h" -#define HEC_UNCORR_FW_ERR_BITS 4 +#define GT_HW_ERROR_MAX_ERR_BITS 16 +#define HEC_UNCORR_FW_ERR_BITS 4 +#define XE_RAS_REG_SIZE 32 + +#define PVC_ERROR_MASK_SET(hw_err, err_bit) ((hw_err == HARDWARE_ERROR_CORRECTABLE) ? \ + (PVC_COR_ERR_MASK & REG_BIT(err_bit)) : \ + (PVC_FAT_ERR_MASK & REG_BIT(err_bit))) extern struct fault_attr inject_csc_hw_error; @@ -28,10 +35,24 @@ static const char * const hec_uncorrected_fw_errors[] = { "Data Corruption" }; -static bool fault_inject_csc_hw_error(void) -{ - return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1); -} +static const unsigned long xe_hw_error_map[] = { + [XE_GT_ERROR] = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE, +}; + +enum gt_vector_regs { + ERR_STAT_GT_VECTOR0 = 0, + ERR_STAT_GT_VECTOR1, + ERR_STAT_GT_VECTOR2, + ERR_STAT_GT_VECTOR3, + ERR_STAT_GT_VECTOR4, + ERR_STAT_GT_VECTOR5, + ERR_STAT_GT_VECTOR6, + ERR_STAT_GT_VECTOR7, + ERR_STAT_GT_VECTOR_MAX +}; + +#define PVC_GT_VECTOR_LEN(hw_err) ((hw_err == HARDWARE_ERROR_CORRECTABLE) ? \ + ERR_STAT_GT_VECTOR4 : ERR_STAT_GT_VECTOR_MAX) static enum drm_xe_ras_error_severity hw_err_to_severity(const enum hardware_error hw_err) { @@ -42,6 +63,11 @@ static enum drm_xe_ras_error_severity hw_err_to_severity(const enum hardware_err return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE; } +static bool fault_inject_csc_hw_error(void) +{ + return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1); +} + static void csc_hw_error_work(struct work_struct *work) { struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work); @@ -89,15 +115,119 @@ static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src); } +static void log_hw_error(struct xe_tile *tile, const char *name, + const enum drm_xe_ras_error_severity severity) +{ + const char *severity_str = error_severity[severity]; + struct xe_device *xe = tile_to_xe(tile); + + if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) + drm_warn(&xe->drm, "%s %s detected\n", name, severity_str); + else + drm_err_ratelimited(&xe->drm, "%s %s detected\n", name, severity_str); +} + +static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err, + const enum drm_xe_ras_error_severity severity) +{ + const char *severity_str = error_severity[severity]; + struct xe_device *xe = tile_to_xe(tile); + + if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) + drm_warn(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n", + name, severity_str, i, err); + else + drm_err_ratelimited(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n", + name, severity_str, i, err); +} + +static void gt_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err, + u32 error_id) +{ + const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err); + struct xe_device *xe = tile_to_xe(tile); + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[severity]; + struct xe_mmio *mmio = &tile->mmio; + unsigned long err_stat = 0; + int i; + + if (xe->info.platform != XE_PVC) + return; + + if (hw_err == HARDWARE_ERROR_NONFATAL) { + atomic_inc(&info[error_id].counter); + log_hw_error(tile, info[error_id].name, severity); + return; + } + + for (i = 0; i < PVC_GT_VECTOR_LEN(hw_err); i++) { + u32 vector, val; + + vector = xe_mmio_read32(mmio, ERR_STAT_GT_VECTOR_REG(hw_err, i)); + if (!vector) + continue; + + switch (i) { + case ERR_STAT_GT_VECTOR0: + case ERR_STAT_GT_VECTOR1: { + u32 errbit; + + val = hweight32(vector); + atomic_add(val, &info[error_id].counter); + log_gt_err(tile, "Subslice", i, vector, severity); + + /* + * Error status register is only populated once per error. + * Read the register and clear once. + */ + if (err_stat) + break; + + err_stat = xe_mmio_read32(mmio, ERR_STAT_GT_REG(hw_err)); + for_each_set_bit(errbit, &err_stat, GT_HW_ERROR_MAX_ERR_BITS) { + if (PVC_ERROR_MASK_SET(hw_err, errbit)) + atomic_inc(&info[error_id].counter); + } + if (err_stat) + xe_mmio_write32(mmio, ERR_STAT_GT_REG(hw_err), err_stat); + break; + } + case ERR_STAT_GT_VECTOR2: + case ERR_STAT_GT_VECTOR3: + val = hweight32(vector); + atomic_add(val, &info[error_id].counter); + log_gt_err(tile, "L3 BANK", i, vector, severity); + break; + case ERR_STAT_GT_VECTOR6: + val = hweight32(vector); + atomic_add(val, &info[error_id].counter); + log_gt_err(tile, "TLB", i, vector, severity); + break; + case ERR_STAT_GT_VECTOR7: + val = hweight32(vector); + atomic_add(val, &info[error_id].counter); + log_gt_err(tile, "L3 Fabric", i, vector, severity); + break; + default: + log_gt_err(tile, "Undefined", i, vector, severity); + } + + xe_mmio_write32(mmio, ERR_STAT_GT_VECTOR_REG(hw_err, i), vector); + } +} + static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) { const enum drm_xe_ras_error_severity severity = hw_err_to_severity(hw_err); const char *severity_str = error_severity[severity]; struct xe_device *xe = tile_to_xe(tile); - unsigned long flags; - u32 err_src; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[severity]; + unsigned long flags, err_src; + u32 err_bit; - if (xe->info.platform != XE_BATTLEMAGE) + if (!IS_DGFX(xe)) return; spin_lock_irqsave(&xe->irq.lock, flags); @@ -108,11 +238,53 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er goto unlock; } - if (err_src & XE_CSC_ERROR) + /* + * On encountering CSC firmware errors, the graphics device becomes unrecoverable + * so return immediately on error. The only way to recover from these errors is + * firmware flash. The device will enter Runtime Survivability mode when such + * errors are detected. + */ + if (err_src & REG_BIT(XE_CSC_ERROR)) { csc_hw_error_handler(tile, hw_err); + goto clear_reg; + } + if (!info) + goto clear_reg; + + for_each_set_bit(err_bit, &err_src, XE_RAS_REG_SIZE) { + const char *name; + u32 error_id; + + /* Check error bit is within bounds */ + if (err_bit >= ARRAY_SIZE(xe_hw_error_map)) + break; + + error_id = xe_hw_error_map[err_bit]; + + /* Check error component is within max */ + if (!error_id || error_id >= DRM_XE_RAS_ERR_COMP_MAX) + continue; + + name = info[error_id].name; + if (!name) + continue; + + if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) { + drm_warn(&xe->drm, HW_ERR + "TILE%d reported %s %s, bit[%d] is set\n", + tile->id, name, severity_str, err_bit); + } else { + drm_err_ratelimited(&xe->drm, HW_ERR + "TILE%d reported %s %s, bit[%d] is set\n", + tile->id, name, severity_str, err_bit); + } + if (err_bit == XE_GT_ERROR) + gt_hw_error_handler(tile, hw_err, error_id); + } + +clear_reg: xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src); - unlock: spin_unlock_irqrestore(&xe->irq.lock, flags); } @@ -134,9 +306,10 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl) if (fault_inject_csc_hw_error()) schedule_work(&tile->csc_hw_error_work); - for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) + for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) { if (master_ctl & ERROR_IRQ(hw_err)) hw_error_source_handler(tile, hw_err); + } } static int hw_error_info_init(struct xe_device *xe)