diff --git a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h index 9b66cc972a63..180be82672ab 100644 --- a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h @@ -13,6 +13,8 @@ /* Definitions of GSC H/W registers, bits, etc */ +#define BMG_GSC_HECI1_BASE 0x373000 + #define MTL_GSC_HECI1_BASE 0x00116000 #define MTL_GSC_HECI2_BASE 0x00117000 diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h index ed9b81fb28a0..c146b9ef44eb 100644 --- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h @@ -6,10 +6,15 @@ #ifndef _XE_HW_ERROR_REGS_H_ #define _XE_HW_ERROR_REGS_H_ +#define HEC_UNCORR_ERR_STATUS(base) XE_REG((base) + 0x118) +#define UNCORR_FW_REPORTED_ERR BIT(6) + +#define HEC_UNCORR_FW_ERR_DW0(base) XE_REG((base) + 0x124) + #define DEV_ERR_STAT_NONFATAL 0x100178 #define DEV_ERR_STAT_CORRECTABLE 0x10017c #define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \ DEV_ERR_STAT_CORRECTABLE, \ DEV_ERR_STAT_NONFATAL)) - +#define XE_CSC_ERROR BIT(17) #endif diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 333c7aa6bbb3..389996970564 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -192,6 +192,9 @@ struct xe_tile { /** @memirq: Memory Based Interrupts. */ struct xe_memirq memirq; + /** @csc_hw_error_work: worker to report CSC HW errors */ + struct work_struct csc_hw_error_work; + /** @pcode: tile's PCODE */ struct { /** @pcode.lock: protecting tile's PCODE mailbox data */ diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 0fa45792f1c0..4099ed1262e1 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -3,12 +3,16 @@ * Copyright © 2025 Intel Corporation */ +#include "regs/xe_gsc_regs.h" #include "regs/xe_hw_error_regs.h" #include "regs/xe_irq_regs.h" #include "xe_device.h" #include "xe_hw_error.h" #include "xe_mmio.h" +#include "xe_survivability_mode.h" + +#define HEC_UNCORR_FW_ERR_BITS 4 /* Error categories reported by hardware */ enum hardware_error { @@ -18,6 +22,13 @@ enum hardware_error { HARDWARE_ERROR_MAX, }; +static const char * const hec_uncorrected_fw_errors[] = { + "Fatal", + "CSE Disabled", + "FD Corruption", + "Data Corruption" +}; + static const char *hw_error_to_str(const enum hardware_error hw_err) { switch (hw_err) { @@ -32,6 +43,52 @@ static const char *hw_error_to_str(const enum hardware_error hw_err) } } +static void csc_hw_error_work(struct work_struct *work) +{ + struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work); + struct xe_device *xe = tile_to_xe(tile); + int ret; + + ret = xe_survivability_mode_runtime_enable(xe); + if (ret) + drm_err(&xe->drm, "Failed to enable runtime survivability mode\n"); +} + +static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error hw_err) +{ + const char *hw_err_str = hw_error_to_str(hw_err); + struct xe_device *xe = tile_to_xe(tile); + struct xe_mmio *mmio = &tile->mmio; + u32 base, err_bit, err_src; + unsigned long fw_err; + + if (xe->info.platform != XE_BATTLEMAGE) + return; + + base = BMG_GSC_HECI1_BASE; + lockdep_assert_held(&xe->irq.lock); + err_src = xe_mmio_read32(mmio, HEC_UNCORR_ERR_STATUS(base)); + if (!err_src) { + drm_err_ratelimited(&xe->drm, HW_ERR "Tile%d reported HEC_ERR_STATUS_%s blank\n", + tile->id, hw_err_str); + return; + } + + if (err_src & UNCORR_FW_REPORTED_ERR) { + fw_err = xe_mmio_read32(mmio, HEC_UNCORR_FW_ERR_DW0(base)); + for_each_set_bit(err_bit, &fw_err, HEC_UNCORR_FW_ERR_BITS) { + drm_err_ratelimited(&xe->drm, HW_ERR + "%s: HEC Uncorrected FW %s error reported, bit[%d] is set\n", + hw_err_str, hec_uncorrected_fw_errors[err_bit], + err_bit); + + schedule_work(&tile->csc_hw_error_work); + } + } + + xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src); +} + static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) { const char *hw_err_str = hw_error_to_str(hw_err); @@ -50,7 +107,8 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er goto unlock; } - /* TODO: Process errrors per source */ + if (err_src & XE_CSC_ERROR) + csc_hw_error_handler(tile, hw_err); xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src); @@ -102,8 +160,12 @@ static void process_hw_errors(struct xe_device *xe) */ void xe_hw_error_init(struct xe_device *xe) { + struct xe_tile *tile = xe_device_get_root_tile(xe); + if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) return; + INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); + process_hw_errors(xe); } diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 86ba767c4e44..53c5af4b810c 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -346,7 +346,7 @@ int xe_survivability_mode_runtime_enable(struct xe_device *xe) xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR); xe_device_declare_wedged(xe); - dev_err(&pdev->dev, "Firmware flash required, Refer the userspace documentation for more details!\n"); + dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n"); return 0; }