drm/xe/hwmon: Expose GPU PCIe temperature

Expose GPU PCIe average temperature and its limits via hwmon sysfs entry
temp5_xxx.
Update Xe hwmon sysfs documentation for this.

v2: Update kernel version in Xe hwmon documentation. (Raag)

v3:
 - Address review comments from Raag.
 - Remove redundant debug log.
 - Update kernel version in Xe hwmon documentation. (Raag)

v4:
 - Address review comments from Raag.
 - Group new temperature attributes with existing temperature attributes
   as per channel index in Xe hwmon documentation.
 - Use TEMP_MASK instead of TEMP_MASK_MAILBOX.
 - Add PCIE_SENSOR_MASK which uses REG_FIELD_GET as replacement of
   PCIE_SENSOR_SHIFT.

v5:
 - Address review comments from Raag.
 - Use REG_FIELD_GET to get PCIe temperature.
 - Move PCIE_SENSOR_GROUP_ID and PCIE_SENSOR_MASK to xe_pcode_api.h
 - Cosmetic change.

Signed-off-by: Karthik Poosa <karthik.poosa@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
Link: https://patch.msgid.link/20260112203521.1014388-4-karthik.poosa@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
This commit is contained in:
Karthik Poosa 2026-01-13 02:05:20 +05:30 committed by Rodrigo Vivi
parent 3a0cb885e1
commit 8d2511686e
No known key found for this signature in database
GPG Key ID: FA625F640EEB13CA
3 changed files with 58 additions and 0 deletions

View File

@ -189,6 +189,30 @@ Description: RO. Memory controller average temperature in millidegree Celsius.
Only supported for particular Intel Xe graphics platforms.
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp5_crit
Date: January 2026
KernelVersion: 7.0
Contact: intel-xe@lists.freedesktop.org
Description: RO. GPU PCIe critical temperature in millidegree Celsius.
Only supported for particular Intel Xe graphics platforms.
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp5_emergency
Date: January 2026
KernelVersion: 7.0
Contact: intel-xe@lists.freedesktop.org
Description: RO. GPU PCIe shutdown temperature in millidegree Celsius.
Only supported for particular Intel Xe graphics platforms.
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp5_input
Date: January 2026
KernelVersion: 7.0
Contact: intel-xe@lists.freedesktop.org
Description: RO. GPU PCIe temperature in millidegree Celsius.
Only supported for particular Intel Xe graphics platforms.
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/fan1_input
Date: March 2025
KernelVersion: 6.16

View File

@ -44,6 +44,7 @@ enum xe_hwmon_channel {
CHANNEL_PKG,
CHANNEL_VRAM,
CHANNEL_MCTRL,
CHANNEL_PCIE,
CHANNEL_MAX,
};
@ -712,6 +713,7 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL |
HWMON_T_MAX,
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL),
HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT |
HWMON_P_CAP,
@ -771,6 +773,27 @@ static int get_mc_temp(struct xe_hwmon *hwmon, long *val)
return 0;
}
static int get_pcie_temp(struct xe_hwmon *hwmon, long *val)
{
struct xe_tile *root_tile = xe_device_get_root_tile(hwmon->xe);
u32 data = 0;
int ret;
ret = xe_pcode_read(root_tile, PCODE_MBOX(PCODE_THERMAL_INFO, READ_THERMAL_DATA,
PCIE_SENSOR_GROUP_ID), &data, NULL);
if (ret)
return ret;
/* Sensor offset is different for G21 */
if (hwmon->xe->info.subplatform != XE_SUBPLATFORM_BATTLEMAGE_G21)
data = REG_FIELD_GET(PCIE_SENSOR_MASK, data);
data = REG_FIELD_GET(TEMP_MASK, data);
*val = (s8)data * MILLIDEGREE_PER_DEGREE;
return 0;
}
/* I1 is exposed as power_crit or as curr_crit depending on bit 31 */
static int xe_hwmon_pcode_read_i1(const struct xe_hwmon *hwmon, u32 *uval)
{
@ -876,6 +899,7 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
case CHANNEL_VRAM:
return hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] ? 0444 : 0;
case CHANNEL_MCTRL:
case CHANNEL_PCIE:
return hwmon->temp.count ? 0444 : 0;
default:
return 0;
@ -887,6 +911,7 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
case CHANNEL_VRAM:
return hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT] ? 0444 : 0;
case CHANNEL_MCTRL:
case CHANNEL_PCIE:
return hwmon->temp.count ? 0444 : 0;
default:
return 0;
@ -906,6 +931,7 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP,
channel)) ? 0444 : 0;
case CHANNEL_MCTRL:
case CHANNEL_PCIE:
return hwmon->temp.count ? 0444 : 0;
default:
return 0;
@ -933,6 +959,8 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
return 0;
case CHANNEL_MCTRL:
return get_mc_temp(hwmon, val);
case CHANNEL_PCIE:
return get_pcie_temp(hwmon, val);
default:
return -EOPNOTSUPP;
}
@ -940,6 +968,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
switch (channel) {
case CHANNEL_PKG:
case CHANNEL_MCTRL:
case CHANNEL_PCIE:
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
return 0;
case CHANNEL_VRAM:
@ -952,6 +981,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
switch (channel) {
case CHANNEL_PKG:
case CHANNEL_MCTRL:
case CHANNEL_PCIE:
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] * MILLIDEGREE_PER_DEGREE;
return 0;
case CHANNEL_VRAM:
@ -1331,6 +1361,8 @@ static int xe_hwmon_read_label(struct device *dev,
*str = "vram";
else if (channel == CHANNEL_MCTRL)
*str = "mctrl";
else if (channel == CHANNEL_PCIE)
*str = "pcie";
return 0;
case hwmon_power:
case hwmon_energy:

View File

@ -54,6 +54,8 @@
#define READ_THERMAL_LIMITS 0x0
#define READ_THERMAL_CONFIG 0x1
#define READ_THERMAL_DATA 0x2
#define PCIE_SENSOR_GROUP_ID 0x2
#define PCIE_SENSOR_MASK REG_GENMASK(31, 16)
#define PCODE_LATE_BINDING 0x5C
#define GET_CAPABILITY_STATUS 0x0