diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 4a7eaeeca293..a32d370c3d30 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -41,6 +41,7 @@ xe-y += xe_bb.o \ xe_device_sysfs.o \ xe_dma_buf.o \ xe_drm_client.o \ + xe_drm_ras.o \ xe_eu_stall.o \ xe_exec.o \ xe_exec_queue.o \ diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index e9032014923d..3e04e80e0815 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -13,6 +13,7 @@ #include #include "xe_devcoredump_types.h" +#include "xe_drm_ras_types.h" #include "xe_heci_gsc.h" #include "xe_late_bind_fw_types.h" #include "xe_oa_types.h" @@ -511,6 +512,9 @@ struct xe_device { /** @pmu: performance monitoring unit */ struct xe_pmu pmu; + /** @ras: RAS structure for device */ + struct xe_drm_ras ras; + /** @i2c: I2C host controller */ struct xe_i2c *i2c; diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c new file mode 100644 index 000000000000..e07dc23a155e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2026 Intel Corporation + */ + +#include + +#include +#include +#include + +#include "xe_device_types.h" +#include "xe_drm_ras.h" + +static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; +static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; + +static int hw_query_error_counter(struct xe_drm_ras_counter *info, + u32 error_id, const char **name, u32 *val) +{ + if (!info || !info[error_id].name) + return -ENOENT; + + *name = info[error_id].name; + *val = atomic_read(&info[error_id].counter); + + return 0; +} + +static int query_uncorrectable_error_counter(struct drm_ras_node *ep, u32 error_id, + const char **name, u32 *val) +{ + struct xe_device *xe = ep->priv; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]; + + return hw_query_error_counter(info, error_id, name, val); +} + +static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id, + const char **name, u32 *val) +{ + struct xe_device *xe = ep->priv; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; + + return hw_query_error_counter(info, error_id, name, val); +} + +static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe) +{ + struct xe_drm_ras_counter *counter; + int i; + + counter = kcalloc(DRM_XE_RAS_ERR_COMP_MAX, sizeof(*counter), GFP_KERNEL); + if (!counter) + return ERR_PTR(-ENOMEM); + + for (i = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; i < DRM_XE_RAS_ERR_COMP_MAX; i++) { + if (!error_components[i]) + continue; + + counter[i].name = error_components[i]; + atomic_set(&counter[i].counter, 0); + } + + return counter; +} + +static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, + const enum drm_xe_ras_error_severity severity) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct xe_drm_ras *ras = &xe->ras; + const char *device_name; + + device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d", + pci_domain_nr(pdev->bus), pdev->bus->number, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + + if (!device_name) + return -ENOMEM; + + node->device_name = device_name; + node->node_name = error_severity[severity]; + node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER; + node->error_counter_range.first = DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; + node->error_counter_range.last = DRM_XE_RAS_ERR_COMP_MAX - 1; + node->priv = xe; + + ras->info[severity] = allocate_and_copy_counters(xe); + if (IS_ERR(ras->info[severity])) + return PTR_ERR(ras->info[severity]); + + if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) + node->query_error_counter = query_correctable_error_counter; + else + node->query_error_counter = query_uncorrectable_error_counter; + + return 0; +} + +static void cleanup_node_param(struct xe_drm_ras *ras, const enum drm_xe_ras_error_severity severity) +{ + struct drm_ras_node *node = &ras->node[severity]; + + kfree(ras->info[severity]); + ras->info[severity] = NULL; + + kfree(node->device_name); + node->device_name = NULL; +} + +static int register_nodes(struct xe_device *xe) +{ + struct xe_drm_ras *ras = &xe->ras; + int i; + + for_each_error_severity(i) { + struct drm_ras_node *node = &ras->node[i]; + int ret; + + ret = assign_node_params(xe, node, i); + if (ret) { + cleanup_node_param(ras, i); + return ret; + } + + ret = drm_ras_node_register(node); + if (ret) { + cleanup_node_param(ras, i); + return ret; + } + } + + return 0; +} + +static void xe_drm_ras_unregister_nodes(struct drm_device *device, void *arg) +{ + struct xe_device *xe = arg; + struct xe_drm_ras *ras = &xe->ras; + int i; + + for_each_error_severity(i) { + struct drm_ras_node *node = &ras->node[i]; + + drm_ras_node_unregister(node); + cleanup_node_param(ras, i); + } +} + +/** + * xe_drm_ras_init() - Initialize DRM RAS + * @xe: xe device instance + * + * Allocate and register DRM RAS nodes per device + * + * Return: 0 on success, negative error code otherwise. + */ +int xe_drm_ras_init(struct xe_device *xe) +{ + struct xe_drm_ras *ras = &xe->ras; + struct drm_ras_node *node; + int err; + + node = drmm_kcalloc(&xe->drm, DRM_XE_RAS_ERR_SEV_MAX, sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + ras->node = node; + + err = register_nodes(xe); + if (err) { + drm_err(&xe->drm, "Failed to register DRM RAS nodes (%pe)\n", ERR_PTR(err)); + return err; + } + + err = drmm_add_action_or_reset(&xe->drm, xe_drm_ras_unregister_nodes, xe); + if (err) { + drm_err(&xe->drm, "Failed to add action for Xe DRM RAS (%pe)\n", ERR_PTR(err)); + return err; + } + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h new file mode 100644 index 000000000000..5cc8f0124411 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2026 Intel Corporation + */ +#ifndef XE_DRM_RAS_H_ +#define XE_DRM_RAS_H_ + +struct xe_device; + +#define for_each_error_severity(i) \ + for (i = 0; i < DRM_XE_RAS_ERR_SEV_MAX; i++) + +int xe_drm_ras_init(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h b/drivers/gpu/drm/xe/xe_drm_ras_types.h new file mode 100644 index 000000000000..7acc5e7377b2 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2026 Intel Corporation + */ + +#ifndef _XE_DRM_RAS_TYPES_H_ +#define _XE_DRM_RAS_TYPES_H_ + +#include +#include + +struct drm_ras_node; + +/** + * struct xe_drm_ras_counter - XE RAS counter + * + * This structure contains error component and counter information + */ +struct xe_drm_ras_counter { + /** @name: error component name */ + const char *name; + + /** @counter: count of error */ + atomic_t counter; +}; + +/** + * struct xe_drm_ras - XE DRM RAS structure + * + * This structure has details of error counters + */ +struct xe_drm_ras { + /** @node: DRM RAS node */ + struct drm_ras_node *node; + + /** @info: info array for all types of errors */ + struct xe_drm_ras_counter *info[DRM_XE_RAS_ERR_SEV_MAX]; +}; + +#endif diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index ef2565048bdf..b0264c32ceb2 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -2357,6 +2357,85 @@ struct drm_xe_exec_queue_set_property { __u64 reserved[2]; }; +/** + * DOC: Xe DRM RAS + * + * The enums and strings defined below map to the attributes of the DRM RAS Netlink Interface. + * Refer to Documentation/netlink/specs/drm_ras.yaml for complete interface specification. + * + * Node Registration + * ================= + * + * The driver registers DRM RAS nodes for each error severity level. + * enum drm_xe_ras_error_severity defines the node-id, while DRM_XE_RAS_ERROR_SEVERITY_NAMES maps + * node-id to node-name. + * + * Error Classification + * ==================== + * + * Each node contains a list of error counters. Each error is identified by a error-id and + * an error-name. enum drm_xe_ras_error_component defines the error-id, while + * DRM_XE_RAS_ERROR_COMPONENT_NAMES maps error-id to error-name. + * + * User Interface + * ============== + * + * To retrieve error values of a error counter, userspace applications should + * follow the below steps: + * + * 1. Use command LIST_NODES to enumerate all available nodes + * 2. Select node by node-id or node-name + * 3. Use command GET_ERROR_COUNTERS to list errors of specific node + * 4. Query specific error values using either error-id or error-name + * + * .. code-block:: C + * + * // Lookup tables for ID-to-name resolution + * static const char *nodes[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; + * static const char *errors[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; + * + */ + +/** + * enum drm_xe_ras_error_severity - DRM RAS error severity. + */ +enum drm_xe_ras_error_severity { + /** @DRM_XE_RAS_ERR_SEV_CORRECTABLE: Correctable Error */ + DRM_XE_RAS_ERR_SEV_CORRECTABLE = 0, + /** @DRM_XE_RAS_ERR_SEV_UNCORRECTABLE: Uncorrectable Error */ + DRM_XE_RAS_ERR_SEV_UNCORRECTABLE, + /** @DRM_XE_RAS_ERR_SEV_MAX: Max severity */ + DRM_XE_RAS_ERR_SEV_MAX /* non-ABI */ +}; + +/** + * enum drm_xe_ras_error_component - DRM RAS error component. + */ +enum drm_xe_ras_error_component { + /** @DRM_XE_RAS_ERR_COMP_CORE_COMPUTE: Core Compute Error */ + DRM_XE_RAS_ERR_COMP_CORE_COMPUTE = 1, + /** @DRM_XE_RAS_ERR_COMP_SOC_INTERNAL: SoC Internal Error */ + DRM_XE_RAS_ERR_COMP_SOC_INTERNAL, + /** @DRM_XE_RAS_ERR_COMP_MAX: Max Error */ + DRM_XE_RAS_ERR_COMP_MAX /* non-ABI */ +}; + +/* + * Error severity to name mapping. + */ +#define DRM_XE_RAS_ERROR_SEVERITY_NAMES { \ + [DRM_XE_RAS_ERR_SEV_CORRECTABLE] = "correctable-errors", \ + [DRM_XE_RAS_ERR_SEV_UNCORRECTABLE] = "uncorrectable-errors", \ +} + +/* + * Error component to name mapping. + */ +#define DRM_XE_RAS_ERROR_COMPONENT_NAMES { \ + [DRM_XE_RAS_ERR_COMP_CORE_COMPUTE] = "core-compute", \ + [DRM_XE_RAS_ERR_COMP_SOC_INTERNAL] = "soc-internal" \ +} + #if defined(__cplusplus) } #endif