drm/amdgpu: support ras critical address check

Support ras critical address check.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai 2025-07-24 15:34:29 +08:00 committed by Alex Deucher
parent d45c5e6845
commit f348691897
2 changed files with 103 additions and 0 deletions

View File

@ -143,6 +143,10 @@ static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr);
static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev);
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev);
#ifdef CONFIG_X86_MCE_AMD
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
struct mce_notifier_adev_list {
@ -3728,6 +3732,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
kfree(data);
mutex_unlock(&con->recovery_lock);
amdgpu_ras_critical_region_init(adev);
return 0;
}
/* recovery end */
@ -4157,6 +4163,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
con->init_task_pid = task_pid_nr(current);
get_task_comm(con->init_task_comm, current);
mutex_init(&con->critical_region_lock);
INIT_LIST_HEAD(&con->critical_region_head);
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
adev->ras_hw_enabled, adev->ras_enabled);
@ -4436,6 +4445,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
if (!adev->ras_enabled || !con)
return 0;
amdgpu_ras_critical_region_fini(adev);
mutex_destroy(&con->critical_region_lock);
list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
if (ras_node->ras_obj) {
obj = ras_node->ras_obj;
@ -5380,3 +5392,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
return con->is_rma;
}
int amdgpu_ras_add_critical_region(struct amdgpu_device *adev,
struct amdgpu_bo *bo)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_vram_mgr_resource *vres;
struct ras_critical_region *region;
struct drm_buddy_block *block;
int ret = 0;
if (!bo || !bo->tbo.resource)
return -EINVAL;
vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource);
mutex_lock(&con->critical_region_lock);
/* Check if the bo had been recorded */
list_for_each_entry(region, &con->critical_region_head, node)
if (region->bo == bo)
goto out;
/* Record new critical amdgpu bo */
list_for_each_entry(block, &vres->blocks, link) {
region = kzalloc(sizeof(*region), GFP_KERNEL);
if (!region) {
ret = -ENOMEM;
goto out;
}
region->bo = bo;
region->start = amdgpu_vram_mgr_block_start(block);
region->size = amdgpu_vram_mgr_block_size(block);
list_add_tail(&region->node, &con->critical_region_head);
}
out:
mutex_unlock(&con->critical_region_lock);
return ret;
}
static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev)
{
amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory);
}
static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_critical_region *region, *tmp;
mutex_lock(&con->critical_region_lock);
list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) {
list_del(&region->node);
kfree(region);
}
mutex_unlock(&con->critical_region_lock);
}
bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_critical_region *region;
bool ret = false;
mutex_lock(&con->critical_region_lock);
list_for_each_entry(region, &con->critical_region_head, node) {
if ((region->start <= addr) &&
(addr < (region->start + region->size))) {
ret = true;
break;
}
}
mutex_unlock(&con->critical_region_lock);
return ret;
}

View File

@ -496,6 +496,13 @@ struct ras_ecc_log_info {
uint64_t prev_de_queried_count;
};
struct ras_critical_region {
struct list_head node;
struct amdgpu_bo *bo;
uint64_t start;
uint64_t size;
};
struct amdgpu_ras {
/* ras infrastructure */
/* for ras itself. */
@ -575,6 +582,10 @@ struct amdgpu_ras {
char init_task_comm[TASK_COMM_LEN];
int bad_page_num;
struct list_head critical_region_head;
struct mutex critical_region_lock;
};
struct ras_fs_data {
@ -979,6 +990,9 @@ int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_
int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, struct amdgpu_bo *bo);
bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr);
int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset);