drm/amdgpu: unmap all user mappings of framebuffer and doorbell before mode1 reset

During Mode 1 reset, the ASIC undergoes a reset cycle and becomes temporarily
inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers during
this window can result in uncompleted PCIe transactions, leading to NMI panics or
system hangs.

To prevent this, Unmap all of the applications mappings of the framebuffer
and doorbell BARs before mode1 reset. Also prevent new mappings from coming in
during the reset process.

v2: remove inode in kfd_dev (Christian)
v3: correct unmap offset (Felix), remove prevent new mappings part
to avoid deadlock (Christian)

Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Yifan Zhang <yifan1.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 70cadefcc6160c575b04f763ada34c20e868d577)
This commit is contained in:
Yifan Zhang 2026-05-11 22:14:23 +08:00 committed by Alex Deucher
parent 6c92f6d960
commit 353f7430d1
5 changed files with 55 additions and 0 deletions

View File

@ -36,6 +36,9 @@
#include "amdgpu_ras.h"
#include "amdgpu_umc.h"
#include "amdgpu_reset.h"
#if IS_ENABLED(CONFIG_HSA_AMD)
#include "kfd_priv.h"
#endif
/* Total memory size in system memory and all GPU VRAM. Used to
* estimate worst case amount of memory to reserve for page tables
@ -320,6 +323,28 @@ void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
(void)amdgpu_reset_domain_schedule(adev->reset_domain, &adev->kfd.reset_work);
}
void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev)
{
#if IS_ENABLED(CONFIG_HSA_AMD)
struct kfd_dev *kfd = adev->kfd.dev;
unsigned int i;
if (!kfd)
return;
for (i = 0; i < kfd->num_nodes; i++) {
struct kfd_node *node = kfd->nodes[i];
kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_DOORBELL |
KFD_MMAP_GPU_ID(node->id),
kfd_doorbell_process_slice(kfd));
kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_MMIO |
KFD_MMAP_GPU_ID(node->id),
PAGE_SIZE);
}
#endif
}
int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
u32 domain, void **mem_obj, uint64_t *gpu_addr,
void **cpu_ptr, bool cp_mqd_gfx9)

View File

@ -358,6 +358,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev);
u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id);

View File

@ -5835,6 +5835,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* We need to lock reset domain only once both for XGMI and single device */
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
/* unmap all the mappings of doorbell and framebuffer to prevent user space from
* accessing them
*/
unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
amdgpu_amdkfd_clear_kfd_mapping(adev);
amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
hive, need_emergency_restart);
if (need_emergency_restart)

View File

@ -67,6 +67,21 @@ static const struct class kfd_class = {
.name = kfd_dev_name,
};
/*
* Cache the address space of the chardev on first open so that the reset
* path can drop all userspace mappings of doorbell and MMIO ranges via
* unmap_mapping_range().
*/
static struct address_space *kfd_dev_mapping;
void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen)
{
struct address_space *mapping = READ_ONCE(kfd_dev_mapping);
if (mapping)
unmap_mapping_range(mapping, holebegin, holelen, 1);
}
static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id)
{
struct kfd_process_device *pdd;
@ -133,6 +148,13 @@ static int kfd_open(struct inode *inode, struct file *filep)
if (iminor(inode) != 0)
return -ENODEV;
/*
* /dev/kfd is a single chardev so all opens share one inode. Cache
* its address_space on the first open for use by the reset path.
*/
if (!READ_ONCE(kfd_dev_mapping))
cmpxchg(&kfd_dev_mapping, NULL, inode->i_mapping);
is_32bit_user_mode = in_compat_syscall();
if (is_32bit_user_mode) {

View File

@ -395,6 +395,7 @@ enum kfd_mempool {
/* Character device interface */
int kfd_chardev_init(void);
void kfd_chardev_exit(void);
void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen);
/**
* enum kfd_unmap_queues_filter - Enum for queue filters.