drm/amdgpu: rework userq reset work handling

It is illegal to schedule reset work from another reset work!

Fix this by scheduling the userq reset work directly on the work queue
of the reset domain.

Not fully tested, I leave that to the IGT test cases.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Prike Liang <Prike.Liang@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit fd9200ccefab94f27877d1943761d6b0ccbd89c8)
This commit is contained in:
Christian König 2026-04-21 12:39:54 +02:00 committed by Alex Deucher
parent be045c5c83
commit c8ed2de0f2
4 changed files with 60 additions and 44 deletions

View File

@ -1190,7 +1190,6 @@ struct amdgpu_device {
bool apu_prefer_gtt;
bool userq_halt_for_enforce_isolation;
struct work_struct userq_reset_work;
struct amdgpu_uid *uid_info;
struct amdgpu_uma_carveout_info uma_info;

View File

@ -3787,7 +3787,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
amdgpu_coredump_init(adev);
@ -5478,7 +5477,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
if (!amdgpu_sriov_vf(adev))
cancel_work(&adev->reset_work);
#endif
cancel_work(&adev->userq_reset_work);
amdgpu_userq_mgr_cancel_reset_work(adev);
if (adev->kfd.dev)
cancel_work(&adev->kfd.reset_work);

View File

@ -82,19 +82,11 @@ static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
return false;
}
static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
{
if (amdgpu_device_should_recover_gpu(adev)) {
amdgpu_reset_domain_schedule(adev->reset_domain,
&adev->userq_reset_work);
/* Wait for the reset job to complete */
flush_work(&adev->userq_reset_work);
}
}
static int
amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
static void amdgpu_userq_mgr_reset_work(struct work_struct *work)
{
struct amdgpu_userq_mgr *uq_mgr =
container_of(work, struct amdgpu_userq_mgr,
reset_work);
struct amdgpu_device *adev = uq_mgr->adev;
const int queue_types[] = {
AMDGPU_RING_TYPE_COMPUTE,
@ -103,12 +95,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
};
const int num_queue_types = ARRAY_SIZE(queue_types);
bool gpu_reset = false;
int r = 0;
int i;
int i, r;
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
dev_err(adev->dev, "userq reset disabled by debug mask\n");
return 0;
return;
}
/*
@ -116,7 +107,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
* skip all reset detection logic
*/
if (!amdgpu_gpu_recovery)
return 0;
return;
/*
* Iterate through all queue types to detect and reset problematic queues
@ -141,10 +132,19 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
}
}
if (gpu_reset)
amdgpu_userq_gpu_reset(adev);
if (gpu_reset) {
struct amdgpu_reset_context reset_context;
return r;
memset(&reset_context, 0, sizeof(reset_context));
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
reset_context.src = AMDGPU_RESET_SRC_USERQ;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
}
static void amdgpu_userq_hang_detect_work(struct work_struct *work)
@ -153,7 +153,11 @@ static void amdgpu_userq_hang_detect_work(struct work_struct *work)
container_of(work, struct amdgpu_usermode_queue,
hang_detect_work.work);
amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
/*
* Don't schedule the work here! Scheduling or queue work from one reset
* handler to another is illegal if you don't take extra precautions!
*/
amdgpu_userq_mgr_reset_work(&queue->userq_mgr->reset_work);
}
/*
@ -182,8 +186,8 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
break;
}
schedule_delayed_work(&queue->hang_detect_work,
msecs_to_jiffies(timeout_ms));
queue_delayed_work(adev->reset_domain->wq, &queue->hang_detect_work,
msecs_to_jiffies(timeout_ms));
}
void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)
@ -1259,28 +1263,13 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
if (ret) {
drm_file_err(uq_mgr->file,
"Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
amdgpu_userq_detect_and_reset_queues(uq_mgr);
amdgpu_reset_domain_schedule(uq_mgr->adev->reset_domain,
&uq_mgr->reset_work);
flush_work(&uq_mgr->reset_work);
}
return ret;
}
void amdgpu_userq_reset_work(struct work_struct *work)
{
struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
userq_reset_work);
struct amdgpu_reset_context reset_context;
memset(&reset_context, 0, sizeof(reset_context));
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
reset_context.src = AMDGPU_RESET_SRC_USERQ;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
static void
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
@ -1314,9 +1303,24 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *f
userq_mgr->file = file_priv;
INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
INIT_WORK(&userq_mgr->reset_work, amdgpu_userq_mgr_reset_work);
return 0;
}
void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev)
{
struct xarray *xa = &adev->userq_doorbell_xa;
struct amdgpu_usermode_queue *queue;
unsigned long flags, queue_id;
xa_lock_irqsave(xa, flags);
xa_for_each(xa, queue_id, queue) {
cancel_delayed_work(&queue->hang_detect_work);
cancel_work(&queue->userq_mgr->reset_work);
}
xa_unlock_irqrestore(xa, flags);
}
void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr)
{
cancel_delayed_work_sync(&userq_mgr->resume_work);

View File

@ -84,7 +84,13 @@ struct amdgpu_usermode_queue {
u32 xcp_id;
int priority;
struct dentry *debugfs_queue;
struct delayed_work hang_detect_work;
/**
* @hang_detect_work:
*
* Delayed work which runs when userq_fences time out.
*/
struct delayed_work hang_detect_work;
struct kref refcount;
struct list_head userq_va_list;
@ -116,6 +122,13 @@ struct amdgpu_userq_mgr {
struct amdgpu_device *adev;
struct delayed_work resume_work;
struct drm_file *file;
/**
* @reset_work:
*
* Reset work which is used when eviction fails.
*/
struct work_struct reset_work;
atomic_t userq_count[AMDGPU_RING_TYPE_MAX];
};
@ -134,6 +147,7 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp
int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
struct amdgpu_device *adev);
void amdgpu_userq_mgr_cancel_reset_work(struct amdgpu_device *adev);
void amdgpu_userq_mgr_cancel_resume(struct amdgpu_userq_mgr *userq_mgr);
void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);