drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues

Well the reset handling seems broken on multiple levels.

As first step of fixing this remove most calls to the hang detection.
That function should only be called after we run into a timeout! And *NOT*
as random check spread over the code in multiple places.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 71bea36b54ccfb14cbc90f94267af6369af4e702)
This commit is contained in:
Christian König 2026-04-20 15:13:57 +02:00 committed by Alex Deucher
parent 44e5bc73bd
commit d0053441ad

View File

@ -345,23 +345,18 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue)
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
bool found_hung_queue = false;
int r = 0;
int r;
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
r = userq_funcs->preempt(queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
found_hung_queue = true;
return r;
} else {
queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
}
}
if (found_hung_queue)
amdgpu_userq_detect_and_reset_queues(uq_mgr);
return r;
return 0;
}
static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue)
@ -390,24 +385,21 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue)
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
bool found_hung_queue = false;
int r = 0;
int r;
if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
r = userq_funcs->unmap(queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
found_hung_queue = true;
return r;
} else {
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
}
}
if (found_hung_queue)
amdgpu_userq_detect_and_reset_queues(uq_mgr);
return r;
return 0;
}
static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
@ -416,19 +408,19 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
int r = 0;
int r;
if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
r = userq_funcs->map(queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
amdgpu_userq_detect_and_reset_queues(uq_mgr);
return r;
} else {
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
return r;
return 0;
}
static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue)
@ -654,7 +646,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
#if defined(CONFIG_DEBUG_FS)
debugfs_remove_recursive(queue->debugfs_queue);
#endif
amdgpu_userq_detect_and_reset_queues(uq_mgr);
r = amdgpu_userq_unmap_helper(queue);
atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
amdgpu_userq_cleanup(queue);
@ -1264,7 +1255,6 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
unsigned long queue_id;
int ret = 0, r;
amdgpu_userq_detect_and_reset_queues(uq_mgr);
/* Try to unmap all the queues in this process ctx */
xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
r = amdgpu_userq_preempt_helper(queue);
@ -1272,9 +1262,11 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
ret = r;
}
if (ret)
if (ret) {
drm_file_err(uq_mgr->file,
"Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
amdgpu_userq_detect_and_reset_queues(uq_mgr);
}
return ret;
}
@ -1374,7 +1366,6 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
uqm = queue->userq_mgr;
cancel_delayed_work_sync(&uqm->resume_work);
guard(mutex)(&uqm->userq_mutex);
amdgpu_userq_detect_and_reset_queues(uqm);
if (adev->in_s0ix)
r = amdgpu_userq_preempt_helper(queue);
else
@ -1433,7 +1424,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
(queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
(queue->xcp_id == idx)) {
amdgpu_userq_detect_and_reset_queues(uqm);
r = amdgpu_userq_preempt_helper(queue);
if (r)
ret = r;