From d0053441ad7eaf7920b71e2263097ece53c5af34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 20 Apr 2026 15:13:57 +0200 Subject: [PATCH] drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Well the reset handling seems broken on multiple levels. As first step of fixing this remove most calls to the hang detection. That function should only be called after we run into a timeout! And *NOT* as random check spread over the code in multiple places. Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher (cherry picked from commit 71bea36b54ccfb14cbc90f94267af6369af4e702) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 38 +++++++++-------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 2d4f159f3362..ba03d2a42e1e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -345,23 +345,18 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - bool found_hung_queue = false; - int r = 0; + int r; if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { r = userq_funcs->preempt(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - found_hung_queue = true; + return r; } else { queue->state = AMDGPU_USERQ_STATE_PREEMPTED; } } - - if (found_hung_queue) - amdgpu_userq_detect_and_reset_queues(uq_mgr); - - return r; + return 0; } static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue) @@ -390,24 +385,21 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - bool found_hung_queue = false; - int r = 0; + int r; if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || - (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { + (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { + r = userq_funcs->unmap(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - found_hung_queue = true; + return r; } else { queue->state = AMDGPU_USERQ_STATE_UNMAPPED; } } - if (found_hung_queue) - amdgpu_userq_detect_and_reset_queues(uq_mgr); - - return r; + return 0; } static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue) @@ -416,19 +408,19 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - int r = 0; + int r; if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { r = userq_funcs->map(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - amdgpu_userq_detect_and_reset_queues(uq_mgr); + return r; } else { queue->state = AMDGPU_USERQ_STATE_MAPPED; } } - return r; + return 0; } static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue) @@ -654,7 +646,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que #if defined(CONFIG_DEBUG_FS) debugfs_remove_recursive(queue->debugfs_queue); #endif - amdgpu_userq_detect_and_reset_queues(uq_mgr); r = amdgpu_userq_unmap_helper(queue); atomic_dec(&uq_mgr->userq_count[queue->queue_type]); amdgpu_userq_cleanup(queue); @@ -1264,7 +1255,6 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) unsigned long queue_id; int ret = 0, r; - amdgpu_userq_detect_and_reset_queues(uq_mgr); /* Try to unmap all the queues in this process ctx */ xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { r = amdgpu_userq_preempt_helper(queue); @@ -1272,9 +1262,11 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) ret = r; } - if (ret) + if (ret) { drm_file_err(uq_mgr->file, "Couldn't unmap all the queues, eviction failed ret=%d\n", ret); + amdgpu_userq_detect_and_reset_queues(uq_mgr); + } return ret; } @@ -1374,7 +1366,6 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) uqm = queue->userq_mgr; cancel_delayed_work_sync(&uqm->resume_work); guard(mutex)(&uqm->userq_mutex); - amdgpu_userq_detect_and_reset_queues(uqm); if (adev->in_s0ix) r = amdgpu_userq_preempt_helper(queue); else @@ -1433,7 +1424,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, if (((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && (queue->xcp_id == idx)) { - amdgpu_userq_detect_and_reset_queues(uqm); r = amdgpu_userq_preempt_helper(queue); if (r) ret = r;