drm/amdgpu: move scheduler wqueue handling into callbacks

Move the scheduler wqueue stopping and starting into
the ring reset callbacks.  On some IPs we have to reset
an engine which may have multiple queues.  Move the wqueue
handling into the backend so we can handle them as needed
based on the type of reset available.

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Alex Deucher 2025-06-16 17:45:05 -04:00
parent 43ca5eb94b
commit 38b20968f3
19 changed files with 55 additions and 21 deletions

View File

@ -135,17 +135,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
} else if (amdgpu_gpu_recovery && ring->funcs->reset) {
dev_err(adev->dev, "Starting %s ring reset\n",
s_job->sched->name);
/*
* Stop the scheduler to prevent anybody else from touching the
* ring buffer.
*/
drm_sched_wqueue_stop(&ring->sched);
r = amdgpu_ring_reset(ring, job->vmid, NULL);
if (!r) {
atomic_inc(&ring->adev->gpu_reset_counter);
drm_sched_wqueue_start(&ring->sched);
dev_err(adev->dev, "Ring %s reset succeeded\n",
ring->sched.name);
drm_dev_wedged_event(adev_to_drm(adev),

View File

@ -554,22 +554,16 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id];
struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
struct amdgpu_ring *page_ring = &sdma_instance->page;
bool gfx_sched_stopped = false, page_sched_stopped = false;
mutex_lock(&sdma_instance->engine_reset_mutex);
/* Stop the scheduler's work queue for the GFX and page rings if they are running.
* This ensures that no new tasks are submitted to the queues while
* the reset is in progress.
*/
if (!amdgpu_ring_sched_ready(gfx_ring)) {
drm_sched_wqueue_stop(&gfx_ring->sched);
gfx_sched_stopped = true;
}
drm_sched_wqueue_stop(&gfx_ring->sched);
if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
if (adev->sdma.has_page_queue)
drm_sched_wqueue_stop(&page_ring->sched);
page_sched_stopped = true;
}
if (sdma_instance->funcs->stop_kernel_queue) {
sdma_instance->funcs->stop_kernel_queue(gfx_ring);
@ -596,12 +590,9 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
* to be submitted to the queues after the reset is complete.
*/
if (!ret) {
if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) {
drm_sched_wqueue_start(&gfx_ring->sched);
}
if (page_sched_stopped && amdgpu_ring_sched_ready(page_ring)) {
drm_sched_wqueue_start(&gfx_ring->sched);
if (adev->sdma.has_page_queue)
drm_sched_wqueue_start(&page_ring->sched);
}
}
mutex_unlock(&sdma_instance->engine_reset_mutex);

View File

@ -9540,6 +9540,8 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring,
if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
spin_lock_irqsave(&kiq->ring_lock, flags);
if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + kiq->pmf->map_queues_size)) {
@ -9581,6 +9583,7 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}
@ -9600,6 +9603,8 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
spin_lock_irqsave(&kiq->ring_lock, flags);
if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
@ -9658,6 +9663,7 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -6821,6 +6821,8 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring,
if (amdgpu_sriov_vf(adev))
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, false);
if (r) {
@ -6846,6 +6848,7 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}
@ -6989,6 +6992,8 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring,
if (amdgpu_sriov_vf(adev))
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true);
if (r) {
dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r);
@ -7012,6 +7017,7 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -5317,6 +5317,8 @@ static int gfx_v12_0_reset_kgq(struct amdgpu_ring *ring,
if (amdgpu_sriov_vf(adev))
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, false);
if (r) {
dev_warn(adev->dev, "reset via MES failed and try pipe reset %d\n", r);
@ -5341,6 +5343,7 @@ static int gfx_v12_0_reset_kgq(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}
@ -5437,6 +5440,8 @@ static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring,
if (amdgpu_sriov_vf(adev))
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true);
if (r) {
dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r);
@ -5460,6 +5465,7 @@ static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -7187,6 +7187,8 @@ static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring,
if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
spin_lock_irqsave(&kiq->ring_lock, flags);
if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
@ -7247,6 +7249,7 @@ static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -3567,6 +3567,8 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
spin_lock_irqsave(&kiq->ring_lock, flags);
if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
@ -3625,6 +3627,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -770,12 +770,14 @@ static int jpeg_v2_0_ring_reset(struct amdgpu_ring *ring,
{
int r;
drm_sched_wqueue_stop(&ring->sched);
jpeg_v2_0_stop(ring->adev);
jpeg_v2_0_start(ring->adev);
r = amdgpu_ring_test_helper(ring);
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -649,12 +649,14 @@ static int jpeg_v2_5_ring_reset(struct amdgpu_ring *ring,
{
int r;
drm_sched_wqueue_stop(&ring->sched);
jpeg_v2_5_stop_inst(ring->adev, ring->me);
jpeg_v2_5_start_inst(ring->adev, ring->me);
r = amdgpu_ring_test_helper(ring);
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -561,12 +561,14 @@ static int jpeg_v3_0_ring_reset(struct amdgpu_ring *ring,
{
int r;
drm_sched_wqueue_stop(&ring->sched);
jpeg_v3_0_stop(ring->adev);
jpeg_v3_0_start(ring->adev);
r = amdgpu_ring_test_helper(ring);
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -729,12 +729,14 @@ static int jpeg_v4_0_ring_reset(struct amdgpu_ring *ring,
if (amdgpu_sriov_vf(ring->adev))
return -EINVAL;
drm_sched_wqueue_stop(&ring->sched);
jpeg_v4_0_stop(ring->adev);
jpeg_v4_0_start(ring->adev);
r = amdgpu_ring_test_helper(ring);
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -1152,12 +1152,14 @@ static int jpeg_v4_0_3_ring_reset(struct amdgpu_ring *ring,
if (amdgpu_sriov_vf(ring->adev))
return -EOPNOTSUPP;
drm_sched_wqueue_stop(&ring->sched);
jpeg_v4_0_3_core_stall_reset(ring);
jpeg_v4_0_3_start_jrbc(ring);
r = amdgpu_ring_test_helper(ring);
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -843,12 +843,14 @@ static int jpeg_v5_0_1_ring_reset(struct amdgpu_ring *ring,
if (amdgpu_sriov_vf(ring->adev))
return -EOPNOTSUPP;
drm_sched_wqueue_stop(&ring->sched);
jpeg_v5_0_1_core_stall_reset(ring);
jpeg_v5_0_1_init_jrbc(ring);
r = amdgpu_ring_test_helper(ring);
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -1570,6 +1570,8 @@ static int sdma_v6_0_reset_queue(struct amdgpu_ring *ring,
return -EINVAL;
}
drm_sched_wqueue_stop(&ring->sched);
r = amdgpu_mes_reset_legacy_queue(adev, ring, vmid, true);
if (r)
return r;
@ -1578,6 +1580,7 @@ static int sdma_v6_0_reset_queue(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -822,6 +822,8 @@ static int sdma_v7_0_reset_queue(struct amdgpu_ring *ring,
return -EINVAL;
}
drm_sched_wqueue_stop(&ring->sched);
r = amdgpu_mes_reset_legacy_queue(adev, ring, vmid, true);
if (r)
return r;
@ -830,6 +832,7 @@ static int sdma_v7_0_reset_queue(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -1978,6 +1978,7 @@ static int vcn_v4_0_ring_reset(struct amdgpu_ring *ring,
if (!(adev->vcn.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
return -EOPNOTSUPP;
drm_sched_wqueue_stop(&ring->sched);
vcn_v4_0_stop(vinst);
vcn_v4_0_start(vinst);
@ -1985,6 +1986,7 @@ static int vcn_v4_0_ring_reset(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -1609,6 +1609,8 @@ static int vcn_v4_0_3_ring_reset(struct amdgpu_ring *ring,
if (!(adev->vcn.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
return -EOPNOTSUPP;
drm_sched_wqueue_stop(&ring->sched);
vcn_inst = GET_INST(VCN, ring->me);
r = amdgpu_dpm_reset_vcn(adev, 1 << vcn_inst);
@ -1626,6 +1628,7 @@ static int vcn_v4_0_3_ring_reset(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -1476,6 +1476,7 @@ static int vcn_v4_0_5_ring_reset(struct amdgpu_ring *ring,
if (!(adev->vcn.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
return -EOPNOTSUPP;
drm_sched_wqueue_stop(&ring->sched);
vcn_v4_0_5_stop(vinst);
vcn_v4_0_5_start(vinst);
@ -1483,6 +1484,7 @@ static int vcn_v4_0_5_ring_reset(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}

View File

@ -1203,6 +1203,7 @@ static int vcn_v5_0_0_ring_reset(struct amdgpu_ring *ring,
if (!(adev->vcn.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
return -EOPNOTSUPP;
drm_sched_wqueue_stop(&ring->sched);
vcn_v5_0_0_stop(vinst);
vcn_v5_0_0_start(vinst);
@ -1210,6 +1211,7 @@ static int vcn_v5_0_0_ring_reset(struct amdgpu_ring *ring,
if (r)
return r;
amdgpu_fence_driver_force_completion(ring);
drm_sched_wqueue_start(&ring->sched);
return 0;
}