drm/amdgpu: set an error on all fences from a bad context

When we backup ring contents to reemit after a queue reset,
we don't backup ring contents from the bad context.  When
we signal the fences, we should set an error on those
fences as well.

v2: misc cleanups
v3: add locking for fence error, fix comment (Christian)
v4: fix wrap around, locking (Christian)

Fixes: 77cc0da39c ("drm/amdgpu: track ring state associated with a fence")
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Alex Deucher 2025-09-03 13:48:23 -04:00
parent 1f22fcb88b
commit ff780f4f80
3 changed files with 37 additions and 6 deletions

View File

@ -758,11 +758,42 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
* @fence: fence of the ring to signal
*
*/
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
{
dma_fence_set_error(&fence->base, -ETIME);
amdgpu_fence_write(fence->ring, fence->seq);
amdgpu_fence_process(fence->ring);
struct dma_fence *unprocessed;
struct dma_fence __rcu **ptr;
struct amdgpu_fence *fence;
struct amdgpu_ring *ring = af->ring;
unsigned long flags;
u32 seq, last_seq;
last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
/* mark all fences from the guilty context with an error */
spin_lock_irqsave(&ring->fence_drv.lock, flags);
do {
last_seq++;
last_seq &= ring->fence_drv.num_fences_mask;
ptr = &ring->fence_drv.fences[last_seq];
rcu_read_lock();
unprocessed = rcu_dereference(*ptr);
if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
fence = container_of(unprocessed, struct amdgpu_fence, base);
if (fence == af)
dma_fence_set_error(&fence->base, -ETIME);
else if (fence->context == af->context)
dma_fence_set_error(&fence->base, -ECANCELED);
}
rcu_read_unlock();
} while (last_seq != seq);
spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
/* signal the guilty fence */
amdgpu_fence_write(ring, af->seq);
amdgpu_fence_process(ring);
}
void amdgpu_fence_save_wptr(struct dma_fence *fence)

View File

@ -811,7 +811,7 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
if (r)
return r;
/* signal the fence of the bad job */
/* signal the guilty fence and set an error on all fences from the context */
if (guilty_fence)
amdgpu_fence_driver_guilty_force_completion(guilty_fence);
/* Re-emit the non-guilty commands */

View File

@ -155,7 +155,7 @@ extern const struct drm_sched_backend_ops amdgpu_sched_ops;
void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence);
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af);
void amdgpu_fence_save_wptr(struct dma_fence *fence);
int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);