drm/amdgpu: completely rework eviction fence handling v2

Well that was broken on multiple levels.

First of all a lot of checks were placed at incorrect locations, especially if
the resume worker should run or not.

Then a bunch of code was just mid-layering because of incorrect assignment who
should do what.

And finally comments explaining what happens instead of why.

Just re-write it from scratch, that should at least fix some of the hangs we
are seeing.

Use RCU for the eviction fence pointer in the manager, the spinlock usage was
mostly incorrect as well. Then finally remove all the nonsense checks and
actually add them in the correct locations.

v2: some typo fixes and cleanups suggested by Sunil

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Christian König 2026-01-28 13:58:14 +01:00 committed by Alex Deucher
parent 87327658c8
commit 2cd7284ba5
7 changed files with 113 additions and 209 deletions

View File

@ -2952,9 +2952,9 @@ static int amdgpu_drm_release(struct inode *inode, struct file *filp)
int idx;
if (fpriv && drm_dev_enter(dev, &idx)) {
fpriv->evf_mgr.fd_closing = true;
amdgpu_eviction_fence_destroy(&fpriv->evf_mgr);
amdgpu_evf_mgr_shutdown(&fpriv->evf_mgr);
amdgpu_userq_mgr_fini(&fpriv->userq_mgr);
amdgpu_evf_mgr_fini(&fpriv->evf_mgr);
drm_dev_exit(idx);
}

View File

@ -25,9 +25,6 @@
#include <drm/drm_exec.h>
#include "amdgpu.h"
#define work_to_evf_mgr(w, name) container_of(w, struct amdgpu_eviction_fence_mgr, name)
#define evf_mgr_to_fpriv(e) container_of(e, struct amdgpu_fpriv, evf_mgr)
static const char *
amdgpu_eviction_fence_get_driver_name(struct dma_fence *fence)
{
@ -43,102 +40,11 @@ amdgpu_eviction_fence_get_timeline_name(struct dma_fence *f)
return ef->timeline_name;
}
int
amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct drm_exec *exec)
{
struct amdgpu_eviction_fence *old_ef, *new_ef;
struct drm_gem_object *obj;
unsigned long index;
int ret;
if (evf_mgr->ev_fence &&
!dma_fence_is_signaled(&evf_mgr->ev_fence->base))
return 0;
/*
* Steps to replace eviction fence:
* * lock all objects in exec (caller)
* * create a new eviction fence
* * update new eviction fence in evf_mgr
* * attach the new eviction fence to BOs
* * release the old fence
* * unlock the objects (caller)
*/
new_ef = amdgpu_eviction_fence_create(evf_mgr);
if (!new_ef) {
DRM_ERROR("Failed to create new eviction fence\n");
return -ENOMEM;
}
/* Update the eviction fence now */
spin_lock(&evf_mgr->ev_fence_lock);
old_ef = evf_mgr->ev_fence;
evf_mgr->ev_fence = new_ef;
spin_unlock(&evf_mgr->ev_fence_lock);
/* Attach the new fence */
drm_exec_for_each_locked_object(exec, index, obj) {
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
if (!bo)
continue;
ret = amdgpu_eviction_fence_attach(evf_mgr, bo);
if (ret) {
DRM_ERROR("Failed to attch new eviction fence\n");
goto free_err;
}
}
/* Free old fence */
if (old_ef)
dma_fence_put(&old_ef->base);
return 0;
free_err:
kfree(new_ef);
return ret;
}
static void
amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
{
struct amdgpu_eviction_fence_mgr *evf_mgr = work_to_evf_mgr(work, suspend_work.work);
struct amdgpu_fpriv *fpriv = evf_mgr_to_fpriv(evf_mgr);
struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
struct amdgpu_eviction_fence *ev_fence;
mutex_lock(&uq_mgr->userq_mutex);
spin_lock(&evf_mgr->ev_fence_lock);
ev_fence = evf_mgr->ev_fence;
if (ev_fence)
dma_fence_get(&ev_fence->base);
else
goto unlock;
spin_unlock(&evf_mgr->ev_fence_lock);
amdgpu_userq_evict(uq_mgr, ev_fence);
mutex_unlock(&uq_mgr->userq_mutex);
dma_fence_put(&ev_fence->base);
return;
unlock:
spin_unlock(&evf_mgr->ev_fence_lock);
mutex_unlock(&uq_mgr->userq_mutex);
}
static bool amdgpu_eviction_fence_enable_signaling(struct dma_fence *f)
{
struct amdgpu_eviction_fence_mgr *evf_mgr;
struct amdgpu_eviction_fence *ev_fence;
struct amdgpu_eviction_fence *ev_fence = to_ev_fence(f);
if (!f)
return true;
ev_fence = to_ev_fence(f);
evf_mgr = ev_fence->evf_mgr;
schedule_delayed_work(&evf_mgr->suspend_work, 0);
schedule_work(&ev_fence->evf_mgr->suspend_work);
return true;
}
@ -148,22 +54,52 @@ static const struct dma_fence_ops amdgpu_eviction_fence_ops = {
.enable_signaling = amdgpu_eviction_fence_enable_signaling,
};
void amdgpu_eviction_fence_signal(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_eviction_fence *ev_fence)
static void
amdgpu_eviction_fence_suspend_worker(struct work_struct *work)
{
spin_lock(&evf_mgr->ev_fence_lock);
dma_fence_signal(&ev_fence->base);
spin_unlock(&evf_mgr->ev_fence_lock);
struct amdgpu_eviction_fence_mgr *evf_mgr =
container_of(work, struct amdgpu_eviction_fence_mgr,
suspend_work);
struct amdgpu_fpriv *fpriv =
container_of(evf_mgr, struct amdgpu_fpriv, evf_mgr);
struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
struct dma_fence *ev_fence;
mutex_lock(&uq_mgr->userq_mutex);
ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
amdgpu_userq_evict(uq_mgr, !evf_mgr->shutdown);
/*
* Signaling the eviction fence must be done while holding the
* userq_mutex. Otherwise we won't resume the queues before issuing the
* next fence.
*/
dma_fence_signal(ev_fence);
dma_fence_put(ev_fence);
mutex_unlock(&uq_mgr->userq_mutex);
}
struct amdgpu_eviction_fence *
amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr)
void amdgpu_evf_mgr_attach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_bo *bo)
{
struct dma_fence *ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
struct dma_resv *resv = bo->tbo.base.resv;
dma_resv_add_fence(resv, ev_fence, DMA_RESV_USAGE_BOOKKEEP);
dma_fence_put(ev_fence);
}
int amdgpu_evf_mgr_rearm(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct drm_exec *exec)
{
struct amdgpu_eviction_fence *ev_fence;
struct drm_gem_object *obj;
unsigned long index;
/* Create and initialize a new eviction fence */
ev_fence = kzalloc_obj(*ev_fence);
if (!ev_fence)
return NULL;
return -ENOMEM;
ev_fence->evf_mgr = evf_mgr;
get_task_comm(ev_fence->timeline_name, current);
@ -171,56 +107,22 @@ amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr)
dma_fence_init64(&ev_fence->base, &amdgpu_eviction_fence_ops,
&ev_fence->lock, evf_mgr->ev_fence_ctx,
atomic_inc_return(&evf_mgr->ev_fence_seq));
return ev_fence;
}
void amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr)
{
struct amdgpu_eviction_fence *ev_fence;
/* Remember it for newly added BOs */
dma_fence_put(evf_mgr->ev_fence);
evf_mgr->ev_fence = &ev_fence->base;
/* Wait for any pending work to execute */
flush_delayed_work(&evf_mgr->suspend_work);
/* And add it to all existing BOs */
drm_exec_for_each_locked_object(exec, index, obj) {
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
spin_lock(&evf_mgr->ev_fence_lock);
ev_fence = evf_mgr->ev_fence;
spin_unlock(&evf_mgr->ev_fence_lock);
if (!ev_fence)
return;
dma_fence_wait(&ev_fence->base, false);
/* Last unref of ev_fence */
dma_fence_put(&ev_fence->base);
}
int amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_bo *bo)
{
struct amdgpu_eviction_fence *ev_fence;
struct dma_resv *resv = bo->tbo.base.resv;
int ret;
if (!resv)
return 0;
ret = dma_resv_reserve_fences(resv, 1);
if (ret) {
DRM_DEBUG_DRIVER("Failed to resv fence space\n");
return ret;
amdgpu_evf_mgr_attach_fence(evf_mgr, bo);
}
spin_lock(&evf_mgr->ev_fence_lock);
ev_fence = evf_mgr->ev_fence;
if (ev_fence)
dma_resv_add_fence(resv, &ev_fence->base, DMA_RESV_USAGE_BOOKKEEP);
spin_unlock(&evf_mgr->ev_fence_lock);
return 0;
}
void amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_bo *bo)
void amdgpu_evf_mgr_detach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_bo *bo)
{
struct dma_fence *stub = dma_fence_get_stub();
@ -229,13 +131,25 @@ void amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr,
dma_fence_put(stub);
}
int amdgpu_eviction_fence_init(struct amdgpu_eviction_fence_mgr *evf_mgr)
void amdgpu_evf_mgr_init(struct amdgpu_eviction_fence_mgr *evf_mgr)
{
/* This needs to be done one time per open */
atomic_set(&evf_mgr->ev_fence_seq, 0);
evf_mgr->ev_fence_ctx = dma_fence_context_alloc(1);
spin_lock_init(&evf_mgr->ev_fence_lock);
evf_mgr->ev_fence = dma_fence_get_stub();
INIT_DELAYED_WORK(&evf_mgr->suspend_work, amdgpu_eviction_fence_suspend_worker);
return 0;
INIT_WORK(&evf_mgr->suspend_work, amdgpu_eviction_fence_suspend_worker);
}
void amdgpu_evf_mgr_shutdown(struct amdgpu_eviction_fence_mgr *evf_mgr)
{
evf_mgr->shutdown = true;
flush_work(&evf_mgr->suspend_work);
}
void amdgpu_evf_mgr_fini(struct amdgpu_eviction_fence_mgr *evf_mgr)
{
dma_fence_wait(rcu_dereference_protected(evf_mgr->ev_fence, true),
false);
flush_work(&evf_mgr->suspend_work);
dma_fence_put(evf_mgr->ev_fence);
}

View File

@ -25,6 +25,8 @@
#ifndef AMDGPU_EV_FENCE_H_
#define AMDGPU_EV_FENCE_H_
#include <linux/dma-fence.h>
struct amdgpu_eviction_fence {
struct dma_fence base;
spinlock_t lock;
@ -35,35 +37,35 @@ struct amdgpu_eviction_fence {
struct amdgpu_eviction_fence_mgr {
u64 ev_fence_ctx;
atomic_t ev_fence_seq;
spinlock_t ev_fence_lock;
struct amdgpu_eviction_fence *ev_fence;
struct delayed_work suspend_work;
uint8_t fd_closing;
/*
* Only updated while holding the VM resv lock.
* Only signaled while holding the userq mutex.
*/
struct dma_fence __rcu *ev_fence;
struct work_struct suspend_work;
bool shutdown;
};
/* Eviction fence helper functions */
struct amdgpu_eviction_fence *
amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr);
static inline struct dma_fence *
amdgpu_evf_mgr_get_fence(struct amdgpu_eviction_fence_mgr *evf_mgr)
{
struct dma_fence *ev_fence;
void
amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr);
rcu_read_lock();
ev_fence = dma_fence_get_rcu_safe(&evf_mgr->ev_fence);
rcu_read_unlock();
return ev_fence;
}
int
amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_bo *bo);
void amdgpu_evf_mgr_attach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_bo *bo);
int amdgpu_evf_mgr_rearm(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct drm_exec *exec);
void amdgpu_evf_mgr_detach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_bo *bo);
void amdgpu_evf_mgr_init(struct amdgpu_eviction_fence_mgr *evf_mgr);
void amdgpu_evf_mgr_shutdown(struct amdgpu_eviction_fence_mgr *evf_mgr);
void amdgpu_evf_mgr_fini(struct amdgpu_eviction_fence_mgr *evf_mgr);
void
amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_bo *bo);
int
amdgpu_eviction_fence_init(struct amdgpu_eviction_fence_mgr *evf_mgr);
void
amdgpu_eviction_fence_signal(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct amdgpu_eviction_fence *ev_fence);
int
amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr,
struct drm_exec *exec);
#endif

View File

@ -263,13 +263,7 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj,
else
++bo_va->ref_count;
/* attach gfx eviction fence */
r = amdgpu_eviction_fence_attach(&fpriv->evf_mgr, abo);
if (r) {
DRM_DEBUG_DRIVER("Failed to attach eviction fence to BO\n");
amdgpu_bo_unreserve(abo);
return r;
}
amdgpu_evf_mgr_attach_fence(&fpriv->evf_mgr, abo);
drm_exec_fini(&exec);
/* Validate and add eviction fence to DMABuf imports with dynamic
@ -337,7 +331,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
}
if (!amdgpu_vm_is_bo_always_valid(vm, bo))
amdgpu_eviction_fence_detach(&fpriv->evf_mgr, bo);
amdgpu_evf_mgr_detach_fence(&fpriv->evf_mgr, bo);
bo_va = amdgpu_vm_bo_find(vm, bo);
if (!bo_va || --bo_va->ref_count)

View File

@ -1522,10 +1522,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
"Failed to init usermode queue manager (%d), use legacy workload submission only\n",
r);
r = amdgpu_eviction_fence_init(&fpriv->evf_mgr);
if (r)
goto error_vm;
amdgpu_evf_mgr_init(&fpriv->evf_mgr);
amdgpu_ctx_mgr_init(&fpriv->ctx_mgr, adev);
file_priv->driver_priv = fpriv;

View File

@ -472,17 +472,16 @@ void
amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence_mgr *evf_mgr)
{
struct amdgpu_eviction_fence *ev_fence;
struct dma_fence *ev_fence;
retry:
/* Flush any pending resume work to create ev_fence */
flush_delayed_work(&uq_mgr->resume_work);
mutex_lock(&uq_mgr->userq_mutex);
spin_lock(&evf_mgr->ev_fence_lock);
ev_fence = evf_mgr->ev_fence;
spin_unlock(&evf_mgr->ev_fence_lock);
if (!ev_fence || dma_fence_is_signaled(&ev_fence->base)) {
ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr);
if (dma_fence_is_signaled(ev_fence)) {
dma_fence_put(ev_fence);
mutex_unlock(&uq_mgr->userq_mutex);
/*
* Looks like there was no pending resume work,
@ -491,6 +490,7 @@ amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr,
schedule_delayed_work(&uq_mgr->resume_work, 0);
goto retry;
}
dma_fence_put(ev_fence);
}
int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr,
@ -1197,7 +1197,7 @@ amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
dma_fence_wait(bo_va->last_pt_update, false);
dma_fence_wait(vm->last_update, false);
ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
ret = amdgpu_evf_mgr_rearm(&fpriv->evf_mgr, &exec);
if (ret)
drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n");
@ -1217,11 +1217,13 @@ static void amdgpu_userq_restore_worker(struct work_struct *work)
{
struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct dma_fence *ev_fence;
int ret;
flush_delayed_work(&fpriv->evf_mgr.suspend_work);
mutex_lock(&uq_mgr->userq_mutex);
ev_fence = amdgpu_evf_mgr_get_fence(&fpriv->evf_mgr);
if (!dma_fence_is_signaled(ev_fence))
goto unlock;
ret = amdgpu_userq_vm_validate(uq_mgr);
if (ret) {
@ -1237,6 +1239,7 @@ static void amdgpu_userq_restore_worker(struct work_struct *work)
unlock:
mutex_unlock(&uq_mgr->userq_mutex);
dma_fence_put(ev_fence);
}
static int
@ -1312,11 +1315,8 @@ amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
}
void
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence)
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, bool schedule_resume)
{
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
struct amdgpu_device *adev = uq_mgr->adev;
int ret;
@ -1329,10 +1329,7 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
if (ret)
dev_err(adev->dev, "Failed to evict userqueue\n");
/* Signal current eviction fence */
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
if (!evf_mgr->fd_closing)
if (schedule_resume)
schedule_delayed_work(&uq_mgr->resume_work, 0);
}

View File

@ -133,7 +133,7 @@ void amdgpu_userq_destroy_object(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_userq_obj *userq_obj);
void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence);
bool schedule_resume);
void amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *userq_mgr,
struct amdgpu_eviction_fence_mgr *evf_mgr);