drm/xe/vf: Avoid indefinite blocking in preempt rebind worker for VFs supporting migration

Blocking in work queues on a hardware action that may never occur —
especially when it depends on a software fixup also scheduled on the
a work queue — is a recipe for deadlock. This situation arises with
the preempt rebind worker and VF post-migration recovery. To prevent
potential deadlocks, avoid indefinite blocking in the preempt rebind
worker for VFs that support migration.

v4:
 - Use dma_fence_wait_timeout (CI)

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Tomasz Lis <tomasz.lis@intel.com>
Link: https://lore.kernel.org/r/20251008214532.3442967-19-matthew.brost@intel.com
This commit is contained in:
Matthew Brost 2025-10-08 14:45:16 -07:00
parent a4dae94aad
commit 1faeeea056

View File

@ -35,6 +35,7 @@
#include "xe_pt.h"
#include "xe_pxp.h"
#include "xe_res_cursor.h"
#include "xe_sriov_vf.h"
#include "xe_svm.h"
#include "xe_sync.h"
#include "xe_tile.h"
@ -111,12 +112,22 @@ static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
static int wait_for_existing_preempt_fences(struct xe_vm *vm)
{
struct xe_exec_queue *q;
bool vf_migration = IS_SRIOV_VF(vm->xe) &&
xe_sriov_vf_migration_supported(vm->xe);
signed long wait_time = vf_migration ? HZ / 5 : MAX_SCHEDULE_TIMEOUT;
xe_vm_assert_held(vm);
list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
if (q->lr.pfence) {
long timeout = dma_fence_wait(q->lr.pfence, false);
long timeout;
timeout = dma_fence_wait_timeout(q->lr.pfence, false,
wait_time);
if (!timeout) {
xe_assert(vm->xe, vf_migration);
return -EAGAIN;
}
/* Only -ETIME on fence indicates VM needs to be killed */
if (timeout < 0 || q->lr.pfence->error == -ETIME)
@ -541,6 +552,19 @@ static void preempt_rebind_work_func(struct work_struct *w)
out_unlock_outer:
if (err == -EAGAIN) {
trace_xe_vm_rebind_worker_retry(vm);
/*
* We can't block in workers on a VF which supports migration
* given this can block the VF post-migration workers from
* getting scheduled.
*/
if (IS_SRIOV_VF(vm->xe) &&
xe_sriov_vf_migration_supported(vm->xe)) {
up_write(&vm->lock);
xe_vm_queue_rebind_worker(vm);
return;
}
goto retry;
}