drm/amdkfd: Handle GPU reset and drain retry fault race

Only check and drain IH1 ring if CAM is not enabled.

If GPU is under reset, don't access IH to drain retry fault.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Philip Yang 2025-11-19 16:32:45 -05:00 committed by Alex Deucher
parent e1c94109c7
commit 5b57c3c3f2

View File

@ -33,6 +33,7 @@
#include "amdgpu_hmm.h"
#include "amdgpu.h"
#include "amdgpu_xgmi.h"
#include "amdgpu_reset.h"
#include "kfd_priv.h"
#include "kfd_svm.h"
#include "kfd_migrate.h"
@ -2369,6 +2370,9 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
if (!down_read_trylock(&pdd->dev->adev->reset_domain->sem))
continue;
amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
pdd->dev->adev->irq.retry_cam_enabled ?
&pdd->dev->adev->irq.ih :
@ -2378,6 +2382,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
&pdd->dev->adev->irq.ih_soft);
up_read(&pdd->dev->adev->reset_domain->sem);
pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
}
@ -2561,7 +2566,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
adev = pdd->dev->adev;
/* Check and drain ih1 ring if cam not available */
if (adev->irq.ih1.ring_size) {
if (!adev->irq.retry_cam_enabled && adev->irq.ih1.ring_size) {
ih = &adev->irq.ih1;
checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
if (ih->rptr != checkpoint_wptr) {