mirror of
https://github.com/torvalds/linux.git
synced 2026-05-22 22:22:08 +02:00
drm/amdgpu: Fix two reset triggered in a row
Some times a hang GPU causes multiple reset sources to schedule resets. The second source will be able to trigger an unnecessary reset if they schedule after we call amdgpu_device_stop_pending_resets. Move amdgpu_device_stop_pending_resets to after the reset is done. Since at this point the GPU is supposedly in a good state, any reset scheduled after this point would be a legitimate reset. Remove unnecessary and incorrect checks for amdgpu_in_reset that was kinda serving this purpose. Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
f5007c67fc
commit
f4322b9f8a
|
|
@ -5070,8 +5070,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
|||
retry:
|
||||
amdgpu_amdkfd_pre_reset(adev);
|
||||
|
||||
amdgpu_device_stop_pending_resets(adev);
|
||||
|
||||
if (from_hypervisor)
|
||||
r = amdgpu_virt_request_full_gpu(adev, true);
|
||||
else
|
||||
|
|
@ -5823,13 +5821,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||
r, adev_to_drm(tmp_adev)->unique);
|
||||
tmp_adev->asic_reset_res = r;
|
||||
}
|
||||
|
||||
if (!amdgpu_sriov_vf(tmp_adev))
|
||||
/*
|
||||
* Drop all pending non scheduler resets. Scheduler resets
|
||||
* were already dropped during drm_sched_stop
|
||||
*/
|
||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||
}
|
||||
|
||||
/* Actual ASIC resets if needed.*/
|
||||
|
|
@ -5851,6 +5842,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||
goto retry;
|
||||
}
|
||||
|
||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
/*
|
||||
* Drop any pending non scheduler resets queued before reset is done.
|
||||
* Any reset scheduled after this point would be valid. Scheduler resets
|
||||
* were already dropped during drm_sched_stop and no new ones can come
|
||||
* in before drm_sched_start.
|
||||
*/
|
||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||
}
|
||||
|
||||
skip_hw_reset:
|
||||
|
||||
/* Post ASIC reset for all devs .*/
|
||||
|
|
|
|||
|
|
@ -599,7 +599,7 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
|
|||
if (ret) {
|
||||
adev->virt.vf2pf_update_retry_cnt++;
|
||||
if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
|
||||
amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
|
||||
amdgpu_sriov_runtime(adev)) {
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
if (amdgpu_reset_domain_schedule(adev->reset_domain,
|
||||
&adev->virt.flr_work))
|
||||
|
|
|
|||
|
|
@ -319,7 +319,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
|
|||
|
||||
switch (event) {
|
||||
case IDH_FLR_NOTIFICATION:
|
||||
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
|
||||
if (amdgpu_sriov_runtime(adev))
|
||||
WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
|
||||
&adev->virt.flr_work),
|
||||
"Failed to queue work! at %s",
|
||||
|
|
|
|||
|
|
@ -358,7 +358,7 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
|
|||
|
||||
switch (event) {
|
||||
case IDH_FLR_NOTIFICATION:
|
||||
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
|
||||
if (amdgpu_sriov_runtime(adev))
|
||||
WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
|
||||
&adev->virt.flr_work),
|
||||
"Failed to queue work! at %s",
|
||||
|
|
|
|||
|
|
@ -560,7 +560,7 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
|
|||
r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
|
||||
|
||||
/* only handle FLR_NOTIFY now */
|
||||
if (!r && !amdgpu_in_reset(adev))
|
||||
if (!r)
|
||||
WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
|
||||
&adev->virt.flr_work),
|
||||
"Failed to queue work! at %s",
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user