mirror of
https://github.com/torvalds/linux.git
synced 2026-05-28 17:13:52 +02:00
drm/amdgpu: Do a basic health check before reset
Check if the device is present in the bus before trying to recover. It could be that device itself is lost from the bus in some hang situations. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Asad Kamal <asad.kamal@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
f1b8479dc9
commit
dfe9c3cde2
|
|
@ -5532,6 +5532,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
|
|||
|
||||
}
|
||||
|
||||
static int amdgpu_device_health_check(struct list_head *device_list_handle)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev;
|
||||
int ret = 0;
|
||||
u32 status;
|
||||
|
||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
|
||||
if (PCI_POSSIBLE_ERROR(status)) {
|
||||
dev_err(tmp_adev->dev, "device lost from bus!");
|
||||
ret = -ENODEV;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
|
||||
*
|
||||
|
|
@ -5603,6 +5620,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||
device_list_handle = &device_list;
|
||||
}
|
||||
|
||||
if (!amdgpu_sriov_vf(adev)) {
|
||||
r = amdgpu_device_health_check(device_list_handle);
|
||||
if (r)
|
||||
goto end_reset;
|
||||
}
|
||||
|
||||
/* We need to lock reset domain only once both for XGMI and single device */
|
||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
||||
reset_list);
|
||||
|
|
@ -5768,6 +5791,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||
reset_list);
|
||||
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
|
||||
|
||||
end_reset:
|
||||
if (hive) {
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user