mirror of
https://github.com/torvalds/linux.git
synced 2026-05-27 00:22:00 +02:00
drm/amdgpu: Add fatal error detected flag
For a RAS error that needs a full reset to recover, set the fatal error status. Clear the status once the device is reset. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Asad Kamal <asad.kamal@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
34b811a281
commit
1b6ef74b2b
|
|
@ -5321,6 +5321,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
|
|||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
if (need_full_reset) {
|
||||
/* post card */
|
||||
amdgpu_ras_set_fed(tmp_adev, false);
|
||||
r = amdgpu_device_asic_init(tmp_adev);
|
||||
if (r) {
|
||||
dev_warn(tmp_adev->dev, "asic atom init failed!");
|
||||
|
|
|
|||
|
|
@ -2439,6 +2439,18 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
|
|||
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
|
||||
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
/* For any RAS error that needs a full reset to
|
||||
* recover, set the fatal error status
|
||||
*/
|
||||
if (hive) {
|
||||
list_for_each_entry(remote_adev,
|
||||
&hive->device_list,
|
||||
gmc.xgmi.head)
|
||||
amdgpu_ras_set_fed(remote_adev,
|
||||
true);
|
||||
} else {
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
}
|
||||
psp_fatal_error_recovery_quirk(&adev->psp);
|
||||
}
|
||||
}
|
||||
|
|
@ -3440,6 +3452,26 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
|
|||
return 0;
|
||||
}
|
||||
|
||||
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *ras;
|
||||
|
||||
ras = amdgpu_ras_get_context(adev);
|
||||
if (!ras)
|
||||
return false;
|
||||
|
||||
return atomic_read(&ras->fed);
|
||||
}
|
||||
|
||||
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
|
||||
{
|
||||
struct amdgpu_ras *ras;
|
||||
|
||||
ras = amdgpu_ras_get_context(adev);
|
||||
if (ras)
|
||||
atomic_set(&ras->fed, !!status);
|
||||
}
|
||||
|
||||
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
|
||||
{
|
||||
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
|
||||
|
|
|
|||
|
|
@ -477,6 +477,8 @@ struct amdgpu_ras {
|
|||
wait_queue_head_t page_retirement_wq;
|
||||
struct mutex page_retirement_lock;
|
||||
atomic_t page_retirement_req_cnt;
|
||||
/* Fatal error detected flag */
|
||||
atomic_t fed;
|
||||
};
|
||||
|
||||
struct ras_fs_data {
|
||||
|
|
@ -873,4 +875,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,
|
|||
|
||||
void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
|
||||
struct ras_err_addr *mca_err_addr);
|
||||
|
||||
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
|
||||
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user