drm/amd/ras: Add sriov ras preprocessing before gpu reset

Sriov host may clear all VF commands registered to auto
update list during VF reset, set ecc.auto_uUpdate block
to false before VF reset, and after VF reset is complete,
RAS_CMD__GET_ALL_BLOCK_ECC_STATUS command will be re-registered
to auto update list of sriov host.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai 2025-10-30 16:49:14 +08:00 committed by Alex Deucher
parent 61a9a4138b
commit 73c6c22694
3 changed files with 23 additions and 1 deletions

View File

@ -642,6 +642,9 @@ int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
{
if (amdgpu_sriov_vf(adev))
return amdgpu_virt_ras_pre_reset(adev);
if (!amdgpu_ras_mgr_is_ready(adev)) {
RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
return -EPERM;
@ -653,6 +656,9 @@ int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
{
if (amdgpu_sriov_vf(adev))
return amdgpu_virt_ras_post_reset(adev);
if (!amdgpu_ras_mgr_is_ready(adev)) {
RAS_DEV_ERR(adev, "Invalid ras resume!\n");
return -EPERM;

View File

@ -413,3 +413,18 @@ int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev)
return 0;
}
int amdgpu_virt_ras_pre_reset(struct amdgpu_device *adev)
{
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
struct amdgpu_virt_ras_cmd *virt_ras =
(struct amdgpu_virt_ras_cmd *)ras_mgr->virt_ras_cmd;
virt_ras->blocks_ecc.auto_update_actived = false;
return 0;
}
int amdgpu_virt_ras_post_reset(struct amdgpu_device *adev)
{
return 0;
}

View File

@ -49,5 +49,6 @@ int amdgpu_virt_ras_hw_init(struct amdgpu_device *adev);
int amdgpu_virt_ras_hw_fini(struct amdgpu_device *adev);
int amdgpu_virt_ras_handle_cmd(struct ras_core_context *ras_core,
struct ras_cmd_ctx *cmd);
int amdgpu_virt_ras_pre_reset(struct amdgpu_device *adev);
int amdgpu_virt_ras_post_reset(struct amdgpu_device *adev);
#endif