mirror of
https://github.com/torvalds/linux.git
synced 2026-06-01 02:53:36 +02:00
drm/amdkfd: Implement hang detection in KFD and call amdgpu
The reset will be performed in a new hw_exception work thread to handle HWS hang without blocking the thread that detected the hang. Signed-off-by: Shaoyun Liu <Shaoyun.Liu@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
parent
24da5a9ca6
commit
73ea648d92
|
|
@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
|
|||
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
|
||||
unsigned int sdma_queue_id);
|
||||
|
||||
static void kfd_process_hw_exception(struct work_struct *work);
|
||||
|
||||
static inline
|
||||
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
|
||||
{
|
||||
|
|
@ -1010,6 +1012,8 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
|
|||
dqm->active_runlist = false;
|
||||
dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
|
||||
|
||||
INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -1042,6 +1046,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
|
|||
init_interrupts(dqm);
|
||||
|
||||
dqm_lock(dqm);
|
||||
/* clear hang status when driver try to start the hw scheduler */
|
||||
dqm->is_hws_hang = false;
|
||||
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
|
||||
dqm_unlock(dqm);
|
||||
|
||||
|
|
@ -1255,6 +1261,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
|
|||
{
|
||||
int retval = 0;
|
||||
|
||||
if (dqm->is_hws_hang)
|
||||
return -EIO;
|
||||
if (!dqm->active_runlist)
|
||||
return retval;
|
||||
|
||||
|
|
@ -1293,9 +1301,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
|
|||
{
|
||||
int retval;
|
||||
|
||||
if (dqm->is_hws_hang)
|
||||
return -EIO;
|
||||
retval = unmap_queues_cpsch(dqm, filter, filter_param);
|
||||
if (retval) {
|
||||
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
|
||||
dqm->is_hws_hang = true;
|
||||
schedule_work(&dqm->hw_exception_work);
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
|
@ -1543,7 +1555,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
|
|||
}
|
||||
|
||||
retval = execute_queues_cpsch(dqm, filter, 0);
|
||||
if (retval || qpd->reset_wavefronts) {
|
||||
if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
|
||||
pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
|
||||
dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
|
||||
qpd->reset_wavefronts = false;
|
||||
|
|
@ -1701,6 +1713,13 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void kfd_process_hw_exception(struct work_struct *work)
|
||||
{
|
||||
struct device_queue_manager *dqm = container_of(work,
|
||||
struct device_queue_manager, hw_exception_work);
|
||||
dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
|
||||
static void seq_reg_dump(struct seq_file *m,
|
||||
|
|
|
|||
|
|
@ -193,6 +193,10 @@ struct device_queue_manager {
|
|||
struct kfd_mem_obj *fence_mem;
|
||||
bool active_runlist;
|
||||
int sched_policy;
|
||||
|
||||
/* hw exception */
|
||||
bool is_hws_hang;
|
||||
struct work_struct hw_exception_work;
|
||||
};
|
||||
|
||||
void device_queue_manager_init_cik(
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user