mirror of
https://github.com/torvalds/linux.git
synced 2026-05-27 08:33:17 +02:00
drm/amdkfd: Fix a concurrency issue during kfd recovery
start_cpsch and stop_cpsch can be called during kfd device initialization or during gpu reset/recovery. So they can run concurrently. Currently in start_cpsch and stop_cpsch, pm_init and pm_uninit is not protected by the dpm lock. Imagine such a case that user use packet manager's function to submit a pm4 packet to hang hws (ie through command cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee /sys/kernel/debug/kfd/hang_hws), while kfd device is under device reset/recovery so packet manager can be not initialized. There will be unpredictable protection fault in such case. This patch moves pm_init/uninit inside the dpm lock and check packet manager is initialized before using packet manager function. Signed-off-by: Oak Zeng <Oak.Zeng@amd.com> Acked-by: Christian Konig <christian.koenig@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
78ccea9ff2
commit
4f942aaeb1
|
|
@ -1382,18 +1382,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
|
|||
*/
|
||||
int kfd_debugfs_hang_hws(struct kfd_dev *dev)
|
||||
{
|
||||
int r = 0;
|
||||
|
||||
if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) {
|
||||
pr_err("HWS is not enabled");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
r = pm_debugfs_hang_hws(&dev->dqm->packet_mgr);
|
||||
if (!r)
|
||||
r = dqm_debugfs_execute_queues(dev->dqm);
|
||||
|
||||
return r;
|
||||
return dqm_debugfs_hang_hws(dev->dqm);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
|
|||
|
||||
retval = 0;
|
||||
|
||||
dqm_lock(dqm);
|
||||
retval = pm_init(&dqm->packet_mgr, dqm);
|
||||
if (retval)
|
||||
goto fail_packet_manager_init;
|
||||
|
|
@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
|
|||
|
||||
init_interrupts(dqm);
|
||||
|
||||
dqm_lock(dqm);
|
||||
/* clear hang status when driver try to start the hw scheduler */
|
||||
dqm->is_hws_hang = false;
|
||||
dqm->is_resetting = false;
|
||||
|
|
@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
|
|||
fail_set_sched_resources:
|
||||
pm_uninit(&dqm->packet_mgr, false);
|
||||
fail_packet_manager_init:
|
||||
dqm_unlock(dqm);
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
|
@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm)
|
|||
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
|
||||
hanging = dqm->is_hws_hang || dqm->is_resetting;
|
||||
dqm->sched_running = false;
|
||||
dqm_unlock(dqm);
|
||||
|
||||
pm_release_ib(&dqm->packet_mgr);
|
||||
|
||||
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
|
||||
pm_uninit(&dqm->packet_mgr, hanging);
|
||||
dqm_unlock(dqm);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -2099,11 +2100,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
|
|||
return r;
|
||||
}
|
||||
|
||||
int dqm_debugfs_execute_queues(struct device_queue_manager *dqm)
|
||||
int dqm_debugfs_hang_hws(struct device_queue_manager *dqm)
|
||||
{
|
||||
int r = 0;
|
||||
|
||||
dqm_lock(dqm);
|
||||
r = pm_debugfs_hang_hws(&dqm->packet_mgr);
|
||||
if (r) {
|
||||
dqm_unlock(dqm);
|
||||
return r;
|
||||
}
|
||||
dqm->active_runlist = true;
|
||||
r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
|
||||
dqm_unlock(dqm);
|
||||
|
|
|
|||
|
|
@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm)
|
|||
uint32_t *buffer, size;
|
||||
int r = 0;
|
||||
|
||||
if (!pm->priv_queue)
|
||||
return -EAGAIN;
|
||||
|
||||
size = pm->pmf->query_status_size;
|
||||
mutex_lock(&pm->lock);
|
||||
kq_acquire_packet_buffer(pm->priv_queue,
|
||||
|
|
|
|||
|
|
@ -1194,7 +1194,7 @@ int pm_debugfs_runlist(struct seq_file *m, void *data);
|
|||
|
||||
int kfd_debugfs_hang_hws(struct kfd_dev *dev);
|
||||
int pm_debugfs_hang_hws(struct packet_manager *pm);
|
||||
int dqm_debugfs_execute_queues(struct device_queue_manager *dqm);
|
||||
int dqm_debugfs_hang_hws(struct device_queue_manager *dqm);
|
||||
|
||||
#else
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user