diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 67a01c4f3885..f958fef253da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -248,6 +248,11 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev, kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry); } +void amdgpu_amdkfd_teardown_processes(struct amdgpu_device *adev) +{ + kgd2kfd_teardown_processes(adev); +} + void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc) { if (adev->kfd.dev) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index da4575676335..eba9556ece9a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -158,6 +158,7 @@ struct amdkfd_process_info { int amdgpu_amdkfd_init(void); void amdgpu_amdkfd_fini(void); +void amdgpu_amdkfd_teardown_processes(struct amdgpu_device *adev); void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc); int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc); @@ -438,6 +439,8 @@ int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd); bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id); bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, bool retry_fault); +void kgd2kfd_lock_kfd(void); +void kgd2kfd_teardown_processes(struct amdgpu_device *adev); #else static inline int kgd2kfd_init(void) @@ -550,5 +553,13 @@ static inline bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct return false; } +static inline void kgd2kfd_lock_kfd(void) +{ +} + +static inline void kgd2kfd_teardown_processes(struct amdgpu_device *adev) +{ +} + #endif #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 33135b185fed..b2deb6a74eb2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3510,6 +3510,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); amdgpu_amdkfd_suspend(adev, true); + amdgpu_amdkfd_teardown_processes(adev); amdgpu_userq_suspend(adev); /* Workaround for ASICs need to disable SMC first */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index db37c2949d19..640a9ab39fcd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -973,6 +973,9 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) } kfree(kfd); + + /* after remove a kfd device unlock kfd driver */ + kgd2kfd_unlock_kfd(NULL); } int kgd2kfd_pre_reset(struct kfd_dev *kfd, @@ -1557,10 +1560,14 @@ int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd) return r; } +/* unlock a kfd dev or kfd driver */ void kgd2kfd_unlock_kfd(struct kfd_dev *kfd) { mutex_lock(&kfd_processes_mutex); - --kfd->kfd_dev_lock; + if (kfd) + --kfd->kfd_dev_lock; + else + --kfd_locked; mutex_unlock(&kfd_processes_mutex); } @@ -1729,6 +1736,73 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr return false; } +/* check if there is kfd process still uses adev */ +static bool kgd2kfd_check_device_idle(struct amdgpu_device *adev) +{ + struct kfd_process *p; + struct hlist_node *p_temp; + unsigned int temp; + struct kfd_node *dev; + + mutex_lock(&kfd_processes_mutex); + + if (hash_empty(kfd_processes_table)) { + mutex_unlock(&kfd_processes_mutex); + return true; + } + + /* check if there is device still use adev */ + hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) { + for (int i = 0; i < p->n_pdds; i++) { + dev = p->pdds[i]->dev; + if (dev->adev == adev) { + mutex_unlock(&kfd_processes_mutex); + return false; + } + } + } + + mutex_unlock(&kfd_processes_mutex); + + return true; +} + +/** kgd2kfd_teardown_processes - gracefully tear down existing + * kfd processes that use adev + * + * @adev: amdgpu_device where kfd processes run on and will be + * teardown + * + */ +void kgd2kfd_teardown_processes(struct amdgpu_device *adev) +{ + struct hlist_node *p_temp; + struct kfd_process *p; + struct kfd_node *dev; + unsigned int temp; + + mutex_lock(&kfd_processes_mutex); + + if (hash_empty(kfd_processes_table)) { + mutex_unlock(&kfd_processes_mutex); + return; + } + + hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) { + for (int i = 0; i < p->n_pdds; i++) { + dev = p->pdds[i]->dev; + if (dev->adev == adev) + kfd_signal_process_terminate_event(p); + } + } + + mutex_unlock(&kfd_processes_mutex); + + /* wait all kfd processes use adev terminate */ + while (!kgd2kfd_check_device_idle(adev)) + cond_resched(); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 5a190dd6be4e..1ad312af8ff0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1380,3 +1380,32 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) kfd_unref_process(p); } + +/* signal KFD_EVENT_TYPE_SIGNAL events from process p + * send signal SIGBUS to correspondent user space process + */ +void kfd_signal_process_terminate_event(struct kfd_process *p) +{ + struct kfd_event *ev; + u32 id; + + rcu_read_lock(); + + /* iterate from id 1 for KFD_EVENT_TYPE_SIGNAL events */ + id = 1; + idr_for_each_entry_continue(&p->event_idr, ev, id) + if (ev->type == KFD_EVENT_TYPE_SIGNAL) { + spin_lock(&ev->lock); + set_event(ev); + spin_unlock(&ev->lock); + } + + /* Send SIGBUS to p->lead_thread */ + dev_notice(kfd_device, + "Sending SIGBUS to process %d", + p->lead_thread->pid); + + send_sig(SIGBUS, p->lead_thread, 0); + + rcu_read_unlock(); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index d798baa7e52e..eeeff9ffc1e3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1192,6 +1192,7 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev, } int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_node **kdev); int kfd_numa_node_to_apic_id(int numa_node_id); +uint32_t kfd_gpu_node_num(void); /* Interrupts */ #define KFD_IRQ_FENCE_CLIENTID 0xff @@ -1547,6 +1548,7 @@ void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, void kfd_signal_reset_event(struct kfd_node *dev); void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid); +void kfd_signal_process_terminate_event(struct kfd_process *p); static inline void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 593b3af10241..1f2d82730c44 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -949,6 +949,12 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) */ mutex_lock(&kfd_processes_mutex); + if (kfd_gpu_node_num() <= 0) { + pr_warn("no gpu node! Cannot create KFD process"); + process = ERR_PTR(-EINVAL); + goto out; + } + if (kfd_is_locked(NULL)) { pr_debug("KFD is locked! Cannot create process"); process = ERR_PTR(-EINVAL); @@ -1235,7 +1241,6 @@ static void kfd_process_wq_release(struct work_struct *work) else ida_destroy(&p->id_table); - kfd_process_remove_sysfs(p); kfd_debugfs_remove_process(p); kfd_process_kunmap_signal_bo(p); @@ -1251,6 +1256,11 @@ static void kfd_process_wq_release(struct work_struct *work) put_task_struct(p->lead_thread); + /* the last step is removing process entries under /sys + * to indicate the process has been terminated. + */ + kfd_process_remove_sysfs(p); + kfree(p); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 005a19602513..1ccd4514d3ee 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -2357,6 +2357,28 @@ int kfd_numa_node_to_apic_id(int numa_node_id) return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); } +/* kfd_gpu_node_num - Return kfd gpu node number at system */ +uint32_t kfd_gpu_node_num(void) +{ + struct kfd_node *dev; + u8 gpu_num = 0; + u8 id = 0; + + while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { + if (!dev || kfd_devcgroup_check_permission(dev)) { + /* Skip non GPU devices and devices to which the + * current process have no access to + */ + id++; + continue; + } + id++; + gpu_num++; + } + + return gpu_num; +} + #if defined(CONFIG_DEBUG_FS) int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)