From d701cf6578e8447af4ac0fa08e7c5a0fad6df1ae Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 21 Feb 2023 17:10:43 +0100 Subject: [PATCH 1/5] MAINTAINERS: Update s390-iommu driver maintainer information The s390 DMA API conversion changes currently under review will extend the use of the s390-iommu driver to the DMA API. With s390's mandatory use of an IOMMU this means all DMA for PCI devices will then use the s390-iommu driver. With this in mind and considering my involvement in these changes it makes sense to reflect this increased interdependence in the maintainer structure. Thus add myself as first maintainer and move Gerald to reviewer status. Reviewed-by: Gerald Schaefer Reviewed-by: Matthew Rosato Acked-by: Peter Oberparleiter Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20230221161043.37065-1-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d8ebab595b2a..33a61d516446 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18291,8 +18291,9 @@ F: drivers/s390/block/dasd* F: include/linux/dasd_mod.h S390 IOMMU (PCI) +M: Niklas Schnelle M: Matthew Rosato -M: Gerald Schaefer +R: Gerald Schaefer L: linux-s390@vger.kernel.org S: Supported F: drivers/iommu/s390-iommu.c From f91bf3272a18356e8585f6bbba896d794632f2af Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 16 Mar 2023 00:25:14 +0100 Subject: [PATCH 2/5] iommu/exynos: Fix set_platform_dma_ops() callback There are some subtle differences between release_device() and set_platform_dma_ops() callbacks, so separate those two callbacks. Device links should be removed only in release_device(), because they were created in probe_device() on purpose and they are needed for proper Exynos IOMMU driver operation. While fixing this, remove the conditional code as it is not really needed. Reported-by: Jason Gunthorpe Fixes: 189d496b48b1 ("iommu/exynos: Add missing set_platform_dma_ops callback") Signed-off-by: Marek Szyprowski Reviewed-by: Sam Protsenko Link: https://lore.kernel.org/r/20230315232514.1046589-1-m.szyprowski@samsung.com Signed-off-by: Joerg Roedel --- drivers/iommu/exynos-iommu.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index 483aaaeb6dae..1abd187c6075 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -1415,23 +1415,26 @@ static struct iommu_device *exynos_iommu_probe_device(struct device *dev) return &data->iommu; } -static void exynos_iommu_release_device(struct device *dev) +static void exynos_iommu_set_platform_dma(struct device *dev) { struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); - struct sysmmu_drvdata *data; if (owner->domain) { struct iommu_group *group = iommu_group_get(dev); if (group) { -#ifndef CONFIG_ARM - WARN_ON(owner->domain != - iommu_group_default_domain(group)); -#endif exynos_iommu_detach_device(owner->domain, dev); iommu_group_put(group); } } +} + +static void exynos_iommu_release_device(struct device *dev) +{ + struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); + struct sysmmu_drvdata *data; + + exynos_iommu_set_platform_dma(dev); list_for_each_entry(data, &owner->controllers, owner_node) device_link_del(data->link); @@ -1479,7 +1482,7 @@ static const struct iommu_ops exynos_iommu_ops = { .domain_alloc = exynos_iommu_domain_alloc, .device_group = generic_device_group, #ifdef CONFIG_ARM - .set_platform_dma_ops = exynos_iommu_release_device, + .set_platform_dma_ops = exynos_iommu_set_platform_dma, #endif .probe_device = exynos_iommu_probe_device, .release_device = exynos_iommu_release_device, From c7d624520c1bd4e42d8ceb8283d6505fc90acccb Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 29 Mar 2023 21:47:19 +0800 Subject: [PATCH 3/5] iommu/vt-d: Remove unnecessary locking in intel_irq_remapping_alloc() The global rwsem dmar_global_lock was introduced by commit 3a5670e8ac932 ("iommu/vt-d: Introduce a rwsem to protect global data structures"). It is used to protect DMAR related global data from DMAR hotplug operations. Using dmar_global_lock in intel_irq_remapping_alloc() is unnecessary as the DMAR global data structures are not touched there. Remove it to avoid below lockdep warning. ====================================================== WARNING: possible circular locking dependency detected 6.3.0-rc2 #468 Not tainted ------------------------------------------------------ swapper/0/1 is trying to acquire lock: ff1db4cb40178698 (&domain->mutex){+.+.}-{3:3}, at: __irq_domain_alloc_irqs+0x3b/0xa0 but task is already holding lock: ffffffffa0c1cdf0 (dmar_global_lock){++++}-{3:3}, at: intel_iommu_init+0x58e/0x880 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (dmar_global_lock){++++}-{3:3}: lock_acquire+0xd6/0x320 down_read+0x42/0x180 intel_irq_remapping_alloc+0xad/0x750 mp_irqdomain_alloc+0xb8/0x2b0 irq_domain_alloc_irqs_locked+0x12f/0x2d0 __irq_domain_alloc_irqs+0x56/0xa0 alloc_isa_irq_from_domain.isra.7+0xa0/0xe0 mp_map_pin_to_irq+0x1dc/0x330 setup_IO_APIC+0x128/0x210 apic_intr_mode_init+0x67/0x110 x86_late_time_init+0x24/0x40 start_kernel+0x41e/0x7e0 secondary_startup_64_no_verify+0xe0/0xeb -> #0 (&domain->mutex){+.+.}-{3:3}: check_prevs_add+0x160/0xef0 __lock_acquire+0x147d/0x1950 lock_acquire+0xd6/0x320 __mutex_lock+0x9c/0xfc0 __irq_domain_alloc_irqs+0x3b/0xa0 dmar_alloc_hwirq+0x9e/0x120 iommu_pmu_register+0x11d/0x200 intel_iommu_init+0x5de/0x880 pci_iommu_init+0x12/0x40 do_one_initcall+0x65/0x350 kernel_init_freeable+0x3ca/0x610 kernel_init+0x1a/0x140 ret_from_fork+0x29/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(dmar_global_lock); lock(&domain->mutex); lock(dmar_global_lock); lock(&domain->mutex); *** DEADLOCK *** Fixes: 9dbb8e3452ab ("irqdomain: Switch to per-domain locking") Reviewed-by: Jacob Pan Tested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20230314051836.23817-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20230329134721.469447-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/irq_remapping.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 6d01fa078c36..df9e261af0b5 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -311,14 +311,12 @@ static int set_ioapic_sid(struct irte *irte, int apic) if (!irte) return -1; - down_read(&dmar_global_lock); for (i = 0; i < MAX_IO_APICS; i++) { if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; break; } } - up_read(&dmar_global_lock); if (sid == 0) { pr_warn("Failed to set source-id of IOAPIC (%d)\n", apic); @@ -338,14 +336,12 @@ static int set_hpet_sid(struct irte *irte, u8 id) if (!irte) return -1; - down_read(&dmar_global_lock); for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].iommu && ir_hpet[i].id == id) { sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; break; } } - up_read(&dmar_global_lock); if (sid == 0) { pr_warn("Failed to set source-id of HPET block (%d)\n", id); @@ -1339,9 +1335,7 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain, if (!data) goto out_free_parent; - down_read(&dmar_global_lock); index = alloc_irte(iommu, &data->irq_2_iommu, nr_irqs); - up_read(&dmar_global_lock); if (index < 0) { pr_warn("Failed to allocate IRTE\n"); kfree(data); From bfd3c6b9fa4a1dc78139dd1621d5bea321ffa69d Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 29 Mar 2023 21:47:20 +0800 Subject: [PATCH 4/5] iommu/vt-d: Allow zero SAGAW if second-stage not supported The VT-d spec states (in section 11.4.2) that hardware implementations reporting second-stage translation support (SSTS) field as Clear also report the SAGAW field as 0. Fix an inappropriate check in alloc_iommu(). Fixes: 792fb43ce2c9 ("iommu/vt-d: Enable Intel IOMMU scalable mode by default") Suggested-by: Raghunathan Srinivasan Reviewed-by: Kevin Tian Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20230318024824.124542-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20230329134721.469447-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 6acfe879589c..23828d189c2a 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1071,7 +1071,8 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) } err = -EINVAL; - if (cap_sagaw(iommu->cap) == 0) { + if (!cap_sagaw(iommu->cap) && + (!ecap_smts(iommu->ecap) || ecap_slts(iommu->ecap))) { pr_info("%s: No supported address widths. Not attempting DMA translation.\n", iommu->name); drhd->ignored = 1; From 16812c96550c30a8d5743167ef4e462d6fbe7472 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 29 Mar 2023 21:47:21 +0800 Subject: [PATCH 5/5] iommu/vt-d: Fix an IOMMU perfmon warning when CPU hotplug A warning can be triggered when hotplug CPU 0. $ echo 0 > /sys/devices/system/cpu/cpu0/online ------------[ cut here ]------------ Voluntary context switch within RCU read-side critical section! WARNING: CPU: 0 PID: 19 at kernel/rcu/tree_plugin.h:318 rcu_note_context_switch+0x4f4/0x580 RIP: 0010:rcu_note_context_switch+0x4f4/0x580 Call Trace: ? perf_event_update_userpage+0x104/0x150 __schedule+0x8d/0x960 ? perf_event_set_state.part.82+0x11/0x50 schedule+0x44/0xb0 schedule_timeout+0x226/0x310 ? __perf_event_disable+0x64/0x1a0 ? _raw_spin_unlock+0x14/0x30 wait_for_completion+0x94/0x130 __wait_rcu_gp+0x108/0x130 synchronize_rcu+0x67/0x70 ? invoke_rcu_core+0xb0/0xb0 ? __bpf_trace_rcu_stall_warning+0x10/0x10 perf_pmu_migrate_context+0x121/0x370 iommu_pmu_cpu_offline+0x6a/0xa0 ? iommu_pmu_del+0x1e0/0x1e0 cpuhp_invoke_callback+0x129/0x510 cpuhp_thread_fun+0x94/0x150 smpboot_thread_fn+0x183/0x220 ? sort_range+0x20/0x20 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 ---[ end trace 0000000000000000 ]--- The synchronize_rcu() will be invoked in the perf_pmu_migrate_context(), when migrating a PMU to a new CPU. However, the current for_each_iommu() is within RCU read-side critical section. Two methods were considered to fix the issue. - Use the dmar_global_lock to replace the RCU read lock when going through the drhd list. But it triggers a lockdep warning. - Use the cpuhp_setup_state_multi() to set up a dedicated state for each IOMMU PMU. The lock can be avoided. The latter method is implemented in this patch. Since each IOMMU PMU has a dedicated state, add cpuhp_node and cpu in struct iommu_pmu to track the state. The state can be dynamically allocated now. Remove the CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE. Fixes: 46284c6ceb5e ("iommu/vt-d: Support cpumask for IOMMU perfmon") Reported-by: Ammy Yi Signed-off-by: Kan Liang Link: https://lore.kernel.org/r/20230328182028.1366416-1-kan.liang@linux.intel.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20230329134721.469447-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.h | 2 ++ drivers/iommu/intel/perfmon.c | 68 ++++++++++++++++++++++------------- include/linux/cpuhotplug.h | 1 - 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index d6df3b865812..694ab9b7d3e9 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -641,6 +641,8 @@ struct iommu_pmu { DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; unsigned char irq_name[16]; + struct hlist_node cpuhp_node; + int cpu; }; #define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c index e17d9743a0d8..cf43e798eca4 100644 --- a/drivers/iommu/intel/perfmon.c +++ b/drivers/iommu/intel/perfmon.c @@ -773,19 +773,34 @@ static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu) iommu->perf_irq = 0; } -static int iommu_pmu_cpu_online(unsigned int cpu) +static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) { + struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); + if (cpumask_empty(&iommu_pmu_cpu_mask)) cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask); + if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask)) + iommu_pmu->cpu = cpu; + return 0; } -static int iommu_pmu_cpu_offline(unsigned int cpu) +static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) { - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - int target; + struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); + int target = cpumask_first(&iommu_pmu_cpu_mask); + + /* + * The iommu_pmu_cpu_mask has been updated when offline the CPU + * for the first iommu_pmu. Migrate the other iommu_pmu to the + * new target. + */ + if (target < nr_cpu_ids && target != iommu_pmu->cpu) { + perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); + iommu_pmu->cpu = target; + return 0; + } if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask)) return 0; @@ -795,45 +810,50 @@ static int iommu_pmu_cpu_offline(unsigned int cpu) if (target < nr_cpu_ids) cpumask_set_cpu(target, &iommu_pmu_cpu_mask); else - target = -1; + return 0; - rcu_read_lock(); - - for_each_iommu(iommu, drhd) { - if (!iommu->pmu) - continue; - perf_pmu_migrate_context(&iommu->pmu->pmu, cpu, target); - } - rcu_read_unlock(); + perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); + iommu_pmu->cpu = target; return 0; } static int nr_iommu_pmu; +static enum cpuhp_state iommu_cpuhp_slot; static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu) { int ret; - if (nr_iommu_pmu++) - return 0; + if (!nr_iommu_pmu) { + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "driver/iommu/intel/perfmon:online", + iommu_pmu_cpu_online, + iommu_pmu_cpu_offline); + if (ret < 0) + return ret; + iommu_cpuhp_slot = ret; + } - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE, - "driver/iommu/intel/perfmon:online", - iommu_pmu_cpu_online, - iommu_pmu_cpu_offline); - if (ret) - nr_iommu_pmu = 0; + ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); + if (ret) { + if (!nr_iommu_pmu) + cpuhp_remove_multi_state(iommu_cpuhp_slot); + return ret; + } + nr_iommu_pmu++; - return ret; + return 0; } static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu) { + cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); + if (--nr_iommu_pmu) return; - cpuhp_remove_state(CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE); + cpuhp_remove_multi_state(iommu_cpuhp_slot); } void iommu_pmu_register(struct intel_iommu *iommu) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index c6fab004104a..5b2f8147d1ae 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -218,7 +218,6 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, CPUHP_AP_PERF_X86_IDXD_ONLINE, - CPUHP_AP_PERF_X86_IOMMU_PERF_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE,