From bd26cd9d815acba56204c8b2af45af96d221c962 Mon Sep 17 00:00:00 2001 From: "Vineeth Pillai (Google)" Date: Mon, 14 Jul 2025 12:50:18 +0800 Subject: [PATCH 01/13] iommu/vt-d: Remove the CONFIG_X86 wrapping from iommu init hook iommu init hook is wrapped in CONFI_X86 and is a remnant of dmar.c when it was a common code in "drivers/pci/dmar.c". This was added in commit (9d5ce73a64be2 x86: intel-iommu: Convert detect_intel_iommu to use iommu_init hook) Now this is built only for x86. This config wrap could be removed. Signed-off-by: Vineeth Pillai (Google) Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250616131740.3499289-1-vineeth@bitbyteword.org Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-2-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/dmar.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index b61d9ea27aa9..ec975c73cfe6 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -935,14 +935,11 @@ void __init detect_intel_iommu(void) pci_request_acs(); } -#ifdef CONFIG_X86 if (!ret) { x86_init.iommu.iommu_init = intel_iommu_init; x86_platform.iommu_shutdown = intel_iommu_shutdown; } -#endif - if (dmar_tbl) { acpi_put_table(dmar_tbl); dmar_tbl = NULL; From 12724ce3fe1a3d8f30d56e48b4f272d8860d1970 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 14 Jul 2025 12:50:19 +0800 Subject: [PATCH 02/13] iommu/vt-d: Optimize iotlb_sync_map for non-caching/non-RWBF modes The iotlb_sync_map iommu ops allows drivers to perform necessary cache flushes when new mappings are established. For the Intel iommu driver, this callback specifically serves two purposes: - To flush caches when a second-stage page table is attached to a device whose iommu is operating in caching mode (CAP_REG.CM==1). - To explicitly flush internal write buffers to ensure updates to memory- resident remapping structures are visible to hardware (CAP_REG.RWBF==1). However, in scenarios where neither caching mode nor the RWBF flag is active, the cache_tag_flush_range_np() helper, which is called in the iotlb_sync_map path, effectively becomes a no-op. Despite being a no-op, cache_tag_flush_range_np() involves iterating through all cache tags of the iommu's attached to the domain, protected by a spinlock. This unnecessary execution path introduces overhead, leading to a measurable I/O performance regression. On systems with NVMes under the same bridge, performance was observed to drop from approximately ~6150 MiB/s down to ~4985 MiB/s. Introduce a flag in the dmar_domain structure. This flag will only be set when iotlb_sync_map is required (i.e., when CM or RWBF is set). The cache_tag_flush_range_np() is called only for domains where this flag is set. This flag, once set, is immutable, given that there won't be mixed configurations in real-world scenarios where some IOMMUs in a system operate in caching mode while others do not. Theoretically, the immutability of this flag does not impact functionality. Reported-by: Ioanna Alifieraki Closes: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2115738 Link: https://lore.kernel.org/r/20250701171154.52435-1-ioanna-maria.alifieraki@canonical.com Fixes: 129dab6e1286 ("iommu/vt-d: Use cache_tag_flush_range_np() in iotlb_sync_map") Cc: stable@vger.kernel.org Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20250703031545.3378602-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20250714045028.958850-3-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 19 ++++++++++++++++++- drivers/iommu/intel/iommu.h | 3 +++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7aa3932251b2..f60201ee4be0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1796,6 +1796,18 @@ static int domain_setup_first_level(struct intel_iommu *iommu, (pgd_t *)pgd, flags, old); } +static bool domain_need_iotlb_sync_map(struct dmar_domain *domain, + struct intel_iommu *iommu) +{ + if (cap_caching_mode(iommu->cap) && !domain->use_first_level) + return true; + + if (rwbf_quirk || cap_rwbf(iommu->cap)) + return true; + + return false; +} + static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { @@ -1833,6 +1845,8 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (ret) goto out_block_translation; + domain->iotlb_sync_map |= domain_need_iotlb_sync_map(domain, iommu); + return 0; out_block_translation: @@ -3945,7 +3959,10 @@ static bool risky_device(struct pci_dev *pdev) static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { - cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + if (dmar_domain->iotlb_sync_map) + cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1); return 0; } diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 3ddbcc603de2..7ab2c34a5ecc 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -614,6 +614,9 @@ struct dmar_domain { u8 has_mappings:1; /* Has mappings configured through * iommu_map() interface. */ + u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write + * buffer when creating mappings. + */ spinlock_t lock; /* Protect device tracking lists */ struct list_head devices; /* all devices' list */ From cd0d0e4e48d817215695e1cc9114c6f614fb629f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 12:50:20 +0800 Subject: [PATCH 03/13] iommu/vt-d: Lift the __pa to domain_setup_first_level/intel_svm_set_dev_pasid() Pass the phys_addr_t down through the call chain from the top instead of passing a pgd_t * KVA. This moves the __pa() into domain_setup_first_level() which is the first function to obtain the pgd from the IOMMU page table in this call chain. The SVA flow is also adjusted to get the pa of the mm->pgd. iommput will move the __pa() into iommupt code, it never shares the KVA of the page table with the driver. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v3-dbbe6f7e7ae3+124ffe-vtd_prep_jgg@nvidia.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-4-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 15 +++++++-------- drivers/iommu/intel/iommu.h | 7 +++---- drivers/iommu/intel/pasid.c | 17 +++++++++-------- drivers/iommu/intel/pasid.h | 11 +++++------ drivers/iommu/intel/svm.c | 2 +- 5 files changed, 25 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f60201ee4be0..bb29c4a635ea 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1736,15 +1736,14 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 intel_context_flush_no_pasid(info, context, did); } -int __domain_setup_first_level(struct intel_iommu *iommu, - struct device *dev, ioasid_t pasid, - u16 did, pgd_t *pgd, int flags, - struct iommu_domain *old) +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old) { if (!old) - return intel_pasid_setup_first_level(iommu, dev, pgd, - pasid, did, flags); - return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did, + return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, + did, flags); + return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did, iommu_domain_did(old, iommu), flags); } @@ -1793,7 +1792,7 @@ static int domain_setup_first_level(struct intel_iommu *iommu, return __domain_setup_first_level(iommu, dev, pasid, domain_id_iommu(domain, iommu), - (pgd_t *)pgd, flags, old); + __pa(pgd), flags, old); } static bool domain_need_iotlb_sync_map(struct dmar_domain *domain, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 7ab2c34a5ecc..30baee6f6519 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1255,10 +1255,9 @@ domain_add_dev_pasid(struct iommu_domain *domain, void domain_remove_dev_pasid(struct iommu_domain *domain, struct device *dev, ioasid_t pasid); -int __domain_setup_first_level(struct intel_iommu *iommu, - struct device *dev, ioasid_t pasid, - u16 did, pgd_t *pgd, int flags, - struct iommu_domain *old); +int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, + ioasid_t pasid, u16 did, phys_addr_t fsptptr, + int flags, struct iommu_domain *old); int dmar_ir_support(void); diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index ac67a056b6c8..52f678975da7 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -348,14 +348,15 @@ static void intel_pasid_flush_present(struct intel_iommu *iommu, */ static void pasid_pte_config_first_level(struct intel_iommu *iommu, struct pasid_entry *pte, - pgd_t *pgd, u16 did, int flags) + phys_addr_t fsptptr, u16 did, + int flags) { lockdep_assert_held(&iommu->lock); pasid_clear_entry(pte); /* Setup the first level page table pointer: */ - pasid_set_flptr(pte, (u64)__pa(pgd)); + pasid_set_flptr(pte, fsptptr); if (flags & PASID_FLAG_FL5LP) pasid_set_flpm(pte, 1); @@ -372,9 +373,9 @@ static void pasid_pte_config_first_level(struct intel_iommu *iommu, pasid_set_present(pte); } -int intel_pasid_setup_first_level(struct intel_iommu *iommu, - struct device *dev, pgd_t *pgd, - u32 pasid, u16 did, int flags) +int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, + phys_addr_t fsptptr, u32 pasid, u16 did, + int flags) { struct pasid_entry *pte; @@ -402,7 +403,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_pte_config_first_level(iommu, pte, pgd, did, flags); + pasid_pte_config_first_level(iommu, pte, fsptptr, did, flags); spin_unlock(&iommu->lock); @@ -412,7 +413,7 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, } int intel_pasid_replace_first_level(struct intel_iommu *iommu, - struct device *dev, pgd_t *pgd, + struct device *dev, phys_addr_t fsptptr, u32 pasid, u16 did, u16 old_did, int flags) { @@ -430,7 +431,7 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu, return -EINVAL; } - pasid_pte_config_first_level(iommu, &new_pte, pgd, did, flags); + pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags); spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index fd0fd1a0df84..a771a77d4239 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -288,9 +288,9 @@ extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); void intel_pasid_free_table(struct device *dev); struct pasid_table *intel_pasid_get_table(struct device *dev); -int intel_pasid_setup_first_level(struct intel_iommu *iommu, - struct device *dev, pgd_t *pgd, - u32 pasid, u16 did, int flags); +int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, + phys_addr_t fsptptr, u32 pasid, u16 did, + int flags); int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); @@ -302,9 +302,8 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); int intel_pasid_replace_first_level(struct intel_iommu *iommu, - struct device *dev, pgd_t *pgd, - u32 pasid, u16 did, u16 old_did, - int flags); + struct device *dev, phys_addr_t fsptptr, + u32 pasid, u16 did, u16 old_did, int flags); int intel_pasid_replace_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u16 old_did, diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index f3da596410b5..8c0bed36c587 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -171,7 +171,7 @@ static int intel_svm_set_dev_pasid(struct iommu_domain *domain, /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; ret = __domain_setup_first_level(iommu, dev, pasid, - FLPT_DEFAULT_DID, mm->pgd, + FLPT_DEFAULT_DID, __pa(mm->pgd), sflags, old); if (ret) goto out_unwind_iopf; From 00939bebe51c23d325c9796ad57ace49833a5813 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 12:50:21 +0800 Subject: [PATCH 04/13] iommu/vt-d: Fold domain_exit() into intel_iommu_domain_free() It has only one caller, no need for two functions. Correct the WARN_ON() error handling to leak the entire page table if the HW is still referencing it so we don't UAF during WARN_ON recovery. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v3-dbbe6f7e7ae3+124ffe-vtd_prep_jgg@nvidia.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-5-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 38 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index bb29c4a635ea..8521566ccf9b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1396,23 +1396,6 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) } } -static void domain_exit(struct dmar_domain *domain) -{ - if (domain->pgd) { - struct iommu_pages_list freelist = - IOMMU_PAGES_LIST_INIT(freelist); - - domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist); - iommu_put_pages_list(&freelist); - } - - if (WARN_ON(!list_empty(&domain->devices))) - return; - - kfree(domain->qi_batch); - kfree(domain); -} - /* * For kdump cases, old valid entries may be cached due to the * in-flight DMA and copied pgtable, but there is no unmapping @@ -3406,9 +3389,24 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - WARN_ON(dmar_domain->nested_parent && - !list_empty(&dmar_domain->s1_domains)); - domain_exit(dmar_domain); + if (WARN_ON(dmar_domain->nested_parent && + !list_empty(&dmar_domain->s1_domains))) + return; + + if (WARN_ON(!list_empty(&dmar_domain->devices))) + return; + + if (dmar_domain->pgd) { + struct iommu_pages_list freelist = + IOMMU_PAGES_LIST_INIT(freelist); + + domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw), + &freelist); + iommu_put_pages_list(&freelist); + } + + kfree(dmar_domain->qi_batch); + kfree(dmar_domain); } int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) From 5c3687d5789cfff8d285a2c76bceb47f145bf01f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 12:50:22 +0800 Subject: [PATCH 05/13] iommu/vt-d: Do not wipe out the page table NID when devices detach The NID is used to control which NUMA node memory for the page table is allocated it from. It should be a permanent property of the page table when it was allocated and not change during attach/detach of devices. Reviewed-by: Wei Wang Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-dbbe6f7e7ae3+124ffe-vtd_prep_jgg@nvidia.com Signed-off-by: Lu Baolu Fixes: 7c204426b818 ("iommu/vt-d: Add domain_alloc_paging support") Link: https://lore.kernel.org/r/20250714045028.958850-6-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8521566ccf9b..8c99f8bf2065 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1391,7 +1391,6 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) if (--info->refcnt == 0) { ida_free(&iommu->domain_ida, info->did); xa_erase(&domain->iommu_array, iommu->seq_id); - domain->nid = NUMA_NO_NODE; kfree(info); } } From b9434ba97c44f5744ea537adfd1f9f3fe102681c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 12:50:23 +0800 Subject: [PATCH 06/13] iommu/vt-d: Split intel_iommu_domain_alloc_paging_flags() Create stage specific functions that check the stage specific conditions if each stage can be supported. Have intel_iommu_domain_alloc_paging_flags() call both stages in sequence until one does not return EOPNOTSUPP and prefer to use the first stage if available and suitable for the requested flags. Move second stage only operations like nested_parent and dirty_tracking into the second stage function for clarity. Move initialization of the iommu_domain members into paging_domain_alloc(). Drop initialization of domain->owner as the callers all do it. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v3-dbbe6f7e7ae3+124ffe-vtd_prep_jgg@nvidia.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-7-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 98 +++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8c99f8bf2065..553f0e32be9a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3281,10 +3281,15 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st spin_lock_init(&domain->lock); spin_lock_init(&domain->cache_lock); xa_init(&domain->iommu_array); + INIT_LIST_HEAD(&domain->s1_domains); + spin_lock_init(&domain->s1_lock); domain->nid = dev_to_node(dev); domain->use_first_level = first_stage; + domain->domain.type = IOMMU_DOMAIN_UNMANAGED; + domain->domain.ops = intel_iommu_ops.default_domain_ops; + /* calculate the address width */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) @@ -3326,62 +3331,73 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st } static struct iommu_domain * -intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, - const struct iommu_user_data *user_data) +intel_iommu_domain_alloc_first_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) +{ + struct dmar_domain *dmar_domain; + + if (flags & ~IOMMU_HWPT_ALLOC_PASID) + return ERR_PTR(-EOPNOTSUPP); + + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + + dmar_domain = paging_domain_alloc(dev, true); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + return &dmar_domain->domain; +} + +static struct iommu_domain * +intel_iommu_domain_alloc_second_stage(struct device *dev, + struct intel_iommu *iommu, u32 flags) { - struct device_domain_info *info = dev_iommu_priv_get(dev); - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; - struct iommu_domain *domain; - bool first_stage; if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_PASID))) return ERR_PTR(-EOPNOTSUPP); - if (nested_parent && !nested_supported(iommu)) - return ERR_PTR(-EOPNOTSUPP); - if (user_data || (dirty_tracking && !ssads_supported(iommu))) + + if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && + !nested_supported(iommu)) || + ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && + !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); - /* - * Always allocate the guest compatible page table unless - * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING - * is specified. - */ - if (nested_parent || dirty_tracking) { - if (!sm_supported(iommu) || !ecap_slts(iommu->ecap)) - return ERR_PTR(-EOPNOTSUPP); - first_stage = false; - } else { - first_stage = first_level_by_default(iommu); - } + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); - dmar_domain = paging_domain_alloc(dev, first_stage); + dmar_domain = paging_domain_alloc(dev, false); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); - domain = &dmar_domain->domain; - domain->type = IOMMU_DOMAIN_UNMANAGED; - domain->owner = &intel_iommu_ops; - domain->ops = intel_iommu_ops.default_domain_ops; - if (nested_parent) { - dmar_domain->nested_parent = true; - INIT_LIST_HEAD(&dmar_domain->s1_domains); - spin_lock_init(&dmar_domain->s1_lock); - } + dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - if (dirty_tracking) { - if (dmar_domain->use_first_level) { - iommu_domain_free(domain); - return ERR_PTR(-EOPNOTSUPP); - } - domain->dirty_ops = &intel_dirty_ops; - } + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) + dmar_domain->domain.dirty_ops = &intel_dirty_ops; - return domain; + return &dmar_domain->domain; +} + +static struct iommu_domain * +intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, + const struct iommu_user_data *user_data) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct iommu_domain *domain; + + if (user_data) + return ERR_PTR(-EOPNOTSUPP); + + /* Prefer first stage if possible by default. */ + domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags); + if (domain != ERR_PTR(-EOPNOTSUPP)) + return domain; + return intel_iommu_domain_alloc_second_stage(dev, iommu, flags); } static void intel_iommu_domain_free(struct iommu_domain *domain) From b33125296b5047115469b8a3b74c0fdbf4976548 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 12:50:24 +0800 Subject: [PATCH 07/13] iommu/vt-d: Create unique domain ops for each stage Use the domain ops pointer to tell what kind of domain it is instead of the internal use_first_level indication. This also protects against wrongly using a SVA/nested/IDENTITY/BLOCKED domain type in places they should not be. The only remaining uses of use_first_level outside the paging domain are in paging_domain_compatible() and intel_iommu_enforce_cache_coherency(). Thus, remove the useless sets of use_first_level in intel_svm_domain_alloc() and intel_iommu_domain_alloc_nested(). None of the unique ops for these domain types ever reference it on their call chains. Add a WARN_ON() check in domain_context_mapping_one() as it only works with second stage. This is preparation for iommupt which will have different ops for each of the stages. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v3-dbbe6f7e7ae3+124ffe-vtd_prep_jgg@nvidia.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-8-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/cache.c | 5 +-- drivers/iommu/intel/iommu.c | 60 +++++++++++++++++++++++++----------- drivers/iommu/intel/iommu.h | 12 ++++++++ drivers/iommu/intel/nested.c | 4 +-- drivers/iommu/intel/svm.c | 1 - 5 files changed, 58 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index fc35cba59145..f8bcc5f67463 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -371,7 +371,7 @@ static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag * struct intel_iommu *iommu = tag->iommu; u64 type = DMA_TLB_PSI_FLUSH; - if (domain->use_first_level) { + if (intel_domain_is_fs_paging(domain)) { qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, pages, ih, domain->qi_batch); return; @@ -546,7 +546,8 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, qi_batch_flush_descs(iommu, domain->qi_batch); iommu = tag->iommu; - if (!cap_caching_mode(iommu->cap) || domain->use_first_level) { + if (!cap_caching_mode(iommu->cap) || + intel_domain_is_fs_paging(domain)) { iommu_flush_write_buffer(iommu); continue; } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 553f0e32be9a..940d1c6c2248 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1462,6 +1462,9 @@ static int domain_context_mapping_one(struct dmar_domain *domain, struct context_entry *context; int ret; + if (WARN_ON(!intel_domain_is_ss_paging(domain))) + return -EINVAL; + pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1780,7 +1783,7 @@ static int domain_setup_first_level(struct intel_iommu *iommu, static bool domain_need_iotlb_sync_map(struct dmar_domain *domain, struct intel_iommu *iommu) { - if (cap_caching_mode(iommu->cap) && !domain->use_first_level) + if (cap_caching_mode(iommu->cap) && intel_domain_is_ss_paging(domain)) return true; if (rwbf_quirk || cap_rwbf(iommu->cap)) @@ -1812,12 +1815,14 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); - else if (domain->use_first_level) + else if (intel_domain_is_fs_paging(domain)) ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID, NULL); - else + else if (intel_domain_is_ss_paging(domain)) ret = domain_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID, NULL); + else if (WARN_ON(true)) + ret = -EINVAL; if (ret) goto out_block_translation; @@ -3288,7 +3293,6 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st domain->use_first_level = first_stage; domain->domain.type = IOMMU_DOMAIN_UNMANAGED; - domain->domain.ops = intel_iommu_ops.default_domain_ops; /* calculate the address width */ addr_width = agaw_to_width(iommu->agaw); @@ -3346,6 +3350,8 @@ intel_iommu_domain_alloc_first_stage(struct device *dev, dmar_domain = paging_domain_alloc(dev, true); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); + + dmar_domain->domain.ops = &intel_fs_paging_domain_ops; return &dmar_domain->domain; } @@ -3374,6 +3380,7 @@ intel_iommu_domain_alloc_second_stage(struct device *dev, if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); + dmar_domain->domain.ops = &intel_ss_paging_domain_ops; dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) @@ -4098,12 +4105,15 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (ret) goto out_remove_dev_pasid; - if (dmar_domain->use_first_level) + if (intel_domain_is_fs_paging(dmar_domain)) ret = domain_setup_first_level(iommu, dmar_domain, dev, pasid, old); - else + else if (intel_domain_is_ss_paging(dmar_domain)) ret = domain_setup_second_level(iommu, dmar_domain, dev, pasid, old); + else if (WARN_ON(true)) + ret = -EINVAL; + if (ret) goto out_unwind_iopf; @@ -4378,6 +4388,32 @@ static struct iommu_domain identity_domain = { }, }; +const struct iommu_domain_ops intel_fs_paging_domain_ops = { + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .map_pages = intel_iommu_map_pages, + .unmap_pages = intel_iommu_unmap_pages, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .iova_to_phys = intel_iommu_iova_to_phys, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, +}; + +const struct iommu_domain_ops intel_ss_paging_domain_ops = { + .attach_dev = intel_iommu_attach_device, + .set_dev_pasid = intel_iommu_set_dev_pasid, + .map_pages = intel_iommu_map_pages, + .unmap_pages = intel_iommu_unmap_pages, + .iotlb_sync_map = intel_iommu_iotlb_sync_map, + .flush_iotlb_all = intel_flush_iotlb_all, + .iotlb_sync = intel_iommu_tlb_sync, + .iova_to_phys = intel_iommu_iova_to_phys, + .free = intel_iommu_domain_free, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, +}; + const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, @@ -4396,18 +4432,6 @@ const struct iommu_ops intel_iommu_ops = { .def_domain_type = device_def_domain_type, .pgsize_bitmap = SZ_4K, .page_response = intel_iommu_page_response, - .default_domain_ops = &(const struct iommu_domain_ops) { - .attach_dev = intel_iommu_attach_device, - .set_dev_pasid = intel_iommu_set_dev_pasid, - .map_pages = intel_iommu_map_pages, - .unmap_pages = intel_iommu_unmap_pages, - .iotlb_sync_map = intel_iommu_iotlb_sync_map, - .flush_iotlb_all = intel_flush_iotlb_all, - .iotlb_sync = intel_iommu_tlb_sync, - .iova_to_phys = intel_iommu_iova_to_phys, - .free = intel_iommu_domain_free, - .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, - } }; static void quirk_iommu_igfx(struct pci_dev *dev) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 30baee6f6519..eac6fda8e59c 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1378,6 +1378,18 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc); extern const struct iommu_ops intel_iommu_ops; +extern const struct iommu_domain_ops intel_fs_paging_domain_ops; +extern const struct iommu_domain_ops intel_ss_paging_domain_ops; + +static inline bool intel_domain_is_fs_paging(struct dmar_domain *domain) +{ + return domain->domain.ops == &intel_fs_paging_domain_ops; +} + +static inline bool intel_domain_is_ss_paging(struct dmar_domain *domain) +{ + return domain->domain.ops == &intel_ss_paging_domain_ops; +} #ifdef CONFIG_INTEL_IOMMU extern int intel_iommu_sm; diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index fc312f649f9e..1b6ad9c900a5 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -216,8 +216,7 @@ intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, /* Must be nested domain */ if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) return ERR_PTR(-EOPNOTSUPP); - if (parent->ops != intel_iommu_ops.default_domain_ops || - !s2_domain->nested_parent) + if (!intel_domain_is_ss_paging(s2_domain) || !s2_domain->nested_parent) return ERR_PTR(-EINVAL); ret = iommu_copy_struct_from_user(&vtd, user_data, @@ -229,7 +228,6 @@ intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, if (!domain) return ERR_PTR(-ENOMEM); - domain->use_first_level = true; domain->s2_domain = s2_domain; domain->s1_cfg = vtd; domain->domain.ops = &intel_nested_domain_ops; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 8c0bed36c587..e147f71f91b7 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -214,7 +214,6 @@ struct iommu_domain *intel_svm_domain_alloc(struct device *dev, return ERR_PTR(-ENOMEM); domain->domain.ops = &intel_svm_domain_ops; - domain->use_first_level = true; INIT_LIST_HEAD(&domain->dev_pasids); INIT_LIST_HEAD(&domain->cache_tags); spin_lock_init(&domain->cache_lock); From 0fa6f0893466c9fc0a149d215cfcee760e41cb00 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 12:50:25 +0800 Subject: [PATCH 08/13] iommu/vt-d: Split intel_iommu_enforce_cache_coherency() First Stage and Second Stage have very different ways to deny no-snoop. The first stage uses the PGSNP bit which is global per-PASID so enabling requires loading new PASID entries for all the attached devices. Second stage uses a bit per PTE, so enabling just requires telling future maps to set the bit. Since we now have two domain ops we can have two functions that can directly code their required actions instead of a bunch of logic dancing around use_first_level. Combine domain_set_force_snooping() into the new functions since they are the only caller. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v3-dbbe6f7e7ae3+124ffe-vtd_prep_jgg@nvidia.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-9-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 57 ++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 940d1c6c2248..d2451a8ee6e7 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3643,44 +3643,41 @@ static bool domain_support_force_snooping(struct dmar_domain *domain) return support; } -static void domain_set_force_snooping(struct dmar_domain *domain) -{ - struct device_domain_info *info; - - assert_spin_locked(&domain->lock); - /* - * Second level page table supports per-PTE snoop control. The - * iommu_map() interface will handle this by setting SNP bit. - */ - if (!domain->use_first_level) { - domain->set_pte_snp = true; - return; - } - - list_for_each_entry(info, &domain->devices, link) - intel_pasid_setup_page_snoop_control(info->iommu, info->dev, - IOMMU_NO_PASID); -} - -static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) +static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); - unsigned long flags; + struct device_domain_info *info; + + guard(spinlock_irqsave)(&dmar_domain->lock); if (dmar_domain->force_snooping) return true; - spin_lock_irqsave(&dmar_domain->lock, flags); - if (!domain_support_force_snooping(dmar_domain) || - (!dmar_domain->use_first_level && dmar_domain->has_mappings)) { - spin_unlock_irqrestore(&dmar_domain->lock, flags); + if (!domain_support_force_snooping(dmar_domain)) return false; - } - domain_set_force_snooping(dmar_domain); dmar_domain->force_snooping = true; - spin_unlock_irqrestore(&dmar_domain->lock, flags); + list_for_each_entry(info, &dmar_domain->devices, link) + intel_pasid_setup_page_snoop_control(info->iommu, info->dev, + IOMMU_NO_PASID); + return true; +} +static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + + guard(spinlock_irqsave)(&dmar_domain->lock); + if (!domain_support_force_snooping(dmar_domain) || + dmar_domain->has_mappings) + return false; + + /* + * Second level page table supports per-PTE snoop control. The + * iommu_map() interface will handle this by setting SNP bit. + */ + dmar_domain->set_pte_snp = true; + dmar_domain->force_snooping = true; return true; } @@ -4398,7 +4395,7 @@ const struct iommu_domain_ops intel_fs_paging_domain_ops = { .iotlb_sync = intel_iommu_tlb_sync, .iova_to_phys = intel_iommu_iova_to_phys, .free = intel_iommu_domain_free, - .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, }; const struct iommu_domain_ops intel_ss_paging_domain_ops = { @@ -4411,7 +4408,7 @@ const struct iommu_domain_ops intel_ss_paging_domain_ops = { .iotlb_sync = intel_iommu_tlb_sync, .iova_to_phys = intel_iommu_iova_to_phys, .free = intel_iommu_domain_free, - .enforce_cache_coherency = intel_iommu_enforce_cache_coherency, + .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, }; const struct iommu_ops intel_iommu_ops = { From 85cfaacc99377a63e47412eeef66eff77197acea Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 12:50:26 +0800 Subject: [PATCH 09/13] iommu/vt-d: Split paging_domain_compatible() Make First/Second stage specific functions that follow the same pattern in intel_iommu_domain_alloc_first/second_stage() for computing EOPNOTSUPP. This makes the code easier to understand as if we couldn't create a domain with the parameters for this IOMMU instance then we certainly are not compatible with it. Check superpage support directly against the per-stage cap bits and the pgsize_bitmap. Add a note that the force_snooping is read without locking. The locking needs to cover the compatible check and the add of the device to the list. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v3-dbbe6f7e7ae3+124ffe-vtd_prep_jgg@nvidia.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-10-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 66 ++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d2451a8ee6e7..3e774e3bb735 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3431,33 +3431,75 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) kfree(dmar_domain); } +static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) +{ + if (WARN_ON(dmar_domain->domain.dirty_ops || + dmar_domain->nested_parent)) + return -EINVAL; + + /* Only SL is available in legacy mode */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return -EINVAL; + + /* Same page size support */ + if (!cap_fl1gp_support(iommu->cap) && + (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + return 0; +} + +static int +paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, + struct intel_iommu *iommu) +{ + unsigned int sslps = cap_super_page_val(iommu->cap); + + if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) + return -EINVAL; + if (dmar_domain->nested_parent && !nested_supported(iommu)) + return -EINVAL; + + /* Legacy mode always supports second stage */ + if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) + return -EINVAL; + + /* Same page size support */ + if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M)) + return -EINVAL; + if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) + return -EINVAL; + return 0; +} + int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; + int ret = -EINVAL; int addr_width; - if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) - return -EPERM; + if (intel_domain_is_fs_paging(dmar_domain)) + ret = paging_domain_compatible_first_stage(dmar_domain, iommu); + else if (intel_domain_is_ss_paging(dmar_domain)) + ret = paging_domain_compatible_second_stage(dmar_domain, iommu); + else if (WARN_ON(true)) + ret = -EINVAL; + if (ret) + return ret; + /* + * FIXME this is locked wrong, it needs to be under the + * dmar_domain->lock + */ if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; - if (domain->dirty_ops && !ssads_supported(iommu)) - return -EINVAL; - if (dmar_domain->iommu_coherency != iommu_paging_structure_coherency(iommu)) return -EINVAL; - if (dmar_domain->iommu_superpage != - iommu_superpage_capability(iommu, dmar_domain->use_first_level)) - return -EINVAL; - - if (dmar_domain->use_first_level && - (!sm_supported(iommu) || !ecap_flts(iommu->ecap))) - return -EINVAL; /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); From 3141153816bf4f0257747bd4dda176d38f1a9a49 Mon Sep 17 00:00:00 2001 From: Ethan Milon Date: Mon, 14 Jul 2025 12:50:27 +0800 Subject: [PATCH 10/13] iommu/vt-d: Fix missing PASID in dev TLB flush with cache_tag_flush_all The function cache_tag_flush_all() was originally implemented with incorrect device TLB invalidation logic that does not handle PASID, in commit c4d27ffaa8eb ("iommu/vt-d: Add cache tag invalidation helpers") This causes regressions where full address space TLB invalidations occur with a PASID attached, such as during transparent hugepage unmapping in SVA configurations or when calling iommu_flush_iotlb_all(). In these cases, the device receives a TLB invalidation that lacks PASID. This incorrect logic was later extracted into cache_tag_flush_devtlb_all(), in commit 3297d047cd7f ("iommu/vt-d: Refactor IOTLB and Dev-IOTLB flush for batching") The fix replaces the call to cache_tag_flush_devtlb_all() with cache_tag_flush_devtlb_psi(), which properly handles PASID. Fixes: 4f609dbff51b ("iommu/vt-d: Use cache helpers in arch_invalidate_secondary_tlbs") Fixes: 4e589a53685c ("iommu/vt-d: Use cache_tag_flush_all() in flush_iotlb_all") Signed-off-by: Ethan Milon Link: https://lore.kernel.org/r/20250708214821.30967-1-ethan.milon@eviden.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-11-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/cache.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index f8bcc5f67463..ff45c4c9609d 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -423,22 +423,6 @@ static void cache_tag_flush_devtlb_psi(struct dmar_domain *domain, struct cache_ domain->qi_batch); } -static void cache_tag_flush_devtlb_all(struct dmar_domain *domain, struct cache_tag *tag) -{ - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; - - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, 0, - MAX_AGAW_PFN_WIDTH, domain->qi_batch); - if (info->dtlb_extra_inval) - qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, 0, - MAX_AGAW_PFN_WIDTH, domain->qi_batch); -} - /* * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive) * when the memory mappings in the target domain have been modified. @@ -509,7 +493,7 @@ void cache_tag_flush_all(struct dmar_domain *domain) break; case CACHE_TAG_DEVTLB: case CACHE_TAG_NESTING_DEVTLB: - cache_tag_flush_devtlb_all(domain, tag); + cache_tag_flush_devtlb_psi(domain, tag, 0, MAX_AGAW_PFN_WIDTH); break; } From e934464e098ebfc212c72d3022f1d31b88929768 Mon Sep 17 00:00:00 2001 From: Ethan Milon Date: Mon, 14 Jul 2025 12:50:28 +0800 Subject: [PATCH 11/13] iommu/vt-d: Deduplicate cache_tag_flush_all by reusing flush_range The logic in cache_tag_flush_all() to iterate over cache tags and issue TLB invalidations is largely duplicated in cache_tag_flush_range(), with the only difference being the range parameters. Extend cache_tag_flush_range() to handle a full address space flush when called with start = 0 and end = ULONG_MAX. This allows cache_tag_flush_all() to simply delegate to cache_tag_flush_range() Signed-off-by: Ethan Milon Link: https://lore.kernel.org/r/20250708214821.30967-2-ethan.milon@eviden.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250714045028.958850-12-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/cache.c | 34 ++++++++-------------------------- drivers/iommu/intel/trace.h | 5 ----- 2 files changed, 8 insertions(+), 31 deletions(-) diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index ff45c4c9609d..a0111e455762 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -435,7 +435,13 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, struct cache_tag *tag; unsigned long flags; - addr = calculate_psi_aligned_address(start, end, &pages, &mask); + if (start == 0 && end == ULONG_MAX) { + addr = 0; + pages = -1; + mask = MAX_AGAW_PFN_WIDTH; + } else { + addr = calculate_psi_aligned_address(start, end, &pages, &mask); + } spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { @@ -476,31 +482,7 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, */ void cache_tag_flush_all(struct dmar_domain *domain) { - struct intel_iommu *iommu = NULL; - struct cache_tag *tag; - unsigned long flags; - - spin_lock_irqsave(&domain->cache_lock, flags); - list_for_each_entry(tag, &domain->cache_tags, node) { - if (iommu && iommu != tag->iommu) - qi_batch_flush_descs(iommu, domain->qi_batch); - iommu = tag->iommu; - - switch (tag->type) { - case CACHE_TAG_IOTLB: - case CACHE_TAG_NESTING_IOTLB: - cache_tag_flush_iotlb(domain, tag, 0, -1, 0, 0); - break; - case CACHE_TAG_DEVTLB: - case CACHE_TAG_NESTING_DEVTLB: - cache_tag_flush_devtlb_psi(domain, tag, 0, MAX_AGAW_PFN_WIDTH); - break; - } - - trace_cache_tag_flush_all(tag); - } - qi_batch_flush_descs(iommu, domain->qi_batch); - spin_unlock_irqrestore(&domain->cache_lock, flags); + cache_tag_flush_range(domain, 0, ULONG_MAX, 0); } /* diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h index 9defdae6ebae..6311ba3f1691 100644 --- a/drivers/iommu/intel/trace.h +++ b/drivers/iommu/intel/trace.h @@ -130,11 +130,6 @@ DEFINE_EVENT(cache_tag_log, cache_tag_unassign, TP_ARGS(tag) ); -DEFINE_EVENT(cache_tag_log, cache_tag_flush_all, - TP_PROTO(struct cache_tag *tag), - TP_ARGS(tag) -); - DECLARE_EVENT_CLASS(cache_tag_flush, TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, unsigned long addr, unsigned long pages, unsigned long mask), From cee686775f9cd4eae31f3c1f7ec24b2048082667 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Mon, 21 Jul 2025 13:16:57 +0800 Subject: [PATCH 12/13] iommu/vt-d: Make iotlb_sync_map a static property of dmar_domain Commit 12724ce3fe1a ("iommu/vt-d: Optimize iotlb_sync_map for non-caching/non-RWBF modes") dynamically set iotlb_sync_map. This causes synchronization issues due to lack of locking on map and attach paths, racing iommufd userspace operations. Invalidation changes must precede device attachment to ensure all flushes complete before hardware walks page tables, preventing coherence issues. Make domain->iotlb_sync_map static, set once during domain allocation. If an IOMMU requires iotlb_sync_map but the domain lacks it, attach is rejected. This won't reduce domain sharing: RWBF and shadowing page table caching are legacy uses with legacy hardware. Mixed configs (some IOMMUs in caching mode, others not) are unlikely in real-world scenarios. Fixes: 12724ce3fe1a ("iommu/vt-d: Optimize iotlb_sync_map for non-caching/non-RWBF modes") Suggested-by: Jason Gunthorpe Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250721051657.1695788-1-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 43 +++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3e774e3bb735..d1791b50f791 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -57,6 +57,8 @@ static void __init check_tylersburg_isoch(void); static int rwbf_quirk; +#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) + /* * set to 1 to panic kernel if can't successfully enable VT-d * (used when kernel is launched w/ TXT) @@ -1780,18 +1782,6 @@ static int domain_setup_first_level(struct intel_iommu *iommu, __pa(pgd), flags, old); } -static bool domain_need_iotlb_sync_map(struct dmar_domain *domain, - struct intel_iommu *iommu) -{ - if (cap_caching_mode(iommu->cap) && intel_domain_is_ss_paging(domain)) - return true; - - if (rwbf_quirk || cap_rwbf(iommu->cap)) - return true; - - return false; -} - static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { @@ -1831,8 +1821,6 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (ret) goto out_block_translation; - domain->iotlb_sync_map |= domain_need_iotlb_sync_map(domain, iommu); - return 0; out_block_translation: @@ -3352,6 +3340,14 @@ intel_iommu_domain_alloc_first_stage(struct device *dev, return ERR_CAST(dmar_domain); dmar_domain->domain.ops = &intel_fs_paging_domain_ops; + /* + * iotlb sync for map is only needed for legacy implementations that + * explicitly require flushing internal write buffers to ensure memory + * coherence. + */ + if (rwbf_required(iommu)) + dmar_domain->iotlb_sync_map = true; + return &dmar_domain->domain; } @@ -3386,6 +3382,14 @@ intel_iommu_domain_alloc_second_stage(struct device *dev, if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) dmar_domain->domain.dirty_ops = &intel_dirty_ops; + /* + * Besides the internal write buffer flush, the caching mode used for + * legacy nested translation (which utilizes shadowing page tables) + * also requires iotlb sync on map. + */ + if (rwbf_required(iommu) || cap_caching_mode(iommu->cap)) + dmar_domain->iotlb_sync_map = true; + return &dmar_domain->domain; } @@ -3446,6 +3450,11 @@ static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain, if (!cap_fl1gp_support(iommu->cap) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map) + return -EINVAL; + return 0; } @@ -3469,6 +3478,12 @@ paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, return -EINVAL; if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G)) return -EINVAL; + + /* iotlb sync on map requirement */ + if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) && + !dmar_domain->iotlb_sync_map) + return -EINVAL; + return 0; } From f0b9d31c6edd50a6207489cd1bd4ddac814b9cd2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 23 Jul 2025 15:20:45 +0800 Subject: [PATCH 13/13] iommu/vt-d: Fix UAF on sva unbind with pending IOPFs Commit 17fce9d2336d ("iommu/vt-d: Put iopf enablement in domain attach path") disables IOPF on device by removing the device from its IOMMU's IOPF queue when the last IOPF-capable domain is detached from the device. Unfortunately, it did this in a wrong place where there are still pending IOPFs. As a result, a use-after-free error is potentially triggered and eventually a kernel panic with a kernel trace similar to the following: refcount_t: underflow; use-after-free. WARNING: CPU: 3 PID: 313 at lib/refcount.c:28 refcount_warn_saturate+0xd8/0xe0 Workqueue: iopf_queue/dmar0-iopfq iommu_sva_handle_iopf Call Trace: iopf_free_group+0xe/0x20 process_one_work+0x197/0x3d0 worker_thread+0x23a/0x350 ? rescuer_thread+0x4a0/0x4a0 kthread+0xf8/0x230 ? finish_task_switch.isra.0+0x81/0x260 ? kthreads_online_cpu+0x110/0x110 ? kthreads_online_cpu+0x110/0x110 ret_from_fork+0x13b/0x170 ? kthreads_online_cpu+0x110/0x110 ret_from_fork_asm+0x11/0x20 ---[ end trace 0000000000000000 ]--- The intel_pasid_tear_down_entry() function is responsible for blocking hardware from generating new page faults and flushing all in-flight ones. Therefore, moving iopf_for_domain_remove() after this function should resolve this. Fixes: 17fce9d2336d ("iommu/vt-d: Put iopf enablement in domain attach path") Reported-by: Ethan Milon Closes: https://lore.kernel.org/r/e8b37f3e-8539-40d4-8993-43a1f3ffe5aa@eviden.com Suggested-by: Ethan Milon Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20250723072045.1853328-1-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d1791b50f791..a4f53b62dfbb 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4081,8 +4081,8 @@ static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, { struct device_domain_info *info = dev_iommu_priv_get(dev); - iopf_for_domain_remove(old, dev); intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); + iopf_for_domain_remove(old, dev); domain_remove_dev_pasid(old, dev, pasid); return 0;