From 8dfd3d8d74435344ee8dc9237596959c8b2a6cbe Mon Sep 17 00:00:00 2001 From: Eder Zulian Date: Fri, 10 Apr 2026 14:55:50 +0200 Subject: [PATCH 01/19] iommu/amd: Remove latent out-of-bounds access in IOMMU debugfs In iommu_mmio_write() and iommu_capability_write(), the variables dbg_mmio_offset and dbg_cap_offset are declared as int. However, they are populated using kstrtou32_from_user(). If a user provides a sufficiently large value, it can become a negative integer. Prior to this patch, the AMD IOMMU debugfs implementation was already protected by different mechanisms. 1. #define OFS_IN_SZ 8 ensures the user string <= 8 bytes, so e.g. 0xffffffff isn't a valid input. if (cnt > OFS_IN_SZ) return -EINVAL; 2. Implicit type promotion in iommu_mmio_write(), dbg_mmio_offset is int and iommu->mmio_phys_end is u64 if (dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) return -EINVAL; 3. The show handlers would currently catch the negative number and refuse to perform the read. Replace kstrtou32_from_user() with kstrtos32_from_user() to parse the input, and check for negative values to explicitly prevent out-of-bounds memory accesses directly in iommu_mmio_write() and iommu_capability_write(). Signed-off-by: Eder Zulian Fixes: 7a4ee419e8c1 ("iommu/amd: Add debugfs support to dump IOMMU MMIO registers") Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd/debugfs.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c index 4e66473d7cea..4c53b6361314 100644 --- a/drivers/iommu/amd/debugfs.c +++ b/drivers/iommu/amd/debugfs.c @@ -31,11 +31,12 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf, if (cnt > OFS_IN_SZ) return -EINVAL; - ret = kstrtou32_from_user(ubuf, cnt, 0, &dbg_mmio_offset); + ret = kstrtos32_from_user(ubuf, cnt, 0, &dbg_mmio_offset); if (ret) return ret; - if (dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) + if (dbg_mmio_offset < 0 || dbg_mmio_offset > + iommu->mmio_phys_end - sizeof(u64)) return -EINVAL; iommu->dbg_mmio_offset = dbg_mmio_offset; @@ -71,12 +72,12 @@ static ssize_t iommu_capability_write(struct file *filp, const char __user *ubuf if (cnt > OFS_IN_SZ) return -EINVAL; - ret = kstrtou32_from_user(ubuf, cnt, 0, &dbg_cap_offset); + ret = kstrtos32_from_user(ubuf, cnt, 0, &dbg_cap_offset); if (ret) return ret; /* Capability register at offset 0x14 is the last IOMMU capability register. */ - if (dbg_cap_offset > 0x14) + if (dbg_cap_offset < 0 || dbg_cap_offset > 0x14) return -EINVAL; iommu->dbg_cap_offset = dbg_cap_offset; From 07d0f496fe7ec5abe3bee7e38be709521567bb33 Mon Sep 17 00:00:00 2001 From: "Jose Fernandez (Anthropic)" Date: Tue, 21 Apr 2026 19:26:13 +0000 Subject: [PATCH 02/19] iommu/amd: Bounds-check devid in __rlookup_amd_iommu() iommu_device_register() walks every device on the PCI bus via bus_for_each_dev() and calls amd_iommu_probe_device() for each. The inlined check_device() path computes the device's sbdf, calls rlookup_amd_iommu() to find the owning IOMMU, and only afterwards verifies devid <= pci_seg->last_bdf. __rlookup_amd_iommu() indexes rlookup_table[devid] with no bounds check of its own, so for a PCI device whose BDF is not described by the IVRS, the lookup reads past the end of the allocation before the caller's bounds check can run. This was harmless before commit e874c666b15b ("iommu/amd: Change rlookup, irq_lookup, and alias to use kvalloc()"): the table was a zeroed page-order allocation, so the over-read returned NULL and the caller's NULL check skipped the device. After that commit the table is a tight kvcalloc() and the over-read returns adjacent slab contents, which check_device() then dereferences as a struct amd_iommu *, causing a boot-time GPF. Seen on Google Compute Engine ct6e VMs, where the virtualized IVRS describes only the four TPU endpoints 00:04.0-07.0; the gVNIC at 00:08.0 (devid 0x40) indexes 56 bytes past the 456-byte allocation, into the adjacent kmalloc-512 slab object: pci 0000:00:04.0: Adding to iommu group 0 pci 0000:00:05.0: Adding to iommu group 1 pci 0000:00:06.0: Adding to iommu group 2 pci 0000:00:07.0: Adding to iommu group 3 Oops: general protection fault, probably for non-canonical address 0x3a64695f78746382: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.18.22 #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/06/2025 RIP: 0010:amd_iommu_probe_device+0x54/0x3a0 Call Trace: __iommu_probe_device+0x107/0x520 probe_iommu_group+0x29/0x50 bus_for_each_dev+0x7e/0xe0 iommu_device_register+0xc9/0x240 iommu_go_to_state+0x9c0/0x1c60 amd_iommu_init+0x14/0x40 pci_iommu_init+0x16/0x60 do_one_initcall+0x47/0x2f0 Guard the array access in __rlookup_amd_iommu(). With the fix applied on 6.18.22, the gVNIC at 00:08.0 is skipped cleanly and the VM boots. Fixes: e874c666b15b ("iommu/amd: Change rlookup, irq_lookup, and alias to use kvalloc()") Cc: stable@vger.kernel.org Reported-by: Ziyuan Chen Tested-by: Ziyuan Chen Reviewed-by: Josef Bacik Assisted-by: Claude:unspecified Signed-off-by: Jose Fernandez (Anthropic) Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index f78e23f03938..57dc8fabc7d9 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -351,8 +351,12 @@ static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid) struct amd_iommu_pci_seg *pci_seg; for_each_pci_segment(pci_seg) { - if (pci_seg->id == seg) - return pci_seg->rlookup_table[devid]; + if (pci_seg->id != seg) + continue; + /* IVRS may not describe every device on the bus */ + if (devid > pci_seg->last_bdf) + return NULL; + return pci_seg->rlookup_table[devid]; } return NULL; } From d769711fcddd005f1e654b3bde547140917fe696 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:20 -0700 Subject: [PATCH 03/19] iommu: Fix NULL group->domain dereference in pci_dev_reset_iommu_done() Local sashiko review pointed it out that group->domain could be NULL when a default domain fails to allocate during the first probe, which can crash at domain->ops->attach_dev dereference in __iommu_attach_device() invoked by pci_dev_reset_iommu_done(). pci_dev_reset_iommu_prepare() is fine as an old_domain pointer can be NULL. Skip the re-attach in pci_dev_reset_iommu_done() to fix the bug. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 61c12ba78206..b8847cc43e76 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4073,8 +4073,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!group->blocking_domain)) return; - /* Re-attach RID domain back to group->domain */ - if (group->domain != group->blocking_domain) { + /* + * Re-attach RID domain back to group->domain + * + * Leave the device parked in the blocking_domain if group->domain isn't + * initialized yet + */ + if (group->domain && group->domain != group->blocking_domain) { WARN_ON(__iommu_attach_device(group->domain, &pdev->dev, group->blocking_domain)); } From 834ab85aa96656f87bb215a8285d34af870f4b66 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:21 -0700 Subject: [PATCH 04/19] iommu: Fix kdocs of pci_dev_reset_iommu_done() Remove the duplicated word. No functional change. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b8847cc43e76..66659e5d1ec2 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4045,9 +4045,9 @@ EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); * @pdev: PCI device that has finished a reset routine * * After a PCIe device finishes a reset routine, it wants to restore its IOMMU - * IOMMU activity, including new translation as well as cache invalidation, by - * re-attaching all RID/PASID of the device's back to the domains retained in - * the core-level structure. + * activity, including new translation and cache invalidation, by re-attaching + * all RID/PASID of the device back to the domains retained in the core-level + * structure. * * Caller must pair it with a successful pci_dev_reset_iommu_prepare(). * From b296ca1fb43aa435edd86131f5230f70c03b2829 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:22 -0700 Subject: [PATCH 05/19] iommu: Replace per-group resetting_domain with per-gdev blocked flag The core tracks device resetting states with a per-group resetting_domain, while a reset is actually per group-device. Such a mismatch might lead to confusion and even difficulty to untangle per-gdev handling requirement. Shuai found that cxl_reset_bus_function() calls pci_reset_bus_function() internally while both are calling pci_dev_reset_iommu_prepare/done(). And the solution requires the core to track at the group_device level as well. Introduce a 'blocked' flag to struct group_device, to allow a multi-device group to isolate concurrent device resets independently. As the reset routine is per gdev, it cannot clear group->resetting_domain without iterating over the device list to ensure no other device is being reset. Simplify it by replacing the resetting_domain with a 'recovery_cnt' in the struct iommu_group. No functional change. But this is essential to apply following bug fixes. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/absKsk7qQOwzhpzv@Asurada-Nvidia/ Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 102 ++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 66659e5d1ec2..2515786d4156 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -62,14 +62,14 @@ struct iommu_group { int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; - /* - * During a group device reset, @resetting_domain points to the physical - * domain, while @domain points to the attached domain before the reset. - */ - struct iommu_domain *resetting_domain; struct iommu_domain *domain; struct list_head entry; unsigned int owner_cnt; + /* + * Number of devices in the group undergoing or awaiting recovery. + * If non-zero, concurrent domain attachments are rejected. + */ + unsigned int recovery_cnt; void *owner; }; @@ -77,12 +77,32 @@ struct group_device { struct list_head list; struct device *dev; char *name; + /* + * Device is blocked for a pending recovery while its group->domain is + * retained. This can happen when: + * - Device is undergoing a reset + */ + bool blocked; }; /* Iterate over each struct group_device in a struct iommu_group */ #define for_each_group_device(group, pos) \ list_for_each_entry(pos, &(group)->devices, list) +static struct group_device *__dev_to_gdev(struct device *dev) +{ + struct iommu_group *group = dev->iommu_group; + struct group_device *gdev; + + lockdep_assert_held(&group->mutex); + + for_each_group_device(group, gdev) { + if (gdev->dev == dev) + return gdev; + } + return NULL; +} + struct iommu_group_attribute { struct attribute attr; ssize_t (*show)(struct iommu_group *group, char *buf); @@ -2196,6 +2216,8 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { + struct group_device *gdev; + /* * This is called on the dma mapping fast path so avoid locking. This is * racy, but we have an expectation that the driver will setup its DMAs @@ -2206,14 +2228,18 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) guard(mutex)(&dev->iommu_group->mutex); + gdev = __dev_to_gdev(dev); + if (WARN_ON(!gdev)) + return -ENODEV; + /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. * * Note that this might fail the iommu_dma_map(). But there's nothing * more we can do here. */ - if (dev->iommu_group->resetting_domain) + if (gdev->blocked) return -EBUSY; return __iommu_attach_device(domain, dev, NULL); } @@ -2270,19 +2296,24 @@ EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev) { struct iommu_group *group = dev->iommu_group; + struct group_device *gdev; lockdep_assert_held(&group->mutex); + gdev = __dev_to_gdev(dev); + if (WARN_ON(!gdev)) + return NULL; + /* * Driver handles the low-level __iommu_attach_device(), including the * one invoked by pci_dev_reset_iommu_done() re-attaching the device to * the cached group->domain. In this case, the driver must get the old - * domain from group->resetting_domain rather than group->domain. This + * domain from group->blocking_domain rather than group->domain. This * prevents it from re-attaching the device from group->domain (old) to * group->domain (new). */ - if (group->resetting_domain) - return group->resetting_domain; + if (gdev->blocked) + return group->blocking_domain; return group->domain; } @@ -2441,10 +2472,10 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, return -EINVAL; /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) + if (group->recovery_cnt) return -EBUSY; /* @@ -3615,10 +3646,10 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, mutex_lock(&group->mutex); /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) { + if (group->recovery_cnt) { ret = -EBUSY; goto out_unlock; } @@ -3708,10 +3739,10 @@ int iommu_replace_device_pasid(struct iommu_domain *domain, mutex_lock(&group->mutex); /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) { + if (group->recovery_cnt) { ret = -EBUSY; goto out_unlock; } @@ -3982,12 +4013,12 @@ EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); * routine wants to block any IOMMU activity: translation and ATS invalidation. * * This function attaches the device's RID/PASID(s) the group->blocking_domain, - * setting the group->resetting_domain. This allows the IOMMU driver pausing any + * incrementing the group->recovery_cnt, to allow the IOMMU driver pausing any * IOMMU activity while leaving the group->domain pointer intact. Later when the * reset is finished, pci_dev_reset_iommu_done() can restore everything. * * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done() - * before/after the core-level reset routine, to unset the resetting_domain. + * before/after the core-level reset routine, to decrement the recovery_cnt. * * Return: 0 on success or negative error code if the preparation failed. * @@ -4000,6 +4031,7 @@ EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) { struct iommu_group *group = pdev->dev.iommu_group; + struct group_device *gdev; unsigned long pasid; void *entry; int ret; @@ -4009,8 +4041,12 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) guard(mutex)(&group->mutex); + gdev = __dev_to_gdev(&pdev->dev); + if (WARN_ON(!gdev)) + return -ENODEV; + /* Re-entry is not allowed */ - if (WARN_ON(group->resetting_domain)) + if (WARN_ON(gdev->blocked)) return -EBUSY; ret = __iommu_group_alloc_blocking_domain(group); @@ -4025,6 +4061,13 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) return ret; } + /* + * Update gdev->blocked upon the domain change, as it is used to return + * the correct domain in iommu_driver_get_domain_for_dev() that might be + * called in a set_dev_pasid callback function. + */ + gdev->blocked = true; + /* * Stage PASID domains at blocking_domain while retaining pasid_array. * @@ -4035,7 +4078,7 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) iommu_remove_dev_pasid(&pdev->dev, pasid, pasid_array_entry_to_domain(entry)); - group->resetting_domain = group->blocking_domain; + group->recovery_cnt++; return ret; } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); @@ -4057,6 +4100,7 @@ EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); void pci_dev_reset_iommu_done(struct pci_dev *pdev) { struct iommu_group *group = pdev->dev.iommu_group; + struct group_device *gdev; unsigned long pasid; void *entry; @@ -4065,11 +4109,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) guard(mutex)(&group->mutex); - /* pci_dev_reset_iommu_prepare() was bypassed for the device */ - if (!group->resetting_domain) + gdev = __dev_to_gdev(&pdev->dev); + if (WARN_ON(!gdev)) + return; + + if (!gdev->blocked) return; - /* pci_dev_reset_iommu_prepare() was not successfully called */ if (WARN_ON(!group->blocking_domain)) return; @@ -4084,6 +4130,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) group->blocking_domain)); } + /* + * Update gdev->blocked upon the domain change, as it is used to return + * the correct domain in iommu_driver_get_domain_for_dev() that might be + * called in a set_dev_pasid callback function. + */ + gdev->blocked = false; + /* * Re-attach PASID domains back to the domains retained in pasid_array. * @@ -4095,7 +4148,8 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) pasid_array_entry_to_domain(entry), group, pasid, group->blocking_domain)); - group->resetting_domain = NULL; + if (!WARN_ON(group->recovery_cnt == 0)) + group->recovery_cnt--; } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done); From 1615e8896a8f6d7b2adf6495e538a81bf6cea3e0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:23 -0700 Subject: [PATCH 06/19] iommu: Fix pasid attach in pci_dev_reset_iommu_prepare/done() Now the helpers handle per-gdev resets. Replace __iommu_set_group_pasid() with set_dev_pasid() accordingly, in the pci_dev_reset_iommu_done(). Also add max_pasids check as other callers. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/ad858513-09fc-455e-bbc5-fe38a225cc78@linux.alibaba.com/ Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2515786d4156..221be84db5ad 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4074,9 +4074,14 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) * The pasid_array is mostly fenced by group->mutex, except one reader * in iommu_attach_handle_get(), so it's safe to read without xa_lock. */ - xa_for_each_start(&group->pasid_array, pasid, entry, 1) - iommu_remove_dev_pasid(&pdev->dev, pasid, - pasid_array_entry_to_domain(entry)); + if (pdev->dev.iommu->max_pasids > 0) { + xa_for_each_start(&group->pasid_array, pasid, entry, 1) { + struct iommu_domain *pasid_dom = + pasid_array_entry_to_domain(entry); + + iommu_remove_dev_pasid(&pdev->dev, pasid, pasid_dom); + } + } group->recovery_cnt++; return ret; @@ -4143,10 +4148,16 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) * The pasid_array is mostly fenced by group->mutex, except one reader * in iommu_attach_handle_get(), so it's safe to read without xa_lock. */ - xa_for_each_start(&group->pasid_array, pasid, entry, 1) - WARN_ON(__iommu_set_group_pasid( - pasid_array_entry_to_domain(entry), group, pasid, - group->blocking_domain)); + if (pdev->dev.iommu->max_pasids > 0) { + xa_for_each_start(&group->pasid_array, pasid, entry, 1) { + struct iommu_domain *pasid_dom = + pasid_array_entry_to_domain(entry); + + WARN_ON(pasid_dom->ops->set_dev_pasid( + pasid_dom, &pdev->dev, pasid, + group->blocking_domain)); + } + } if (!WARN_ON(group->recovery_cnt == 0)) group->recovery_cnt--; From 0d5fd7a9323ce6bedd170e21e1e90b8904917c75 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:24 -0700 Subject: [PATCH 07/19] iommu: Fix nested pci_dev_reset_iommu_prepare/done() Shuai found that cxl_reset_bus_function() calls pci_reset_bus_function() internally while both are calling pci_dev_reset_iommu_prepare/done(). As pci_dev_reset_iommu_prepare() doesn't support re-entry, the inner call will trigger a WARN_ON and return -EBUSY, resulting in failing the entire device reset. On the other hand, removing the outer calls in the PCI callers is unsafe. As pointed out by Kevin, device-specific quirks like reset_hinic_vf_dev() execute custom firmware waits after their inner pcie_flr() completes. If the IOMMU protection relies solely on the inner reset, the IOMMU will be unblocked prematurely while the device is still resetting. Instead, fix this by making pci_dev_reset_iommu_prepare/done() reentrant. Introduce gdev->reset_depth to handle the re-entries on the same device. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/absKsk7qQOwzhpzv@Asurada-Nvidia/ Suggested-by: Kevin Tian Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 221be84db5ad..301c76c40e3d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -83,6 +83,7 @@ struct group_device { * - Device is undergoing a reset */ bool blocked; + unsigned int reset_depth; }; /* Iterate over each struct group_device in a struct iommu_group */ @@ -4045,20 +4046,23 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) if (WARN_ON(!gdev)) return -ENODEV; - /* Re-entry is not allowed */ - if (WARN_ON(gdev->blocked)) - return -EBUSY; + if (gdev->reset_depth++) + return 0; ret = __iommu_group_alloc_blocking_domain(group); - if (ret) + if (ret) { + gdev->reset_depth--; return ret; + } /* Stage RID domain at blocking_domain while retaining group->domain */ if (group->domain != group->blocking_domain) { ret = __iommu_attach_device(group->blocking_domain, &pdev->dev, group->domain); - if (ret) + if (ret) { + gdev->reset_depth--; return ret; + } } /* @@ -4118,7 +4122,10 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!gdev)) return; - if (!gdev->blocked) + /* Unbalanced done() calls would underflow the counter */ + if (WARN_ON(gdev->reset_depth == 0)) + return; + if (--gdev->reset_depth) return; if (WARN_ON(!group->blocking_domain)) From fc3523b16d2b4b88e61e69504b0ae0b18b869c8f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:25 -0700 Subject: [PATCH 08/19] iommu: Fix ATS invalidation timeouts during __iommu_remove_group_pasid() If a device is blocked, its PASID domains are already detached. Repeating iommu_remove_dev_pasid() is unnecessary and might trigger ATS invalidation timeouts. Skip the iommu_remove_dev_pasid() call upon gdev->blocked. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Closes: https://sashiko.dev/#/patchset/20260407194644.171304-1-nicolinc%40nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 301c76c40e3d..a5e3e90883f8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3602,7 +3602,12 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, struct group_device *device; for_each_group_device(group, device) { - if (device->dev->iommu->max_pasids > 0) + /* + * A group-level detach cannot fail, even if there is a blocked + * device. In fact, blocked devices must be already detached for + * a pending device recovery. + */ + if (!device->blocked && device->dev->iommu->max_pasids > 0) iommu_remove_dev_pasid(device->dev, pasid, domain); } } From 5474e6e17a262db45c60575c73f70210f5c7001f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:26 -0700 Subject: [PATCH 09/19] iommu: Fix WARN_ON in __iommu_group_set_domain_nofail() due to reset In __iommu_group_set_domain_internal(), concurrent domain attachments are rejected when any device in the group is recovering. This is necessary to fence concurrent attachments to a multi-device group where devices might share the same RID due to PCI DMA alias quirks, but triggers the WARN_ON in __iommu_group_set_domain_nofail(). Other IOMMU_SET_DOMAIN_MUST_SUCCEED callers in detach/teardown paths, such as __iommu_group_set_core_domain and __iommu_release_dma_ownership, should not be rejected, as the domain would be freed anyway in these nofail paths while group->domain is still pointing to it. So pci_dev_reset_iommu_done() could trigger a UAF when re-attaching group->domain. Honor the IOMMU_SET_DOMAIN_MUST_SUCCEED flag, allowing the callers through the group->recovery_cnt fence, so as to update the group->domain pointer. Instead add a gdev->blocked check in the device iteration loop, to prevent any concurrent per-device detachment. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Closes: https://sashiko.dev/#/patchset/20260407194644.171304-1-nicolinc%40nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a5e3e90883f8..845284a9eeaf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2474,9 +2474,10 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, /* * This is a concurrent attach during device recovery. Reject it until - * pci_dev_reset_iommu_done() attaches the device to group->domain. + * pci_dev_reset_iommu_done() attaches the device to group->domain, if + * IOMMU_SET_DOMAIN_MUST_SUCCEED is not set. */ - if (group->recovery_cnt) + if (group->recovery_cnt && !(flags & IOMMU_SET_DOMAIN_MUST_SUCCEED)) return -EBUSY; /* @@ -2487,6 +2488,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, */ result = 0; for_each_group_device(group, gdev) { + /* + * Device under recovery is attached to group->blocking_domain. + * Don't change that. pci_dev_reset_iommu_done() will re-attach + * its domain to the updated group->domain, after the recovery. + */ + if (gdev->blocked) + continue; ret = __iommu_device_set_domain(group, gdev->dev, new_domain, group->domain, flags); if (ret) { From 15dd29ca620648a18a0334a3f6b26af181154a23 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:27 -0700 Subject: [PATCH 10/19] iommu: Warn on premature unblock during DMA aliased sibling reset When two aliased siblings are in the same iommu_group, they might share the same RID. The reset functions don't support this case, though it is unclear whether there is a real case of having an ATS capable device on a PCI/PCI-X bus. Theoretically, however, if two aliased devices are resetting concurrently, one might be unblocked prematurely in the middle of the reset by the other sibling who completes the reset first. This isn't a regression from this series but it's better to spit a warning, so we can know if such use case is common enough for us to make subsequent patches for its coverage. Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 49 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 845284a9eeaf..e7bd28cc77ee 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4105,6 +4105,41 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); +static int __group_device_cmp_dma_alias(struct pci_dev *dev, u16 alias, + void *data) +{ + return alias == *(u16 *)data; +} + +static int group_device_cmp_dma_alias(struct pci_dev *dev, u16 alias, + void *data) +{ + return pci_for_each_dma_alias(data, __group_device_cmp_dma_alias, + &alias); +} + +static bool group_device_dma_alias_is_blocked(struct iommu_group *group, + struct group_device *gdev) +{ + struct group_device *sibling; + + lockdep_assert_held(&group->mutex); + + if (!dev_is_pci(gdev->dev)) + return false; + + for_each_group_device(group, sibling) { + if (sibling == gdev || !sibling->blocked || + !dev_is_pci(sibling->dev)) + continue; + if (pci_for_each_dma_alias(to_pci_dev(gdev->dev), + group_device_cmp_dma_alias, + to_pci_dev(sibling->dev))) + return true; + } + return false; +} + /** * pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done * @pdev: PCI device that has finished a reset routine @@ -4144,6 +4179,20 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!group->blocking_domain)) return; + if (group_device_dma_alias_is_blocked(group, gdev)) { + /* + * FIXME: DMA aliased devices share the same RID, which would be + * convoluted to handle, as "gdev->blocked" is not sufficient: + * - "blocked" state is effectively shared across these devices + * - if the core skipped the blocking on the second device, the + * IOMMU driver's attachment state would diverge from the HW + * state + * For now, just warn and see whether real ATS use cases hit it. + */ + pci_warn(pdev, + "DMA-aliased sibling may be prematurely unblocked\n"); + } + /* * Re-attach RID domain back to group->domain * From 2cda2e10dc8343ae01eae9e999a876b7e7d37861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naval=20Alcal=C3=A1?= Date: Sat, 9 May 2026 10:43:44 +0800 Subject: [PATCH 11/19] iommu/vt-d: Disable DMAR for Intel Q35 IGFX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel Q35 integrated graphics (8086:29b2) exhibits broken DMAR behaviour similar to other G4x/GM45 devices for which DMAR is already disabled via quirks. When DMAR is enabled, the system may hard lock up during boot or early device initialization, requiring a reset. Add the missing PCI ID to the existing quirk list to disable DMAR for this device. Fixes: 1f76249cc3be ("iommu/vt-d: Declare Broadwell igfx dmar support snafu") Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=201185 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=216064 Signed-off-by: Naval Alcalá Link: https://lore.kernel.org/r/20260410161622.13549-1-ari@naval.cat Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index c3d18cd77d2f..2a6b6813a78d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3937,6 +3937,9 @@ static void quirk_iommu_igfx(struct pci_dev *dev) disable_igfx_iommu = 1; } +/* Q35 integrated gfx dmar support is totally busted. */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x29b2, quirk_iommu_igfx); + /* G4x/GM45 integrated gfx dmar support is totally busted. */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); From a6dea58d8625c06b9654c0555f101742481335c3 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 9 May 2026 10:43:45 +0800 Subject: [PATCH 12/19] iommu/vt-d: Fix oops due to out of scope access Below oops triggers when kill QEMU process: Oops: general protection fault, probably for non-canonical address 0x7fffffff844eaaa7: 0000 [#1] SMP NOPTI Call Trace: do_raw_spin_lock+0xaa/0xc0 _raw_spin_lock_irqsave+0x21/0x40 domain_remove_dev_pasid+0x52/0x160 intel_nested_set_dev_pasid+0x1b9/0x1e0 __iommu_set_group_pasid+0x56/0x120 pci_dev_reset_iommu_done+0xe3/0x180 pcie_flr+0x65/0x160 __pci_reset_function_locked+0x5b/0x120 vfio_pci_core_close_device+0x63/0xe0 [vfio_pci_core] vfio_df_close+0x4f/0xa0 vfio_df_unbind_iommufd+0x2d/0x60 vfio_device_fops_release+0x3e/0x40 __fput+0xe5/0x2c0 task_work_run+0x58/0xa0 do_exit+0x2c8/0x600 do_group_exit+0x2f/0xa0 get_signal+0x863/0x8c0 arch_do_signal_or_restart+0x24/0x100 exit_to_user_mode_loop+0x87/0x380 do_syscall_64+0x2ff/0x11e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The global static blocked domain is a dummy domain without corresponding dmar_domain structure, accessing beyond iommu_domain structure triggers oops easily. Fix it by return early in domain_remove_dev_pasid() like identity domain. Fixes: 7d0c9da6c150 ("iommu/vt-d: Add set_dev_pasid callback for dma domain") Cc: stable@vger.kernel.org Signed-off-by: Zhenzhong Duan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260421031347.1408890-1-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2a6b6813a78d..a4b123c33022 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3530,8 +3530,8 @@ void domain_remove_dev_pasid(struct iommu_domain *domain, if (!domain) return; - /* Identity domain has no meta data for pasid. */ - if (domain->type == IOMMU_DOMAIN_IDENTITY) + /* Identity domain and blocked domain have no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY || domain->type == IOMMU_DOMAIN_BLOCKED) return; dmar_domain = to_dmar_domain(domain); From 79ea2feb917b05366b49d85573c9c5331f043b2c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 9 May 2026 10:43:46 +0800 Subject: [PATCH 13/19] iommu/vt-d: Avoid NULL pointer dereference or refcount corruption Commit 60f030f7418d ("iommu/vt-d: Avoid use of NULL after WARN_ON_ONCE") fixed a NULL pointer dereference in an unlikely situation partly. If dev_pasid is not found in the dev_pasids list, it remains NULL. However, the teardown operations are executed unconditionally, this lead to a NULL pointer dereference or refcount corruption. If the domain was never attached to this IOMMU, info will be NULL, which would cause an immediate dereference when checking --info->refcnt. Even if info is not NULL, decrementing the refcount without having removed a valid PASID might unbalance the count. This could lead to premature dropping of the refcount to 0, potentially causing a use-after-free for the remaining active devices sharing the domain. Fix it by returning early if dev_pasid is NULL, before executing the teardown operations. Issue found by AI review and suggested by Kevin Tian. https://sashiko.dev/#/patchset/20260421031347.1408890-1-zhenzhong.duan%40intel.com Fixes: 60f030f7418d ("iommu/vt-d: Avoid use of NULL after WARN_ON_ONCE") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Zhenzhong Duan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260422033538.95000-1-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a4b123c33022..4d0e65bc131d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3545,12 +3545,13 @@ void domain_remove_dev_pasid(struct iommu_domain *domain, } spin_unlock_irqrestore(&dmar_domain->lock, flags); + if (WARN_ON_ONCE(!dev_pasid)) + return; + cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); - if (!WARN_ON_ONCE(!dev_pasid)) { - intel_iommu_debugfs_remove_dev_pasid(dev_pasid); - kfree(dev_pasid); - } + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + kfree(dev_pasid); } static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, From 6fc7e8a3b8115294f60f5c89de27330bf1b9c98e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:13 -0300 Subject: [PATCH 14/19] iommu: Fix loss of errno on map failure for classic ops A typo, likely from a rebase, inverted the condition and caused errors to be lost. Fix it to be "if (ret)". This was breaking iommu_create_device_direct_mappings() on drivers that don't use iommupt and don't fully set up their domain in alloc_pages() (i.e., SMMUv2). In this case the first call of iommu_create_device_direct_mappings() should fail due to the incompletely initialized domain. Since it wrongly returns success, the second call to iommu_create_device_direct_mappings() doesn't happen and IOMMU_RESV_DIRECT is never set up. Cc: stable@vger.kernel.org Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Reported-by: Josua Mayer Closes: https://lore.kernel.org/all/321c2e57-6a17-4aef-ba42-d2ebd577e472@solid-run.com/ Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index e7bd28cc77ee..05f78cfd1f1d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2709,7 +2709,7 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, return 0; } ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); - if (!ret) + if (ret) return ret; trace_map(iova, paddr, size); From b948a87228482235afbaf5f4d8037860b5c470fd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:14 -0300 Subject: [PATCH 15/19] iommu: Fix up map/unmap debugging for iommupt domains Sashiko noticed a few issues in this path, and a few more were found on review. Tidy them up further. These are intertwined because the debug code depends on some of the WARN_ONs to function right: Lift into iommu_map_nosync(): - The might_sleep_if() - 0 pgsize_bitmap WARN_ON - Promote the illegal domain->type to a WARN_ON - WARN_ON for illegal gfp flags Then remove the return 0 since it is now safe to call iommu_debug_map(). Lift into __iommu_unmap(): - 0 pgsize_bitmap WARN_ON - Promote the illegal domain->type to a WARN_ON - iommu_debug_unmap_begin() This now pairs with the unconditional iommu_debug_map() on the mapping side. Thus iommu debugging now works for iommupt along with some of the other debugging features. Fixes: 99fb8afa16ad ("iommupt: Directly call iommupt's unmap_range()") Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 05f78cfd1f1d..c21d1e3c4876 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2623,19 +2623,9 @@ static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, size_t orig_size = size; int ret = 0; - might_sleep_if(gfpflags_allow_blocking(gfp)); - - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) - return -EINVAL; - - if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) + if (WARN_ON(!ops->map_pages)) return -ENODEV; - /* Discourage passing strange GFP flags */ - if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | - __GFP_HIGHMEM))) - return -EINVAL; - /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); @@ -2697,6 +2687,15 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, struct pt_iommu *pt = iommupt_from_domain(domain); int ret; + might_sleep_if(gfpflags_allow_blocking(gfp)); + + /* Discourage passing strange GFP flags or illegal domains */ + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING) || + !domain->pgsize_bitmap || + (gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | + __GFP_HIGHMEM)))) + return -EINVAL; + if (pt) { size_t mapped = 0; @@ -2706,11 +2705,12 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, iommu_unmap(domain, iova, mapped); return ret; } - return 0; + } else { + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, + gfp); + if (ret) + return ret; } - ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); - if (ret) - return ret; trace_map(iova, paddr, size); iommu_debug_map(domain, paddr, size); @@ -2742,10 +2742,7 @@ __iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, size_t unmapped_page, unmapped = 0; unsigned int min_pagesz; - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) - return 0; - - if (WARN_ON(!ops->unmap_pages || domain->pgsize_bitmap == 0UL)) + if (WARN_ON(!ops->unmap_pages)) return 0; /* find out the minimum page size supported */ @@ -2764,8 +2761,6 @@ __iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); - iommu_debug_unmap_begin(domain, iova, size); - /* * Keep iterating until we either unmap 'size' bytes (or more) * or we hit an area that isn't mapped. @@ -2801,6 +2796,12 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, struct pt_iommu *pt = iommupt_from_domain(domain); size_t unmapped; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING) || + !domain->pgsize_bitmap)) + return 0; + + iommu_debug_unmap_begin(domain, iova, size); + if (pt) unmapped = pt->ops->unmap_range(pt, iova, size, iotlb_gather); else From 0735c54804c709d1b292f3b6947cfb560b2ce552 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:15 -0300 Subject: [PATCH 16/19] iommu: Handle unmap error when iommu_debug is enabled Sashiko noticed a latent bug where the map error flow called iommu_unmap() which calls iommu_debug_unmap_begin()/iommu_debug_unmap_end() however since this is an error path the map flow never actually established the original iommu_debug_map() it will malfunction. Lift the unmap error handling into iommu_map_nosync() and reorder it so the trace_map()/iommu_debug_map() records the partial mapping and then immediately unmaps it. This avoid creating the unbalanced tracking and provides saner tracing instead of a unmap unmatched to any map. Fixes: ccc21213f013 ("iommu: Add calls for IOMMU_DEBUG_PAGEALLOC") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 49 +++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c21d1e3c4876..d1a9e713d3a0 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2615,12 +2615,11 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, - size_t size, int prot, gfp_t gfp) + size_t size, int prot, gfp_t gfp, + size_t *mapped) { const struct iommu_domain_ops *ops = domain->ops; - unsigned long orig_iova = iova; unsigned int min_pagesz; - size_t orig_size = size; int ret = 0; if (WARN_ON(!ops->map_pages)) @@ -2643,31 +2642,25 @@ static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { - size_t pgsize, count, mapped = 0; + size_t pgsize, count, op_mapped = 0; pgsize = iommu_pgsize(domain, iova, paddr, size, &count); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", iova, &paddr, pgsize, count); ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, - gfp, &mapped); + gfp, &op_mapped); /* * Some pages may have been mapped, even if an error occurred, * so we should account for those so they can be unmapped. */ - size -= mapped; - + *mapped += op_mapped; if (ret) - break; + return ret; - iova += mapped; - paddr += mapped; - } - - /* unroll mapping in case something went wrong */ - if (ret) { - iommu_unmap(domain, orig_iova, orig_size - size); - return ret; + size -= op_mapped; + iova += op_mapped; + paddr += op_mapped; } return 0; } @@ -2685,6 +2678,7 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { struct pt_iommu *pt = iommupt_from_domain(domain); + size_t mapped = 0; int ret; might_sleep_if(gfpflags_allow_blocking(gfp)); @@ -2696,24 +2690,19 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, __GFP_HIGHMEM)))) return -EINVAL; - if (pt) { - size_t mapped = 0; - + if (pt) ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, &mapped); - if (ret) { - iommu_unmap(domain, iova, mapped); - return ret; - } - } else { + else ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, - gfp); - if (ret) - return ret; - } + gfp, &mapped); - trace_map(iova, paddr, size); - iommu_debug_map(domain, paddr, size); + trace_map(iova, paddr, mapped); + iommu_debug_map(domain, paddr, mapped); + if (ret) { + iommu_unmap(domain, iova, mapped); + return ret; + } return 0; } From 8ef3f77c440005c7f04229a75976bfc078364247 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:16 -0300 Subject: [PATCH 17/19] iommupt: Check for missing PAGE_SIZE in the pgsize_bitmap Sashiko pointed out that the driver could drop PAGE_SIZE from the pgsize_bitmap. That is technically allowed but nothing does it, and such an iommu_domain would not be used with the DMA API today. Still, it is against the design and it is trivial to fix up. Lift the PT_WARN_ON to the if branch and just skip the fast path. Fixes: dcd6a011a8d5 ("iommupt: Add map_pages op") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 19b6daf88f2a..4877b05291c9 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -920,8 +920,8 @@ static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, return ret; /* Calculate target page size and level for the leaves */ - if (pt_has_system_page_size(common) && len == PAGE_SIZE) { - PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); + if (pt_has_system_page_size(common) && len == PAGE_SIZE && + likely(pgsize_bitmap & PAGE_SIZE)) { if (log2_mod(iova | paddr, PAGE_SHIFT)) return -ENXIO; map.leaf_pgsize_lg2 = PAGE_SHIFT; From 58829512ad461af8f35941069c209941e3a97b65 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:17 -0300 Subject: [PATCH 18/19] iommupt: Fix the end_index calculation in __map_range_leaf() Sashiko noticed a mismatch of units in this math: num_leaves is actually the number of leaf *entries* (so a 16-item contiguous leaf is one num_leaves), while index is in items. The mismatch in maths causes __map_range_leaf() to exit early instead of efficiently filling a larger range of contiguous PTEs. The early exit is caught by the functions above and then __map_range_leaf() is re-invoked, so there is no functional issue. Correct the misuse of units by adjusting num_leaves with the leaf size and avoid the performance cost of looping externally. There are also some mismatched types for num_leaves; simplify things to remove the duplicated calculations. Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Signed-off-by: Jason Gunthorpe Reviewed-by: Samiullah Khawaja Reviewd-by: Pranjal Shrivastava Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 4877b05291c9..dc91fb4e2f61 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -534,10 +534,12 @@ static int __map_range_leaf(struct pt_range *range, void *arg, struct pt_state pts = pt_init(range, level, table); struct pt_iommu_map_args *map = arg; unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; + unsigned int leaves_avail; unsigned int start_index; pt_oaddr_t oa = map->oa; - unsigned int num_leaves; + pt_vaddr_t num_leaves; unsigned int orig_end; + unsigned int step_lg2; pt_vaddr_t last_va; unsigned int step; bool need_contig; @@ -546,21 +548,25 @@ static int __map_range_leaf(struct pt_range *range, void *arg, PT_WARN_ON(map->leaf_level != level); PT_WARN_ON(!pt_can_have_leaf(&pts)); - step = log2_to_int_t(unsigned int, - leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); - need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); + step_lg2 = leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts); + step = log2_to_int_t(unsigned int, step_lg2); + need_contig = step_lg2 != 0; _pt_iter_first(&pts); start_index = pts.index; orig_end = pts.end_index; - if (pts.index + map->num_leaves < pts.end_index) { + leaves_avail = + log2_div_t(unsigned int, pts.end_index - pts.index, step_lg2); + if (map->num_leaves <= leaves_avail) { /* Need to stop in the middle of the table to change sizes */ - pts.end_index = pts.index + map->num_leaves; + pts.end_index = pts.index + log2_mul(map->num_leaves, step_lg2); num_leaves = 0; } else { - num_leaves = map->num_leaves - (pts.end_index - pts.index); + num_leaves = map->num_leaves - leaves_avail; } + PT_WARN_ON( + log2_mod_t(unsigned int, pts.end_index - pts.index, step_lg2)); do { pts.type = pt_load_entry_raw(&pts); if (pts.type != PT_ENTRY_EMPTY || need_contig) { From 1bb54043ff309795c90ccadd8a6e6b13ac40ec4e Mon Sep 17 00:00:00 2001 From: Tomasz Jeznach Date: Tue, 12 May 2026 10:37:44 -0700 Subject: [PATCH 19/19] MAINTAINERS: update Tomasz Jeznach's email address Switch from the previous work address to a linux.dev account, as the work address is no longer actively monitored. Signed-off-by: Tomasz Jeznach Signed-off-by: Joerg Roedel --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index eec4a740f7ca..bd6a72f29d9c 100644 --- a/.mailmap +++ b/.mailmap @@ -857,6 +857,7 @@ Tobias Klauser Tobias Klauser Tobias Klauser Todor Tomov +Tomasz Jeznach Tony Luck Trilok Soni TripleX Chung diff --git a/MAINTAINERS b/MAINTAINERS index b2040011a386..55c6ab1a974a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22944,7 +22944,7 @@ N: riscv K: riscv RISC-V IOMMU -M: Tomasz Jeznach +M: Tomasz Jeznach L: iommu@lists.linux.dev L: linux-riscv@lists.infradead.org S: Maintained