vfio/pci: Use RCU for error/request triggers to avoid circular locking

Thanks to a device generating an ACS violation during bus reset,
lockdep reported the following circular locking issue:

CPU0: SET_IRQS (MSI/X): holds igate, acquires memory_lock
CPU1: HOT_RESET: holds memory_lock, acquires pci_bus_sem
CPU2: AER: holds pci_bus_sem, acquires igate

This results in a potential 3-way deadlock.

Remove the pci_bus_sem->igate leg of the triangle by using RCU
to peek at the eventfd rather than locking it with igate.

Fixes: 3be3a074cf ("vfio-pci: Don't use device_lock around AER interrupt setup")
Signed-off-by: Alex Williamson <alex.williamson@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20251124223623.2770706-1-alex@shazbot.org
Signed-off-by: Alex Williamson <alex@shazbot.org>
This commit is contained in:
Alex Williamson 2025-11-24 15:36:22 -07:00 committed by Alex Williamson
parent fa804aa4ac
commit 98693e0897
4 changed files with 93 additions and 41 deletions

View File

@ -42,6 +42,40 @@ static bool nointxmask;
static bool disable_vga;
static bool disable_idle_d3;
static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu)
{
struct vfio_pci_eventfd *eventfd =
container_of(rcu, struct vfio_pci_eventfd, rcu);
eventfd_ctx_put(eventfd->ctx);
kfree(eventfd);
}
int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
struct vfio_pci_eventfd __rcu **peventfd,
struct eventfd_ctx *ctx)
{
struct vfio_pci_eventfd *new = NULL;
struct vfio_pci_eventfd *old;
lockdep_assert_held(&vdev->igate);
if (ctx) {
new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
if (!new)
return -ENOMEM;
new->ctx = ctx;
}
old = rcu_replace_pointer(*peventfd, new,
lockdep_is_held(&vdev->igate));
if (old)
call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free);
return 0;
}
/* List of PF's that vfio_pci_core_sriov_configure() has been called on */
static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
static LIST_HEAD(vfio_pci_sriov_pfs);
@ -697,14 +731,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
vfio_pci_dma_buf_cleanup(vdev);
mutex_lock(&vdev->igate);
if (vdev->err_trigger) {
eventfd_ctx_put(vdev->err_trigger);
vdev->err_trigger = NULL;
}
if (vdev->req_trigger) {
eventfd_ctx_put(vdev->req_trigger);
vdev->req_trigger = NULL;
}
vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL);
vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL);
mutex_unlock(&vdev->igate);
}
EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
@ -1784,21 +1812,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
struct vfio_pci_core_device *vdev =
container_of(core_vdev, struct vfio_pci_core_device, vdev);
struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_eventfd *eventfd;
mutex_lock(&vdev->igate);
if (vdev->req_trigger) {
rcu_read_lock();
eventfd = rcu_dereference(vdev->req_trigger);
if (eventfd) {
if (!(count % 10))
pci_notice_ratelimited(pdev,
"Relaying device request to user (#%u)\n",
count);
eventfd_signal(vdev->req_trigger);
eventfd_signal(eventfd->ctx);
} else if (count == 0) {
pci_warn(pdev,
"No device request channel registered, blocked until released by user\n");
}
mutex_unlock(&vdev->igate);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(vfio_pci_core_request);
@ -2216,13 +2244,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
struct vfio_pci_eventfd *eventfd;
mutex_lock(&vdev->igate);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger);
mutex_unlock(&vdev->igate);
rcu_read_lock();
eventfd = rcu_dereference(vdev->err_trigger);
if (eventfd)
eventfd_signal(eventfd->ctx);
rcu_read_unlock();
return PCI_ERS_RESULT_CAN_RECOVER;
}

View File

@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
return 0;
}
static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev,
struct vfio_pci_eventfd __rcu **peventfd,
unsigned int count, uint32_t flags,
void *data)
{
/* DATA_NONE/DATA_BOOL enables loopback testing */
if (flags & VFIO_IRQ_SET_DATA_NONE) {
if (*ctx) {
if (count) {
eventfd_signal(*ctx);
} else {
eventfd_ctx_put(*ctx);
*ctx = NULL;
}
struct vfio_pci_eventfd *eventfd;
eventfd = rcu_dereference_protected(*peventfd,
lockdep_is_held(&vdev->igate));
if (!eventfd)
return -EINVAL;
if (count) {
eventfd_signal(eventfd->ctx);
return 0;
}
return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger;
@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
return -EINVAL;
trigger = *(uint8_t *)data;
if (trigger && *ctx)
eventfd_signal(*ctx);
if (trigger) {
struct vfio_pci_eventfd *eventfd =
rcu_dereference_protected(*peventfd,
lockdep_is_held(&vdev->igate));
if (eventfd)
eventfd_signal(eventfd->ctx);
}
return 0;
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
fd = *(int32_t *)data;
if (fd == -1) {
if (*ctx)
eventfd_ctx_put(*ctx);
*ctx = NULL;
return vfio_pci_eventfd_replace_locked(vdev,
peventfd, NULL);
} else if (fd >= 0) {
struct eventfd_ctx *efdctx;
int ret;
efdctx = eventfd_ctx_fdget(fd);
if (IS_ERR(efdctx))
return PTR_ERR(efdctx);
if (*ctx)
eventfd_ctx_put(*ctx);
ret = vfio_pci_eventfd_replace_locked(vdev,
peventfd, efdctx);
if (ret)
eventfd_ctx_put(efdctx);
*ctx = efdctx;
return ret;
}
return 0;
}
return -EINVAL;
@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
return -EINVAL;
return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger,
return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger,
count, flags, data);
}
@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
return -EINVAL;
return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger,
return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger,
count, flags, data);
}

View File

@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd {
bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev);
void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);
int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev,
struct vfio_pci_eventfd __rcu **peventfd,
struct eventfd_ctx *ctx);
int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
unsigned index, unsigned start, unsigned count,
void *data);

View File

@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <linux/vfio.h>
#include <linux/irqbypass.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/uuid.h>
#include <linux/notifier.h>
@ -29,6 +30,11 @@ struct vfio_pci_region;
struct p2pdma_provider;
struct dma_buf_phys_vec;
struct vfio_pci_eventfd {
struct eventfd_ctx *ctx;
struct rcu_head rcu;
};
struct vfio_pci_regops {
ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite);
@ -124,8 +130,8 @@ struct vfio_pci_core_device {
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
int ioeventfds_nr;
struct eventfd_ctx *err_trigger;
struct eventfd_ctx *req_trigger;
struct vfio_pci_eventfd __rcu *err_trigger;
struct vfio_pci_eventfd __rcu *req_trigger;
struct eventfd_ctx *pm_wake_eventfd_ctx;
struct list_head dummy_resources_list;
struct mutex ioeventfds_lock;