From 003049b1c4fb8aabb93febb7d1e49004f6ad653b Mon Sep 17 00:00:00 2001 From: Kai Aizen Date: Wed, 18 Feb 2026 17:36:41 +0000 Subject: [PATCH 1/3] io_uring/zcrx: fix user_ref race between scrub and refill paths The io_zcrx_put_niov_uref() function uses a non-atomic check-then-decrement pattern (atomic_read followed by separate atomic_dec) to manipulate user_refs. This is serialized against other callers by rq_lock, but io_zcrx_scrub() modifies the same counter with atomic_xchg() WITHOUT holding rq_lock. On SMP systems, the following race exists: CPU0 (refill, holds rq_lock) CPU1 (scrub, no rq_lock) put_niov_uref: atomic_read(uref) - 1 // window opens atomic_xchg(uref, 0) - 1 return_niov_freelist(niov) [PUSH #1] // window closes atomic_dec(uref) - wraps to -1 returns true return_niov(niov) return_niov_freelist(niov) [PUSH #2: DOUBLE-FREE] The same niov is pushed to the freelist twice, causing free_count to exceed nr_iovs. Subsequent freelist pushes then perform an out-of-bounds write (a u32 value) past the kvmalloc'd freelist array into the adjacent slab object. Fix this by replacing the non-atomic read-then-dec in io_zcrx_put_niov_uref() with an atomic_try_cmpxchg loop that atomically tests and decrements user_refs. This makes the operation safe against concurrent atomic_xchg from scrub without requiring scrub to acquire rq_lock. Fixes: 34a3e60821ab ("io_uring/zcrx: implement zerocopy receive pp memory provider") Cc: stable@vger.kernel.org Signed-off-by: Kai Aizen [pavel: removed a warning and a comment] Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 28150c6578e3..97984a73a95d 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -349,10 +349,14 @@ static inline atomic_t *io_get_user_counter(struct net_iov *niov) static bool io_zcrx_put_niov_uref(struct net_iov *niov) { atomic_t *uref = io_get_user_counter(niov); + int old; + + old = atomic_read(uref); + do { + if (unlikely(old == 0)) + return false; + } while (!atomic_try_cmpxchg(uref, &old, old - 1)); - if (unlikely(!atomic_read(uref))) - return false; - atomic_dec(uref); return true; } From 42a6bd57ee9f930a72c26f863c72f666d6ed9ea5 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 18 Feb 2026 18:35:34 -0700 Subject: [PATCH 2/3] io_uring: add IORING_OP_URING_CMD128 to opcode checks io_should_commit(), io_uring_classic_poll(), and io_do_iopoll() compare struct io_kiocb's opcode against IORING_OP_URING_CMD to implement special treatment for uring_cmds. The recently added opcode IORING_OP_URING_CMD128 is meant to be equivalent to IORING_OP_URING_CMD, so treat it the same way in these functions. Fixes: 1cba30bf9fdd ("io_uring: add support for IORING_SETUP_SQE_MIXED") Signed-off-by: Caleb Sander Mateos Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 6 ++++++ io_uring/kbuf.c | 2 +- io_uring/rw.c | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 503663d6fd6d..0fa844faf287 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -530,6 +530,12 @@ static inline bool io_file_can_poll(struct io_kiocb *req) return false; } +static inline bool io_is_uring_cmd(const struct io_kiocb *req) +{ + return req->opcode == IORING_OP_URING_CMD || + req->opcode == IORING_OP_URING_CMD128; +} + static inline ktime_t io_get_time(struct io_ring_ctx *ctx) { if (ctx->clockid == CLOCK_MONOTONIC) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 67d4fe576473..dae5b4ab3819 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -171,7 +171,7 @@ static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags) return true; /* uring_cmd commits kbuf upfront, no need to auto-commit */ - if (!io_file_can_poll(req) && req->opcode != IORING_OP_URING_CMD) + if (!io_file_can_poll(req) && !io_is_uring_cmd(req)) return true; return false; } diff --git a/io_uring/rw.c b/io_uring/rw.c index b3971171c342..1a5f262734e8 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1254,7 +1254,7 @@ static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob { struct file *file = req->file; - if (req->opcode == IORING_OP_URING_CMD) { + if (io_is_uring_cmd(req)) { struct io_uring_cmd *ioucmd; ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -1380,7 +1380,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs); nr_events++; req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); - if (req->opcode != IORING_OP_URING_CMD) + if (!io_is_uring_cmd(req)) io_req_rw_cleanup(req, 0); } if (nr_events) From ea129e55c9e06a51a93c3f5ef3e32a6cfa3f8ec7 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Wed, 18 Feb 2026 20:59:30 -0800 Subject: [PATCH 3/3] io_uring: Add size check for sqe->cmd For SQE128, sqe->cmd provides 80 bytes for uring_cmd. Add macro to check if size of user struct does not exceed 80 bytes at compile time. User doesn't have to track this manually during development. Replace io_uring_sqe_cmd() inline func with macro and add io_uring_sqe128_cmd() which checks struct size for 16 bytes cmd and 80 bytes cmd respectively. Signed-off-by: Govindarajulu Varadarajan Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 12 ++++++++---- drivers/nvme/host/ioctl.c | 3 ++- fs/fuse/dev_uring.c | 6 ++++-- include/linux/io_uring/cmd.h | 15 +++++++++++---- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c13cda58a7c6..46a785ce078d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -3255,7 +3255,8 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, unsigned int issue_flags) { /* May point to userspace-mapped memory */ - const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); + const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe, + struct ublksrv_io_cmd); u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; @@ -3833,7 +3834,8 @@ static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe); + const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe, + struct ublksrv_io_cmd); struct ublk_device *ub = cmd->file->private_data; unsigned tag = READ_ONCE(ub_cmd->tag); unsigned q_id = READ_ONCE(ub_cmd->q_id); @@ -3862,7 +3864,8 @@ static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd, static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe); + const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe, + struct ublk_batch_io); struct ublk_device *ub = cmd->file->private_data; struct ublk_batch_io_data data = { .ub = ub, @@ -5253,7 +5256,8 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { /* May point to userspace-mapped memory */ - const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); + const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe, + struct ublksrv_ctrl_cmd); struct ublksrv_ctrl_cmd header; struct ublk_device *ub = NULL; u32 cmd_op = cmd->cmd_op; diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index fb62633ccbb0..8844bbd39515 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -447,7 +447,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) { struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); + const struct nvme_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, + struct nvme_uring_cmd); struct request_queue *q = ns ? ns->queue : ctrl->admin_q; struct nvme_uring_data d; struct nvme_command c; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 5ceb217ced1b..60f2058feb74 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -879,7 +879,8 @@ static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, struct fuse_conn *fc) { - const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe128_cmd(cmd->sqe, + struct fuse_uring_cmd_req); struct fuse_ring_ent *ent; int err; struct fuse_ring *ring = fc->ring; @@ -1083,7 +1084,8 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, static int fuse_uring_register(struct io_uring_cmd *cmd, unsigned int issue_flags, struct fuse_conn *fc) { - const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe128_cmd(cmd->sqe, + struct fuse_uring_cmd_req); struct fuse_ring *ring = smp_load_acquire(&fc->ring); struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 375fd048c4cb..331dcbefe72f 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -20,10 +20,17 @@ struct io_uring_cmd { u8 unused[8]; }; -static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -{ - return sqe->cmd; -} +#define io_uring_sqe128_cmd(sqe, type) ({ \ + BUILD_BUG_ON(sizeof(type) > ((2 * sizeof(struct io_uring_sqe)) - \ + offsetof(struct io_uring_sqe, cmd))); \ + (const type *)(sqe)->cmd; \ +}) + +#define io_uring_sqe_cmd(sqe, type) ({ \ + BUILD_BUG_ON(sizeof(type) > (sizeof(struct io_uring_sqe) - \ + offsetof(struct io_uring_sqe, cmd))); \ + (const type *)(sqe)->cmd; \ +}) static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) {