From 41b70df5b38bc80967d2e0ed55cc3c3896bba781 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Aug 2025 08:30:11 -0600 Subject: [PATCH 1/2] io_uring/net: commit partial buffers on retry Ring provided buffers are potentially only valid within the single execution context in which they were acquired. io_uring deals with this and invalidates them on retry. But on the networking side, if MSG_WAITALL is set, or if the socket is of the streaming type and too little was processed, then it will hang on to the buffer rather than recycle or commit it. This is problematic for two reasons: 1) If someone unregisters the provided buffer ring before a later retry, then the req->buf_list will no longer be valid. 2) If multiple sockers are using the same buffer group, then multiple receives can consume the same memory. This can cause data corruption in the application, as either receive could land in the same userspace buffer. Fix this by disallowing partial retries from pinning a provided buffer across multiple executions, if ring provided buffers are used. Cc: stable@vger.kernel.org Reported-by: pt x Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring") Signed-off-by: Jens Axboe --- io_uring/net.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index dd96e355982f..d69f2afa4f7a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -494,6 +494,15 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) return nbufs; } +static int io_net_kbuf_recyle(struct io_kiocb *req, + struct io_async_msghdr *kmsg, int len) +{ + req->flags |= REQ_F_BL_NO_RECYCLE; + if (req->flags & REQ_F_BUFFERS_COMMIT) + io_kbuf_commit(req, req->buf_list, len, io_bundle_nbufs(kmsg, len)); + return IOU_RETRY; +} + static inline bool io_send_finish(struct io_kiocb *req, int *ret, struct io_async_msghdr *kmsg, unsigned issue_flags) @@ -562,8 +571,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -674,8 +682,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1071,8 +1078,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return IOU_RETRY; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1218,8 +1224,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1500,8 +1505,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) zc->len -= ret; zc->buf += ret; zc->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1571,8 +1575,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; From 9d83e1f05c98bab5de350bef89177e2be8b34db0 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Wed, 13 Aug 2025 20:02:14 +0800 Subject: [PATCH 2/2] io_uring/io-wq: add check free worker before create new worker After commit 0b2b066f8a85 ("io_uring/io-wq: only create a new worker if it can make progress"), in our produce environment, we still observe that part of io_worker threads keeps creating and destroying. After analysis, it was confirmed that this was due to a more complex scenario involving a large number of fsync operations, which can be abstracted as frequent write + fsync operations on multiple files in a single uring instance. Since write is a hash operation while fsync is not, and fsync is likely to be suspended during execution, the action of checking the hash value in io_wqe_dec_running cannot handle such scenarios. Similarly, if hash-based work and non-hash-based work are sent at the same time, similar issues are likely to occur. Returning to the starting point of the issue, when a new work arrives, io_wq_enqueue may wake up free worker A, while io_wq_dec_running may create worker B. Ultimately, only one of A and B can obtain and process the task, leaving the other in an idle state. In the end, the issue is caused by inconsistent logic in the checks performed by io_wq_enqueue and io_wq_dec_running. Therefore, the problem can be resolved by checking for available workers in io_wq_dec_running. Signed-off-by: Fengnan Chang Reviewed-by: Diangang Li Link: https://lore.kernel.org/r/20250813120214.18729-1-changfengnan@bytedance.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index be91edf34f01..17dfaa0395c4 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -357,6 +357,13 @@ static void create_worker_cb(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; acct = worker->acct; + + rcu_read_lock(); + do_create = !io_acct_activate_free_worker(acct); + rcu_read_unlock(); + if (!do_create) + goto no_need_create; + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers < acct->max_workers) { @@ -367,6 +374,7 @@ static void create_worker_cb(struct callback_head *cb) if (do_create) { create_io_worker(wq, acct); } else { +no_need_create: atomic_dec(&acct->nr_running); io_worker_ref_put(wq); }