From 5fc16fa5f13b3c06fdb959ef262050bd810416a2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 1 Jun 2024 12:25:35 -0600 Subject: [PATCH 1/4] io_uring: check for non-NULL file pointer in io_file_can_poll() In earlier kernels, it was possible to trigger a NULL pointer dereference off the forced async preparation path, if no file had been assigned. The trace leading to that looks as follows: BUG: kernel NULL pointer dereference, address: 00000000000000b0 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 67 PID: 1633 Comm: buf-ring-invali Not tainted 6.8.0-rc3+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022 RIP: 0010:io_buffer_select+0xc3/0x210 Code: 00 00 48 39 d1 0f 82 ae 00 00 00 48 81 4b 48 00 00 01 00 48 89 73 70 0f b7 50 0c 66 89 53 42 85 ed 0f 85 d2 00 00 00 48 8b 13 <48> 8b 92 b0 00 00 00 48 83 7a 40 00 0f 84 21 01 00 00 4c 8b 20 5b RSP: 0018:ffffb7bec38c7d88 EFLAGS: 00010246 RAX: ffff97af2be61000 RBX: ffff97af234f1700 RCX: 0000000000000040 RDX: 0000000000000000 RSI: ffff97aecfb04820 RDI: ffff97af234f1700 RBP: 0000000000000000 R08: 0000000000200030 R09: 0000000000000020 R10: ffffb7bec38c7dc8 R11: 000000000000c000 R12: ffffb7bec38c7db8 R13: ffff97aecfb05800 R14: ffff97aecfb05800 R15: ffff97af2be5e000 FS: 00007f852f74b740(0000) GS:ffff97b1eeec0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000b0 CR3: 000000016deab005 CR4: 0000000000370ef0 Call Trace: ? __die+0x1f/0x60 ? page_fault_oops+0x14d/0x420 ? do_user_addr_fault+0x61/0x6a0 ? exc_page_fault+0x6c/0x150 ? asm_exc_page_fault+0x22/0x30 ? io_buffer_select+0xc3/0x210 __io_import_iovec+0xb5/0x120 io_readv_prep_async+0x36/0x70 io_queue_sqe_fallback+0x20/0x260 io_submit_sqes+0x314/0x630 __do_sys_io_uring_enter+0x339/0xbc0 ? __do_sys_io_uring_register+0x11b/0xc50 ? vm_mmap_pgoff+0xce/0x160 do_syscall_64+0x5f/0x180 entry_SYSCALL_64_after_hwframe+0x46/0x4e RIP: 0033:0x55e0a110a67e Code: ba cc 00 00 00 45 31 c0 44 0f b6 92 d0 00 00 00 31 d2 41 b9 08 00 00 00 41 83 e2 01 41 c1 e2 04 41 09 c2 b8 aa 01 00 00 0f 05 90 89 30 eb a9 0f 1f 40 00 48 8b 42 20 8b 00 a8 06 75 af 85 f6 because the request is marked forced ASYNC and has a bad file fd, and hence takes the forced async prep path. Current kernels with the request async prep cleaned up can no longer hit this issue, but for ease of backporting, let's add this safety check in here too as it really doesn't hurt. For both cases, this will inevitably end with a CQE posted with -EBADF. Cc: stable@vger.kernel.org Fixes: a76c0b31eef5 ("io_uring: commit non-pollable provided mapped buffers upfront") Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 624ca9076a50..726e6367af4d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -433,7 +433,7 @@ static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) return true; - if (file_can_poll(req->file)) { + if (req->file && file_can_poll(req->file)) { req->flags |= REQ_F_CAN_POLL; return true; } From 415ce0ea55c5a3afea501a773e002be9ed7149f5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 3 Jun 2024 13:56:53 -0600 Subject: [PATCH 2/4] io_uring/napi: fix timeout calculation Not quite sure what __io_napi_adjust_timeout() was attemping to do, it's adjusting both the NAPI timeout and the general overall timeout, and calculating a value that is never used. The overall timeout is a super set of the NAPI timeout, and doesn't need adjusting. The only thing we really need to care about is that the NAPI timeout doesn't exceed the overall timeout. If a user asked for a timeout of eg 5 usec and NAPI timeout is 10 usec, then we should not spin for 10 usec. While in there, sanitize the time checking a bit. If we have a negative value in the passed in timeout, discard it. Round up the value as well, so we don't end up with a NAPI timeout for the majority of the wait, with only a tiny sleep value at the end. Hence the only case we need to care about is if the NAPI timeout is larger than the overall timeout. If it is, cap the NAPI timeout at what the overall timeout is. Cc: stable@vger.kernel.org Fixes: 8d0c12a80cde ("io-uring: add napi busy poll support") Reported-by: Lewis Baker Signed-off-by: Jens Axboe --- io_uring/napi.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 883a1a665907..8c18ede595c4 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -261,12 +261,14 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) } /* - * __io_napi_adjust_timeout() - Add napi id to the busy poll list + * __io_napi_adjust_timeout() - adjust busy loop timeout * @ctx: pointer to io-uring context structure * @iowq: pointer to io wait queue * @ts: pointer to timespec or NULL * * Adjust the busy loop timeout according to timespec and busy poll timeout. + * If the specified NAPI timeout is bigger than the wait timeout, then adjust + * the NAPI timeout accordingly. */ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct timespec64 *ts) @@ -274,16 +276,16 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); if (ts) { - struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + struct timespec64 poll_to_ts; - if (timespec64_compare(ts, &poll_to_ts) > 0) { - *ts = timespec64_sub(*ts, poll_to_ts); - } else { - u64 to = timespec64_to_ns(ts); - - do_div(to, 1000); - ts->tv_sec = 0; - ts->tv_nsec = 0; + poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + if (timespec64_compare(ts, &poll_to_ts) < 0) { + s64 poll_to_ns = timespec64_to_ns(ts); + if (poll_to_ns > 0) { + u64 val = poll_to_ns + 999; + do_div(val, (s64) 1000); + poll_to = val; + } } } From 91215f70ea8541e9011c0b48f8b59b9e0ce6953b Mon Sep 17 00:00:00 2001 From: Su Hui Date: Tue, 4 Jun 2024 20:12:43 +0800 Subject: [PATCH 3/4] io_uring/io-wq: avoid garbage value of 'match' in io_wq_enqueue() Clang static checker (scan-build) warning: o_uring/io-wq.c:line 1051, column 3 The expression is an uninitialized value. The computed value will also be garbage. 'match.nr_pending' is used in io_acct_cancel_pending_work(), but it is not fully initialized. Change the order of assignment for 'match' to fix this problem. Fixes: 42abc95f05bf ("io-wq: decouple work_list protection from the big wqe->lock") Signed-off-by: Su Hui Link: https://lore.kernel.org/r/20240604121242.2661244-1-suhui@nfschina.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index d1c47a9d9215..7d3316fe9bfc 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -927,7 +927,11 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned long work_flags = work->flags; - struct io_cb_cancel_data match; + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; bool do_create; /* @@ -965,10 +969,6 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) raw_spin_unlock(&wq->lock); /* fatal condition, failed to create the first worker */ - match.fn = io_wq_work_match_item, - match.data = work, - match.cancel_all = false, - io_acct_cancel_pending_work(wq, acct, &match); } } From 73254a297c2dd094abec7c9efee32455ae875bdf Mon Sep 17 00:00:00 2001 From: Hagar Hemdan Date: Tue, 4 Jun 2024 13:05:27 +0000 Subject: [PATCH 4/4] io_uring: fix possible deadlock in io_register_iowq_max_workers() The io_register_iowq_max_workers() function calls io_put_sq_data(), which acquires the sqd->lock without releasing the uring_lock. Similar to the commit 009ad9f0c6ee ("io_uring: drop ctx->uring_lock before acquiring sqd->lock"), this can lead to a potential deadlock situation. To resolve this issue, the uring_lock is released before calling io_put_sq_data(), and then it is re-acquired after the function call. This change ensures that the locks are acquired in the correct order, preventing the possibility of a deadlock. Suggested-by: Maximilian Heyne Signed-off-by: Hagar Hemdan Link: https://lore.kernel.org/r/20240604130527.3597-1-hagarhem@amazon.com Signed-off-by: Jens Axboe --- io_uring/register.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/register.c b/io_uring/register.c index ef8c908346a4..c0010a66a6f2 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -355,8 +355,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, } if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } if (copy_to_user(arg, new_count, sizeof(new_count))) @@ -380,8 +382,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return 0; err: if (sqd) { + mutex_unlock(&ctx->uring_lock); mutex_unlock(&sqd->lock); io_put_sq_data(sqd); + mutex_lock(&ctx->uring_lock); } return ret; }