From b8c2e9e27636b92dc96c12f16894cbc60c58a306 Mon Sep 17 00:00:00 2001 From: Yufan Chen Date: Mon, 4 May 2026 01:56:10 +0800 Subject: [PATCH 1/4] io_uring/napi: clear tracked NAPI entries on unregister IORING_UNREGISTER_NAPI disables NAPI busy polling, but it currently leaves any previously tracked NAPI IDs on the ring context. The normal wait path only checks whether the list is empty before entering the busy poll helper, so an unregistered ring can still observe stale entries and run an unexpected busy poll pass. Make unregister switch the context to inactive and free the tracked entries. Do the same inactive transition while changing the tracking strategy, and recheck the expected tracking mode under napi_lock before inserting a newly learned NAPI ID. This prevents a racing poll path from repopulating the list after unregister or reconfiguration. Also make the busy poll dispatcher ignore inactive mode explicitly. Signed-off-by: Yufan Chen Fixes: 6bf90bd8c58a ("io_uring/napi: add static napi tracking strategy") Link: https://patch.msgid.link/20260503175610.35521-1-yufan.chen@linux.dev Signed-off-by: Jens Axboe --- io_uring/napi.c | 27 ++++++++++++++++++++------- io_uring/napi.h | 8 +++++--- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 8d68366a4b90..bfc771445912 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -38,7 +38,8 @@ static inline ktime_t net_to_ktime(unsigned long t) return ns_to_ktime(t << 10); } -int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, + unsigned int mode) { struct hlist_head *hash_list; struct io_napi_entry *e; @@ -69,6 +70,11 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) * kfree() */ spin_lock(&ctx->napi_lock); + if (unlikely(READ_ONCE(ctx->napi_track_mode) != mode)) { + spin_unlock(&ctx->napi_lock); + kfree(e); + return -EINVAL; + } if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); @@ -196,9 +202,14 @@ __io_napi_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { - if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) + switch (READ_ONCE(ctx->napi_track_mode)) { + case IO_URING_NAPI_TRACKING_STATIC: return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); - return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + case IO_URING_NAPI_TRACKING_DYNAMIC: + return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + default: + return false; + } } static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, @@ -273,13 +284,13 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx, default: return -EINVAL; } - /* clean the napi list for new settings */ + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); io_napi_free(ctx); - WRITE_ONCE(ctx->napi_track_mode, napi->op_param); /* cap NAPI at 10 msec of spin time */ napi->busy_poll_to = min(10000, napi->busy_poll_to); WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); + WRITE_ONCE(ctx->napi_track_mode, napi->op_param); return 0; } @@ -315,7 +326,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) case IO_URING_NAPI_STATIC_ADD_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; - return __io_napi_add_id(ctx, napi.op_param); + return __io_napi_add_id(ctx, napi.op_param, + IO_URING_NAPI_TRACKING_STATIC); case IO_URING_NAPI_STATIC_DEL_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; @@ -343,9 +355,10 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); - WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); + io_napi_free(ctx); return 0; } diff --git a/io_uring/napi.h b/io_uring/napi.h index fa742f42e09b..e0aecccc5065 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -15,7 +15,8 @@ void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); -int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, + unsigned int mode); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -43,13 +44,14 @@ static inline void io_napi_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct socket *sock; + unsigned int mode = IO_URING_NAPI_TRACKING_DYNAMIC; - if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) + if (READ_ONCE(ctx->napi_track_mode) != mode) return; sock = sock_from_file(req->file); if (sock && sock->sk) - __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); + __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id), mode); } #else From 04fe9aeb4f3c0999e6715385664c677469dfd8f4 Mon Sep 17 00:00:00 2001 From: Yufan Chen Date: Mon, 4 May 2026 01:57:10 +0800 Subject: [PATCH 2/4] io_uring/eventfd: reset deferred signal state Recursive eventfd wakeups must defer io_uring eventfd signaling because eventfd_signal_mask() rejects reentry from eventfd wakeup handlers. The io_ev_fd ops bit tracks an outstanding deferred signal so that the same rcu_head is not queued twice. That bit is only set today. Once the first deferred callback runs, later recursive notifications still see the bit set and skip queueing another deferred signal. This can leave new completions without a matching eventfd wake after the first recursive deferral. Clear the pending bit before issuing the deferred signal. If the wakeup path recurses while the callback runs, a new signal can be queued for the next RCU grace period while the current callback keeps its reference until it returns. Signed-off-by: Yufan Chen Fixes: 60b6c075e8eb ("io_uring/eventfd: move to more idiomatic RCU free usage") Link: https://patch.msgid.link/20260503175710.37209-1-yufan.chen@linux.dev Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 3da028500f76..d656cc2a0b9b 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -43,6 +43,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + atomic_andnot(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); io_eventfd_put(ev_fd); } From 9cc6bac1bebf8310d2950d1411a91479e86d69a1 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Mon, 4 May 2026 23:37:54 +0800 Subject: [PATCH 3/4] io_uring/timeout: honour caller's time namespace for IORING_TIMEOUT_ABS io_uring's IORING_OP_TIMEOUT and IORING_OP_LINK_TIMEOUT accept a timespec from the caller via io_parse_user_time(). With IORING_TIMEOUT_ABS, the timestamp is an absolute deadline on the selected clock. The clock is CLOCK_MONOTONIC by default. CLOCK_BOOTTIME and CLOCK_REALTIME are also selectable. A submitter inside a CLONE_NEWTIME time namespace observes CLOCK_MONOTONIC and CLOCK_BOOTTIME shifted by the namespace's offsets relative to the host. Every other ABS timer interface in the kernel converts the caller's absolute time to host view via timens_ktime_to_host() before arming an hrtimer: kernel/time/posix-timers.c -- timer_settime(TIMER_ABSTIME) kernel/time/posix-stubs.c -- clock_nanosleep(TIMER_ABSTIME) kernel/time/alarmtimer.c -- alarm_timer_nsleep(TIMER_ABSTIME) fs/timerfd.c -- timerfd_settime(TFD_TIMER_ABSTIME) io_parse_user_time() does not. As a result, an absolute timeout submitted from within a time namespace is interpreted in host view. That is generally a different point in time. It may already be in the past, causing the timer to fire immediately, or far in the future, causing the timer not to fire when expected. Reproducer: in unshare --user --time, with a -10s monotonic offset, submit IORING_OP_TIMEOUT with IORING_TIMEOUT_ABS and deadline = now + 1s. The CQE is delivered after <1ms instead of the expected ~1s. Apply timens_ktime_to_host() to the parsed time when IORING_TIMEOUT_ABS is set. Split the existing clock id resolver in io_timeout_get_clock() into a flags only helper io_flags_to_clock(), so io_parse_user_time() can resolve the clock without a struct io_timeout_data. timens_ktime_to_host() is a no-op for clocks not affected by time namespaces, e.g. CLOCK_REALTIME. It is also a no-op for callers in the initial time namespace. The fast path is unchanged. SQPOLL is also covered. The SQPOLL kernel thread is created via create_io_thread() with CLONE_THREAD and no CLONE_NEW* flag. copy_namespaces() therefore shares the submitter's nsproxy by reference. Inside the SQPOLL kthread, current->nsproxy->time_ns is the submitter's time_ns. timens_ktime_to_host() resolves correctly. Suggested-by: Pavel Begunkov Suggested-by: Jens Axboe Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260504153755.1293932-2-maoyi.xie@ntu.edu.sg Signed-off-by: Jens Axboe --- io_uring/timeout.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 4cfdfc519770..e2595cae2b07 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -35,6 +36,22 @@ struct io_timeout_rem { bool ltimeout; }; +static clockid_t io_flags_to_clock(unsigned flags) +{ + switch (flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) { struct timespec64 ts; @@ -43,7 +60,7 @@ static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) *time = ns_to_ktime(arg); if (*time < 0) return -EINVAL; - return 0; + goto out; } if (get_timespec64(&ts, u64_to_user_ptr(arg))) @@ -51,6 +68,9 @@ static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) if (ts.tv_sec < 0 || ts.tv_nsec < 0) return -EINVAL; *time = timespec64_to_ktime(ts); +out: + if (flags & IORING_TIMEOUT_ABS) + *time = timens_ktime_to_host(io_flags_to_clock(flags), *time); return 0; } @@ -399,18 +419,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static clockid_t io_timeout_get_clock(struct io_timeout_data *data) { - switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { - case IORING_TIMEOUT_BOOTTIME: - return CLOCK_BOOTTIME; - case IORING_TIMEOUT_REALTIME: - return CLOCK_REALTIME; - default: - /* can't happen, vetted at prep time */ - WARN_ON_ONCE(1); - fallthrough; - case 0: - return CLOCK_MONOTONIC; - } + return io_flags_to_clock(data->flags); } static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, From 45d2b37a37ab98484693533496395c610a2cab96 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Mon, 4 May 2026 23:37:55 +0800 Subject: [PATCH 4/4] io_uring/wait: honour caller's time namespace for IORING_ENTER_ABS_TIMER io_uring_enter() with IORING_ENTER_ABS_TIMER takes an absolute timespec from the caller via ext_arg->ts. It arms an ABS mode hrtimer in __io_cqring_wait_schedule(). The conversion path in io_uring/wait.c parses ext_arg->ts inline rather than going through io_parse_user_time(). It therefore does not pick up the time namespace conversion added by the previous patch. Apply timens_ktime_to_host() to the parsed time on the IORING_ENTER_ABS_TIMER branch. This mirrors the IORING_TIMEOUT_ABS fix in io_parse_user_time(). Use ctx->clockid as the clock id. ctx->clockid is set either at ring creation or via IORING_REGISTER_CLOCK. timens_ktime_to_host() is a no-op for clocks not affected by time namespaces. It is also a no-op for callers in the initial time namespace. The fast path is unchanged. Reproducer: in unshare --user --time, with a -10s monotonic offset, call io_uring_enter with min_complete=1, IORING_ENTER_ABS_TIMER, and ts = now + 1s. The call returns -ETIME after <1ms instead of after the expected ~1s. Suggested-by: Pavel Begunkov Suggested-by: Jens Axboe Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260504153755.1293932-3-maoyi.xie@ntu.edu.sg Signed-off-by: Jens Axboe --- io_uring/wait.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/wait.c b/io_uring/wait.c index 91df86ce0d18..ec01e78a216d 100644 --- a/io_uring/wait.c +++ b/io_uring/wait.c @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -229,7 +230,10 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (ext_arg->ts_set) { iowq.timeout = timespec64_to_ktime(ext_arg->ts); - if (!(flags & IORING_ENTER_ABS_TIMER)) + if (flags & IORING_ENTER_ABS_TIMER) + iowq.timeout = timens_ktime_to_host(ctx->clockid, + iowq.timeout); + else iowq.timeout = ktime_add(iowq.timeout, start_time); }