From b8c2e9e27636b92dc96c12f16894cbc60c58a306 Mon Sep 17 00:00:00 2001
From: Yufan Chen <ericterminal@gmail.com>
Date: Mon, 4 May 2026 01:56:10 +0800
Subject: [PATCH 1/4] io_uring/napi: clear tracked NAPI entries on unregister

IORING_UNREGISTER_NAPI disables NAPI busy polling, but it currently
leaves any previously tracked NAPI IDs on the ring context. The normal
wait path only checks whether the list is empty before entering the busy
poll helper, so an unregistered ring can still observe stale entries and
run an unexpected busy poll pass.

Make unregister switch the context to inactive and free the tracked
entries. Do the same inactive transition while changing the tracking
strategy, and recheck the expected tracking mode under napi_lock before
inserting a newly learned NAPI ID. This prevents a racing poll path from
repopulating the list after unregister or reconfiguration.

Also make the busy poll dispatcher ignore inactive mode explicitly.

Signed-off-by: Yufan Chen <ericterminal@gmail.com>
Fixes: 6bf90bd8c58a ("io_uring/napi: add static napi tracking strategy")
Link: https://patch.msgid.link/20260503175610.35521-1-yufan.chen@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/napi.c | 27 ++++++++++++++++++++-------
 io_uring/napi.h |  8 +++++---
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/io_uring/napi.c b/io_uring/napi.c
index 8d68366a4b90..bfc771445912 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -38,7 +38,8 @@ static inline ktime_t net_to_ktime(unsigned long t)
 	return ns_to_ktime(t << 10);
 }
 
-int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id,
+		     unsigned int mode)
 {
 	struct hlist_head *hash_list;
 	struct io_napi_entry *e;
@@ -69,6 +70,11 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
 	 * kfree()
 	 */
 	spin_lock(&ctx->napi_lock);
+	if (unlikely(READ_ONCE(ctx->napi_track_mode) != mode)) {
+		spin_unlock(&ctx->napi_lock);
+		kfree(e);
+		return -EINVAL;
+	}
 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
 		spin_unlock(&ctx->napi_lock);
 		kfree(e);
@@ -196,9 +202,14 @@ __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
 		       bool (*loop_end)(void *, unsigned long),
 		       void *loop_end_arg)
 {
-	if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
+	switch (READ_ONCE(ctx->napi_track_mode)) {
+	case IO_URING_NAPI_TRACKING_STATIC:
 		return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
-	return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+	case IO_URING_NAPI_TRACKING_DYNAMIC:
+		return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+	default:
+		return false;
+	}
 }
 
 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
@@ -273,13 +284,13 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx,
 	default:
 		return -EINVAL;
 	}
-	/* clean the napi list for new settings */
+	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
 	io_napi_free(ctx);
-	WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
 	/* cap NAPI at 10 msec of spin time */
 	napi->busy_poll_to = min(10000, napi->busy_poll_to);
 	WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
+	WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
 	return 0;
 }
 
@@ -315,7 +326,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 	case IO_URING_NAPI_STATIC_ADD_ID:
 		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
 			return -EINVAL;
-		return __io_napi_add_id(ctx, napi.op_param);
+		return __io_napi_add_id(ctx, napi.op_param,
+					IO_URING_NAPI_TRACKING_STATIC);
 	case IO_URING_NAPI_STATIC_DEL_ID:
 		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
 			return -EINVAL;
@@ -343,9 +355,10 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
 	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
 		return -EFAULT;
 
+	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
-	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
+	io_napi_free(ctx);
 	return 0;
 }
 
diff --git a/io_uring/napi.h b/io_uring/napi.h
index fa742f42e09b..e0aecccc5065 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -15,7 +15,8 @@ void io_napi_free(struct io_ring_ctx *ctx);
 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
 
-int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id);
+int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id,
+		     unsigned int mode);
 
 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
@@ -43,13 +44,14 @@ static inline void io_napi_add(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct socket *sock;
+	unsigned int mode = IO_URING_NAPI_TRACKING_DYNAMIC;
 
-	if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC)
+	if (READ_ONCE(ctx->napi_track_mode) != mode)
 		return;
 
 	sock = sock_from_file(req->file);
 	if (sock && sock->sk)
-		__io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id));
+		__io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id), mode);
 }
 
 #else

From 04fe9aeb4f3c0999e6715385664c677469dfd8f4 Mon Sep 17 00:00:00 2001
From: Yufan Chen <ericterminal@gmail.com>
Date: Mon, 4 May 2026 01:57:10 +0800
Subject: [PATCH 2/4] io_uring/eventfd: reset deferred signal state

Recursive eventfd wakeups must defer io_uring eventfd signaling because
eventfd_signal_mask() rejects reentry from eventfd wakeup handlers. The
io_ev_fd ops bit tracks an outstanding deferred signal so that the same
rcu_head is not queued twice.

That bit is only set today. Once the first deferred callback runs, later
recursive notifications still see the bit set and skip queueing another
deferred signal. This can leave new completions without a matching
eventfd wake after the first recursive deferral.

Clear the pending bit before issuing the deferred signal. If the wakeup
path recurses while the callback runs, a new signal can be queued for
the next RCU grace period while the current callback keeps its reference
until it returns.

Signed-off-by: Yufan Chen <ericterminal@gmail.com>
Fixes: 60b6c075e8eb ("io_uring/eventfd: move to more idiomatic RCU free usage")
Link: https://patch.msgid.link/20260503175710.37209-1-yufan.chen@linux.dev
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/eventfd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index 3da028500f76..d656cc2a0b9b 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -43,6 +43,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
 {
 	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
 
+	atomic_andnot(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops);
 	eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
 	io_eventfd_put(ev_fd);
 }

From 9cc6bac1bebf8310d2950d1411a91479e86d69a1 Mon Sep 17 00:00:00 2001
From: Maoyi Xie <maoyixie.tju@gmail.com>
Date: Mon, 4 May 2026 23:37:54 +0800
Subject: [PATCH 3/4] io_uring/timeout: honour caller's time namespace for
 IORING_TIMEOUT_ABS

io_uring's IORING_OP_TIMEOUT and IORING_OP_LINK_TIMEOUT accept a
timespec from the caller via io_parse_user_time(). With
IORING_TIMEOUT_ABS, the timestamp is an absolute deadline on the
selected clock. The clock is CLOCK_MONOTONIC by default.
CLOCK_BOOTTIME and CLOCK_REALTIME are also selectable.

A submitter inside a CLONE_NEWTIME time namespace observes
CLOCK_MONOTONIC and CLOCK_BOOTTIME shifted by the namespace's
offsets relative to the host. Every other ABS timer interface in
the kernel converts the caller's absolute time to host view via
timens_ktime_to_host() before arming an hrtimer:

  kernel/time/posix-timers.c    -- timer_settime(TIMER_ABSTIME)
  kernel/time/posix-stubs.c     -- clock_nanosleep(TIMER_ABSTIME)
  kernel/time/alarmtimer.c      -- alarm_timer_nsleep(TIMER_ABSTIME)
  fs/timerfd.c                  -- timerfd_settime(TFD_TIMER_ABSTIME)

io_parse_user_time() does not. As a result, an absolute timeout
submitted from within a time namespace is interpreted in host
view. That is generally a different point in time. It may already
be in the past, causing the timer to fire immediately, or far in
the future, causing the timer not to fire when expected.

Reproducer: in unshare --user --time, with a -10s monotonic
offset, submit IORING_OP_TIMEOUT with IORING_TIMEOUT_ABS and
deadline = now + 1s. The CQE is delivered after <1ms instead of
the expected ~1s.

Apply timens_ktime_to_host() to the parsed time when
IORING_TIMEOUT_ABS is set. Split the existing clock id resolver
in io_timeout_get_clock() into a flags only helper
io_flags_to_clock(), so io_parse_user_time() can resolve the
clock without a struct io_timeout_data.

timens_ktime_to_host() is a no-op for clocks not affected by time
namespaces, e.g. CLOCK_REALTIME. It is also a no-op for callers
in the initial time namespace. The fast path is unchanged.

SQPOLL is also covered. The SQPOLL kernel thread is created via
create_io_thread() with CLONE_THREAD and no CLONE_NEW* flag.
copy_namespaces() therefore shares the submitter's nsproxy by
reference. Inside the SQPOLL kthread, current->nsproxy->time_ns
is the submitter's time_ns. timens_ktime_to_host() resolves
correctly.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Maoyi Xie <maoyi.xie@ntu.edu.sg>
Link: https://patch.msgid.link/20260504153755.1293932-2-maoyi.xie@ntu.edu.sg
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 4cfdfc519770..e2595cae2b07 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -3,6 +3,7 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/io_uring.h>
+#include <linux/time_namespace.h>
 
 #include <trace/events/io_uring.h>
 
@@ -35,6 +36,22 @@ struct io_timeout_rem {
 	bool				ltimeout;
 };
 
+static clockid_t io_flags_to_clock(unsigned flags)
+{
+	switch (flags & IORING_TIMEOUT_CLOCK_MASK) {
+	case IORING_TIMEOUT_BOOTTIME:
+		return CLOCK_BOOTTIME;
+	case IORING_TIMEOUT_REALTIME:
+		return CLOCK_REALTIME;
+	default:
+		/* can't happen, vetted at prep time */
+		WARN_ON_ONCE(1);
+		fallthrough;
+	case 0:
+		return CLOCK_MONOTONIC;
+	}
+}
+
 static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags)
 {
 	struct timespec64 ts;
@@ -43,7 +60,7 @@ static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags)
 		*time = ns_to_ktime(arg);
 		if (*time < 0)
 			return -EINVAL;
-		return 0;
+		goto out;
 	}
 
 	if (get_timespec64(&ts, u64_to_user_ptr(arg)))
@@ -51,6 +68,9 @@ static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags)
 	if (ts.tv_sec < 0 || ts.tv_nsec < 0)
 		return -EINVAL;
 	*time = timespec64_to_ktime(ts);
+out:
+	if (flags & IORING_TIMEOUT_ABS)
+		*time = timens_ktime_to_host(io_flags_to_clock(flags), *time);
 	return 0;
 }
 
@@ -399,18 +419,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 
 static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
 {
-	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
-	case IORING_TIMEOUT_BOOTTIME:
-		return CLOCK_BOOTTIME;
-	case IORING_TIMEOUT_REALTIME:
-		return CLOCK_REALTIME;
-	default:
-		/* can't happen, vetted at prep time */
-		WARN_ON_ONCE(1);
-		fallthrough;
-	case 0:
-		return CLOCK_MONOTONIC;
-	}
+	return io_flags_to_clock(data->flags);
 }
 
 static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,

From 45d2b37a37ab98484693533496395c610a2cab96 Mon Sep 17 00:00:00 2001
From: Maoyi Xie <maoyixie.tju@gmail.com>
Date: Mon, 4 May 2026 23:37:55 +0800
Subject: [PATCH 4/4] io_uring/wait: honour caller's time namespace for
 IORING_ENTER_ABS_TIMER

io_uring_enter() with IORING_ENTER_ABS_TIMER takes an absolute
timespec from the caller via ext_arg->ts. It arms an ABS mode
hrtimer in __io_cqring_wait_schedule(). The conversion path in
io_uring/wait.c parses ext_arg->ts inline rather than going
through io_parse_user_time(). It therefore does not pick up the
time namespace conversion added by the previous patch.

Apply timens_ktime_to_host() to the parsed time on the
IORING_ENTER_ABS_TIMER branch. This mirrors the IORING_TIMEOUT_ABS
fix in io_parse_user_time(). Use ctx->clockid as the clock id.
ctx->clockid is set either at ring creation or via
IORING_REGISTER_CLOCK.

timens_ktime_to_host() is a no-op for clocks not affected by time
namespaces. It is also a no-op for callers in the initial time
namespace. The fast path is unchanged.

Reproducer: in unshare --user --time, with a -10s monotonic
offset, call io_uring_enter with min_complete=1,
IORING_ENTER_ABS_TIMER, and ts = now + 1s. The call returns
-ETIME after <1ms instead of after the expected ~1s.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Maoyi Xie <maoyi.xie@ntu.edu.sg>
Link: https://patch.msgid.link/20260504153755.1293932-3-maoyi.xie@ntu.edu.sg
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/wait.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/io_uring/wait.c b/io_uring/wait.c
index 91df86ce0d18..ec01e78a216d 100644
--- a/io_uring/wait.c
+++ b/io_uring/wait.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/sched/signal.h>
 #include <linux/io_uring.h>
+#include <linux/time_namespace.h>
 
 #include <trace/events/io_uring.h>
 
@@ -229,7 +230,10 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 
 	if (ext_arg->ts_set) {
 		iowq.timeout = timespec64_to_ktime(ext_arg->ts);
-		if (!(flags & IORING_ENTER_ABS_TIMER))
+		if (flags & IORING_ENTER_ABS_TIMER)
+			iowq.timeout = timens_ktime_to_host(ctx->clockid,
+							    iowq.timeout);
+		else
 			iowq.timeout = ktime_add(iowq.timeout, start_time);
 	}