From 6b47d35d4d9e6a3fc7b456bb7f84aea807df5b11 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Feb 2025 10:22:24 -0700 Subject: [PATCH 1/5] eventpoll: abstract out parameter sanity checking Add a helper that checks the validity of the file descriptor and other parameters passed in to epoll_wait(). Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250219172552.1565603-2-axboe@kernel.dk Signed-off-by: Christian Brauner --- fs/eventpoll.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7c0980db77b3..565bf451df82 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2445,6 +2445,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return do_epoll_ctl(epfd, op, fd, &epds, false); } +static int ep_check_params(struct file *file, struct epoll_event __user *evs, + int maxevents) +{ + /* The maximum number of event must be greater than zero */ + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) + return -EINVAL; + + /* Verify that the area passed by the user is writeable */ + if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) + return -EFAULT; + + /* + * We have to check that the file structure underneath the fd + * the user passed to us _is_ an eventpoll file. + */ + if (!is_file_epoll(file)) + return -EINVAL; + + return 0; +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). @@ -2453,26 +2474,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; - - /* The maximum number of event must be greater than zero */ - if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) - return -EINVAL; - - /* Verify that the area passed by the user is writeable */ - if (!access_ok(events, maxevents * sizeof(struct epoll_event))) - return -EFAULT; + int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; - /* - * We have to check that the file structure underneath the fd - * the user passed to us _is_ an eventpoll file. - */ - if (!is_file_epoll(fd_file(f))) - return -EINVAL; + ret = ep_check_params(fd_file(f), events, maxevents); + if (unlikely(ret)) + return ret; /* * At this point it is safe to assume that the "private_data" contains From 38d203560118a673018df5892a6555bb0aba7762 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Feb 2025 10:22:25 -0700 Subject: [PATCH 2/5] eventpoll: abstract out ep_try_send_events() helper In preparation for reusing this helper in another epoll setup helper, abstract it out. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250219172552.1565603-3-axboe@kernel.dk Signed-off-by: Christian Brauner --- fs/eventpoll.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 565bf451df82..14466765b85d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1980,6 +1980,22 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, return ret; } +static int ep_try_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + int res; + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + res = ep_send_events(ep, events, maxevents); + if (res > 0) + ep_suspend_napi_irqs(ep); + return res; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -2031,17 +2047,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, while (1) { if (eavail) { - /* - * Try to transfer events to user space. In case we get - * 0 events and there's still timeout left over, we go - * trying again in search of more luck. - */ - res = ep_send_events(ep, events, maxevents); - if (res) { - if (res > 0) - ep_suspend_napi_irqs(ep); + res = ep_try_send_events(ep, events, maxevents); + if (res) return res; - } } if (timed_out) From ae3a4f1fdc2cfad089e79e2ee4697f84941528d3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Feb 2025 10:22:26 -0700 Subject: [PATCH 3/5] eventpoll: add epoll_sendevents() helper Basic helper that copies ready events to the specified userspace address. The event checking is quick and racy, it's up to the caller to ensure it retries appropriately in case 0 events are copied. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20250219172552.1565603-4-axboe@kernel.dk Signed-off-by: Christian Brauner --- fs/eventpoll.c | 20 ++++++++++++++++++++ include/linux/eventpoll.h | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 14466765b85d..94b87aaad0f6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2474,6 +2474,26 @@ static int ep_check_params(struct file *file, struct epoll_event __user *evs, return 0; } +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents) +{ + struct eventpoll *ep; + int ret; + + ret = ep_check_params(file, events, maxevents); + if (unlikely(ret)) + return ret; + + ep = file->private_data; + /* + * Racy call, but that's ok - it should get retried based on + * poll readiness anyway. + */ + if (ep_events_available(ep)) + return ep_try_send_events(ep, events, maxevents); + return 0; +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 0c0d00fcd131..ccb478eb174b 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -25,6 +25,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); +/* Copy ready events to userspace */ +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents); + /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup From 0fb3f5600c5e583a1cf9dbe803ba7ef89f7fbc65 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 31 Jan 2025 14:19:11 -0700 Subject: [PATCH 4/5] io_uring/epoll: remove CONFIG_EPOLL guards Just have the Makefile add the object if epoll is enabled, then it's not necessary to guard the entire epoll.c file inside an CONFIG_EPOLL ifdef. Signed-off-by: Jens Axboe --- io_uring/Makefile | 9 +++++---- io_uring/epoll.c | 2 -- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/io_uring/Makefile b/io_uring/Makefile index 98e48339d84d..3e28a741ca15 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -11,10 +11,11 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - epoll.o statx.o timeout.o fdinfo.o \ - cancel.o waitid.o register.o \ - truncate.o memmap.o alloc_cache.o + statx.o timeout.o fdinfo.o cancel.o \ + waitid.o register.o truncate.o \ + memmap.o alloc_cache.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o -obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_EPOLL) += epoll.o +obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 89bff2068a19..7848d9cc073d 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -12,7 +12,6 @@ #include "io_uring.h" #include "epoll.h" -#if defined(CONFIG_EPOLL) struct io_epoll { struct file *file; int epfd; @@ -58,4 +57,3 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } -#endif From 19f7e942732766aec8a51d217ab5fb4a7fe3bb0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 31 Jan 2025 14:29:09 -0700 Subject: [PATCH 5/5] io_uring/epoll: add support for IORING_OP_EPOLL_WAIT For existing epoll event loops that can't fully convert to io_uring, the used approach is usually to add the io_uring fd to the epoll instance and use epoll_wait() to wait on both "legacy" and io_uring events. While this work, it isn't optimal as: 1) epoll_wait() is pretty limited in what it can do. It does not support partial reaping of events, or waiting on a batch of events. 2) When an io_uring ring is added to an epoll instance, it activates the io_uring "I'm being polled" logic which slows things down. Rather than use this approach, with EPOLL_WAIT support added to io_uring, event loops can use the normal io_uring wait logic for everything, as long as an epoll wait request has been armed with io_uring. Note that IORING_OP_EPOLL_WAIT does NOT take a timeout value, as this is an async request. Waiting on io_uring events in general has various timeout parameters, and those are the ones that should be used when waiting on any kind of request. If events are immediately available for reaping, then This opcode will return those immediately. If none are available, then it will post an async completion when they become available. cqe->res will contain either an error code (< 0 value) for a malformed request, invalid epoll instance, etc. It will return a positive result indicating how many events were reaped. IORING_OP_EPOLL_WAIT requests may be canceled using the normal io_uring cancelation infrastructure. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/epoll.c | 33 +++++++++++++++++++++++++++++++++ io_uring/epoll.h | 2 ++ io_uring/opdef.c | 14 ++++++++++++++ 4 files changed, 50 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 05d6255b0f6a..135eb9296296 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -280,6 +280,7 @@ enum io_uring_op { IORING_OP_BIND, IORING_OP_LISTEN, IORING_OP_RECV_ZC, + IORING_OP_EPOLL_WAIT, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 7848d9cc073d..6d2c48ba1923 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -20,6 +20,12 @@ struct io_epoll { struct epoll_event event; }; +struct io_epoll_wait { + struct file *file; + int maxevents; + struct epoll_event __user *events; +}; + int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll); @@ -57,3 +63,30 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + + iew->maxevents = READ_ONCE(sqe->len); + iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr)); + return 0; +} + +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + int ret; + + ret = epoll_sendevents(req->file, iew->events, iew->maxevents); + if (ret == 0) + return -EAGAIN; + if (ret < 0) + req_set_fail(req); + + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/epoll.h b/io_uring/epoll.h index 870cce11ba98..4111997c360b 100644 --- a/io_uring/epoll.h +++ b/io_uring/epoll.h @@ -3,4 +3,6 @@ #if defined(CONFIG_EPOLL) int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags); +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags); #endif diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 89f50ecadeaf..9344534780a0 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -527,6 +527,17 @@ const struct io_issue_def io_issue_defs[] = { .issue = io_recvzc, #else .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_EPOLL_WAIT] = { + .needs_file = 1, + .audit_skip = 1, + .pollin = 1, +#if defined(CONFIG_EPOLL) + .prep = io_epoll_wait_prep, + .issue = io_epoll_wait, +#else + .prep = io_eopnotsupp_prep, #endif }, }; @@ -761,6 +772,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_RECV_ZC] = { .name = "RECV_ZC", }, + [IORING_OP_EPOLL_WAIT] = { + .name = "EPOLL_WAIT", + }, }; const char *io_uring_get_opcode(u8 opcode)