diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 893a8ba0e9fe..95fec5968362 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -149,14 +149,14 @@ CPU page table into a device page table; HMM helps keep both synchronized. A device driver that wants to mirror a process address space must start with the registration of a mmu_interval_notifier:: - mni->ops = &driver_ops; - int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, - unsigned long start, unsigned long length, - struct mm_struct *mm); + int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops); -During the driver_ops->invalidate() callback the device driver must perform -the update action to the range (mark range read only, or fully unmap, -etc.). The device must complete the update before the driver callback returns. +During the ops->invalidate() callback the device driver must perform the +update action to the range (mark range read only, or fully unmap, etc.). The +device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use:: @@ -183,7 +183,7 @@ The usage pattern is:: struct hmm_range range; ... - range.notifier = &mni; + range.notifier = &interval_sub; range.start = ...; range.end = ...; range.pfns = ...; @@ -191,11 +191,11 @@ The usage pattern is:: range.values = ...; range.pfn_shift = ...; - if (!mmget_not_zero(mni->notifier.mm)) + if (!mmget_not_zero(interval_sub->notifier.mm)) return -EFAULT; again: - range.notifier_seq = mmu_interval_read_begin(&mni); + range.notifier_seq = mmu_interval_read_begin(&interval_sub); down_read(&mm->mmap_sem); ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT); if (ret) { diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 4d7f2ffa957c..36d42da7466a 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -476,3 +476,4 @@ 544 common pidfd_open sys_pidfd_open # 545 reserved for clone3 547 common openat2 sys_openat2 +548 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 4ba54bc7e19a..4d1cf74a2caa 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -450,3 +450,4 @@ 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 0f255a23733d..1dd22da1c3a9 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 438 +#define __NR_compat_syscalls 439 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 57f6f592d460..c1c61635f89c 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -881,6 +881,8 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open) __SYSCALL(__NR_clone3, sys_clone3) #define __NR_openat2 437 __SYSCALL(__NR_openat2, sys_openat2) +#define __NR_pidfd_getfd 438 +__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 8d36f2e2dc89..042911e670b8 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -357,3 +357,4 @@ 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index b911e0f50a71..f4f49fcb76d0 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -436,3 +436,4 @@ 434 common pidfd_open sys_pidfd_open 435 common clone3 __sys_clone3 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index c04385e60833..4c67b11f9c9e 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 68c9ec06851f..1f9e8ad636cc 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -375,3 +375,4 @@ 434 n32 pidfd_open sys_pidfd_open 435 n32 clone3 __sys_clone3 437 n32 openat2 sys_openat2 +438 n32 pidfd_getfd sys_pidfd_getfd diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 42a72d010050..c0b9d802dbf6 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -351,3 +351,4 @@ 434 n64 pidfd_open sys_pidfd_open 435 n64 clone3 __sys_clone3 437 n64 openat2 sys_openat2 +438 n64 pidfd_getfd sys_pidfd_getfd diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index f114c4aed0ed..ac586774c980 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -424,3 +424,4 @@ 434 o32 pidfd_open sys_pidfd_open 435 o32 clone3 __sys_clone3 437 o32 openat2 sys_openat2 +438 o32 pidfd_getfd sys_pidfd_getfd diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index b550ae9a7fea..52a15f5cd130 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -434,3 +434,4 @@ 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3_wrapper 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index a8b5ecb5b602..35b61bfc1b1a 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -518,3 +518,4 @@ 434 common pidfd_open sys_pidfd_open 435 nospu clone3 ppc_clone3 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 16b571c06161..bd7bd3581a0f 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -439,3 +439,4 @@ 434 common pidfd_open sys_pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 sys_clone3 437 common openat2 sys_openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index a7185cc18626..c7a30fcd135f 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -439,3 +439,4 @@ 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index b11c19552022..f13615ecdecc 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -482,3 +482,4 @@ 434 common pidfd_open sys_pidfd_open # 435 reserved for clone3 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d22a8b5c3fab..c17cb77eb150 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -441,3 +441,4 @@ 434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open 435 i386 clone3 sys_clone3 __ia32_sys_clone3 437 i386 openat2 sys_openat2 __ia32_sys_openat2 +438 i386 pidfd_getfd sys_pidfd_getfd __ia32_sys_pidfd_getfd diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9035647ef236..44d510bc9b78 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -358,6 +358,7 @@ 434 common pidfd_open __x64_sys_pidfd_open 435 common clone3 __x64_sys_clone3/ptregs 437 common openat2 __x64_sys_openat2 +438 common pidfd_getfd __x64_sys_pidfd_getfd # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index f0a68013c038..85a9ab1bc04d 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -407,3 +407,4 @@ 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 90c1f3ca1939..c5b717c6797f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2402,10 +2402,12 @@ static void binder_deferred_fd_close(int fd) return; init_task_work(&twcb->twork, binder_do_fd_close); __close_fd_get_file(fd, &twcb->file); - if (twcb->file) + if (twcb->file) { + filp_close(twcb->file, current->files); task_work_add(current, &twcb->twork, true); - else + } else { kfree(twcb); + } } static void binder_transaction_buffer_release(struct binder_proc *proc, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 636c1cca9e1b..717b08e7bfd3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -355,12 +355,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) return container_of(p, struct ep_pqueue, pt)->epi; } -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ -static inline int ep_op_has_event(int op) -{ - return op != EPOLL_CTL_DEL; -} - /* Initialize the poll safe wake up structure */ static void ep_nested_calls_init(struct nested_calls *ncalls) { @@ -2076,27 +2070,28 @@ SYSCALL_DEFINE1(epoll_create, int, size) return do_epoll_create(0); } -/* - * The following function implements the controller interface for - * the eventpoll file that enables the insertion/removal/change of - * file descriptors inside the interest set. - */ -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, - struct epoll_event __user *, event) +static inline int epoll_mutex_lock(struct mutex *mutex, int depth, + bool nonblock) +{ + if (!nonblock) { + mutex_lock_nested(mutex, depth); + return 0; + } + if (mutex_trylock(mutex)) + return 0; + return -EAGAIN; +} + +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + bool nonblock) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; - struct epoll_event epds; struct eventpoll *tep = NULL; - error = -EFAULT; - if (ep_op_has_event(op) && - copy_from_user(&epds, event, sizeof(struct epoll_event))) - goto error_return; - error = -EBADF; f = fdget(epfd); if (!f.file) @@ -2114,7 +2109,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) - ep_take_care_of_epollwakeup(&epds); + ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor @@ -2130,11 +2125,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ - if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { + if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || - (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) + (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } @@ -2159,13 +2154,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) + goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || is_file_epoll(tf.file)) { - full_check = 1; mutex_unlock(&ep->mtx); - mutex_lock(&epmutex); + error = epoll_mutex_lock(&epmutex, 0, nonblock); + if (error) + goto error_tgt_fput; + full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; if (ep_loop_check(ep, tf.file) != 0) { @@ -2175,10 +2174,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else list_add(&tf.file->f_tfile_llink, &tfile_check_list); - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) { +out_del: + list_del(&tf.file->f_tfile_llink); + goto error_tgt_fput; + } if (is_file_epoll(tf.file)) { tep = tf.file->private_data; - mutex_lock_nested(&tep->mtx, 1); + error = epoll_mutex_lock(&tep->mtx, 1, nonblock); + if (error) { + mutex_unlock(&ep->mtx); + goto out_del; + } } } } @@ -2194,8 +2202,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, &epds, tf.file, fd, full_check); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; if (full_check) @@ -2210,8 +2218,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_modify(ep, epi, &epds); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_modify(ep, epi, epds); } } else error = -ENOENT; @@ -2233,6 +2241,23 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return error; } +/* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + struct epoll_event epds; + + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + return -EFAULT; + + return do_epoll_ctl(epfd, op, fd, &epds, false); +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). diff --git a/fs/file.c b/fs/file.c index 3da91a112bab..a364e1a9b7e8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -642,7 +642,9 @@ int __close_fd(struct files_struct *files, unsigned fd) EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ /* - * variant of __close_fd that gets a ref on the file for later fput + * variant of __close_fd that gets a ref on the file for later fput. + * The caller must ensure that filp_close() called on the file, and then + * an fput(). */ int __close_fd_get_file(unsigned int fd, struct file **res) { @@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res) spin_unlock(&files->file_lock); get_file(file); *res = file; - return filp_close(file, files); + return 0; out_unlock: spin_unlock(&files->file_lock); @@ -706,9 +708,9 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) +static struct file *__fget_files(struct files_struct *files, unsigned int fd, + fmode_t mask, unsigned int refs) { - struct files_struct *files = current->files; struct file *file; rcu_read_lock(); @@ -729,6 +731,12 @@ static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) return file; } +static inline struct file *__fget(unsigned int fd, fmode_t mask, + unsigned int refs) +{ + return __fget_files(current->files, fd, mask, refs); +} + struct file *fget_many(unsigned int fd, unsigned int refs) { return __fget(fd, FMODE_PATH, refs); @@ -746,6 +754,18 @@ struct file *fget_raw(unsigned int fd) } EXPORT_SYMBOL(fget_raw); +struct file *fget_task(struct task_struct *task, unsigned int fd) +{ + struct file *file = NULL; + + task_lock(task); + if (task->files) + file = __fget_files(task->files, fd, 0, 1); + task_unlock(task); + + return file; +} + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * diff --git a/fs/internal.h b/fs/internal.h index 72b175a4f6c8..17d2d03ffd46 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -122,6 +122,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); +extern struct open_how build_open_how(int flags, umode_t mode); +extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); long do_faccessat(int dfd, const char __user *filename, int mode); @@ -180,3 +182,9 @@ extern const struct dentry_operations ns_dentry_operations; /* direct-io.c: */ int sb_init_dio_done_wq(struct super_block *sb); + +/* + * fs/stat.c: + */ +unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags); +int cp_statx(const struct kstat *stat, struct statx __user *buffer); diff --git a/fs/io-wq.c b/fs/io-wq.c index 5147d2213b01..cb60a42b9fdf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -56,7 +56,8 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; - const struct cred *creds; + const struct cred *cur_creds; + const struct cred *saved_creds; struct files_struct *restore_files; }; @@ -109,10 +110,10 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - const struct cred *creds; - struct mm_struct *mm; refcount_t refs; struct completion done; + + refcount_t use_refs; }; static bool io_worker_get(struct io_worker *worker) @@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) { bool dropped_lock = false; - if (worker->creds) { - revert_creds(worker->creds); - worker->creds = NULL; + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; } if (current->files != worker->restore_files) { @@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) return NULL; } +static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) +{ + if (worker->mm) { + unuse_mm(worker->mm); + mmput(worker->mm); + worker->mm = NULL; + } + if (!work->mm) { + set_fs(KERNEL_DS); + return; + } + if (mmget_not_zero(work->mm)) { + use_mm(work->mm); + if (!worker->mm) + set_fs(USER_DS); + worker->mm = work->mm; + /* hang on to this mm */ + work->mm = NULL; + return; + } + + /* failed grabbing mm, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; +} + +static void io_wq_switch_creds(struct io_worker *worker, + struct io_wq_work *work) +{ + const struct cred *old_creds = override_creds(work->creds); + + worker->cur_creds = work->creds; + if (worker->saved_creds) + put_cred(old_creds); /* creds set by previous switch */ + else + worker->saved_creds = old_creds; +} + static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { @@ -438,24 +476,19 @@ static void io_worker_handle_work(struct io_worker *worker) if (work->flags & IO_WQ_WORK_CB) work->func(&work); - if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && - current->files != work->files) { + if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; task_unlock(current); } - if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && - wq->mm) { - if (mmget_not_zero(wq->mm)) { - use_mm(wq->mm); - set_fs(USER_DS); - worker->mm = wq->mm; - } else { - work->flags |= IO_WQ_WORK_CANCEL; - } - } - if (!worker->creds) - worker->creds = override_creds(wq->creds); + if (work->mm != worker->mm) + io_wq_switch_mm(worker, work); + if (worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); + /* + * OK to set IO_WQ_WORK_CANCEL even for uncancellable work, + * the worker function will do the right thing. + */ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; if (worker->mm) @@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + int work_flags; unsigned long flags; /* @@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } + work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); wq_list_add_tail(&work->list, &wqe->work_list); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); - if (!atomic_read(&acct->nr_running)) + if ((work_flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running)) io_wqe_wake_worker(wqe, acct); } @@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) */ spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && + !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && data->cancel(worker->cur_work, data->caller_data)) { send_sig(SIGINT, worker->task, 1); ret = true; @@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return false; spin_lock_irqsave(&worker->lock, flags); - if (worker->cur_work == work) { + if (worker->cur_work == work && + !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { send_sig(SIGINT, worker->task, 1); ret = true; } @@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; - wq->creds = data->creds; for_each_node(node) { struct io_wqe *wqe; @@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) init_completion(&wq->done); - /* caller must have already done mmgrab() on this mm */ - wq->mm = data->mm; - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); if (!IS_ERR(wq->manager)) { wake_up_process(wq->manager); @@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ret = -ENOMEM; goto err; } + refcount_set(&wq->use_refs, 1); reinit_completion(&wq->done); return wq; } @@ -1078,13 +1113,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) return ERR_PTR(ret); } +bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) +{ + if (data->get_work != wq->get_work || data->put_work != wq->put_work) + return false; + + return refcount_inc_not_zero(&wq->use_refs); +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { wake_up_process(worker->task); return false; } -void io_wq_destroy(struct io_wq *wq) +static void __io_wq_destroy(struct io_wq *wq) { int node; @@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq) kfree(wq->wqes); kfree(wq); } + +void io_wq_destroy(struct io_wq *wq) +{ + if (refcount_dec_and_test(&wq->use_refs)) + __io_wq_destroy(wq); +} diff --git a/fs/io-wq.h b/fs/io-wq.h index 3f5e356de980..50b3378febf2 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -7,11 +7,11 @@ enum { IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, - IO_WQ_WORK_NEEDS_USER = 8, - IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_INTERNAL = 64, IO_WQ_WORK_CB = 128, + IO_WQ_WORK_NO_CANCEL = 256, + IO_WQ_WORK_CONCURRENT = 512, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -72,6 +72,8 @@ struct io_wq_work { }; void (*func)(struct io_wq_work **); struct files_struct *files; + struct mm_struct *mm; + const struct cred *creds; unsigned flags; }; @@ -81,21 +83,22 @@ struct io_wq_work { (work)->func = _func; \ (work)->flags = 0; \ (work)->files = NULL; \ + (work)->mm = NULL; \ + (work)->creds = NULL; \ } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { - struct mm_struct *mm; struct user_struct *user; - const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); +bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index e54556b0fcc6..ac5340fdcdfe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -70,6 +71,10 @@ #include #include #include +#include +#include +#include +#include #define CREATE_TRACE_POINTS #include @@ -177,6 +182,21 @@ struct fixed_file_table { struct file **files; }; +enum { + FFD_F_ATOMIC, +}; + +struct fixed_file_data { + struct fixed_file_table *table; + struct io_ring_ctx *ctx; + + struct percpu_ref refs; + struct llist_head put_llist; + unsigned long state; + struct work_struct ref_work; + struct completion done; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -184,10 +204,11 @@ struct io_ring_ctx { struct { unsigned int flags; - bool compat; - bool account_mem; - bool cq_overflow_flushed; - bool drain_next; + int compat: 1; + int account_mem: 1; + int cq_overflow_flushed: 1; + int drain_next: 1; + int eventfd_async: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -207,13 +228,14 @@ struct io_ring_ctx { unsigned sq_thread_idle; unsigned cached_sq_dropped; atomic_t cached_cq_overflow; - struct io_uring_sqe *sq_sqes; + unsigned long sq_check_overflow; struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; wait_queue_head_t inflight_wait; + struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -229,8 +251,10 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_file_table *file_table; + struct fixed_file_data *file_data; unsigned nr_user_files; + int ring_fd; + struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -250,11 +274,14 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr personality_idr; + struct { unsigned cached_cq_tail; unsigned cq_entries; unsigned cq_mask; atomic_t cq_timeouts; + unsigned long cq_check_overflow; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; @@ -267,7 +294,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - bool poll_multi_file; + struct llist_head poll_llist; + /* * ->poll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -277,6 +305,7 @@ struct io_ring_ctx { struct list_head poll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; + bool poll_multi_file; spinlock_t inflight_lock; struct list_head inflight_list; @@ -299,6 +328,12 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_close { + struct file *file; + struct file *put_file; + int fd; +}; + struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; @@ -319,6 +354,7 @@ struct io_sync { loff_t len; loff_t off; int flags; + int mode; }; struct io_cancel { @@ -348,8 +384,52 @@ struct io_connect { struct io_sr_msg { struct file *file; - struct user_msghdr __user *msg; + union { + struct user_msghdr __user *msg; + void __user *buf; + }; int msg_flags; + size_t len; +}; + +struct io_open { + struct file *file; + int dfd; + union { + unsigned mask; + }; + struct filename *filename; + struct statx __user *buffer; + struct open_how how; +}; + +struct io_files_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + +struct io_fadvise { + struct file *file; + u64 offset; + u32 len; + u32 advice; +}; + +struct io_madvise { + struct file *file; + u64 addr; + u32 len; + u32 advice; +}; + +struct io_epoll { + struct file *file; + int epfd; + int op; + int fd; + struct epoll_event event; }; struct io_async_connect { @@ -370,15 +450,79 @@ struct io_async_rw { ssize_t size; }; +struct io_async_open { + struct filename *filename; +}; + struct io_async_ctx { union { struct io_async_rw rw; struct io_async_msghdr msg; struct io_async_connect connect; struct io_timeout_data timeout; + struct io_async_open open; }; }; +enum { + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + + REQ_F_LINK_NEXT_BIT, + REQ_F_FAIL_LINK_BIT, + REQ_F_INFLIGHT_BIT, + REQ_F_CUR_POS_BIT, + REQ_F_NOWAIT_BIT, + REQ_F_IOPOLL_COMPLETED_BIT, + REQ_F_LINK_TIMEOUT_BIT, + REQ_F_TIMEOUT_BIT, + REQ_F_ISREG_BIT, + REQ_F_MUST_PUNT_BIT, + REQ_F_TIMEOUT_NOSEQ_BIT, + REQ_F_COMP_LOCKED_BIT, +}; + +enum { + /* ctx owns file */ + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + /* drain existing IO first */ + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + /* linked sqes */ + REQ_F_LINK = BIT(REQ_F_LINK_BIT), + /* doesn't sever on completion < 0 */ + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + /* IOSQE_ASYNC */ + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + + /* already grabbed next link */ + REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), + /* fail rest of links */ + REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), + /* on inflight list */ + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + /* read/write uses file position */ + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + /* must not punt to workers */ + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + /* polled IO has completed */ + REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT), + /* has linked timeout */ + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + /* timeout request */ + REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* must be punted even for NONBLOCK */ + REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), + /* no timeout sequence */ + REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), + /* completion under lock */ + REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -396,11 +540,19 @@ struct io_kiocb { struct io_timeout timeout; struct io_connect connect; struct io_sr_msg sr_msg; + struct io_open open; + struct io_close close; + struct io_files_update files_update; + struct io_fadvise fadvise; + struct io_madvise madvise; + struct io_epoll epoll; }; struct io_async_ctx *io; - struct file *ring_file; - int ring_fd; + /* + * llist_node is only used for poll deferred completions + */ + struct llist_node llist_node; bool has_user; bool in_async; bool needs_fixed_file; @@ -414,23 +566,6 @@ struct io_kiocb { struct list_head link_list; unsigned int flags; refcount_t refs; -#define REQ_F_NOWAIT 1 /* must not punt to workers */ -#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ -#define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ -#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ -#define REQ_F_IO_DRAINED 32 /* drain done */ -#define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ -#define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ -#define REQ_F_TIMEOUT 1024 /* timeout request */ -#define REQ_F_ISREG 2048 /* regular file */ -#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ -#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ -#define REQ_F_INFLIGHT 16384 /* on inflight list */ -#define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ u64 user_data; u32 result; u32 sequence; @@ -463,14 +598,162 @@ struct io_submit_state { unsigned int ios_left; }; +struct io_op_def { + /* needs req->io allocated for deferral/async */ + unsigned async_ctx : 1; + /* needs current->mm setup, does mm access */ + unsigned needs_mm : 1; + /* needs req->file assigned */ + unsigned needs_file : 1; + /* needs req->file assigned IFF fd is >= 0 */ + unsigned fd_non_neg : 1; + /* hash wq insertion if file is a regular file */ + unsigned hash_reg_file : 1; + /* unbound wq insertion if file is a non-regular file */ + unsigned unbound_nonreg_file : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; + /* needs file table */ + unsigned file_table : 1; +}; + +static const struct io_op_def io_op_defs[] = { + [IORING_OP_NOP] = {}, + [IORING_OP_READV] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITEV] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FSYNC] = { + .needs_file = 1, + }, + [IORING_OP_READ_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITE_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_POLL_ADD] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_POLL_REMOVE] = {}, + [IORING_OP_SYNC_FILE_RANGE] = { + .needs_file = 1, + }, + [IORING_OP_SENDMSG] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_RECVMSG] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_TIMEOUT] = { + .async_ctx = 1, + .needs_mm = 1, + }, + [IORING_OP_TIMEOUT_REMOVE] = {}, + [IORING_OP_ACCEPT] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + .file_table = 1, + }, + [IORING_OP_ASYNC_CANCEL] = {}, + [IORING_OP_LINK_TIMEOUT] = { + .async_ctx = 1, + .needs_mm = 1, + }, + [IORING_OP_CONNECT] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FALLOCATE] = { + .needs_file = 1, + }, + [IORING_OP_OPENAT] = { + .needs_file = 1, + .fd_non_neg = 1, + .file_table = 1, + }, + [IORING_OP_CLOSE] = { + .needs_file = 1, + .file_table = 1, + }, + [IORING_OP_FILES_UPDATE] = { + .needs_mm = 1, + .file_table = 1, + }, + [IORING_OP_STATX] = { + .needs_mm = 1, + .needs_file = 1, + .fd_non_neg = 1, + }, + [IORING_OP_READ] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITE] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FADVISE] = { + .needs_file = 1, + }, + [IORING_OP_MADVISE] = { + .needs_mm = 1, + }, + [IORING_OP_SEND] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_RECV] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_OPENAT2] = { + .needs_file = 1, + .fd_non_neg = 1, + .file_table = 1, + }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .file_table = 1, + }, +}; + static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); -static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); -static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *ip, + unsigned nr_args); +static int io_grab_files(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -537,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); + init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -566,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req) static inline bool req_need_defer(struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN) + if (unlikely(req->flags & REQ_F_IO_DRAIN)) return __req_need_defer(req); return false; @@ -606,53 +891,53 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { - /* order cqe stores with ring update */ - smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); + /* order cqe stores with ring update */ + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); - if (wq_has_sleeper(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } } -static inline bool io_req_needs_user(struct io_kiocb *req) +static inline void io_req_work_grab_env(struct io_kiocb *req, + const struct io_op_def *def) { - return !(req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED); + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); +} + +static inline void io_req_work_drop_env(struct io_kiocb *req) +{ + if (req->work.mm) { + mmdrop(req->work.mm); + req->work.mm = NULL; + } + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } } static inline bool io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { + const struct io_op_def *def = &io_op_defs[req->opcode]; bool do_hashed = false; - switch (req->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - /* only regular files should be hashed for writes */ - if (req->flags & REQ_F_ISREG) + if (req->flags & REQ_F_ISREG) { + if (def->hash_reg_file) do_hashed = true; - /* fall-through */ - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - case IORING_OP_ACCEPT: - case IORING_OP_POLL_ADD: - case IORING_OP_CONNECT: - /* - * We know REQ_F_ISREG is not set on some of these - * opcodes, but this enables us to keep the check in - * just one place. - */ - if (!(req->flags & REQ_F_ISREG)) + } else { + if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; - break; } - if (io_req_needs_user(req)) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + + io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); return do_hashed; @@ -711,10 +996,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); - while ((req = io_get_deferred_req(ctx)) != NULL) { - req->flags |= REQ_F_IO_DRAINED; + while ((req = io_get_deferred_req(ctx)) != NULL) io_queue_async_work(req); - } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) @@ -735,13 +1018,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } +static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +{ + if (!ctx->eventfd_async) + return true; + return io_wq_current_is_worker() || in_interrupt(); +} + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd) + if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -766,7 +1056,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) /* if force is set, the ring is going away. always drop after that */ if (force) - ctx->cq_overflow_flushed = true; + ctx->cq_overflow_flushed = 1; cqe = NULL; while (!list_empty(&ctx->cq_overflow_list)) { @@ -788,6 +1078,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); + if (cqe) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -821,6 +1115,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { + if (list_empty(&ctx->cq_overflow_list)) { + set_bit(0, &ctx->sq_check_overflow); + set_bit(0, &ctx->cq_check_overflow); + } refcount_inc(&req->refs); req->result = res; list_add_tail(&req->list, &ctx->cq_overflow_list); @@ -863,9 +1161,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!percpu_ref_tryget(&ctx->refs)) - return NULL; - if (!state) { req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) @@ -898,7 +1193,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, got_it: req->io = NULL; - req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -915,24 +1209,35 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, return NULL; } -static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) +static void __io_req_do_free(struct io_kiocb *req) { - if (*nr) { - kmem_cache_free_bulk(req_cachep, *nr, reqs); - percpu_ref_put_many(&ctx->refs, *nr); - *nr = 0; + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); +} + +static void __io_req_aux_free(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + kfree(req->io); + if (req->file) { + if (req->flags & REQ_F_FIXED_FILE) + percpu_ref_put(&ctx->file_data->refs); + else + fput(req->file); } + + io_req_work_drop_env(req); } static void __io_free_req(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + __io_req_aux_free(req); - if (req->io) - kfree(req->io); - if (req->file && !(req->flags & REQ_F_FIXED_FILE)) - fput(req->file); if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->inflight_lock, flags); @@ -941,11 +1246,63 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - percpu_ref_put(&ctx->refs); - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); - else - clear_bit_unlock(0, (unsigned long *) ctx->fallback_req); + + percpu_ref_put(&req->ctx->refs); + __io_req_do_free(req); +} + +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; + int need_iter; +}; + +static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) +{ + int fixed_refs = rb->to_free; + + if (!rb->to_free) + return; + if (rb->need_iter) { + int i, inflight = 0; + unsigned long flags; + + fixed_refs = 0; + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_FIXED_FILE) { + req->file = NULL; + fixed_refs++; + } + if (req->flags & REQ_F_INFLIGHT) + inflight++; + __io_req_aux_free(req); + } + if (!inflight) + goto do_free; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_INFLIGHT) { + list_del(&req->inflight_entry); + if (!--inflight) + break; + } + } + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + } +do_free: + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + if (fixed_refs) + percpu_ref_put_many(&ctx->file_data->refs, fixed_refs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = rb->need_iter = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1118,19 +1475,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; - /* - * noflush == true is from the waitqueue handler, just ensure we wake - * up the task, and the next invocation will flush the entries. We - * cannot safely to it from here. - */ - if (noflush && !list_empty(&ctx->cq_overflow_list)) - return -1U; + if (test_bit(0, &ctx->cq_check_overflow)) { + /* + * noflush == true is from the waitqueue handler, just ensure + * we wake up the task, and the next invocation will flush the + * entries. We cannot safely to it from here. + */ + if (noflush && !list_empty(&ctx->cq_overflow_list)) + return -1U; - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false); + } /* See comment at the top of this file */ smp_rmb(); - return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) @@ -1141,17 +1500,30 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } +static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +{ + if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) + return false; + + if (!(req->flags & REQ_F_FIXED_FILE) || req->io) + rb->need_iter++; + + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + io_free_req_many(req->ctx, rb); + return true; +} + /* * Find and free completed poll iocbs */ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct list_head *done) { - void *reqs[IO_IOPOLL_BATCH]; + struct req_batch rb; struct io_kiocb *req; - int to_free; - to_free = 0; + rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); @@ -1159,26 +1531,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_cqring_fill_event(req, req->result); (*nr_events)++; - if (refcount_dec_and_test(&req->refs)) { - /* If we're not using fixed files, we have to pair the - * completion part with the file put. Use regular - * completions for those, only batch free for fixed - * file and non-linked commands. - */ - if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && - !req->io) { - reqs[to_free++] = req; - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); - } else { - io_free_req(req); - } - } + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) + io_free_req(req); } io_commit_cqring(ctx); - io_free_req_many(ctx, reqs, &to_free); + io_free_req_many(ctx, &rb); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1503,6 +1862,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); + if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + } kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1574,6 +1937,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + + if (req->flags & REQ_F_CUR_POS) + req->file->f_pos = kiocb->ki_pos; if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else @@ -1671,6 +2038,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->rw.kiocb.private) return -EINVAL; + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { + ssize_t ret; + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); + *iovec = NULL; + return ret; + } + if (req->io) { struct io_async_rw *iorw = &req->io->rw; @@ -1767,6 +2141,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, static int io_alloc_async_ctx(struct io_kiocb *req) { + if (!io_op_defs[req->opcode].async_ctx) + return 0; req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); return req->io == NULL; } @@ -1786,8 +2162,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) + if (!io_op_defs[req->opcode].async_ctx) return 0; if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; @@ -2101,6 +2476,421 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static void io_fallocate_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_fallocate_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->addr); + req->sync.mode = READ_ONCE(sqe->len); + return 0; +} + +static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + + /* fallocate always requiring blocking context */ + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fallocate_finish; + return -EAGAIN; + } + + work = old_work = &req->work; + io_fallocate_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + + return 0; +} + +static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + const char __user *fname; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.how.mode = READ_ONCE(sqe->len); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.how.flags = READ_ONCE(sqe->open_flags); + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct open_how __user *how; + const char __user *fname; + size_t len; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + len = READ_ONCE(sqe->len); + + if (len < OPEN_HOW_SIZE_VER0) + return -EINVAL; + + ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, + len); + if (ret) + return ret; + + if (!(req->open.how.flags & O_PATH) && force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct open_flags op; + struct file *file; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = build_open_flags(&req->open.how, &op); + if (ret) + goto err; + + ret = get_unused_fd_flags(req->open.how.flags); + if (ret < 0) + goto err; + + file = do_filp_open(req->open.dfd, req->open.filename, &op); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + } else { + fsnotify_open(file); + fd_install(ret, file); + } +err: + putname(req->open.filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); + return io_openat2(req, nxt, force_nonblock); +} + +static int io_epoll_ctl_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_EPOLL) + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->epoll.epfd = READ_ONCE(sqe->fd); + req->epoll.op = READ_ONCE(sqe->len); + req->epoll.fd = READ_ONCE(sqe->off); + + if (ep_op_has_event(req->epoll.op)) { + struct epoll_event __user *ev; + + ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + return -EFAULT; + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_EPOLL) + struct io_epoll *ie = &req->epoll; + int ret; + + ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + if (sqe->ioprio || sqe->buf_index || sqe->off) + return -EINVAL; + + req->madvise.addr = READ_ONCE(sqe->addr); + req->madvise.len = READ_ONCE(sqe->len); + req->madvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = &req->madvise; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_madvise(ma->addr, ma->len, ma->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->addr) + return -EINVAL; + + req->fadvise.offset = READ_ONCE(sqe->off); + req->fadvise.len = READ_ONCE(sqe->len); + req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +} + +static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_fadvise *fa = &req->fadvise; + int ret; + + /* DONTNEED may block, others _should_ not */ + if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock) + return -EAGAIN; + + ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + const char __user *fname; + unsigned lookup_flags; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.mask = READ_ONCE(sqe->len); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + req->open.how.flags = READ_ONCE(sqe->statx_flags); + + if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags)) + return -EINVAL; + + req->open.filename = getname_flags(fname, lookup_flags, NULL); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_open *ctx = &req->open; + unsigned lookup_flags; + struct path path; + struct kstat stat; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) + return -EINVAL; + +retry: + /* filename_lookup() drops it, keep a reference */ + ctx->filename->refcnt++; + + ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path, + NULL); + if (ret) + goto err; + + ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + if (!ret) + ret = cp_statx(&stat, ctx->buffer); +err: + putname(ctx->filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If we queue this for async, it must not be cancellable. That would + * leave the 'file' in an undeterminate state. + */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; + + if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || + sqe->rw_flags || sqe->buf_index) + return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EINVAL; + + req->close.fd = READ_ONCE(sqe->fd); + if (req->file->f_op == &io_uring_fops || + req->close.fd == req->ctx->ring_fd) + return -EBADF; + + return 0; +} + +static void io_close_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + /* Invoked with files, we need to do the close */ + if (req->work.files) { + int ret; + + ret = filp_close(req->close.put_file, req->work.files); + if (ret < 0) { + req_set_fail_links(req); + } + io_cqring_add_event(req, ret); + } + + fput(req->close.put_file); + + /* we bypassed the re-issue, drop the submission reference */ + io_put_req(req); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + int ret; + + req->close.put_file = NULL; + ret = __close_fd_get_file(req->close.fd, &req->close.put_file); + if (ret < 0) + return ret; + + /* if the file has a flush method, be safe and punt to async */ + if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) + goto eagain; + + /* + * No ->flush(), safely close from here and just punt the + * fput() to async context. + */ + ret = filp_close(req->close.put_file, current->files); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + + if (io_wq_current_is_worker()) { + struct io_wq_work *old_work, *work; + + old_work = work = &req->work; + io_close_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; + } + +eagain: + req->work.func = io_close_finish; + return -EAGAIN; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -2178,8 +2968,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); - if (!io) + if (!io || req->opcode == IORING_OP_SEND) return 0; io->msg.iov = io->msg.fast_iov; @@ -2259,6 +3050,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -2269,7 +3110,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (!io) + if (!io || req->opcode == IORING_OP_RECV) return 0; io->msg.iov = io->msg.fast_iov; @@ -2351,6 +3192,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(READ, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + + static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) @@ -2414,7 +3308,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, ret = __io_accept(req, nxt, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; io_put_req(req); return -EAGAIN; } @@ -2635,6 +3528,39 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_wq_assign_next(workptr, nxt); } +static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) +{ + struct io_kiocb *req, *tmp; + struct req_batch rb; + + rb.to_free = rb.need_iter = 0; + spin_lock_irq(&ctx->completion_lock); + llist_for_each_entry_safe(req, tmp, nodes, llist_node) { + hash_del(&req->hash_node); + io_poll_complete(req, req->result, 0); + + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) { + req->flags |= REQ_F_COMP_LOCKED; + io_free_req(req); + } + } + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + io_free_req_many(ctx, &rb); +} + +static void io_poll_flush(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct llist_node *nodes; + + nodes = llist_del_all(&req->ctx->poll_llist); + if (nodes) + __io_poll_flush(req->ctx, nodes); +} + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -2642,7 +3568,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); - unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -2656,17 +3581,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * If we have a link timeout we're going to need the completion_lock * for finalizing the request, mark us as having grabbed that already. */ - if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (mask) { + unsigned long flags; - io_cqring_ev_posted(ctx); - } else { - io_queue_async_work(req); + if (llist_empty(&ctx->poll_llist) && + spin_trylock_irqsave(&ctx->completion_lock, flags)) { + hash_del(&req->hash_node); + io_poll_complete(req, mask, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + req = NULL; + } else { + req->result = mask; + req->llist_node.next = NULL; + /* if the list wasn't empty, we're done */ + if (!llist_add(&req->llist_node, &ctx->poll_llist)) + req = NULL; + else + req->work.func = io_poll_flush; + } } + if (req) + io_queue_async_work(req); return 1; } @@ -3071,20 +4010,67 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) return 0; } +static int io_files_update_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->flags || sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + req->files_update.offset = READ_ONCE(sqe->off); + req->files_update.nr_args = READ_ONCE(sqe->len); + if (!req->files_update.nr_args) + return -EINVAL; + req->files_update.arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update(struct io_kiocb *req, bool force_nonblock) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_files_update up; + int ret; + + if (force_nonblock) + return -EAGAIN; + + up.offset = req->files_update.offset; + up.fds = req->files_update.arg; + + mutex_lock(&ctx->uring_lock); + ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); + mutex_unlock(&ctx->uring_lock); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + static int io_req_defer_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret = 0; + if (io_op_defs[req->opcode].file_table) { + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } + + io_req_work_grab_env(req, &io_op_defs[req->opcode]); + switch (req->opcode) { case IORING_OP_NOP: break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: ret = io_write_prep(req, sqe, true); break; case IORING_OP_POLL_ADD: @@ -3100,9 +4086,11 @@ static int io_req_defer_prep(struct io_kiocb *req, ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: @@ -3123,6 +4111,33 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_ACCEPT: ret = io_accept_prep(req, sqe); break; + case IORING_OP_FALLOCATE: + ret = io_fallocate_prep(req, sqe); + break; + case IORING_OP_OPENAT: + ret = io_openat_prep(req, sqe); + break; + case IORING_OP_CLOSE: + ret = io_close_prep(req, sqe); + break; + case IORING_OP_FILES_UPDATE: + ret = io_files_update_prep(req, sqe); + break; + case IORING_OP_STATX: + ret = io_statx_prep(req, sqe); + break; + case IORING_OP_FADVISE: + ret = io_fadvise_prep(req, sqe); + break; + case IORING_OP_MADVISE: + ret = io_madvise_prep(req, sqe); + break; + case IORING_OP_OPENAT2: + ret = io_openat2_prep(req, sqe); + break; + case IORING_OP_EPOLL_CTL: + ret = io_epoll_ctl_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3173,6 +4188,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: if (sqe) { ret = io_read_prep(req, sqe, force_nonblock); if (ret < 0) @@ -3182,6 +4198,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: if (sqe) { ret = io_write_prep(req, sqe, force_nonblock); if (ret < 0) @@ -3222,20 +4239,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: if (sqe) { ret = io_sendmsg_prep(req, sqe); if (ret < 0) break; } - ret = io_sendmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_SENDMSG) + ret = io_sendmsg(req, nxt, force_nonblock); + else + ret = io_send(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: if (sqe) { ret = io_recvmsg_prep(req, sqe); if (ret) break; } - ret = io_recvmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_RECVMSG) + ret = io_recvmsg(req, nxt, force_nonblock); + else + ret = io_recv(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -3277,6 +4302,78 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_async_cancel(req, nxt); break; + case IORING_OP_FALLOCATE: + if (sqe) { + ret = io_fallocate_prep(req, sqe); + if (ret) + break; + } + ret = io_fallocate(req, nxt, force_nonblock); + break; + case IORING_OP_OPENAT: + if (sqe) { + ret = io_openat_prep(req, sqe); + if (ret) + break; + } + ret = io_openat(req, nxt, force_nonblock); + break; + case IORING_OP_CLOSE: + if (sqe) { + ret = io_close_prep(req, sqe); + if (ret) + break; + } + ret = io_close(req, nxt, force_nonblock); + break; + case IORING_OP_FILES_UPDATE: + if (sqe) { + ret = io_files_update_prep(req, sqe); + if (ret) + break; + } + ret = io_files_update(req, force_nonblock); + break; + case IORING_OP_STATX: + if (sqe) { + ret = io_statx_prep(req, sqe); + if (ret) + break; + } + ret = io_statx(req, nxt, force_nonblock); + break; + case IORING_OP_FADVISE: + if (sqe) { + ret = io_fadvise_prep(req, sqe); + if (ret) + break; + } + ret = io_fadvise(req, nxt, force_nonblock); + break; + case IORING_OP_MADVISE: + if (sqe) { + ret = io_madvise_prep(req, sqe); + if (ret) + break; + } + ret = io_madvise(req, nxt, force_nonblock); + break; + case IORING_OP_OPENAT2: + if (sqe) { + ret = io_openat2_prep(req, sqe); + if (ret) + break; + } + ret = io_openat2(req, nxt, force_nonblock); + break; + case IORING_OP_EPOLL_CTL: + if (sqe) { + ret = io_epoll_ctl_prep(req, sqe); + if (ret) + break; + } + ret = io_epoll_ctl(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3311,8 +4408,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) + /* if NO_CANCEL is set, we must still run the work */ + if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == + IO_WQ_WORK_CANCEL) { ret = -ECANCELED; + } if (!ret) { req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; @@ -3344,26 +4444,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_wq_assign_next(workptr, nxt); } -static bool io_req_op_valid(int op) +static int io_req_needs_file(struct io_kiocb *req, int fd) { - return op >= IORING_OP_NOP && op < IORING_OP_LAST; -} - -static int io_req_needs_file(struct io_kiocb *req) -{ - switch (req->opcode) { - case IORING_OP_NOP: - case IORING_OP_POLL_REMOVE: - case IORING_OP_TIMEOUT: - case IORING_OP_TIMEOUT_REMOVE: - case IORING_OP_ASYNC_CANCEL: - case IORING_OP_LINK_TIMEOUT: + if (!io_op_defs[req->opcode].needs_file) return 0; - default: - if (io_req_op_valid(req->opcode)) - return 1; - return -EINVAL; - } + if (fd == -1 && io_op_defs[req->opcode].fd_non_neg) + return 0; + return 1; } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -3371,8 +4458,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, { struct fixed_file_table *table; - table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK]; + table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK];; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -3380,20 +4467,16 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, { struct io_ring_ctx *ctx = req->ctx; unsigned flags; - int fd, ret; + int fd; flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - if (flags & IOSQE_IO_DRAIN) - req->flags |= REQ_F_IO_DRAIN; - - ret = io_req_needs_file(req); - if (ret <= 0) - return ret; + if (!io_req_needs_file(req, fd)) + return 0; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->file_table || + if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); @@ -3401,6 +4484,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (!req->file) return -EBADF; req->flags |= REQ_F_FIXED_FILE; + percpu_ref_get(&ctx->file_data->refs); } else { if (req->needs_fixed_file) return -EBADF; @@ -3418,6 +4502,11 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + if (req->work.files) + return 0; + if (!ctx->ring_file) + return -EBADF; + rcu_read_lock(); spin_lock_irq(&ctx->inflight_lock); /* @@ -3426,7 +4515,7 @@ static int io_grab_files(struct io_kiocb *req) * the fd has changed since we started down this path, and disallow * this operation if it has. */ - if (fcheck(req->ring_fd) == req->ring_file) { + if (fcheck(ctx->ring_fd) == ctx->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; req->work.files = current->files; @@ -3532,7 +4621,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { +punt: + if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); if (ret) goto err; @@ -3567,6 +4657,9 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (nxt) { req = nxt; nxt = NULL; + + if (req->flags & REQ_F_FORCE_ASYNC) + goto punt; goto again; } } @@ -3575,21 +4668,27 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; - if (unlikely(req->ctx->drain_next)) { - req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = false; - } - req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { +fail_req: io_cqring_add_event(req, ret); req_set_fail_links(req); io_double_put_req(req); } - } else + } else if (req->flags & REQ_F_FORCE_ASYNC) { + ret = io_req_defer_prep(req, sqe); + if (unlikely(ret < 0)) + goto fail_req; + /* + * Never try inline submit of IOSQE_ASYNC is set, go straight + * to async execution. + */ + req->work.flags |= IO_WQ_WORK_CONCURRENT; + io_queue_async_work(req); + } else { __io_queue_sqe(req, sqe); + } } static inline void io_queue_link_head(struct io_kiocb *req) @@ -3602,25 +4701,47 @@ static inline void io_queue_link_head(struct io_kiocb *req) } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK) + IOSQE_IO_HARDLINK | IOSQE_ASYNC) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { + const struct cred *old_creds = NULL; struct io_ring_ctx *ctx = req->ctx; - int ret; + unsigned int sqe_flags; + int ret, id; + + sqe_flags = READ_ONCE(sqe->flags); /* enforce forwards compatibility on users */ - if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } + id = READ_ONCE(sqe->personality); + if (id) { + const struct cred *personality_creds; + + personality_creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!personality_creds)) { + ret = -EINVAL; + goto err_req; + } + old_creds = override_creds(personality_creds); + } + + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| + IOSQE_ASYNC); + ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); io_double_put_req(req); + if (old_creds) + revert_creds(old_creds); return false; } @@ -3632,14 +4753,19 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, * conditions are true (normal request), then just queue it. */ if (*link) { - struct io_kiocb *prev = *link; - - if (sqe->flags & IOSQE_IO_DRAIN) - (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - - if (sqe->flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; + struct io_kiocb *head = *link; + /* + * Taking sequential execution of a link, draining both sides + * of the link also fullfils IOSQE_IO_DRAIN semantics for all + * requests in the link. So, it drains the head and the + * next after the link request. The last one is done via + * drain_next flag to persist the effect across calls. + */ + if (sqe_flags & IOSQE_IO_DRAIN) { + head->flags |= REQ_F_IO_DRAIN; + ctx->drain_next = 1; + } if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; @@ -3648,25 +4774,36 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ - prev->flags |= REQ_F_FAIL_LINK; + head->flags |= REQ_F_FAIL_LINK; goto err_req; } - trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->link_list, &prev->link_list); - } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { - req->flags |= REQ_F_LINK; - if (sqe->flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; + trace_io_uring_link(ctx, req, head); + list_add_tail(&req->link_list, &head->link_list); - INIT_LIST_HEAD(&req->link_list); - ret = io_req_defer_prep(req, sqe); - if (ret) - req->flags |= REQ_F_FAIL_LINK; - *link = req; + /* last request of a link, enqueue the link */ + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { + io_queue_link_head(head); + *link = NULL; + } } else { - io_queue_sqe(req, sqe); + if (unlikely(ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = 0; + } + if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + req->flags |= REQ_F_LINK; + INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); + if (ret) + req->flags |= REQ_F_FAIL_LINK; + *link = req; + } else { + io_queue_sqe(req, sqe); + } } + if (old_creds) + revert_creds(old_creds); return true; } @@ -3698,14 +4835,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); - } + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } /* @@ -3719,7 +4854,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe **sqe_ptr) { - struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; unsigned head; @@ -3731,12 +4865,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = ctx->cached_sq_head; - /* make sure SQ entry isn't read before tail */ - if (unlikely(head == smp_load_acquire(&rings->sq.tail))) - return false; - - head = READ_ONCE(sq_array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); if (likely(head < ctx->sq_entries)) { /* * All io need record the previous position, if LINK vs DARIN, @@ -3754,7 +4883,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, /* drop invalid entries */ ctx->cached_sq_head++; ctx->cached_sq_dropped++; - WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); + WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); return false; } @@ -3768,19 +4897,29 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, bool mm_fault = false; /* if we have a backlog and couldn't flush it all, return BUSY */ - if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false)) - return -EBUSY; + if (test_bit(0, &ctx->sq_check_overflow)) { + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) + return -EBUSY; + } + + /* make sure SQ entry isn't read before tail */ + nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); + + if (!percpu_ref_tryget_many(&ctx->refs, nr)) + return -EAGAIN; if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, nr); statep = &state; } + ctx->ring_fd = ring_fd; + ctx->ring_file = ring_file; + for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; - unsigned int sqe_flags; req = io_get_req(ctx, statep); if (unlikely(!req)) { @@ -3789,11 +4928,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } if (!io_get_sqring(ctx, req, &sqe)) { - __io_free_req(req); + __io_req_do_free(req); break; } - if (io_req_needs_user(req) && !*mm) { + /* will complete beyond this point, count as submitted */ + submitted++; + + if (unlikely(req->opcode >= IORING_OP_LAST)) { + io_cqring_add_event(req, -EINVAL); + io_double_put_req(req); + break; + } + + if (io_op_defs[req->opcode].needs_mm && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3801,27 +4949,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - submitted++; - sqe_flags = sqe->flags; - - req->ring_file = ring_file; - req->ring_fd = ring_fd; req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->user_data, true, async); + trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, + true, async); if (!io_submit_sqe(req, sqe, statep, &link)) break; - /* - * If previous wasn't linked and we have a linked command, - * that's the end of the chain. Submit the previous link. - */ - if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { - io_queue_link_head(link); - link = NULL; - } } + if (unlikely(submitted != nr)) { + int ref_used = (submitted == -EAGAIN) ? 0 : submitted; + + percpu_ref_put_many(&ctx->refs, nr - ref_used); + } if (link) io_queue_link_head(link); if (statep) @@ -3944,7 +5085,6 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); mutex_unlock(&ctx->uring_lock); @@ -4075,19 +5215,40 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #endif } +static void io_file_ref_kill(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + complete(&data->done); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { + struct fixed_file_data *data = ctx->file_data; unsigned nr_tables, i; - if (!ctx->file_table) + if (!data) return -ENXIO; + /* protect against inflight atomic switch, which drops the ref */ + percpu_ref_get(&data->refs); + /* wait for existing switches */ + flush_work(&data->ref_work); + percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); + wait_for_completion(&data->done); + percpu_ref_put(&data->refs); + /* flush potential new switch */ + flush_work(&data->ref_work); + percpu_ref_exit(&data->refs); + __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(data->table[i].files); + kfree(data->table); + kfree(data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; } @@ -4118,16 +5279,6 @@ static void io_finish_async(struct io_ring_ctx *ctx) } #if defined(CONFIG_UNIX) -static void io_destruct_skb(struct sk_buff *skb) -{ - struct io_ring_ctx *ctx = skb->sk->sk_user_data; - - if (ctx->io_wq) - io_wq_flush(ctx->io_wq); - - unix_destruct_scm(skb); -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -4175,7 +5326,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fpl->max = SCM_MAX_FD; fpl->count = nr_files; UNIXCB(skb).fp = fpl; - skb->destructor = io_destruct_skb; + skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); @@ -4237,7 +5388,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -4252,101 +5403,15 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; kfree(table->files); } return 1; } -static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - __s32 __user *fds = (__s32 __user *) arg; - unsigned nr_tables; - int fd, ret = 0; - unsigned i; - - if (ctx->file_table) - return -EBUSY; - if (!nr_args) - return -EINVAL; - if (nr_args > IORING_MAX_FIXED_FILES) - return -EMFILE; - - nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), - GFP_KERNEL); - if (!ctx->file_table) - return -ENOMEM; - - if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - kfree(ctx->file_table); - ctx->file_table = NULL; - return -ENOMEM; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct fixed_file_table *table; - unsigned index; - - ret = -EFAULT; - if (copy_from_user(&fd, &fds[i], sizeof(fd))) - break; - /* allow sparse sets */ - if (fd == -1) { - ret = 0; - continue; - } - - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; - index = i & IORING_FILE_TABLE_MASK; - table->files[index] = fget(fd); - - ret = -EBADF; - if (!table->files[index]) - break; - /* - * Don't allow io_uring instances to be registered. If UNIX - * isn't enabled, then this causes a reference cycle and this - * instance can never get freed. If UNIX is enabled we'll - * handle it just fine, but there's still no point in allowing - * a ring fd as it doesn't support regular read/write anyway. - */ - if (table->files[index]->f_op == &io_uring_fops) { - fput(table->files[index]); - break; - } - ret = 0; - } - - if (ret) { - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - - file = io_file_from_index(ctx, i); - if (file) - fput(file); - } - for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); - - kfree(ctx->file_table); - ctx->file_table = NULL; - ctx->nr_user_files = 0; - return ret; - } - - ret = io_sqe_files_scm(ctx); - if (ret) - io_sqe_files_unregister(ctx); - - return ret; -} - -static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) +static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) { #if defined(CONFIG_UNIX) - struct file *file = io_file_from_index(ctx, index); struct sock *sock = ctx->ring_sock->sk; struct sk_buff_head list, *head = &sock->sk_receive_queue; struct sk_buff *skb; @@ -4402,10 +5467,157 @@ static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) spin_unlock_irq(&head->lock); } #else - fput(io_file_from_index(ctx, index)); + fput(file); #endif } +struct io_file_put { + struct llist_node llist; + struct file *file; + struct completion *done; +}; + +static void io_ring_file_ref_switch(struct work_struct *work) +{ + struct io_file_put *pfile, *tmp; + struct fixed_file_data *data; + struct llist_node *node; + + data = container_of(work, struct fixed_file_data, ref_work); + + while ((node = llist_del_all(&data->put_llist)) != NULL) { + llist_for_each_entry_safe(pfile, tmp, node, llist) { + io_ring_file_put(data->ctx, pfile->file); + if (pfile->done) + complete(pfile->done); + else + kfree(pfile); + } + } + + percpu_ref_get(&data->refs); + percpu_ref_switch_to_percpu(&data->refs); +} + +static void io_file_data_ref_zero(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + + /* we can't safely switch from inside this context, punt to wq */ + queue_work(system_wq, &data->ref_work); +} + +static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + __s32 __user *fds = (__s32 __user *) arg; + unsigned nr_tables; + struct file *file; + int fd, ret = 0; + unsigned i; + + if (ctx->file_data) + return -EBUSY; + if (!nr_args) + return -EINVAL; + if (nr_args > IORING_MAX_FIXED_FILES) + return -EMFILE; + + ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!ctx->file_data) + return -ENOMEM; + ctx->file_data->ctx = ctx; + init_completion(&ctx->file_data->done); + + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); + ctx->file_data->table = kcalloc(nr_tables, + sizeof(struct fixed_file_table), + GFP_KERNEL); + if (!ctx->file_data->table) { + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + + if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + ctx->file_data->put_llist.first = NULL; + INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); + + if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { + percpu_ref_exit(&ctx->file_data->refs); + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + + for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { + struct fixed_file_table *table; + unsigned index; + + ret = -EFAULT; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) + break; + /* allow sparse sets */ + if (fd == -1) { + ret = 0; + continue; + } + + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; + index = i & IORING_FILE_TABLE_MASK; + file = fget(fd); + + ret = -EBADF; + if (!file) + break; + + /* + * Don't allow io_uring instances to be registered. If UNIX + * isn't enabled, then this causes a reference cycle and this + * instance can never get freed. If UNIX is enabled we'll + * handle it just fine, but there's still no point in allowing + * a ring fd as it doesn't support regular read/write anyway. + */ + if (file->f_op == &io_uring_fops) { + fput(file); + break; + } + ret = 0; + table->files[index] = file; + } + + if (ret) { + for (i = 0; i < ctx->nr_user_files; i++) { + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } + for (i = 0; i < nr_tables; i++) + kfree(ctx->file_data->table[i].files); + + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + ctx->nr_user_files = 0; + return ret; + } + + ret = io_sqe_files_scm(ctx); + if (ret) + io_sqe_files_unregister(ctx); + + return ret; +} + static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, int index) { @@ -4449,29 +5661,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) +static void io_atomic_switch(struct percpu_ref *ref) { - struct io_uring_files_update up; + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + clear_bit(FFD_F_ATOMIC, &data->state); +} + +static bool io_queue_file_removal(struct fixed_file_data *data, + struct file *file) +{ + struct io_file_put *pfile, pfile_stack; + DECLARE_COMPLETION_ONSTACK(done); + + /* + * If we fail allocating the struct we need for doing async reomval + * of this file, just punt to sync and wait for it. + */ + pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); + if (!pfile) { + pfile = &pfile_stack; + pfile->done = &done; + } + + pfile->file = file; + llist_add(&pfile->llist, &data->put_llist); + + if (pfile == &pfile_stack) { + if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, + io_atomic_switch); + } + wait_for_completion(&done); + flush_work(&data->ref_work); + return false; + } + + return true; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *up, + unsigned nr_args) +{ + struct fixed_file_data *data = ctx->file_data; + bool ref_switch = false; + struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; - if (!ctx->file_table) - return -ENXIO; - if (!nr_args) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (up.resv) - return -EINVAL; - if (check_add_overflow(up.offset, nr_args, &done)) + if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; done = 0; - fds = u64_to_user_ptr(up.fds); + fds = u64_to_user_ptr(up->fds); while (nr_args) { struct fixed_file_table *table; unsigned index; @@ -4481,16 +5729,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, err = -EFAULT; break; } - i = array_index_nospec(up.offset, ctx->nr_user_files); - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + i = array_index_nospec(up->offset, ctx->nr_user_files); + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - io_sqe_file_unregister(ctx, i); + file = io_file_from_index(ctx, index); table->files[index] = NULL; + if (io_queue_file_removal(data, file)) + ref_switch = true; } if (fd != -1) { - struct file *file; - file = fget(fd); if (!file) { err = -EBADF; @@ -4516,11 +5764,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, } nr_args--; done++; - up.offset++; + up->offset++; + } + + if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); } return done ? done : err; } +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + + if (!ctx->file_data) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (up.resv) + return -EINVAL; + + return __io_sqe_files_update(ctx, &up, nr_args); +} static void io_put_work(struct io_wq_work *work) { @@ -4536,11 +5805,56 @@ static void io_get_work(struct io_wq_work *work) refcount_inc(&req->refs); } +static int io_init_wq_offload(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_wq_data data; + struct fd f; + struct io_ring_ctx *ctx_attach; + unsigned int concurrency; + int ret = 0; + + data.user = ctx->user; + data.get_work = io_get_work; + data.put_work = io_put_work; + + if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + + ctx->io_wq = io_wq_create(concurrency, &data); + if (IS_ERR(ctx->io_wq)) { + ret = PTR_ERR(ctx->io_wq); + ctx->io_wq = NULL; + } + return ret; + } + + f = fdget(p->wq_fd); + if (!f.file) + return -EBADF; + + if (f.file->f_op != &io_uring_fops) { + ret = -EINVAL; + goto out_fput; + } + + ctx_attach = f.file->private_data; + /* @io_wq is protected by holding the fd */ + if (!io_wq_get(ctx_attach->io_wq, &data)) { + ret = -EINVAL; + goto out_fput; + } + + ctx->io_wq = ctx_attach->io_wq; +out_fput: + fdput(f); + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_wq_data data; - unsigned concurrency; int ret; init_waitqueue_head(&ctx->sqo_wait); @@ -4584,20 +5898,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - data.mm = ctx->sqo_mm; - data.user = ctx->user; - data.creds = ctx->creds; - data.get_work = io_get_work; - data.put_work = io_put_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; + ret = io_init_wq_offload(ctx, p); + if (ret) goto err; - } return 0; err: @@ -4975,6 +6278,17 @@ static int io_uring_fasync(int fd, struct file *file, int on) return fasync_helper(fd, file, on, &ctx->cq_fasync); } +static int io_remove_personalities(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + const struct cred *cred; + + cred = idr_remove(&ctx->personality_idr, id); + if (cred) + put_cred(cred); + return 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -4991,6 +6305,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } @@ -5157,7 +6472,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } else if (to_submit) { struct mm_struct *cur_mm; - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); /* already have mm, so io_submit_sqes() won't try to grab it */ cur_mm = ctx->sqo_mm; @@ -5273,7 +6587,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; - ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -5292,8 +6605,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) bool account_mem; int ret; - if (!entries || entries > IORING_MAX_ENTRIES) + if (!entries) return -EINVAL; + if (entries > IORING_MAX_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + entries = IORING_MAX_ENTRIES; + } /* * Use twice as many entries for the CQ ring. It's possible for the @@ -5310,8 +6628,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ - if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) + if (p->cq_entries < p->sq_entries) return -EINVAL; + if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + p->cq_entries = IORING_MAX_CQ_ENTRIES; + } p->cq_entries = roundup_pow_of_two(p->cq_entries); } else { p->cq_entries = 2 * p->sq_entries; @@ -5376,7 +6699,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE; + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -5403,7 +6727,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; ret = io_uring_create(entries, &p); @@ -5422,6 +6747,84 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct io_uring_probe *p; + size_t size; + int i, ret; + + size = struct_size(p, ops, nr_args); + if (size == SIZE_MAX) + return -EOVERFLOW; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(p, arg, size)) + goto out; + ret = -EINVAL; + if (memchr_inv(p, 0, size)) + goto out; + + p->last_op = IORING_OP_LAST - 1; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + + for (i = 0; i < nr_args; i++) { + p->ops[i].op = i; + if (!io_op_defs[i].not_supported) + p->ops[i].flags = IO_URING_OP_SUPPORTED; + } + p->ops_len = i; + + ret = 0; + if (copy_to_user(arg, p, size)) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + +static int io_register_personality(struct io_ring_ctx *ctx) +{ + const struct cred *creds = get_current_cred(); + int id; + + id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (id < 0) + put_cred(creds); + return id; +} + +static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) +{ + const struct cred *old_creds; + + old_creds = idr_remove(&ctx->personality_idr, id); + if (old_creds) { + put_cred(old_creds); + return 0; + } + + return -EINVAL; +} + +static bool io_register_op_must_quiesce(int op) +{ + switch (op) { + case IORING_UNREGISTER_FILES: + case IORING_REGISTER_FILES_UPDATE: + case IORING_REGISTER_PROBE: + case IORING_REGISTER_PERSONALITY: + case IORING_UNREGISTER_PERSONALITY: + return false; + default: + return true; + } +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -5437,18 +6840,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - percpu_ref_kill(&ctx->refs); + if (io_register_op_must_quiesce(opcode)) { + percpu_ref_kill(&ctx->refs); - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab - * the uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->completions[0]); - mutex_lock(&ctx->uring_lock); + /* + * Drop uring mutex before waiting for references to exit. If + * another thread is currently inside io_uring_enter() it might + * need to grab the uring_lock to make progress. If we hold it + * here across the drain wait, then we can deadlock. It's safe + * to drop the mutex here, since no new references will come in + * after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + ret = wait_for_completion_interruptible(&ctx->completions[0]); + mutex_lock(&ctx->uring_lock); + if (ret) { + percpu_ref_resurrect(&ctx->refs); + ret = -EINTR; + goto out; + } + } switch (opcode) { case IORING_REGISTER_BUFFERS: @@ -5473,10 +6884,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: + case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg); + if (ret) + break; + if (opcode == IORING_REGISTER_EVENTFD_ASYNC) + ctx->eventfd_async = 1; + else + ctx->eventfd_async = 0; break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; @@ -5484,14 +6902,35 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_eventfd_unregister(ctx); break; + case IORING_REGISTER_PROBE: + ret = -EINVAL; + if (!arg || nr_args > 256) + break; + ret = io_probe(ctx, arg, nr_args); + break; + case IORING_REGISTER_PERSONALITY: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_personality(ctx); + break; + case IORING_UNREGISTER_PERSONALITY: + ret = -EINVAL; + if (arg) + break; + ret = io_unregister_personality(ctx, nr_args); + break; default: ret = -EINVAL; break; } - /* bring the ctx back to life */ - reinit_completion(&ctx->completions[0]); - percpu_ref_reinit(&ctx->refs); + if (io_register_op_must_quiesce(opcode)) { + /* bring the ctx back to life */ + percpu_ref_reinit(&ctx->refs); +out: + reinit_completion(&ctx->completions[0]); + } return ret; } @@ -5524,6 +6963,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, static int __init io_uring_init(void) { + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; }; diff --git a/fs/open.c b/fs/open.c index 8de17281ac52..3d633d24015d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -970,7 +970,7 @@ EXPORT_SYMBOL(open_with_fake_path); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) -static inline struct open_how build_open_how(int flags, umode_t mode) +inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, @@ -986,8 +986,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode) return how; } -static inline int build_open_flags(const struct open_how *how, - struct open_flags *op) +inline int build_open_flags(const struct open_how *how, struct open_flags *op) { int flags = how->flags; int lookup_flags = 0; diff --git a/fs/stat.c b/fs/stat.c index c38e4c2e1221..030008796479 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -21,6 +21,8 @@ #include #include +#include "internal.h" + /** * generic_fillattr - Fill in the basic attributes from the inode struct * @inode: Inode to use as the source @@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat, } EXPORT_SYMBOL(vfs_statx_fd); +inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags) +{ + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + return -EINVAL; + + *lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + *lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_NO_AUTOMOUNT) + *lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_EMPTY_PATH) + *lookup_flags |= LOOKUP_EMPTY; + + return 0; +} + /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename @@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags, { struct path path; int error = -EINVAL; - unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + unsigned lookup_flags; - if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | - AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + if (vfs_stat_set_lookup_flags(&lookup_flags, flags)) return -EINVAL; - - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, } #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ -static noinline_for_stack int +noinline_for_stack int cp_statx(const struct kstat *stat, struct statx __user *buffer) { struct statx tmp; diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index bc6d79b00c4e..8f000fada5a4 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file) eventpoll_release_file(file); } +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + bool nonblock); + +/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ +static inline int ep_op_has_event(int op) +{ + return op != EPOLL_CTL_DEL; +} + #else static inline void eventpoll_init_file(struct file *file) {} diff --git a/include/linux/file.h b/include/linux/file.h index 3fcddff56bc4..c6c7b24ea9f7 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -16,6 +16,7 @@ extern void fput(struct file *); extern void fput_many(struct file *, unsigned int); struct file_operations; +struct task_struct; struct vfsmount; struct dentry; struct inode; @@ -47,6 +48,7 @@ static inline void fdput(struct fd fd) extern struct file *fget(unsigned int fd); extern struct file *fget_many(unsigned int fd, unsigned int refs); extern struct file *fget_raw(unsigned int fd); +extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern unsigned long __fdget(unsigned int fd); extern unsigned long __fdget_raw(unsigned int fd); extern unsigned long __fdget_pos(unsigned int fd); diff --git a/include/linux/mm.h b/include/linux/mm.h index 975ca39c9bc1..1dba485186b0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2324,6 +2324,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); +extern int do_madvise(unsigned long start, size_t len_in, int behavior); static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9ddfb12308e4..e0b569da4764 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -497,7 +497,7 @@ struct mm_struct { /* store ref to file /proc//exe symlink points to */ struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER - struct mmu_notifier_mm *mmu_notifier_mm; + struct mmu_notifier_subscriptions *notifier_subscriptions; #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 9e6caa8ecd19..736f6918335e 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -8,7 +8,7 @@ #include #include -struct mmu_notifier_mm; +struct mmu_notifier_subscriptions; struct mmu_notifier; struct mmu_notifier_range; struct mmu_interval_notifier; @@ -73,7 +73,7 @@ struct mmu_notifier_ops { * through the gart alias address, so leading to memory * corruption. */ - void (*release)(struct mmu_notifier *mn, + void (*release)(struct mmu_notifier *subscription, struct mm_struct *mm); /* @@ -85,7 +85,7 @@ struct mmu_notifier_ops { * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ - int (*clear_flush_young)(struct mmu_notifier *mn, + int (*clear_flush_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -95,7 +95,7 @@ struct mmu_notifier_ops { * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ - int (*clear_young)(struct mmu_notifier *mn, + int (*clear_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -106,7 +106,7 @@ struct mmu_notifier_ops { * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ - int (*test_young)(struct mmu_notifier *mn, + int (*test_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address); @@ -114,7 +114,7 @@ struct mmu_notifier_ops { * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. */ - void (*change_pte)(struct mmu_notifier *mn, + void (*change_pte)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address, pte_t pte); @@ -169,9 +169,9 @@ struct mmu_notifier_ops { * invalidate_range_end. * */ - int (*invalidate_range_start)(struct mmu_notifier *mn, + int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); - void (*invalidate_range_end)(struct mmu_notifier *mn, + void (*invalidate_range_end)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); /* @@ -192,8 +192,10 @@ struct mmu_notifier_ops { * of what was passed to invalidate_range_start()/end(), if * called between those functions. */ - void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end); + void (*invalidate_range)(struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * These callbacks are used with the get/put interface to manage the @@ -206,7 +208,7 @@ struct mmu_notifier_ops { * and cannot sleep. */ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); - void (*free_notifier)(struct mmu_notifier *mn); + void (*free_notifier)(struct mmu_notifier *subscription); }; /* @@ -235,7 +237,7 @@ struct mmu_notifier { * was required but mmu_notifier_range_blockable(range) is false. */ struct mmu_interval_notifier_ops { - bool (*invalidate)(struct mmu_interval_notifier *mni, + bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); }; @@ -265,7 +267,7 @@ struct mmu_notifier_range { static inline int mm_has_notifiers(struct mm_struct *mm) { - return unlikely(mm->mmu_notifier_mm); + return unlikely(mm->notifier_subscriptions); } struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, @@ -280,30 +282,31 @@ mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) up_write(&mm->mmap_sem); return ret; } -void mmu_notifier_put(struct mmu_notifier *mn); +void mmu_notifier_put(struct mmu_notifier *subscription); void mmu_notifier_synchronize(void); -extern int mmu_notifier_register(struct mmu_notifier *mn, +extern int mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); -extern int __mmu_notifier_register(struct mmu_notifier *mn, +extern int __mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); -extern void mmu_notifier_unregister(struct mmu_notifier *mn, +extern void mmu_notifier_unregister(struct mmu_notifier *subscription, struct mm_struct *mm); -unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni); -int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, +unsigned long +mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub); +int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); int mmu_interval_notifier_insert_locked( - struct mmu_interval_notifier *mni, struct mm_struct *mm, + struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); -void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni); +void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence - * @mni - The mni passed to invalidate + * @interval_sub - The subscription passed to invalidate * @cur_seq - The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a @@ -314,15 +317,16 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni); * If the caller does not call mmu_interval_read_begin() or * mmu_interval_read_retry() then this call is not required. */ -static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni, - unsigned long cur_seq) +static inline void +mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, + unsigned long cur_seq) { - WRITE_ONCE(mni->invalidate_seq, cur_seq); + WRITE_ONCE(interval_sub->invalidate_seq, cur_seq); } /** * mmu_interval_read_retry - End a read side critical section against a VA range - * mni: The range + * interval_sub: The subscription * seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held @@ -334,15 +338,16 @@ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni, * Returns true if an invalidation collided with this critical section, and * the caller should retry. */ -static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni, - unsigned long seq) +static inline bool +mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, + unsigned long seq) { - return mni->invalidate_seq != seq; + return interval_sub->invalidate_seq != seq; } /** * mmu_interval_check_retry - Test if a collision has occurred - * mni: The range + * interval_sub: The subscription * seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() @@ -357,14 +362,15 @@ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni, * This call can be used as part of loops and other expensive operations to * expedite a retry. */ -static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *mni, - unsigned long seq) +static inline bool +mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, + unsigned long seq) { /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ - return READ_ONCE(mni->invalidate_seq) != seq; + return READ_ONCE(interval_sub->invalidate_seq) != seq; } -extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); +extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, @@ -480,15 +486,15 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, __mmu_notifier_invalidate_range(mm, start, end); } -static inline void mmu_notifier_mm_init(struct mm_struct *mm) +static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { - mm->mmu_notifier_mm = NULL; + mm->notifier_subscriptions = NULL; } -static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) +static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) - __mmu_notifier_mm_destroy(mm); + __mmu_notifier_subscriptions_destroy(mm); } @@ -692,11 +698,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } -static inline void mmu_notifier_mm_init(struct mm_struct *mm) +static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { } -static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) +static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { } diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 390031e816dc..22d9d183950d 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -209,6 +209,36 @@ static inline void percpu_ref_get(struct percpu_ref *ref) percpu_ref_get_many(ref, 1); } +/** + * percpu_ref_tryget_many - try to increment a percpu refcount + * @ref: percpu_ref to try-get + * @nr: number of references to get + * + * Increment a percpu refcount by @nr unless its count already reached zero. + * Returns %true on success; %false on failure. + * + * This function is safe to call as long as @ref is between init and exit. + */ +static inline bool percpu_ref_tryget_many(struct percpu_ref *ref, + unsigned long nr) +{ + unsigned long __percpu *percpu_count; + bool ret; + + rcu_read_lock(); + + if (__ref_is_percpu(ref, &percpu_count)) { + this_cpu_add(*percpu_count, nr); + ret = true; + } else { + ret = atomic_long_add_unless(&ref->count, nr, 0); + } + + rcu_read_unlock(); + + return ret; +} + /** * percpu_ref_tryget - try to increment a percpu refcount * @ref: percpu_ref to try-get @@ -220,21 +250,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { - unsigned long __percpu *percpu_count; - bool ret; - - rcu_read_lock(); - - if (__ref_is_percpu(ref, &percpu_count)) { - this_cpu_inc(*percpu_count); - ret = true; - } else { - ret = atomic_long_inc_not_zero(&ref->count); - } - - rcu_read_unlock(); - - return ret; + return percpu_ref_tryget_many(ref, 1); } /** diff --git a/include/linux/sched.h b/include/linux/sched.h index 6776276c7de5..d88d1bc5f278 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -921,7 +921,7 @@ struct task_struct { /* Signal handlers: */ struct signal_struct *signal; - struct sighand_struct *sighand; + struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ac3a663137b6..1815065d52f3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1002,6 +1002,7 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags) asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, siginfo_t __user *info, unsigned int flags); +asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags); /* * Architecture-specific system calls diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index b352d66b5d51..27bd9e4f927b 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete, * io_uring_submit_sqe - called before submitting one SQE * * @ctx: pointer to a ring context structure + * @opcode: opcode of request * @user_data: user data associated with the request * @force_nonblock: whether a context blocking or not * @sq_thread: true if sq_thread has submitted this SQE @@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete, */ TRACE_EVENT(io_uring_submit_sqe, - TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread), + TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock, + bool sq_thread), - TP_ARGS(ctx, user_data, force_nonblock, sq_thread), + TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread), TP_STRUCT__entry ( __field( void *, ctx ) + __field( u8, opcode ) __field( u64, user_data ) __field( bool, force_nonblock ) __field( bool, sq_thread ) @@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe, TP_fast_assign( __entry->ctx = ctx; + __entry->opcode = opcode; __entry->user_data = user_data; __entry->force_nonblock = force_nonblock; __entry->sq_thread = sq_thread; ), - TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d", - __entry->ctx, (unsigned long long) __entry->user_data, + TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, __entry->force_nonblock, __entry->sq_thread) ); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d4122c091472..3a3201e4618e 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -853,9 +853,11 @@ __SYSCALL(__NR_clone3, sys_clone3) #define __NR_openat2 437 __SYSCALL(__NR_openat2, sys_openat2) +#define __NR_pidfd_getfd 438 +__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #undef __NR_syscalls -#define __NR_syscalls 438 +#define __NR_syscalls 439 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 240fdb9a60f6..272dc69fa080 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -301,6 +301,7 @@ struct vfs_ns_cap_data { /* Allow more than 64hz interrupts from the real-time clock */ /* Override max number of consoles on console allocation */ /* Override max number of keymaps */ +/* Control memory reclaim behavior */ #define CAP_SYS_RESOURCE 24 diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 55cfcb71606d..3f7961c1c243 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -34,21 +34,43 @@ struct io_uring_sqe { __u32 timeout_flags; __u32 accept_flags; __u32 cancel_flags; + __u32 open_flags; + __u32 statx_flags; + __u32 fadvise_advice; }; __u64 user_data; /* data to be passed back at completion time */ union { - __u16 buf_index; /* index into fixed buffers, if used */ + struct { + /* index into fixed buffers, if used */ + __u16 buf_index; + /* personality to use, if used */ + __u16 personality; + }; __u64 __pad2[3]; }; }; +enum { + IOSQE_FIXED_FILE_BIT, + IOSQE_IO_DRAIN_BIT, + IOSQE_IO_LINK_BIT, + IOSQE_IO_HARDLINK_BIT, + IOSQE_ASYNC_BIT, +}; + /* * sqe->flags */ -#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ -#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ -#define IOSQE_IO_LINK (1U << 2) /* links next sqe */ -#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */ +/* use fixed fileset */ +#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT) +/* issue after inflight IO */ +#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT) +/* links next sqe */ +#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT) +/* like LINK, but stronger */ +#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) +/* always go async */ +#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) /* * io_uring_setup() flags @@ -57,6 +79,8 @@ struct io_uring_sqe { #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ +#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ +#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ enum { IORING_OP_NOP, @@ -76,6 +100,19 @@ enum { IORING_OP_ASYNC_CANCEL, IORING_OP_LINK_TIMEOUT, IORING_OP_CONNECT, + IORING_OP_FALLOCATE, + IORING_OP_OPENAT, + IORING_OP_CLOSE, + IORING_OP_FILES_UPDATE, + IORING_OP_STATX, + IORING_OP_READ, + IORING_OP_WRITE, + IORING_OP_FADVISE, + IORING_OP_MADVISE, + IORING_OP_SEND, + IORING_OP_RECV, + IORING_OP_OPENAT2, + IORING_OP_EPOLL_CTL, /* this goes last, obviously */ IORING_OP_LAST, @@ -153,7 +190,8 @@ struct io_uring_params { __u32 sq_thread_cpu; __u32 sq_thread_idle; __u32 features; - __u32 resv[4]; + __u32 wq_fd; + __u32 resv[3]; struct io_sqring_offsets sq_off; struct io_cqring_offsets cq_off; }; @@ -164,6 +202,8 @@ struct io_uring_params { #define IORING_FEAT_SINGLE_MMAP (1U << 0) #define IORING_FEAT_NODROP (1U << 1) #define IORING_FEAT_SUBMIT_STABLE (1U << 2) +#define IORING_FEAT_RW_CUR_POS (1U << 3) +#define IORING_FEAT_CUR_PERSONALITY (1U << 4) /* * io_uring_register(2) opcodes and arguments @@ -175,6 +215,10 @@ struct io_uring_params { #define IORING_REGISTER_EVENTFD 4 #define IORING_UNREGISTER_EVENTFD 5 #define IORING_REGISTER_FILES_UPDATE 6 +#define IORING_REGISTER_EVENTFD_ASYNC 7 +#define IORING_REGISTER_PROBE 8 +#define IORING_REGISTER_PERSONALITY 9 +#define IORING_UNREGISTER_PERSONALITY 10 struct io_uring_files_update { __u32 offset; @@ -182,4 +226,21 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +#define IO_URING_OP_SUPPORTED (1U << 0) + +struct io_uring_probe_op { + __u8 op; + __u8 resv; + __u16 flags; /* IO_URING_OP_* flags */ + __u32 resv2; +}; + +struct io_uring_probe { + __u8 last_op; /* last opcode supported */ + __u8 ops_len; /* length of ops[] array below */ + __u16 resv; + __u32 resv2[3]; + struct io_uring_probe_op ops[0]; +}; + #endif diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index bc4a5d963ac8..10773270f67b 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -234,6 +234,10 @@ struct prctl_mm_map { #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* Control reclaim behavior when allocating memory */ +#define PR_SET_IO_FLUSHER 57 +#define PR_GET_IO_FLUSHER 58 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/kernel/fork.c b/kernel/fork.c index f2f99dbfb440..550027ff6071 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -697,7 +697,7 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); - mmu_notifier_mm_destroy(mm); + mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); free_mm(mm); @@ -1036,7 +1036,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_aio(mm); mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); - mmu_notifier_mm_init(mm); + mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; diff --git a/kernel/pid.c b/kernel/pid.c index 2278e249141d..0f4ecb57214c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -578,3 +578,93 @@ void __init pid_idr_init(void) init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } + +static struct file *__pidfd_fget(struct task_struct *task, int fd) +{ + struct file *file; + int ret; + + ret = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (ret) + return ERR_PTR(ret); + + if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) + file = fget_task(task, fd); + else + file = ERR_PTR(-EPERM); + + mutex_unlock(&task->signal->cred_guard_mutex); + + return file ?: ERR_PTR(-EBADF); +} + +static int pidfd_getfd(struct pid *pid, int fd) +{ + struct task_struct *task; + struct file *file; + int ret; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + file = __pidfd_fget(task, fd); + put_task_struct(task); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = security_file_receive(file); + if (ret) { + fput(file); + return ret; + } + + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) + fput(file); + else + fd_install(ret, file); + + return ret; +} + +/** + * sys_pidfd_getfd() - Get a file descriptor from another process + * + * @pidfd: the pidfd file descriptor of the process + * @fd: the file descriptor number to get + * @flags: flags on how to get the fd (reserved) + * + * This syscall gets a copy of a file descriptor from another process + * based on the pidfd, and file descriptor number. It requires that + * the calling process has the ability to ptrace the process represented + * by the pidfd. The process which is having its file descriptor copied + * is otherwise unaffected. + * + * Return: On success, a cloexec file descriptor is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, + unsigned int, flags) +{ + struct pid *pid; + struct fd f; + int ret; + + /* flags is currently unused - make sure it's unset */ + if (flags) + return -EINVAL; + + f = fdget(pidfd); + if (!f.file) + return -EBADF; + + pid = pidfd_pid(f.file); + if (IS_ERR(pid)) + ret = PTR_ERR(pid); + else + ret = pidfd_getfd(pid, fd); + + fdput(f); + return ret; +} diff --git a/kernel/signal.c b/kernel/signal.c index bcd46f547db3..9ad8dea93dbb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1383,7 +1383,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, * must see ->sighand == NULL. */ spin_lock_irqsave(&sighand->siglock, *flags); - if (likely(sighand == tsk->sighand)) + if (likely(sighand == rcu_access_pointer(tsk->sighand))) break; spin_unlock_irqrestore(&sighand->siglock, *flags); } diff --git a/kernel/sys.c b/kernel/sys.c index 21d24e2ac9b3..cc54ea5f76ca 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2410,6 +2410,8 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2640,6 +2642,29 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = GET_TAGGED_ADDR_CTRL(); break; + case PR_SET_IO_FLUSHER: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (arg3 || arg4 || arg5) + return -EINVAL; + + if (arg2 == 1) + current->flags |= PR_IO_FLUSHER; + else if (!arg2) + current->flags &= ~PR_IO_FLUSHER; + else + return -EINVAL; + break; + case PR_GET_IO_FLUSHER: + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; + break; default: error = -EINVAL; break; diff --git a/mm/debug.c b/mm/debug.c index 0461df1207cb..74ee73cf7079 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -153,7 +153,7 @@ void dump_mm(const struct mm_struct *mm) #endif "exe_file %px\n" #ifdef CONFIG_MMU_NOTIFIER - "mmu_notifier_mm %px\n" + "notifier_subscriptions %px\n" #endif #ifdef CONFIG_NUMA_BALANCING "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" @@ -185,7 +185,7 @@ void dump_mm(const struct mm_struct *mm) #endif mm->exe_file, #ifdef CONFIG_MMU_NOTIFIER - mm->mmu_notifier_mm, + mm->notifier_subscriptions, #endif #ifdef CONFIG_NUMA_BALANCING mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, diff --git a/mm/madvise.c b/mm/madvise.c index 4e9cf5ff8532..26160975f049 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior) * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +int do_madvise(unsigned long start, size_t len_in, int behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -1141,3 +1141,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return error; } + +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +{ + return do_madvise(start, len_in, behavior); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index f76ea05b1cb0..ef3973a5d34a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -29,12 +29,12 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = { #endif /* - * The mmu notifier_mm structure is allocated and installed in - * mm->mmu_notifier_mm inside the mm_take_all_locks() protected + * The mmu_notifier_subscriptions structure is allocated and installed in + * mm->notifier_subscriptions inside the mm_take_all_locks() protected * critical section and it's released only when mm_count reaches zero * in mmdrop(). */ -struct mmu_notifier_mm { +struct mmu_notifier_subscriptions { /* all mmu notifiers registered in this mm are queued in this list */ struct hlist_head list; bool has_itree; @@ -65,80 +65,81 @@ struct mmu_notifier_mm { * * The write side has two states, fully excluded: * - mm->active_invalidate_ranges != 0 - * - mnn->invalidate_seq & 1 == True (odd) + * - subscriptions->invalidate_seq & 1 == True (odd) * - some range on the mm_struct is being invalidated * - the itree is not allowed to change * * And partially excluded: * - mm->active_invalidate_ranges != 0 - * - mnn->invalidate_seq & 1 == False (even) + * - subscriptions->invalidate_seq & 1 == False (even) * - some range on the mm_struct is being invalidated * - the itree is allowed to change * - * Operations on mmu_notifier_mm->invalidate_seq (under spinlock): + * Operations on notifier_subscriptions->invalidate_seq (under spinlock): * seq |= 1 # Begin writing * seq++ # Release the writing state * seq & 1 # True if a writer exists * * The later state avoids some expensive work on inv_end in the common case of - * no mni monitoring the VA. + * no mmu_interval_notifier monitoring the VA. */ -static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm) +static bool +mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions) { - lockdep_assert_held(&mmn_mm->lock); - return mmn_mm->invalidate_seq & 1; + lockdep_assert_held(&subscriptions->lock); + return subscriptions->invalidate_seq & 1; } static struct mmu_interval_notifier * -mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm, +mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range, unsigned long *seq) { struct interval_tree_node *node; struct mmu_interval_notifier *res = NULL; - spin_lock(&mmn_mm->lock); - mmn_mm->active_invalidate_ranges++; - node = interval_tree_iter_first(&mmn_mm->itree, range->start, + spin_lock(&subscriptions->lock); + subscriptions->active_invalidate_ranges++; + node = interval_tree_iter_first(&subscriptions->itree, range->start, range->end - 1); if (node) { - mmn_mm->invalidate_seq |= 1; + subscriptions->invalidate_seq |= 1; res = container_of(node, struct mmu_interval_notifier, interval_tree); } - *seq = mmn_mm->invalidate_seq; - spin_unlock(&mmn_mm->lock); + *seq = subscriptions->invalidate_seq; + spin_unlock(&subscriptions->lock); return res; } static struct mmu_interval_notifier * -mn_itree_inv_next(struct mmu_interval_notifier *mni, +mn_itree_inv_next(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range) { struct interval_tree_node *node; - node = interval_tree_iter_next(&mni->interval_tree, range->start, - range->end - 1); + node = interval_tree_iter_next(&interval_sub->interval_tree, + range->start, range->end - 1); if (!node) return NULL; return container_of(node, struct mmu_interval_notifier, interval_tree); } -static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) +static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions) { - struct mmu_interval_notifier *mni; + struct mmu_interval_notifier *interval_sub; struct hlist_node *next; - spin_lock(&mmn_mm->lock); - if (--mmn_mm->active_invalidate_ranges || - !mn_itree_is_invalidating(mmn_mm)) { - spin_unlock(&mmn_mm->lock); + spin_lock(&subscriptions->lock); + if (--subscriptions->active_invalidate_ranges || + !mn_itree_is_invalidating(subscriptions)) { + spin_unlock(&subscriptions->lock); return; } /* Make invalidate_seq even */ - mmn_mm->invalidate_seq++; + subscriptions->invalidate_seq++; /* * The inv_end incorporates a deferred mechanism like rtnl_unlock(). @@ -146,30 +147,31 @@ static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) * they are progressed. This arrangement for tree updates is used to * avoid using a blocking lock during invalidate_range_start. */ - hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list, + hlist_for_each_entry_safe(interval_sub, next, + &subscriptions->deferred_list, deferred_item) { - if (RB_EMPTY_NODE(&mni->interval_tree.rb)) - interval_tree_insert(&mni->interval_tree, - &mmn_mm->itree); + if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); else - interval_tree_remove(&mni->interval_tree, - &mmn_mm->itree); - hlist_del(&mni->deferred_item); + interval_tree_remove(&interval_sub->interval_tree, + &subscriptions->itree); + hlist_del(&interval_sub->deferred_item); } - spin_unlock(&mmn_mm->lock); + spin_unlock(&subscriptions->lock); - wake_up_all(&mmn_mm->wq); + wake_up_all(&subscriptions->wq); } /** * mmu_interval_read_begin - Begin a read side critical section against a VA * range - * mni: The range to use + * interval_sub: The interval subscription * * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a - * collision-retry scheme similar to seqcount for the VA range under mni. If - * the mm invokes invalidation during the critical section then - * mmu_interval_read_retry() will return true. + * collision-retry scheme similar to seqcount for the VA range under + * subscription. If the mm invokes invalidation during the critical section + * then mmu_interval_read_retry() will return true. * * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs * require a blocking context. The critical region formed by this can sleep, @@ -180,68 +182,71 @@ static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) * * The return value should be passed to mmu_interval_read_retry(). */ -unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni) +unsigned long +mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) { - struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm; + struct mmu_notifier_subscriptions *subscriptions = + interval_sub->mm->notifier_subscriptions; unsigned long seq; bool is_invalidating; /* - * If the mni has a different seq value under the user_lock than we - * started with then it has collided. + * If the subscription has a different seq value under the user_lock + * than we started with then it has collided. * - * If the mni currently has the same seq value as the mmn_mm seq, then - * it is currently between invalidate_start/end and is colliding. + * If the subscription currently has the same seq value as the + * subscriptions seq, then it is currently between + * invalidate_start/end and is colliding. * * The locking looks broadly like this: * mn_tree_invalidate_start(): mmu_interval_read_begin(): * spin_lock - * seq = READ_ONCE(mni->invalidate_seq); - * seq == mmn_mm->invalidate_seq + * seq = READ_ONCE(interval_sub->invalidate_seq); + * seq == subs->invalidate_seq * spin_unlock * spin_lock - * seq = ++mmn_mm->invalidate_seq + * seq = ++subscriptions->invalidate_seq * spin_unlock * op->invalidate_range(): * user_lock * mmu_interval_set_seq() - * mni->invalidate_seq = seq + * interval_sub->invalidate_seq = seq * user_unlock * * [Required: mmu_interval_read_retry() == true] * * mn_itree_inv_end(): * spin_lock - * seq = ++mmn_mm->invalidate_seq + * seq = ++subscriptions->invalidate_seq * spin_unlock * * user_lock * mmu_interval_read_retry(): - * mni->invalidate_seq != seq + * interval_sub->invalidate_seq != seq * user_unlock * * Barriers are not needed here as any races here are closed by an * eventual mmu_interval_read_retry(), which provides a barrier via the * user_lock. */ - spin_lock(&mmn_mm->lock); + spin_lock(&subscriptions->lock); /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ - seq = READ_ONCE(mni->invalidate_seq); - is_invalidating = seq == mmn_mm->invalidate_seq; - spin_unlock(&mmn_mm->lock); + seq = READ_ONCE(interval_sub->invalidate_seq); + is_invalidating = seq == subscriptions->invalidate_seq; + spin_unlock(&subscriptions->lock); /* - * mni->invalidate_seq must always be set to an odd value via + * interval_sub->invalidate_seq must always be set to an odd value via * mmu_interval_set_seq() using the provided cur_seq from * mn_itree_inv_start_range(). This ensures that if seq does wrap we * will always clear the below sleep in some reasonable time as - * mmn_mm->invalidate_seq is even in the idle state. + * subscriptions->invalidate_seq is even in the idle state. */ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (is_invalidating) - wait_event(mmn_mm->wq, - READ_ONCE(mmn_mm->invalidate_seq) != seq); + wait_event(subscriptions->wq, + READ_ONCE(subscriptions->invalidate_seq) != seq); /* * Notice that mmu_interval_read_retry() can already be true at this @@ -253,7 +258,7 @@ unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni) } EXPORT_SYMBOL_GPL(mmu_interval_read_begin); -static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, +static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, struct mm_struct *mm) { struct mmu_notifier_range range = { @@ -263,17 +268,20 @@ static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, .start = 0, .end = ULONG_MAX, }; - struct mmu_interval_notifier *mni; + struct mmu_interval_notifier *interval_sub; unsigned long cur_seq; bool ret; - for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni; - mni = mn_itree_inv_next(mni, &range)) { - ret = mni->ops->invalidate(mni, &range, cur_seq); + for (interval_sub = + mn_itree_inv_start_range(subscriptions, &range, &cur_seq); + interval_sub; + interval_sub = mn_itree_inv_next(interval_sub, &range)) { + ret = interval_sub->ops->invalidate(interval_sub, &range, + cur_seq); WARN_ON(!ret); } - mn_itree_inv_end(mmn_mm); + mn_itree_inv_end(subscriptions); } /* @@ -283,15 +291,15 @@ static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, * in parallel despite there being no task using this mm any more, * through the vmas outside of the exit_mmap context, such as with * vmtruncate. This serializes against mmu_notifier_unregister with - * the mmu_notifier_mm->lock in addition to SRCU and it serializes - * against the other mmu notifiers with SRCU. struct mmu_notifier_mm + * the notifier_subscriptions->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ -static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, +static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, struct mm_struct *mm) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int id; /* @@ -299,29 +307,29 @@ static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) /* * If ->release runs before mmu_notifier_unregister it must be * handled, as it's the only way for the driver to flush all * existing sptes and stop the driver from establishing any more * sptes before all the pages in the mm are freed. */ - if (mn->ops->release) - mn->ops->release(mn, mm); + if (subscription->ops->release) + subscription->ops->release(subscription, mm); - spin_lock(&mmn_mm->lock); - while (unlikely(!hlist_empty(&mmn_mm->list))) { - mn = hlist_entry(mmn_mm->list.first, struct mmu_notifier, - hlist); + spin_lock(&subscriptions->lock); + while (unlikely(!hlist_empty(&subscriptions->list))) { + subscription = hlist_entry(subscriptions->list.first, + struct mmu_notifier, hlist); /* * We arrived before mmu_notifier_unregister so * mmu_notifier_unregister will do nothing other than to wait * for ->release to finish and for mmu_notifier_unregister to * return. */ - hlist_del_init_rcu(&mn->hlist); + hlist_del_init_rcu(&subscription->hlist); } - spin_unlock(&mmn_mm->lock); + spin_unlock(&subscriptions->lock); srcu_read_unlock(&srcu, id); /* @@ -330,21 +338,22 @@ static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, * until the ->release method returns, if it was invoked by * mmu_notifier_unregister. * - * The mmu_notifier_mm can't go away from under us because one mm_count - * is held by exit_mmap. + * The notifier_subscriptions can't go away from under us because + * one mm_count is held by exit_mmap. */ synchronize_srcu(&srcu); } void __mmu_notifier_release(struct mm_struct *mm) { - struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; - if (mmn_mm->has_itree) - mn_itree_release(mmn_mm, mm); + if (subscriptions->has_itree) + mn_itree_release(subscriptions, mm); - if (!hlist_empty(&mmn_mm->list)) - mn_hlist_release(mmn_mm, mm); + if (!hlist_empty(&subscriptions->list)) + mn_hlist_release(subscriptions, mm); } /* @@ -356,13 +365,15 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->clear_flush_young) - young |= mn->ops->clear_flush_young(mn, mm, start, end); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->clear_flush_young) + young |= subscription->ops->clear_flush_young( + subscription, mm, start, end); } srcu_read_unlock(&srcu, id); @@ -373,13 +384,15 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->clear_young) - young |= mn->ops->clear_young(mn, mm, start, end); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->clear_young) + young |= subscription->ops->clear_young(subscription, + mm, start, end); } srcu_read_unlock(&srcu, id); @@ -389,13 +402,15 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->test_young) { - young = mn->ops->test_young(mn, mm, address); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->test_young) { + young = subscription->ops->test_young(subscription, mm, + address); if (young) break; } @@ -408,28 +423,33 @@ int __mmu_notifier_test_young(struct mm_struct *mm, void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->change_pte) - mn->ops->change_pte(mn, mm, address, pte); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->change_pte) + subscription->ops->change_pte(subscription, mm, address, + pte); } srcu_read_unlock(&srcu, id); } -static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm, +static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { - struct mmu_interval_notifier *mni; + struct mmu_interval_notifier *interval_sub; unsigned long cur_seq; - for (mni = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mni; - mni = mn_itree_inv_next(mni, range)) { + for (interval_sub = + mn_itree_inv_start_range(subscriptions, range, &cur_seq); + interval_sub; + interval_sub = mn_itree_inv_next(interval_sub, range)) { bool ret; - ret = mni->ops->invalidate(mni, range, cur_seq); + ret = interval_sub->ops->invalidate(interval_sub, range, + cur_seq); if (!ret) { if (WARN_ON(mmu_notifier_range_blockable(range))) continue; @@ -443,31 +463,36 @@ static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm, * On -EAGAIN the non-blocking caller is not allowed to call * invalidate_range_end() */ - mn_itree_inv_end(mmn_mm); + mn_itree_inv_end(subscriptions); return -EAGAIN; } -static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm, - struct mmu_notifier_range *range) +static int mn_hlist_invalidate_range_start( + struct mmu_notifier_subscriptions *subscriptions, + struct mmu_notifier_range *range) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int ret = 0; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { - if (mn->ops->invalidate_range_start) { + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { + const struct mmu_notifier_ops *ops = subscription->ops; + + if (ops->invalidate_range_start) { int _ret; if (!mmu_notifier_range_blockable(range)) non_block_start(); - _ret = mn->ops->invalidate_range_start(mn, range); + _ret = ops->invalidate_range_start(subscription, range); if (!mmu_notifier_range_blockable(range)) non_block_end(); if (_ret) { pr_info("%pS callback failed with %d in %sblockable context.\n", - mn->ops->invalidate_range_start, _ret, - !mmu_notifier_range_blockable(range) ? "non-" : ""); + ops->invalidate_range_start, _ret, + !mmu_notifier_range_blockable(range) ? + "non-" : + ""); WARN_ON(mmu_notifier_range_blockable(range) || _ret != -EAGAIN); ret = _ret; @@ -481,28 +506,29 @@ static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm, int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { - struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + struct mmu_notifier_subscriptions *subscriptions = + range->mm->notifier_subscriptions; int ret; - if (mmn_mm->has_itree) { - ret = mn_itree_invalidate(mmn_mm, range); + if (subscriptions->has_itree) { + ret = mn_itree_invalidate(subscriptions, range); if (ret) return ret; } - if (!hlist_empty(&mmn_mm->list)) - return mn_hlist_invalidate_range_start(mmn_mm, range); + if (!hlist_empty(&subscriptions->list)) + return mn_hlist_invalidate_range_start(subscriptions, range); return 0; } -static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, - struct mmu_notifier_range *range, - bool only_end) +static void +mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, + struct mmu_notifier_range *range, bool only_end) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -516,14 +542,16 @@ static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, * is safe to do when we know that a call to invalidate_range() * already happen under page table lock. */ - if (!only_end && mn->ops->invalidate_range) - mn->ops->invalidate_range(mn, range->mm, - range->start, - range->end); - if (mn->ops->invalidate_range_end) { + if (!only_end && subscription->ops->invalidate_range) + subscription->ops->invalidate_range(subscription, + range->mm, + range->start, + range->end); + if (subscription->ops->invalidate_range_end) { if (!mmu_notifier_range_blockable(range)) non_block_start(); - mn->ops->invalidate_range_end(mn, range); + subscription->ops->invalidate_range_end(subscription, + range); if (!mmu_notifier_range_blockable(range)) non_block_end(); } @@ -534,27 +562,30 @@ static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, bool only_end) { - struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + struct mmu_notifier_subscriptions *subscriptions = + range->mm->notifier_subscriptions; lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); - if (mmn_mm->has_itree) - mn_itree_inv_end(mmn_mm); + if (subscriptions->has_itree) + mn_itree_inv_end(subscriptions); - if (!hlist_empty(&mmn_mm->list)) - mn_hlist_invalidate_end(mmn_mm, range, only_end); + if (!hlist_empty(&subscriptions->list)) + mn_hlist_invalidate_end(subscriptions, range, only_end); lock_map_release(&__mmu_notifier_invalidate_range_start_map); } void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_range) - mn->ops->invalidate_range(mn, mm, start, end); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->invalidate_range) + subscription->ops->invalidate_range(subscription, mm, + start, end); } srcu_read_unlock(&srcu, id); } @@ -564,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, * write mode. A NULL mn signals the notifier is being registered for itree * mode. */ -int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +int __mmu_notifier_register(struct mmu_notifier *subscription, + struct mm_struct *mm) { - struct mmu_notifier_mm *mmu_notifier_mm = NULL; + struct mmu_notifier_subscriptions *subscriptions = NULL; int ret; lockdep_assert_held_write(&mm->mmap_sem); @@ -579,23 +611,23 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) fs_reclaim_release(GFP_KERNEL); } - if (!mm->mmu_notifier_mm) { + if (!mm->notifier_subscriptions) { /* * kmalloc cannot be called under mm_take_all_locks(), but we - * know that mm->mmu_notifier_mm can't change while we hold - * the write side of the mmap_sem. + * know that mm->notifier_subscriptions can't change while we + * hold the write side of the mmap_sem. */ - mmu_notifier_mm = - kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); - if (!mmu_notifier_mm) + subscriptions = kzalloc( + sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL); + if (!subscriptions) return -ENOMEM; - INIT_HLIST_HEAD(&mmu_notifier_mm->list); - spin_lock_init(&mmu_notifier_mm->lock); - mmu_notifier_mm->invalidate_seq = 2; - mmu_notifier_mm->itree = RB_ROOT_CACHED; - init_waitqueue_head(&mmu_notifier_mm->wq); - INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list); + INIT_HLIST_HEAD(&subscriptions->list); + spin_lock_init(&subscriptions->lock); + subscriptions->invalidate_seq = 2; + subscriptions->itree = RB_ROOT_CACHED; + init_waitqueue_head(&subscriptions->wq); + INIT_HLIST_HEAD(&subscriptions->deferred_list); } ret = mm_take_all_locks(mm); @@ -610,34 +642,36 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * We can't race against any other mmu notifier method either * thanks to mm_take_all_locks(). * - * release semantics on the initialization of the mmu_notifier_mm's - * contents are provided for unlocked readers. acquire can only be - * used while holding the mmgrab or mmget, and is safe because once - * created the mmu_notififer_mm is not freed until the mm is - * destroyed. As above, users holding the mmap_sem or one of the + * release semantics on the initialization of the + * mmu_notifier_subscriptions's contents are provided for unlocked + * readers. acquire can only be used while holding the mmgrab or + * mmget, and is safe because once created the + * mmu_notifier_subscriptions is not freed until the mm is destroyed. + * As above, users holding the mmap_sem or one of the * mm_take_all_locks() do not need to use acquire semantics. */ - if (mmu_notifier_mm) - smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm); + if (subscriptions) + smp_store_release(&mm->notifier_subscriptions, subscriptions); - if (mn) { + if (subscription) { /* Pairs with the mmdrop in mmu_notifier_unregister_* */ mmgrab(mm); - mn->mm = mm; - mn->users = 1; + subscription->mm = mm; + subscription->users = 1; - spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_lock(&mm->notifier_subscriptions->lock); + hlist_add_head_rcu(&subscription->hlist, + &mm->notifier_subscriptions->list); + spin_unlock(&mm->notifier_subscriptions->lock); } else - mm->mmu_notifier_mm->has_itree = true; + mm->notifier_subscriptions->has_itree = true; mm_drop_all_locks(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); return 0; out_clean: - kfree(mmu_notifier_mm); + kfree(subscriptions); return ret; } EXPORT_SYMBOL_GPL(__mmu_notifier_register); @@ -658,15 +692,16 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register); * mmu_notifier_unregister() or mmu_notifier_put() must be always called to * unregister the notifier. * - * While the caller has a mmu_notifier get the mn->mm pointer will remain + * While the caller has a mmu_notifier get the subscription->mm pointer will remain * valid, and can be converted to an active mm pointer via mmget_not_zero(). */ -int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +int mmu_notifier_register(struct mmu_notifier *subscription, + struct mm_struct *mm) { int ret; down_write(&mm->mmap_sem); - ret = __mmu_notifier_register(mn, mm); + ret = __mmu_notifier_register(subscription, mm); up_write(&mm->mmap_sem); return ret; } @@ -675,21 +710,22 @@ EXPORT_SYMBOL_GPL(mmu_notifier_register); static struct mmu_notifier * find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; - spin_lock(&mm->mmu_notifier_mm->lock); - hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops != ops) + spin_lock(&mm->notifier_subscriptions->lock); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops != ops) continue; - if (likely(mn->users != UINT_MAX)) - mn->users++; + if (likely(subscription->users != UINT_MAX)) + subscription->users++; else - mn = ERR_PTR(-EOVERFLOW); - spin_unlock(&mm->mmu_notifier_mm->lock); - return mn; + subscription = ERR_PTR(-EOVERFLOW); + spin_unlock(&mm->notifier_subscriptions->lock); + return subscription; } - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_unlock(&mm->notifier_subscriptions->lock); return NULL; } @@ -713,37 +749,37 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int ret; lockdep_assert_held_write(&mm->mmap_sem); - if (mm->mmu_notifier_mm) { - mn = find_get_mmu_notifier(mm, ops); - if (mn) - return mn; + if (mm->notifier_subscriptions) { + subscription = find_get_mmu_notifier(mm, ops); + if (subscription) + return subscription; } - mn = ops->alloc_notifier(mm); - if (IS_ERR(mn)) - return mn; - mn->ops = ops; - ret = __mmu_notifier_register(mn, mm); + subscription = ops->alloc_notifier(mm); + if (IS_ERR(subscription)) + return subscription; + subscription->ops = ops; + ret = __mmu_notifier_register(subscription, mm); if (ret) goto out_free; - return mn; + return subscription; out_free: - mn->ops->free_notifier(mn); + subscription->ops->free_notifier(subscription); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); /* this is called after the last mmu_notifier_unregister() returned */ -void __mmu_notifier_mm_destroy(struct mm_struct *mm) +void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { - BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); - kfree(mm->mmu_notifier_mm); - mm->mmu_notifier_mm = LIST_POISON1; /* debug */ + BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list)); + kfree(mm->notifier_subscriptions); + mm->notifier_subscriptions = LIST_POISON1; /* debug */ } /* @@ -756,11 +792,12 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) * and only after mmu_notifier_unregister returned we're guaranteed * that ->release or any other method can't run anymore. */ -void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +void mmu_notifier_unregister(struct mmu_notifier *subscription, + struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); - if (!hlist_unhashed(&mn->hlist)) { + if (!hlist_unhashed(&subscription->hlist)) { /* * SRCU here will force exit_mmap to wait for ->release to * finish before freeing the pages. @@ -772,17 +809,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) * exit_mmap will block in mmu_notifier_release to guarantee * that ->release is called before freeing the pages. */ - if (mn->ops->release) - mn->ops->release(mn, mm); + if (subscription->ops->release) + subscription->ops->release(subscription, mm); srcu_read_unlock(&srcu, id); - spin_lock(&mm->mmu_notifier_mm->lock); + spin_lock(&mm->notifier_subscriptions->lock); /* * Can not use list_del_rcu() since __mmu_notifier_release * can delete it before we hold the lock. */ - hlist_del_init_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); + hlist_del_init_rcu(&subscription->hlist); + spin_unlock(&mm->notifier_subscriptions->lock); } /* @@ -799,10 +836,11 @@ EXPORT_SYMBOL_GPL(mmu_notifier_unregister); static void mmu_notifier_free_rcu(struct rcu_head *rcu) { - struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu); - struct mm_struct *mm = mn->mm; + struct mmu_notifier *subscription = + container_of(rcu, struct mmu_notifier, rcu); + struct mm_struct *mm = subscription->mm; - mn->ops->free_notifier(mn); + subscription->ops->free_notifier(subscription); /* Pairs with the get in __mmu_notifier_register() */ mmdrop(mm); } @@ -829,39 +867,40 @@ static void mmu_notifier_free_rcu(struct rcu_head *rcu) * Modules calling this function must call mmu_notifier_synchronize() in * their __exit functions to ensure the async work is completed. */ -void mmu_notifier_put(struct mmu_notifier *mn) +void mmu_notifier_put(struct mmu_notifier *subscription) { - struct mm_struct *mm = mn->mm; + struct mm_struct *mm = subscription->mm; - spin_lock(&mm->mmu_notifier_mm->lock); - if (WARN_ON(!mn->users) || --mn->users) + spin_lock(&mm->notifier_subscriptions->lock); + if (WARN_ON(!subscription->users) || --subscription->users) goto out_unlock; - hlist_del_init_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); + hlist_del_init_rcu(&subscription->hlist); + spin_unlock(&mm->notifier_subscriptions->lock); - call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu); + call_srcu(&srcu, &subscription->rcu, mmu_notifier_free_rcu); return; out_unlock: - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_unlock(&mm->notifier_subscriptions->lock); } EXPORT_SYMBOL_GPL(mmu_notifier_put); static int __mmu_interval_notifier_insert( - struct mmu_interval_notifier *mni, struct mm_struct *mm, - struct mmu_notifier_mm *mmn_mm, unsigned long start, + struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, + struct mmu_notifier_subscriptions *subscriptions, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops) { - mni->mm = mm; - mni->ops = ops; - RB_CLEAR_NODE(&mni->interval_tree.rb); - mni->interval_tree.start = start; + interval_sub->mm = mm; + interval_sub->ops = ops; + RB_CLEAR_NODE(&interval_sub->interval_tree.rb); + interval_sub->interval_tree.start = start; /* * Note that the representation of the intervals in the interval tree * considers the ending point as contained in the interval. */ if (length == 0 || - check_add_overflow(start, length - 1, &mni->interval_tree.last)) + check_add_overflow(start, length - 1, + &interval_sub->interval_tree.last)) return -EOVERFLOW; /* Must call with a mmget() held */ @@ -881,38 +920,40 @@ static int __mmu_interval_notifier_insert( * possibility for live lock, instead defer the add to * mn_itree_inv_end() so this algorithm is deterministic. * - * In all cases the value for the mni->invalidate_seq should be + * In all cases the value for the interval_sub->invalidate_seq should be * odd, see mmu_interval_read_begin() */ - spin_lock(&mmn_mm->lock); - if (mmn_mm->active_invalidate_ranges) { - if (mn_itree_is_invalidating(mmn_mm)) - hlist_add_head(&mni->deferred_item, - &mmn_mm->deferred_list); + spin_lock(&subscriptions->lock); + if (subscriptions->active_invalidate_ranges) { + if (mn_itree_is_invalidating(subscriptions)) + hlist_add_head(&interval_sub->deferred_item, + &subscriptions->deferred_list); else { - mmn_mm->invalidate_seq |= 1; - interval_tree_insert(&mni->interval_tree, - &mmn_mm->itree); + subscriptions->invalidate_seq |= 1; + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); } - mni->invalidate_seq = mmn_mm->invalidate_seq; + interval_sub->invalidate_seq = subscriptions->invalidate_seq; } else { - WARN_ON(mn_itree_is_invalidating(mmn_mm)); + WARN_ON(mn_itree_is_invalidating(subscriptions)); /* - * The starting seq for a mni not under invalidation should be - * odd, not equal to the current invalidate_seq and + * The starting seq for a subscription not under invalidation + * should be odd, not equal to the current invalidate_seq and * invalidate_seq should not 'wrap' to the new seq any time * soon. */ - mni->invalidate_seq = mmn_mm->invalidate_seq - 1; - interval_tree_insert(&mni->interval_tree, &mmn_mm->itree); + interval_sub->invalidate_seq = + subscriptions->invalidate_seq - 1; + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); } - spin_unlock(&mmn_mm->lock); + spin_unlock(&subscriptions->lock); return 0; } /** * mmu_interval_notifier_insert - Insert an interval notifier - * @mni: Interval notifier to register + * @interval_sub: Interval subscription to register * @start: Starting virtual address to monitor * @length: Length of the range to monitor * @mm : mm_struct to attach to @@ -925,53 +966,53 @@ static int __mmu_interval_notifier_insert( * The caller must use the normal interval notifier read flow via * mmu_interval_read_begin() to establish SPTEs for this range. */ -int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, +int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops) { - struct mmu_notifier_mm *mmn_mm; + struct mmu_notifier_subscriptions *subscriptions; int ret; might_lock(&mm->mmap_sem); - mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm); - if (!mmn_mm || !mmn_mm->has_itree) { + subscriptions = smp_load_acquire(&mm->notifier_subscriptions); + if (!subscriptions || !subscriptions->has_itree) { ret = mmu_notifier_register(NULL, mm); if (ret) return ret; - mmn_mm = mm->mmu_notifier_mm; + subscriptions = mm->notifier_subscriptions; } - return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, - ops); + return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions, + start, length, ops); } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert); int mmu_interval_notifier_insert_locked( - struct mmu_interval_notifier *mni, struct mm_struct *mm, + struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops) { - struct mmu_notifier_mm *mmn_mm; + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; int ret; lockdep_assert_held_write(&mm->mmap_sem); - mmn_mm = mm->mmu_notifier_mm; - if (!mmn_mm || !mmn_mm->has_itree) { + if (!subscriptions || !subscriptions->has_itree) { ret = __mmu_notifier_register(NULL, mm); if (ret) return ret; - mmn_mm = mm->mmu_notifier_mm; + subscriptions = mm->notifier_subscriptions; } - return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, - ops); + return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions, + start, length, ops); } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); /** * mmu_interval_notifier_remove - Remove a interval notifier - * @mni: Interval notifier to unregister + * @interval_sub: Interval subscription to unregister * * This function must be paired with mmu_interval_notifier_insert(). It cannot * be called from any ops callback. @@ -979,32 +1020,34 @@ EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); * Once this returns ops callbacks are no longer running on other CPUs and * will not be called in future. */ -void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni) +void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) { - struct mm_struct *mm = mni->mm; - struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + struct mm_struct *mm = interval_sub->mm; + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; unsigned long seq = 0; might_sleep(); - spin_lock(&mmn_mm->lock); - if (mn_itree_is_invalidating(mmn_mm)) { + spin_lock(&subscriptions->lock); + if (mn_itree_is_invalidating(subscriptions)) { /* * remove is being called after insert put this on the * deferred list, but before the deferred list was processed. */ - if (RB_EMPTY_NODE(&mni->interval_tree.rb)) { - hlist_del(&mni->deferred_item); + if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) { + hlist_del(&interval_sub->deferred_item); } else { - hlist_add_head(&mni->deferred_item, - &mmn_mm->deferred_list); - seq = mmn_mm->invalidate_seq; + hlist_add_head(&interval_sub->deferred_item, + &subscriptions->deferred_list); + seq = subscriptions->invalidate_seq; } } else { - WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb)); - interval_tree_remove(&mni->interval_tree, &mmn_mm->itree); + WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb)); + interval_tree_remove(&interval_sub->interval_tree, + &subscriptions->itree); } - spin_unlock(&mmn_mm->lock); + spin_unlock(&subscriptions->lock); /* * The possible sleep on progress in the invalidation requires the @@ -1013,8 +1056,8 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni) lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) - wait_event(mmn_mm->wq, - READ_ONCE(mmn_mm->invalidate_seq) != seq); + wait_event(subscriptions->wq, + READ_ONCE(subscriptions->invalidate_seq) != seq); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 8d069490e17b..3a779c084d96 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -2,3 +2,4 @@ pidfd_open_test pidfd_poll_test pidfd_test pidfd_wait +pidfd_getfd_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 43db1b98e845..75a545861375 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g -I../../../../usr/include/ -pthread -TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait +TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait pidfd_getfd_test include ../lib.mk diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index c6bc68329f4b..d482515604db 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -36,6 +36,10 @@ #define __NR_clone3 -1 #endif +#ifndef __NR_pidfd_getfd +#define __NR_pidfd_getfd -1 +#endif + /* * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c * That means, when it wraps around any pid < 300 will be skipped. @@ -84,4 +88,9 @@ static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); } +static inline int sys_pidfd_getfd(int pidfd, int fd, int flags) +{ + return syscall(__NR_pidfd_getfd, pidfd, fd, flags); +} + #endif /* __PIDFD_H */ diff --git a/tools/testing/selftests/pidfd/pidfd_getfd_test.c b/tools/testing/selftests/pidfd/pidfd_getfd_test.c new file mode 100644 index 000000000000..401a7c1d0312 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_getfd_test.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "../kselftest.h" +#include "../kselftest_harness.h" + +/* + * UNKNOWN_FD is an fd number that should never exist in the child, as it is + * used to check the negative case. + */ +#define UNKNOWN_FD 111 +#define UID_NOBODY 65535 + +static int sys_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, + unsigned long idx2) +{ + return syscall(__NR_kcmp, pid1, pid2, type, idx1, idx2); +} + +static int sys_memfd_create(const char *name, unsigned int flags) +{ + return syscall(__NR_memfd_create, name, flags); +} + +static int __child(int sk, int memfd) +{ + int ret; + char buf; + + /* + * Ensure we don't leave around a bunch of orphaned children if our + * tests fail. + */ + ret = prctl(PR_SET_PDEATHSIG, SIGKILL); + if (ret) { + fprintf(stderr, "%s: Child could not set DEATHSIG\n", + strerror(errno)); + return -1; + } + + ret = send(sk, &memfd, sizeof(memfd), 0); + if (ret != sizeof(memfd)) { + fprintf(stderr, "%s: Child failed to send fd number\n", + strerror(errno)); + return -1; + } + + /* + * The fixture setup is completed at this point. The tests will run. + * + * This blocking recv enables the parent to message the child. + * Either we will read 'P' off of the sk, indicating that we need + * to disable ptrace, or we will read a 0, indicating that the other + * side has closed the sk. This occurs during fixture teardown time, + * indicating that the child should exit. + */ + while ((ret = recv(sk, &buf, sizeof(buf), 0)) > 0) { + if (buf == 'P') { + ret = prctl(PR_SET_DUMPABLE, 0); + if (ret < 0) { + fprintf(stderr, + "%s: Child failed to disable ptrace\n", + strerror(errno)); + return -1; + } + } else { + fprintf(stderr, "Child received unknown command %c\n", + buf); + return -1; + } + ret = send(sk, &buf, sizeof(buf), 0); + if (ret != 1) { + fprintf(stderr, "%s: Child failed to ack\n", + strerror(errno)); + return -1; + } + } + if (ret < 0) { + fprintf(stderr, "%s: Child failed to read from socket\n", + strerror(errno)); + return -1; + } + + return 0; +} + +static int child(int sk) +{ + int memfd, ret; + + memfd = sys_memfd_create("test", 0); + if (memfd < 0) { + fprintf(stderr, "%s: Child could not create memfd\n", + strerror(errno)); + ret = -1; + } else { + ret = __child(sk, memfd); + close(memfd); + } + + close(sk); + return ret; +} + +FIXTURE(child) +{ + /* + * remote_fd is the number of the FD which we are trying to retrieve + * from the child. + */ + int remote_fd; + /* pid points to the child which we are fetching FDs from */ + pid_t pid; + /* pidfd is the pidfd of the child */ + int pidfd; + /* + * sk is our side of the socketpair used to communicate with the child. + * When it is closed, the child will exit. + */ + int sk; +}; + +FIXTURE_SETUP(child) +{ + int ret, sk_pair[2]; + + ASSERT_EQ(0, socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { + TH_LOG("%s: failed to create socketpair", strerror(errno)); + } + self->sk = sk_pair[0]; + + self->pid = fork(); + ASSERT_GE(self->pid, 0); + + if (self->pid == 0) { + close(sk_pair[0]); + if (child(sk_pair[1])) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + close(sk_pair[1]); + + self->pidfd = sys_pidfd_open(self->pid, 0); + ASSERT_GE(self->pidfd, 0); + + /* + * Wait for the child to complete setup. It'll send the remote memfd's + * number when ready. + */ + ret = recv(sk_pair[0], &self->remote_fd, sizeof(self->remote_fd), 0); + ASSERT_EQ(sizeof(self->remote_fd), ret); +} + +FIXTURE_TEARDOWN(child) +{ + EXPECT_EQ(0, close(self->pidfd)); + EXPECT_EQ(0, close(self->sk)); + + EXPECT_EQ(0, wait_for_pid(self->pid)); +} + +TEST_F(child, disable_ptrace) +{ + int uid, fd; + char c; + + /* + * Turn into nobody if we're root, to avoid CAP_SYS_PTRACE + * + * The tests should run in their own process, so even this test fails, + * it shouldn't result in subsequent tests failing. + */ + uid = getuid(); + if (uid == 0) + ASSERT_EQ(0, seteuid(UID_NOBODY)); + + ASSERT_EQ(1, send(self->sk, "P", 1, 0)); + ASSERT_EQ(1, recv(self->sk, &c, 1, 0)); + + fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0); + EXPECT_EQ(-1, fd); + EXPECT_EQ(EPERM, errno); + + if (uid == 0) + ASSERT_EQ(0, seteuid(0)); +} + +TEST_F(child, fetch_fd) +{ + int fd, ret; + + fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0); + ASSERT_GE(fd, 0); + + EXPECT_EQ(0, sys_kcmp(getpid(), self->pid, KCMP_FILE, fd, self->remote_fd)); + + ret = fcntl(fd, F_GETFD); + ASSERT_GE(ret, 0); + EXPECT_GE(ret & FD_CLOEXEC, 0); + + close(fd); +} + +TEST_F(child, test_unknown_fd) +{ + int fd; + + fd = sys_pidfd_getfd(self->pidfd, UNKNOWN_FD, 0); + EXPECT_EQ(-1, fd) { + TH_LOG("getfd succeeded while fetching unknown fd"); + }; + EXPECT_EQ(EBADF, errno) { + TH_LOG("%s: getfd did not get EBADF", strerror(errno)); + } +} + +TEST(flags_set) +{ + ASSERT_EQ(-1, sys_pidfd_getfd(0, 0, 1)); + EXPECT_EQ(errno, EINVAL); +} + +#if __NR_pidfd_getfd == -1 +int main(void) +{ + fprintf(stderr, "__NR_pidfd_getfd undefined. The pidfd_getfd syscall is unavailable. Test aborting\n"); + return KSFT_SKIP; +} +#else +TEST_HARNESS_MAIN +#endif