mirror of
https://github.com/torvalds/linux.git
synced 2026-05-26 16:12:59 +02:00
pidfd: add CLONE_PIDFD_AUTOKILL
Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's lifetime to the pidfd returned from clone3(). When the last reference to the struct file created by clone3() is closed the kernel sends SIGKILL to the child. A pidfd obtained via pidfd_open() for the same process does not keep the child alive and does not trigger autokill - only the specific struct file from clone3() has this property. This is useful for container runtimes, service managers, and sandboxed subprocess execution - any scenario where the child must die if the parent crashes or abandons the pidfd. CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no one to reap it would become a zombie). CLONE_THREAD is rejected because autokill targets a process not a thread. The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on the struct file at clone3() time. The pidfs .release handler checks this flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...) only when it is set. Files from pidfd_open() or open_by_handle_at() are distinct struct files that do not carry this flag. dup()/fork() share the same struct file so they extend the child's lifetime until the last reference drops. CLONE_PIDFD_AUTOKILL uses a privilege model based on CLONE_NNP: without CLONE_NNP the child could escalate privileges via setuid/setgid exec after being spawned, so the caller must have CAP_SYS_ADMIN in its user namespace. With CLONE_NNP the child can never gain new privileges so unprivileged usage is allowed. This is a deliberate departure from the pdeath_signal model which is reset during secureexec and commit_creds() rendering it useless for container runtimes that need to deprivilege themselves. Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-3-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
parent
24baca56fa
commit
c8134b5f13
38
fs/pidfs.c
38
fs/pidfs.c
|
|
@ -8,6 +8,8 @@
|
|||
#include <linux/mount.h>
|
||||
#include <linux/pid.h>
|
||||
#include <linux/pidfs.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/signal.h>
|
||||
#include <linux/pid_namespace.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/proc_fs.h>
|
||||
|
|
@ -637,7 +639,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
|||
return open_namespace(ns_common);
|
||||
}
|
||||
|
||||
static int pidfs_file_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct pid *pid = inode->i_private;
|
||||
struct task_struct *task;
|
||||
|
||||
if (!(file->f_flags & PIDFD_AUTOKILL))
|
||||
return 0;
|
||||
|
||||
guard(rcu)();
|
||||
task = pid_task(pid, PIDTYPE_TGID);
|
||||
if (!task)
|
||||
return 0;
|
||||
|
||||
/* Not available for kthreads or user workers for now. */
|
||||
if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER)))
|
||||
return 0;
|
||||
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct file_operations pidfs_file_operations = {
|
||||
.release = pidfs_file_release,
|
||||
.poll = pidfd_poll,
|
||||
#ifdef CONFIG_PROC_FS
|
||||
.show_fdinfo = pidfd_show_fdinfo,
|
||||
|
|
@ -1093,11 +1116,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
|
|||
int ret;
|
||||
|
||||
/*
|
||||
* Ensure that PIDFD_STALE can be passed as a flag without
|
||||
* overloading other uapi pidfd flags.
|
||||
* Ensure that internal pidfd flags don't overlap with each
|
||||
* other or with uapi pidfd flags.
|
||||
*/
|
||||
BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
|
||||
BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
|
||||
BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK |
|
||||
PIDFD_STALE | PIDFD_AUTOKILL) != 4);
|
||||
|
||||
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
|
||||
if (ret < 0)
|
||||
|
|
@ -1108,9 +1131,12 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
|
|||
flags &= ~PIDFD_STALE;
|
||||
flags |= O_RDWR;
|
||||
pidfd_file = dentry_open(&path, flags, current_cred());
|
||||
/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
|
||||
/*
|
||||
* Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as
|
||||
* do_dentry_open() strips O_EXCL and O_TRUNC.
|
||||
*/
|
||||
if (!IS_ERR(pidfd_file))
|
||||
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
|
||||
pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL));
|
||||
|
||||
return pidfd_file;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
#ifdef __KERNEL__
|
||||
#include <linux/sched.h>
|
||||
#define PIDFD_STALE CLONE_PIDFD
|
||||
#define PIDFD_AUTOKILL O_TRUNC
|
||||
#endif
|
||||
|
||||
/* Flags for pidfd_send_signal(). */
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@
|
|||
#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */
|
||||
#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */
|
||||
#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */
|
||||
#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */
|
||||
|
||||
/*
|
||||
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
||||
|
|
|
|||
|
|
@ -2045,6 +2045,24 @@ __latent_entropy struct task_struct *copy_process(
|
|||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
if (clone_flags & CLONE_PIDFD_AUTOKILL) {
|
||||
if (!(clone_flags & CLONE_PIDFD))
|
||||
return ERR_PTR(-EINVAL);
|
||||
if (!(clone_flags & CLONE_AUTOREAP))
|
||||
return ERR_PTR(-EINVAL);
|
||||
if (clone_flags & CLONE_THREAD)
|
||||
return ERR_PTR(-EINVAL);
|
||||
/*
|
||||
* Without CLONE_NNP the child could escalate privileges
|
||||
* after being spawned, so require CAP_SYS_ADMIN.
|
||||
* With CLONE_NNP the child can't gain new privileges,
|
||||
* so allow unprivileged usage.
|
||||
*/
|
||||
if (!(clone_flags & CLONE_NNP) &&
|
||||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
}
|
||||
|
||||
/*
|
||||
* Force any signals received before this point to be delivered
|
||||
* before the fork happens. Collect up signals sent to multiple
|
||||
|
|
@ -2267,13 +2285,18 @@ __latent_entropy struct task_struct *copy_process(
|
|||
* if the fd table isn't shared).
|
||||
*/
|
||||
if (clone_flags & CLONE_PIDFD) {
|
||||
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
|
||||
unsigned flags = PIDFD_STALE;
|
||||
|
||||
if (clone_flags & CLONE_THREAD)
|
||||
flags |= PIDFD_THREAD;
|
||||
if (clone_flags & CLONE_PIDFD_AUTOKILL)
|
||||
flags |= PIDFD_AUTOKILL;
|
||||
|
||||
/*
|
||||
* Note that no task has been attached to @pid yet indicate
|
||||
* that via CLONE_PIDFD.
|
||||
*/
|
||||
retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
|
||||
retval = pidfd_prepare(pid, flags, &pidfile);
|
||||
if (retval < 0)
|
||||
goto bad_fork_free_pid;
|
||||
pidfd = retval;
|
||||
|
|
@ -2920,7 +2943,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
|
|||
/* Verify that no unknown flags are passed along. */
|
||||
if (kargs->flags &
|
||||
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
|
||||
CLONE_AUTOREAP | CLONE_NNP))
|
||||
CLONE_AUTOREAP | CLONE_NNP | CLONE_PIDFD_AUTOKILL))
|
||||
return false;
|
||||
|
||||
/*
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user