Merge patch series "fhandle, pidfs: allow open_by_handle_at() purely based on file handle"

Christian Brauner <brauner@kernel.org> says:

Various filesystems such as pidfs and drm support opening file handles
without having to require a file descriptor to identify the filesystem.
The filesystem are global single instances and can be trivially
identified solely on the information encoded in the file handle.

This makes it possible to not have to keep or acquire a sentinal file
descriptor just to pass it to open_by_handle_at() to identify the
filesystem. That's especially useful when such sentinel file descriptor
cannot or should not be acquired.

For pidfs this means a file handle can function as full replacement for
storing a pid in a file. Instead a file handle can be stored and
reopened purely based on the file handle.

Such autonomous file handles can be opened with or without specifying a
a file descriptor. If no proper file descriptor is used the
FD_PIDFS_ROOT sentinel must be passed. This allows us to define further
special negative fd sentinels in the future.

Userspace can trivially test for support by trying to open the file
handle with an invalid file descriptor.

* patches from https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-0-d02a04858fe3@kernel.org:
  selftests/pidfd: decode pidfd file handles withou having to specify an fd
  fhandle, pidfs: support open_by_handle_at() purely based on file handle
  uapi/fcntl: add FD_PIDFS_ROOT
  uapi/fcntl: add FD_INVALID
  uapi/fcntl: mark range as reserved
  fhandle: reflow get_path_anchor()
  pidfs: add pidfs_root_path() helper
  fhandle: rename to get_path_anchor()
  fhandle: hoist copy_from_user() above get_path_from_fd()
  fhandle: raise FILEID_IS_DIR in handle_type

Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-0-d02a04858fe3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2025-06-24 13:00:16 +02:00
commit 867673063e
No known key found for this signature in database
GPG Key ID: 91C61BC06578DCA2
8 changed files with 129 additions and 48 deletions

View File

@ -88,7 +88,7 @@ static long do_sys_name_to_handle(const struct path *path,
if (fh_flags & EXPORT_FH_CONNECTABLE) {
handle->handle_type |= FILEID_IS_CONNECTABLE;
if (d_is_dir(path->dentry))
fh_flags |= FILEID_IS_DIR;
handle->handle_type |= FILEID_IS_DIR;
}
retval = 0;
}
@ -168,23 +168,32 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
return err;
}
static int get_path_from_fd(int fd, struct path *root)
static int get_path_anchor(int fd, struct path *root)
{
if (fd >= 0) {
CLASS(fd, f)(fd);
if (fd_empty(f))
return -EBADF;
*root = fd_file(f)->f_path;
path_get(root);
return 0;
}
if (fd == AT_FDCWD) {
struct fs_struct *fs = current->fs;
spin_lock(&fs->lock);
*root = fs->pwd;
path_get(root);
spin_unlock(&fs->lock);
} else {
CLASS(fd, f)(fd);
if (fd_empty(f))
return -EBADF;
*root = fd_file(f)->f_path;
path_get(root);
return 0;
}
return 0;
if (fd == FD_PIDFS_ROOT) {
pidfs_get_root(root);
return 0;
}
return -EBADF;
}
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
@ -323,13 +332,24 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
{
int retval = 0;
struct file_handle f_handle;
struct file_handle *handle = NULL;
struct file_handle *handle __free(kfree) = NULL;
struct handle_to_path_ctx ctx = {};
const struct export_operations *eops;
retval = get_path_from_fd(mountdirfd, &ctx.root);
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
return -EFAULT;
if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
(f_handle.handle_bytes == 0))
return -EINVAL;
if (f_handle.handle_type < 0 ||
FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS)
return -EINVAL;
retval = get_path_anchor(mountdirfd, &ctx.root);
if (retval)
goto out_err;
return retval;
eops = ctx.root.mnt->mnt_sb->s_export_op;
if (eops && eops->permission)
@ -339,21 +359,6 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
if (retval)
goto out_path;
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
retval = -EFAULT;
goto out_path;
}
if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
(f_handle.handle_bytes == 0)) {
retval = -EINVAL;
goto out_path;
}
if (f_handle.handle_type < 0 ||
FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) {
retval = -EINVAL;
goto out_path;
}
handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
GFP_KERNEL);
if (!handle) {
@ -366,7 +371,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
&ufh->f_handle,
f_handle.handle_bytes)) {
retval = -EFAULT;
goto out_handle;
goto out_path;
}
/*
@ -384,11 +389,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
handle->handle_type &= ~FILEID_USER_FLAGS_MASK;
retval = do_handle_to_path(handle, path, &ctx);
out_handle:
kfree(handle);
out_path:
path_put(&ctx.root);
out_err:
return retval;
}

View File

@ -353,3 +353,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags);
int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
void pidfs_get_root(struct path *path);

View File

@ -31,6 +31,14 @@
static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
static struct path pidfs_root_path = {};
void pidfs_get_root(struct path *path)
{
*path = pidfs_root_path;
path_get(path);
}
/*
* Stashes information that userspace needs to access even after the
* process has been reaped.
@ -1068,4 +1076,7 @@ void __init pidfs_init(void)
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
pidfs_root_path.mnt = pidfs_mnt;
pidfs_root_path.dentry = pidfs_mnt->mnt_root;
}

View File

@ -90,10 +90,28 @@
#define DN_ATTRIB 0x00000020 /* File changed attibutes */
#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */
/* Reserved kernel ranges [-100], [-10000, -40000]. */
#define AT_FDCWD -100 /* Special value for dirfd used to
indicate openat should use the
current working directory. */
/*
* The concept of process and threads in userland and the kernel is a confusing
* one - within the kernel every thread is a 'task' with its own individual PID,
* however from userland's point of view threads are grouped by a single PID,
* which is that of the 'thread group leader', typically the first thread
* spawned.
*
* To cut the Gideon knot, for internal kernel usage, we refer to
* PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel
* perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread
* group leader...
*/
#define PIDFD_SELF_THREAD -10000 /* Current thread. */
#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */
#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */
#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
/* Generic flags for the *at(2) family of syscalls. */

View File

@ -42,21 +42,6 @@
#define PIDFD_COREDUMP_USER (1U << 2) /* coredump was done as the user. */
#define PIDFD_COREDUMP_ROOT (1U << 3) /* coredump was done as root. */
/*
* The concept of process and threads in userland and the kernel is a confusing
* one - within the kernel every thread is a 'task' with its own individual PID,
* however from userland's point of view threads are grouped by a single PID,
* which is that of the 'thread group leader', typically the first thread
* spawned.
*
* To cut the Gideon knot, for internal kernel usage, we refer to
* PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel
* perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread
* group leader...
*/
#define PIDFD_SELF_THREAD -10000 /* Current thread. */
#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */
/*
* ...and for userland we make life simpler - PIDFD_SELF refers to the current
* thread, PIDFD_SELF_PROCESS refers to the process thread group leader.

View File

@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall
CFLAGS += -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -pthread -Wall
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \

View File

@ -19,6 +19,10 @@
#include "../kselftest.h"
#include "../clone3/clone3_selftests.h"
#ifndef FD_PIDFS_ROOT
#define FD_PIDFS_ROOT -10002
#endif
#ifndef P_PIDFD
#define P_PIDFD 3
#endif
@ -56,7 +60,7 @@
#endif
#ifndef PIDFD_SELF_THREAD_GROUP
#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */
#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */
#endif
#ifndef PIDFD_SELF

View File

@ -500,4 +500,64 @@ TEST_F(file_handle, valid_name_to_handle_at_flags)
ASSERT_EQ(close(pidfd), 0);
}
/*
* That we decode a file handle without having to pass a pidfd.
*/
TEST_F(file_handle, decode_purely_based_on_file_handle)
{
int mnt_id;
struct file_handle *fh;
int pidfd = -EBADF;
struct stat st1, st2;
fh = malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
ASSERT_NE(fh, NULL);
memset(fh, 0, sizeof(struct file_handle) + MAX_HANDLE_SZ);
fh->handle_bytes = MAX_HANDLE_SZ;
ASSERT_EQ(name_to_handle_at(self->child_pidfd1, "", fh, &mnt_id, AT_EMPTY_PATH), 0);
ASSERT_EQ(fstat(self->child_pidfd1, &st1), 0);
pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, 0);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, O_CLOEXEC);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
pidfd = open_by_handle_at(FD_PIDFS_ROOT, fh, O_NONBLOCK);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
pidfd = open_by_handle_at(self->pidfd, fh, 0);
ASSERT_GE(pidfd, 0);
ASSERT_EQ(fstat(pidfd, &st2), 0);
ASSERT_TRUE(st1.st_dev == st2.st_dev && st1.st_ino == st2.st_ino);
ASSERT_EQ(close(pidfd), 0);
pidfd = open_by_handle_at(-EBADF, fh, 0);
ASSERT_LT(pidfd, 0);
pidfd = open_by_handle_at(AT_FDCWD, fh, 0);
ASSERT_LT(pidfd, 0);
free(fh);
}
TEST_HARNESS_MAIN