Merge patch series "fsmount: add FSMOUNT_NAMESPACE"

Christian Brauner <brauner@kernel.org> says:

Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount
namespace with the newly created filesystem attached to a copy of the
real rootfs. This returns a namespace file descriptor instead of an
O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for
open_tree().

This allows creating a new filesystem and immediately placing it in a
new mount namespace in a single operation, which is useful for container
runtimes and other namespace-based isolation mechanisms.

This accompanies OPEN_TREE_NAMESPACE and avoids a needless detour via
OPEN_TREE_NAMESPACE to get the same effect. Will be especially useful
when you mount an actual filesystem to be used as the container rootfs.

* patches from https://patch.msgid.link/20260122-work-fsmount-namespace-v1-0-5ef0a886e646@kernel.org:
  selftests/open_tree_ns: fix compilation
  selftests: add FSMOUNT_NAMESPACE tests
  selftests/statmount: add statmount_alloc() helper
  tools: update mount.h header
  mount: add FSMOUNT_NAMESPACE
  mount: simplify __do_loopback()
  mount: start iterating from start of rbtree

Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-0-5ef0a886e646@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2026-01-23 16:58:20 +01:00
commit 0209e31659
No known key found for this signature in database
GPG Key ID: 91C61BC06578DCA2
10 changed files with 1271 additions and 112 deletions

View File

@ -2958,10 +2958,9 @@ static inline bool may_copy_tree(const struct path *path)
}
static struct mount *__do_loopback(const struct path *old_path,
unsigned int flags, unsigned int copy_flags)
bool recurse, unsigned int copy_flags)
{
struct mount *old = real_mount(old_path->mnt);
bool recurse = flags & AT_RECURSIVE;
if (IS_MNT_UNBINDABLE(old))
return ERR_PTR(-EINVAL);
@ -2972,18 +2971,6 @@ static struct mount *__do_loopback(const struct path *old_path,
if (!recurse && __has_locked_children(old, old_path->dentry))
return ERR_PTR(-EINVAL);
/*
* When creating a new mount namespace we don't want to copy over
* mounts of mount namespaces to avoid the risk of cycles and also to
* minimize the default complex interdependencies between mount
* namespaces.
*
* We could ofc just check whether all mount namespace files aren't
* creating cycles but really let's keep this simple.
*/
if (!(flags & OPEN_TREE_NAMESPACE))
copy_flags |= CL_COPY_MNT_NS_FILE;
if (recurse)
return copy_tree(old, old_path->dentry, copy_flags);
@ -2998,7 +2985,6 @@ static int do_loopback(const struct path *path, const char *old_name,
{
struct path old_path __free(path_put) = {};
struct mount *mnt = NULL;
unsigned int flags = recurse ? AT_RECURSIVE : 0;
int err;
if (!old_name || !*old_name)
@ -3017,7 +3003,7 @@ static int do_loopback(const struct path *path, const char *old_name,
if (!check_mnt(mp.parent))
return -EINVAL;
mnt = __do_loopback(&old_path, flags, 0);
mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
@ -3055,7 +3041,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned
ns->seq_origin = src_mnt_ns->ns.ns_id;
}
mnt = __do_loopback(path, flags, 0);
mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE);
if (IS_ERR(mnt)) {
emptied_ns = ns;
return ERR_CAST(mnt);
@ -3087,7 +3073,8 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
return file;
}
static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
static struct mnt_namespace *create_new_namespace(struct path *path,
bool recurse)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct user_namespace *user_ns = current_user_ns();
@ -3131,11 +3118,26 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
}
/*
* We don't emulate unshare()ing a mount namespace. We stick
* to the restrictions of creating detached bind-mounts. It
* has a lot saner and simpler semantics.
* We don't emulate unshare()ing a mount namespace. We stick to
* the restrictions of creating detached bind-mounts. It has a
* lot saner and simpler semantics.
*/
mnt = __do_loopback(path, flags, copy_flags);
mnt = real_mount(path->mnt);
if (!mnt->mnt_ns) {
/*
* If we're moving into a new mount namespace via
* fsmount() swap the mount ids so the nullfs mount id
* is the lowest in the mount namespace avoiding another
* useless copy. This is fine we're not attached to any
* mount namespace so the mount ids are pure decoration
* at that point.
*/
swap(mnt->mnt_id_unique, new_ns_root->mnt_id_unique);
swap(mnt->mnt_id, new_ns_root->mnt_id);
mntget(&mnt->mnt);
} else {
mnt = __do_loopback(path, recurse, copy_flags);
}
scoped_guard(mount_writer) {
if (IS_ERR(mnt)) {
emptied_ns = new_ns;
@ -3164,11 +3166,11 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
return new_ns;
}
static struct file *open_new_namespace(struct path *path, unsigned int flags)
static struct file *open_new_namespace(struct path *path, bool recurse)
{
struct mnt_namespace *new_ns;
new_ns = create_new_namespace(path, flags);
new_ns = create_new_namespace(path, recurse);
if (IS_ERR(new_ns))
return ERR_CAST(new_ns);
return open_namespace_file(to_ns_common(new_ns));
@ -3217,7 +3219,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
return ERR_PTR(ret);
if (flags & OPEN_TREE_NAMESPACE)
return open_new_namespace(&path, flags);
return open_new_namespace(&path, (flags & AT_RECURSIVE));
if (flags & OPEN_TREE_CLONE)
return open_detached_copy(&path, flags);
@ -4414,11 +4416,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
unsigned int mnt_flags = 0;
long ret;
if (!may_mount())
if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0)
return -EINVAL;
if ((flags & FSMOUNT_NAMESPACE) &&
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
return -EINVAL;
if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
return -EPERM;
if (attr_flags & ~FSMOUNT_VALID_FLAGS)
return -EINVAL;
@ -4485,6 +4491,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
*/
vfs_clean_context(fc);
if (flags & FSMOUNT_NAMESPACE)
return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
open_new_namespace(&new_path, 0));
ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
if (IS_ERR(ns))
return PTR_ERR(ns);
@ -5649,14 +5659,14 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
if (mnt_ns_empty(ns))
return -ENOENT;
first = child = ns->root;
for (;;) {
child = listmnt_next(child, false);
if (!child)
return -ENOENT;
if (child->mnt_parent == first)
first = ns->root;
for (child = node_to_mount(ns->mnt_first_node); child;
child = listmnt_next(child, false)) {
if (child != first && child->mnt_parent == first)
break;
}
if (!child)
return -ENOENT;
root->mnt = mntget(&child->mnt);
root->dentry = dget(root->mnt->mnt_root);

View File

@ -110,6 +110,7 @@ enum fsconfig_command {
* fsmount() flags.
*/
#define FSMOUNT_CLOEXEC 0x00000001
#define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */
/*
* Mount attributes.

View File

@ -61,7 +61,8 @@
/*
* open_tree() flags.
*/
#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
#define OPEN_TREE_CLONE (1 << 0) /* Clone the target tree and attach the clone */
#define OPEN_TREE_NAMESPACE (1 << 1) /* Clone the target tree into a new mount namespace */
#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
/*
@ -109,6 +110,7 @@ enum fsconfig_command {
* fsmount() flags.
*/
#define FSMOUNT_CLOEXEC 0x00000001
#define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */
/*
* Mount attributes.
@ -197,7 +199,10 @@ struct statmount {
*/
struct mnt_id_req {
__u32 size;
__u32 spare;
union {
__u32 mnt_ns_fd;
__u32 mnt_fd;
};
__u64 mnt_id;
__u64 param;
__u64 mnt_ns_id;
@ -232,4 +237,9 @@ struct mnt_id_req {
#define LSMT_ROOT 0xffffffffffffffff /* root mount */
#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */
/*
* @flag bits for statmount(2)
*/
#define STATMOUNT_BY_FD 0x00000001U /* want mountinfo for given fd */
#endif /* _UAPI_LINUX_MOUNT_H */

View File

@ -0,0 +1 @@
fsmount_ns_test

View File

@ -0,0 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
TEST_GEN_PROGS := fsmount_ns_test
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
LDLIBS := -lcap
include ../../lib.mk
$(OUTPUT)/fsmount_ns_test: fsmount_ns_test.c ../utils.c
$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
TEST_GEN_PROGS := open_tree_ns_test
CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES)
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
LDLIBS := -lcap
include ../../lib.mk

View File

@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2026 Christian Brauner <brauner@kernel.org>
*
* Test for OPEN_TREE_NAMESPACE flag.
*
* Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount
@ -50,31 +52,6 @@ static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id)
return ret;
}
#define STATMOUNT_BUFSIZE (1 << 15)
static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask)
{
struct statmount *buf;
size_t bufsize = STATMOUNT_BUFSIZE;
int ret;
for (;;) {
buf = malloc(bufsize);
if (!buf)
return NULL;
ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0);
if (ret == 0)
return buf;
free(buf);
if (errno != EOVERFLOW)
return NULL;
bufsize <<= 1;
}
}
static void log_mount(struct __test_metadata *_metadata, struct statmount *sm)
{
const char *fs_type = "";
@ -115,7 +92,7 @@ static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id)
STATMOUNT_MNT_BASIC |
STATMOUNT_FS_TYPE |
STATMOUNT_MNT_ROOT |
STATMOUNT_MNT_POINT);
STATMOUNT_MNT_POINT, 0);
if (!sm) {
TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s",
i, (unsigned long long)list[i], strerror(errno));
@ -221,7 +198,7 @@ FIXTURE_SETUP(open_tree_ns)
SKIP(return, "open_tree() syscall not supported");
/* Check if statmount/listmount are supported */
ret = statmount(0, 0, 0, NULL, 0, 0);
ret = statmount(0, 0, 0, 0, NULL, 0, 0);
if (ret == -1 && errno == ENOSYS)
SKIP(return, "statmount() syscall not supported");
@ -340,7 +317,7 @@ TEST_F(open_tree_ns, verify_mount_properties)
ASSERT_GE(nr_mounts, 1);
/* Get info about the root mount (the bind mount, rootfs is hidden) */
ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
ret = statmount(list[0], new_ns_id, 0, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
ASSERT_EQ(ret, 0);
ASSERT_NE(sm.mnt_id, sm.mnt_parent_id);
@ -452,7 +429,7 @@ FIXTURE_SETUP(open_tree_ns_userns)
SKIP(return, "open_tree() syscall not supported");
/* Check if statmount/listmount are supported */
ret = statmount(0, 0, 0, NULL, 0, 0);
ret = statmount(0, 0, 0, 0, NULL, 0, 0);
if (ret == -1 && errno == ENOSYS)
SKIP(return, "statmount() syscall not supported");
}
@ -746,7 +723,7 @@ TEST_F(open_tree_ns_userns, umount_fails_einval)
const char *mnt_point;
sm = statmount_alloc(list[i], new_ns_id,
STATMOUNT_MNT_POINT);
STATMOUNT_MNT_POINT, 0);
if (!sm)
_exit(11);
@ -863,7 +840,7 @@ TEST_F(open_tree_ns_userns, umount_succeeds)
const char *mnt_point;
sm = statmount_alloc(list[i], new_ns_id,
STATMOUNT_MNT_POINT);
STATMOUNT_MNT_POINT, 0);
if (!sm)
_exit(11);
@ -904,7 +881,7 @@ TEST_F(open_tree_ns_userns, umount_succeeds)
ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
break;
case 7:
ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL");
ASSERT_FALSE(true) TH_LOG("umount failed but should have succeeded");
break;
case 9:
ASSERT_FALSE(true) TH_LOG("listmount failed");
@ -1003,7 +980,7 @@ TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable)
struct statmount *sm;
const char *mnt_point;
sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT);
sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT, 0);
ASSERT_NE(sm, NULL) {
TH_LOG("statmount_alloc failed for mnt_id %llu",
(unsigned long long)list[i]);

View File

@ -3,10 +3,14 @@
#ifndef __STATMOUNT_H
#define __STATMOUNT_H
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <linux/mount.h>
#include <asm/unistd.h>
#define STATMOUNT_BUFSIZE (1 << 15)
#ifndef __NR_statmount
#if defined __alpha__
#define __NR_statmount 567
@ -84,4 +88,51 @@ static inline ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id,
return syscall(__NR_listmount, &req, list, num, flags);
}
static inline struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id,
uint64_t mask, unsigned int flags)
{
struct statmount *buf;
size_t bufsize = STATMOUNT_BUFSIZE;
int ret;
for (;;) {
buf = malloc(bufsize);
if (!buf)
return NULL;
ret = statmount(mnt_id, mnt_ns_id, 0, mask, buf, bufsize, flags);
if (ret == 0)
return buf;
free(buf);
if (errno != EOVERFLOW)
return NULL;
bufsize <<= 1;
}
}
static inline struct statmount *statmount_alloc_by_fd(int fd, uint64_t mask)
{
struct statmount *buf;
size_t bufsize = STATMOUNT_BUFSIZE;
int ret;
for (;;) {
buf = malloc(bufsize);
if (!buf)
return NULL;
ret = statmount(0, 0, fd, mask, buf, bufsize, STATMOUNT_BY_FD);
if (ret == 0)
return buf;
free(buf);
if (errno != EOVERFLOW)
return NULL;
bufsize <<= 1;
}
}
#endif /* __STATMOUNT_H */

View File

@ -33,45 +33,6 @@ static const char *const known_fs[] = {
"sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf",
"vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL };
static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags)
{
size_t bufsize = 1 << 15;
struct statmount *buf = NULL, *tmp = NULL;
int tofree = 0;
int ret;
if (flags & STATMOUNT_BY_FD && fd < 0)
return NULL;
tmp = alloca(bufsize);
for (;;) {
if (flags & STATMOUNT_BY_FD)
ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags);
else
ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags);
if (ret != -1)
break;
if (tofree)
free(tmp);
if (errno != EOVERFLOW)
return NULL;
bufsize <<= 1;
tofree = 1;
tmp = malloc(bufsize);
if (!tmp)
return NULL;
}
buf = malloc(tmp->size);
if (buf)
memcpy(buf, tmp, tmp->size);
if (tofree)
free(tmp);
return buf;
}
static void write_file(const char *path, const char *val)
{
int fd = open(path, O_WRONLY);
@ -715,7 +676,7 @@ static void test_statmount_by_fd(void)
goto err_fd;
}
sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
if (!sm) {
ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
goto err_chroot;
@ -750,7 +711,7 @@ static void test_statmount_by_fd(void)
}
free(sm);
sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT);
if (!sm) {
ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
goto err_fd;
@ -844,7 +805,7 @@ static void test_statmount_by_fd_unmounted(void)
goto err_fd;
}
sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD);
sm = statmount_alloc_by_fd(fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT);
if (!sm) {
ksft_test_result_fail("statmount by fd unmounted: %s\n",
strerror(errno));