mirror of
https://github.com/torvalds/linux.git
synced 2026-05-12 16:18:45 +02:00
The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. To illustrate the impact of rlimits, let's say there is a program that does not fork. Some service-A wants to run this program as user X in multiple containers. Since the program never fork the service wants to set RLIMIT_NPROC=1. service-A \- program (uid=1000, container1, rlimit_nproc=1) \- program (uid=1000, container2, rlimit_nproc=1) The service-A sets RLIMIT_NPROC=1 and runs the program in container1. When the service-A tries to run a program with RLIMIT_NPROC=1 in container2 it fails since user X already has one running process. We cannot use existing inc_ucounts / dec_ucounts because they do not allow us to exceed the maximum for the counter. Some rlimits can be overlimited by root or if the user has the appropriate capability. Changelog v11: * Change inc_rlimit_ucounts() which now returns top value of ucounts. * Drop inc_rlimit_ucounts_and_test() because the return code of inc_rlimit_ucounts() can be checked. Signed-off-by: Alexey Gladkov <legion@kernel.org> Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
306 lines
7.3 KiB
C
306 lines
7.3 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
#include <linux/stat.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/cred.h>
|
|
#include <linux/hash.h>
|
|
#include <linux/kmemleak.h>
|
|
#include <linux/user_namespace.h>
|
|
|
|
struct ucounts init_ucounts = {
|
|
.ns = &init_user_ns,
|
|
.uid = GLOBAL_ROOT_UID,
|
|
.count = ATOMIC_INIT(1),
|
|
};
|
|
|
|
#define UCOUNTS_HASHTABLE_BITS 10
|
|
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
|
|
static DEFINE_SPINLOCK(ucounts_lock);
|
|
|
|
#define ucounts_hashfn(ns, uid) \
|
|
hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
|
|
UCOUNTS_HASHTABLE_BITS)
|
|
#define ucounts_hashentry(ns, uid) \
|
|
(ucounts_hashtable + ucounts_hashfn(ns, uid))
|
|
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
static struct ctl_table_set *
|
|
set_lookup(struct ctl_table_root *root)
|
|
{
|
|
return ¤t_user_ns()->set;
|
|
}
|
|
|
|
static int set_is_seen(struct ctl_table_set *set)
|
|
{
|
|
return ¤t_user_ns()->set == set;
|
|
}
|
|
|
|
static int set_permissions(struct ctl_table_header *head,
|
|
struct ctl_table *table)
|
|
{
|
|
struct user_namespace *user_ns =
|
|
container_of(head->set, struct user_namespace, set);
|
|
int mode;
|
|
|
|
/* Allow users with CAP_SYS_RESOURCE unrestrained access */
|
|
if (ns_capable(user_ns, CAP_SYS_RESOURCE))
|
|
mode = (table->mode & S_IRWXU) >> 6;
|
|
else
|
|
/* Allow all others at most read-only access */
|
|
mode = table->mode & S_IROTH;
|
|
return (mode << 6) | (mode << 3) | mode;
|
|
}
|
|
|
|
static struct ctl_table_root set_root = {
|
|
.lookup = set_lookup,
|
|
.permissions = set_permissions,
|
|
};
|
|
|
|
#define UCOUNT_ENTRY(name) \
|
|
{ \
|
|
.procname = name, \
|
|
.maxlen = sizeof(int), \
|
|
.mode = 0644, \
|
|
.proc_handler = proc_dointvec_minmax, \
|
|
.extra1 = SYSCTL_ZERO, \
|
|
.extra2 = SYSCTL_INT_MAX, \
|
|
}
|
|
static struct ctl_table user_table[] = {
|
|
UCOUNT_ENTRY("max_user_namespaces"),
|
|
UCOUNT_ENTRY("max_pid_namespaces"),
|
|
UCOUNT_ENTRY("max_uts_namespaces"),
|
|
UCOUNT_ENTRY("max_ipc_namespaces"),
|
|
UCOUNT_ENTRY("max_net_namespaces"),
|
|
UCOUNT_ENTRY("max_mnt_namespaces"),
|
|
UCOUNT_ENTRY("max_cgroup_namespaces"),
|
|
UCOUNT_ENTRY("max_time_namespaces"),
|
|
#ifdef CONFIG_INOTIFY_USER
|
|
UCOUNT_ENTRY("max_inotify_instances"),
|
|
UCOUNT_ENTRY("max_inotify_watches"),
|
|
#endif
|
|
{ },
|
|
{ }
|
|
};
|
|
#endif /* CONFIG_SYSCTL */
|
|
|
|
bool setup_userns_sysctls(struct user_namespace *ns)
|
|
{
|
|
#ifdef CONFIG_SYSCTL
|
|
struct ctl_table *tbl;
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1);
|
|
setup_sysctl_set(&ns->set, &set_root, set_is_seen);
|
|
tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
|
|
if (tbl) {
|
|
int i;
|
|
for (i = 0; i < UCOUNT_COUNTS; i++) {
|
|
tbl[i].data = &ns->ucount_max[i];
|
|
}
|
|
ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
|
|
}
|
|
if (!ns->sysctls) {
|
|
kfree(tbl);
|
|
retire_sysctl_set(&ns->set);
|
|
return false;
|
|
}
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
void retire_userns_sysctls(struct user_namespace *ns)
|
|
{
|
|
#ifdef CONFIG_SYSCTL
|
|
struct ctl_table *tbl;
|
|
|
|
tbl = ns->sysctls->ctl_table_arg;
|
|
unregister_sysctl_table(ns->sysctls);
|
|
retire_sysctl_set(&ns->set);
|
|
kfree(tbl);
|
|
#endif
|
|
}
|
|
|
|
static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
|
|
{
|
|
struct ucounts *ucounts;
|
|
|
|
hlist_for_each_entry(ucounts, hashent, node) {
|
|
if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
|
|
return ucounts;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static void hlist_add_ucounts(struct ucounts *ucounts)
|
|
{
|
|
struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
|
|
spin_lock_irq(&ucounts_lock);
|
|
hlist_add_head(&ucounts->node, hashent);
|
|
spin_unlock_irq(&ucounts_lock);
|
|
}
|
|
|
|
struct ucounts *get_ucounts(struct ucounts *ucounts)
|
|
{
|
|
if (ucounts && atomic_add_negative(1, &ucounts->count)) {
|
|
put_ucounts(ucounts);
|
|
ucounts = NULL;
|
|
}
|
|
return ucounts;
|
|
}
|
|
|
|
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
|
|
{
|
|
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
|
|
struct ucounts *ucounts, *new;
|
|
|
|
spin_lock_irq(&ucounts_lock);
|
|
ucounts = find_ucounts(ns, uid, hashent);
|
|
if (!ucounts) {
|
|
spin_unlock_irq(&ucounts_lock);
|
|
|
|
new = kzalloc(sizeof(*new), GFP_KERNEL);
|
|
if (!new)
|
|
return NULL;
|
|
|
|
new->ns = ns;
|
|
new->uid = uid;
|
|
atomic_set(&new->count, 1);
|
|
|
|
spin_lock_irq(&ucounts_lock);
|
|
ucounts = find_ucounts(ns, uid, hashent);
|
|
if (ucounts) {
|
|
kfree(new);
|
|
} else {
|
|
hlist_add_head(&new->node, hashent);
|
|
spin_unlock_irq(&ucounts_lock);
|
|
return new;
|
|
}
|
|
}
|
|
spin_unlock_irq(&ucounts_lock);
|
|
ucounts = get_ucounts(ucounts);
|
|
return ucounts;
|
|
}
|
|
|
|
void put_ucounts(struct ucounts *ucounts)
|
|
{
|
|
unsigned long flags;
|
|
|
|
if (atomic_dec_and_test(&ucounts->count)) {
|
|
spin_lock_irqsave(&ucounts_lock, flags);
|
|
hlist_del_init(&ucounts->node);
|
|
spin_unlock_irqrestore(&ucounts_lock, flags);
|
|
kfree(ucounts);
|
|
}
|
|
}
|
|
|
|
static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
|
|
{
|
|
long c, old;
|
|
c = atomic_long_read(v);
|
|
for (;;) {
|
|
if (unlikely(c >= u))
|
|
return false;
|
|
old = atomic_long_cmpxchg(v, c, c+1);
|
|
if (likely(old == c))
|
|
return true;
|
|
c = old;
|
|
}
|
|
}
|
|
|
|
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
|
|
enum ucount_type type)
|
|
{
|
|
struct ucounts *ucounts, *iter, *bad;
|
|
struct user_namespace *tns;
|
|
ucounts = alloc_ucounts(ns, uid);
|
|
for (iter = ucounts; iter; iter = tns->ucounts) {
|
|
long max;
|
|
tns = iter->ns;
|
|
max = READ_ONCE(tns->ucount_max[type]);
|
|
if (!atomic_long_inc_below(&iter->ucount[type], max))
|
|
goto fail;
|
|
}
|
|
return ucounts;
|
|
fail:
|
|
bad = iter;
|
|
for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
|
|
atomic_long_dec(&iter->ucount[type]);
|
|
|
|
put_ucounts(ucounts);
|
|
return NULL;
|
|
}
|
|
|
|
void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
|
|
{
|
|
struct ucounts *iter;
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
|
|
WARN_ON_ONCE(dec < 0);
|
|
}
|
|
put_ucounts(ucounts);
|
|
}
|
|
|
|
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
|
|
{
|
|
struct ucounts *iter;
|
|
long ret = 0;
|
|
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
long max = READ_ONCE(iter->ns->ucount_max[type]);
|
|
long new = atomic_long_add_return(v, &iter->ucount[type]);
|
|
if (new < 0 || new > max)
|
|
ret = LONG_MAX;
|
|
else if (iter == ucounts)
|
|
ret = new;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
|
|
{
|
|
struct ucounts *iter;
|
|
long new;
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
long dec = atomic_long_add_return(-v, &iter->ucount[type]);
|
|
WARN_ON_ONCE(dec < 0);
|
|
if (iter == ucounts)
|
|
new = dec;
|
|
}
|
|
return (new == 0);
|
|
}
|
|
|
|
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
|
|
{
|
|
struct ucounts *iter;
|
|
if (get_ucounts_value(ucounts, type) > max)
|
|
return true;
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
max = READ_ONCE(iter->ns->ucount_max[type]);
|
|
if (get_ucounts_value(iter, type) > max)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static __init int user_namespace_sysctl_init(void)
|
|
{
|
|
#ifdef CONFIG_SYSCTL
|
|
static struct ctl_table_header *user_header;
|
|
static struct ctl_table empty[1];
|
|
/*
|
|
* It is necessary to register the user directory in the
|
|
* default set so that registrations in the child sets work
|
|
* properly.
|
|
*/
|
|
user_header = register_sysctl("user", empty);
|
|
kmemleak_ignore(user_header);
|
|
BUG_ON(!user_header);
|
|
BUG_ON(!setup_userns_sysctls(&init_user_ns));
|
|
#endif
|
|
hlist_add_ucounts(&init_ucounts);
|
|
inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
|
|
return 0;
|
|
}
|
|
subsys_initcall(user_namespace_sysctl_init);
|