15bc01effe
In commit fda31c50292a ("signal: avoid double atomic counter increments for user accounting") Linus made a clever optimization to how rlimits and the struct user_struct. Unfortunately that optimization does not work in the obvious way when moved to nested rlimits. The problem is that the last decrement of the per user namespace per user sigpending counter might also be the last decrement of the sigpending counter in the parent user namespace as well. Which means that simply freeing the leaf ucount in __free_sigqueue is not enough. Maintain the optimization and handle the tricky cases by introducing inc_rlimit_get_ucounts and dec_rlimit_put_ucounts. By moving the entire optimization into functions that perform all of the work it becomes possible to ensure that every level is handled properly. The new function inc_rlimit_get_ucounts returns 0 on failure to increment the ucount. This is different than inc_rlimit_ucounts which increments the ucounts and returns LONG_MAX if the ucount counter has exceeded it's maximum or it wrapped (to indicate the counter needs to decremented). I wish we had a single user to account all pending signals to across all of the threads of a process so this complexity was not necessary Cc: stable@vger.kernel.org Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts") v1: https://lkml.kernel.org/r/87mtnavszx.fsf_-_@disp2133 Link: https://lkml.kernel.org/r/87fssytizw.fsf_-_@disp2133 Reviewed-by: Alexey Gladkov <legion@kernel.org> Tested-by: Rune Kleveland <rune.kleveland@infomedia.dk> Tested-by: Yu Zhao <yuzhao@google.com> Tested-by: Jordan Glover <Golden_Miller83@protonmail.ch> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
369 lines
8.8 KiB
C
369 lines
8.8 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
#include <linux/stat.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/cred.h>
|
|
#include <linux/hash.h>
|
|
#include <linux/kmemleak.h>
|
|
#include <linux/user_namespace.h>
|
|
|
|
struct ucounts init_ucounts = {
|
|
.ns = &init_user_ns,
|
|
.uid = GLOBAL_ROOT_UID,
|
|
.count = ATOMIC_INIT(1),
|
|
};
|
|
|
|
#define UCOUNTS_HASHTABLE_BITS 10
|
|
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
|
|
static DEFINE_SPINLOCK(ucounts_lock);
|
|
|
|
#define ucounts_hashfn(ns, uid) \
|
|
hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
|
|
UCOUNTS_HASHTABLE_BITS)
|
|
#define ucounts_hashentry(ns, uid) \
|
|
(ucounts_hashtable + ucounts_hashfn(ns, uid))
|
|
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
static struct ctl_table_set *
|
|
set_lookup(struct ctl_table_root *root)
|
|
{
|
|
return ¤t_user_ns()->set;
|
|
}
|
|
|
|
static int set_is_seen(struct ctl_table_set *set)
|
|
{
|
|
return ¤t_user_ns()->set == set;
|
|
}
|
|
|
|
static int set_permissions(struct ctl_table_header *head,
|
|
struct ctl_table *table)
|
|
{
|
|
struct user_namespace *user_ns =
|
|
container_of(head->set, struct user_namespace, set);
|
|
int mode;
|
|
|
|
/* Allow users with CAP_SYS_RESOURCE unrestrained access */
|
|
if (ns_capable(user_ns, CAP_SYS_RESOURCE))
|
|
mode = (table->mode & S_IRWXU) >> 6;
|
|
else
|
|
/* Allow all others at most read-only access */
|
|
mode = table->mode & S_IROTH;
|
|
return (mode << 6) | (mode << 3) | mode;
|
|
}
|
|
|
|
static struct ctl_table_root set_root = {
|
|
.lookup = set_lookup,
|
|
.permissions = set_permissions,
|
|
};
|
|
|
|
static long ue_zero = 0;
|
|
static long ue_int_max = INT_MAX;
|
|
|
|
#define UCOUNT_ENTRY(name) \
|
|
{ \
|
|
.procname = name, \
|
|
.maxlen = sizeof(long), \
|
|
.mode = 0644, \
|
|
.proc_handler = proc_doulongvec_minmax, \
|
|
.extra1 = &ue_zero, \
|
|
.extra2 = &ue_int_max, \
|
|
}
|
|
static struct ctl_table user_table[] = {
|
|
UCOUNT_ENTRY("max_user_namespaces"),
|
|
UCOUNT_ENTRY("max_pid_namespaces"),
|
|
UCOUNT_ENTRY("max_uts_namespaces"),
|
|
UCOUNT_ENTRY("max_ipc_namespaces"),
|
|
UCOUNT_ENTRY("max_net_namespaces"),
|
|
UCOUNT_ENTRY("max_mnt_namespaces"),
|
|
UCOUNT_ENTRY("max_cgroup_namespaces"),
|
|
UCOUNT_ENTRY("max_time_namespaces"),
|
|
#ifdef CONFIG_INOTIFY_USER
|
|
UCOUNT_ENTRY("max_inotify_instances"),
|
|
UCOUNT_ENTRY("max_inotify_watches"),
|
|
#endif
|
|
#ifdef CONFIG_FANOTIFY
|
|
UCOUNT_ENTRY("max_fanotify_groups"),
|
|
UCOUNT_ENTRY("max_fanotify_marks"),
|
|
#endif
|
|
{ },
|
|
{ },
|
|
{ },
|
|
{ },
|
|
{ }
|
|
};
|
|
#endif /* CONFIG_SYSCTL */
|
|
|
|
bool setup_userns_sysctls(struct user_namespace *ns)
|
|
{
|
|
#ifdef CONFIG_SYSCTL
|
|
struct ctl_table *tbl;
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1);
|
|
setup_sysctl_set(&ns->set, &set_root, set_is_seen);
|
|
tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
|
|
if (tbl) {
|
|
int i;
|
|
for (i = 0; i < UCOUNT_COUNTS; i++) {
|
|
tbl[i].data = &ns->ucount_max[i];
|
|
}
|
|
ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
|
|
}
|
|
if (!ns->sysctls) {
|
|
kfree(tbl);
|
|
retire_sysctl_set(&ns->set);
|
|
return false;
|
|
}
|
|
#endif
|
|
return true;
|
|
}
|
|
|
|
void retire_userns_sysctls(struct user_namespace *ns)
|
|
{
|
|
#ifdef CONFIG_SYSCTL
|
|
struct ctl_table *tbl;
|
|
|
|
tbl = ns->sysctls->ctl_table_arg;
|
|
unregister_sysctl_table(ns->sysctls);
|
|
retire_sysctl_set(&ns->set);
|
|
kfree(tbl);
|
|
#endif
|
|
}
|
|
|
|
static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
|
|
{
|
|
struct ucounts *ucounts;
|
|
|
|
hlist_for_each_entry(ucounts, hashent, node) {
|
|
if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
|
|
return ucounts;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static void hlist_add_ucounts(struct ucounts *ucounts)
|
|
{
|
|
struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
|
|
spin_lock_irq(&ucounts_lock);
|
|
hlist_add_head(&ucounts->node, hashent);
|
|
spin_unlock_irq(&ucounts_lock);
|
|
}
|
|
|
|
struct ucounts *get_ucounts(struct ucounts *ucounts)
|
|
{
|
|
if (ucounts && atomic_add_negative(1, &ucounts->count)) {
|
|
put_ucounts(ucounts);
|
|
ucounts = NULL;
|
|
}
|
|
return ucounts;
|
|
}
|
|
|
|
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
|
|
{
|
|
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
|
|
struct ucounts *ucounts, *new;
|
|
long overflow;
|
|
|
|
spin_lock_irq(&ucounts_lock);
|
|
ucounts = find_ucounts(ns, uid, hashent);
|
|
if (!ucounts) {
|
|
spin_unlock_irq(&ucounts_lock);
|
|
|
|
new = kzalloc(sizeof(*new), GFP_KERNEL);
|
|
if (!new)
|
|
return NULL;
|
|
|
|
new->ns = ns;
|
|
new->uid = uid;
|
|
atomic_set(&new->count, 1);
|
|
|
|
spin_lock_irq(&ucounts_lock);
|
|
ucounts = find_ucounts(ns, uid, hashent);
|
|
if (ucounts) {
|
|
kfree(new);
|
|
} else {
|
|
hlist_add_head(&new->node, hashent);
|
|
spin_unlock_irq(&ucounts_lock);
|
|
return new;
|
|
}
|
|
}
|
|
overflow = atomic_add_negative(1, &ucounts->count);
|
|
spin_unlock_irq(&ucounts_lock);
|
|
if (overflow) {
|
|
put_ucounts(ucounts);
|
|
return NULL;
|
|
}
|
|
return ucounts;
|
|
}
|
|
|
|
void put_ucounts(struct ucounts *ucounts)
|
|
{
|
|
unsigned long flags;
|
|
|
|
if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
|
|
hlist_del_init(&ucounts->node);
|
|
spin_unlock_irqrestore(&ucounts_lock, flags);
|
|
kfree(ucounts);
|
|
}
|
|
}
|
|
|
|
static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
|
|
{
|
|
long c, old;
|
|
c = atomic_long_read(v);
|
|
for (;;) {
|
|
if (unlikely(c >= u))
|
|
return false;
|
|
old = atomic_long_cmpxchg(v, c, c+1);
|
|
if (likely(old == c))
|
|
return true;
|
|
c = old;
|
|
}
|
|
}
|
|
|
|
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
|
|
enum ucount_type type)
|
|
{
|
|
struct ucounts *ucounts, *iter, *bad;
|
|
struct user_namespace *tns;
|
|
ucounts = alloc_ucounts(ns, uid);
|
|
for (iter = ucounts; iter; iter = tns->ucounts) {
|
|
long max;
|
|
tns = iter->ns;
|
|
max = READ_ONCE(tns->ucount_max[type]);
|
|
if (!atomic_long_inc_below(&iter->ucount[type], max))
|
|
goto fail;
|
|
}
|
|
return ucounts;
|
|
fail:
|
|
bad = iter;
|
|
for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
|
|
atomic_long_dec(&iter->ucount[type]);
|
|
|
|
put_ucounts(ucounts);
|
|
return NULL;
|
|
}
|
|
|
|
void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
|
|
{
|
|
struct ucounts *iter;
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
|
|
WARN_ON_ONCE(dec < 0);
|
|
}
|
|
put_ucounts(ucounts);
|
|
}
|
|
|
|
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
|
|
{
|
|
struct ucounts *iter;
|
|
long ret = 0;
|
|
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
long max = READ_ONCE(iter->ns->ucount_max[type]);
|
|
long new = atomic_long_add_return(v, &iter->ucount[type]);
|
|
if (new < 0 || new > max)
|
|
ret = LONG_MAX;
|
|
else if (iter == ucounts)
|
|
ret = new;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
|
|
{
|
|
struct ucounts *iter;
|
|
long new = -1; /* Silence compiler warning */
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
long dec = atomic_long_add_return(-v, &iter->ucount[type]);
|
|
WARN_ON_ONCE(dec < 0);
|
|
if (iter == ucounts)
|
|
new = dec;
|
|
}
|
|
return (new == 0);
|
|
}
|
|
|
|
static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
|
|
struct ucounts *last, enum ucount_type type)
|
|
{
|
|
struct ucounts *iter, *next;
|
|
for (iter = ucounts; iter != last; iter = next) {
|
|
long dec = atomic_long_add_return(-1, &iter->ucount[type]);
|
|
WARN_ON_ONCE(dec < 0);
|
|
next = iter->ns->ucounts;
|
|
if (dec == 0)
|
|
put_ucounts(iter);
|
|
}
|
|
}
|
|
|
|
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type)
|
|
{
|
|
do_dec_rlimit_put_ucounts(ucounts, NULL, type);
|
|
}
|
|
|
|
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
|
|
{
|
|
/* Caller must hold a reference to ucounts */
|
|
struct ucounts *iter;
|
|
long dec, ret = 0;
|
|
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
long max = READ_ONCE(iter->ns->ucount_max[type]);
|
|
long new = atomic_long_add_return(1, &iter->ucount[type]);
|
|
if (new < 0 || new > max)
|
|
goto unwind;
|
|
if (iter == ucounts)
|
|
ret = new;
|
|
/*
|
|
* Grab an extra ucount reference for the caller when
|
|
* the rlimit count was previously 0.
|
|
*/
|
|
if (new != 1)
|
|
continue;
|
|
if (!get_ucounts(iter))
|
|
goto dec_unwind;
|
|
}
|
|
return ret;
|
|
dec_unwind:
|
|
dec = atomic_long_add_return(-1, &iter->ucount[type]);
|
|
WARN_ON_ONCE(dec < 0);
|
|
unwind:
|
|
do_dec_rlimit_put_ucounts(ucounts, iter, type);
|
|
return 0;
|
|
}
|
|
|
|
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
|
|
{
|
|
struct ucounts *iter;
|
|
if (get_ucounts_value(ucounts, type) > max)
|
|
return true;
|
|
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
|
|
max = READ_ONCE(iter->ns->ucount_max[type]);
|
|
if (get_ucounts_value(iter, type) > max)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static __init int user_namespace_sysctl_init(void)
|
|
{
|
|
#ifdef CONFIG_SYSCTL
|
|
static struct ctl_table_header *user_header;
|
|
static struct ctl_table empty[1];
|
|
/*
|
|
* It is necessary to register the user directory in the
|
|
* default set so that registrations in the child sets work
|
|
* properly.
|
|
*/
|
|
user_header = register_sysctl("user", empty);
|
|
kmemleak_ignore(user_header);
|
|
BUG_ON(!user_header);
|
|
BUG_ON(!setup_userns_sysctls(&init_user_ns));
|
|
#endif
|
|
hlist_add_ucounts(&init_ucounts);
|
|
inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
|
|
return 0;
|
|
}
|
|
subsys_initcall(user_namespace_sysctl_init);
|