2019-06-01 10:08:55 +02:00
// SPDX-License-Identifier: GPL-2.0-only
2016-07-30 13:58:49 -05:00
# include <linux/stat.h>
# include <linux/sysctl.h>
# include <linux/slab.h>
2017-02-02 17:54:15 +01:00
# include <linux/cred.h>
2016-08-08 13:54:50 -05:00
# include <linux/hash.h>
2018-04-05 16:25:34 -07:00
# include <linux/kmemleak.h>
2016-07-30 13:58:49 -05:00
# include <linux/user_namespace.h>
2016-08-08 13:54:50 -05:00
# define UCOUNTS_HASHTABLE_BITS 10
static struct hlist_head ucounts_hashtable [ ( 1 < < UCOUNTS_HASHTABLE_BITS ) ] ;
static DEFINE_SPINLOCK ( ucounts_lock ) ;
# define ucounts_hashfn(ns, uid) \
hash_long ( ( unsigned long ) __kuid_val ( uid ) + ( unsigned long ) ( ns ) , \
UCOUNTS_HASHTABLE_BITS )
# define ucounts_hashentry(ns, uid) \
( ucounts_hashtable + ucounts_hashfn ( ns , uid ) )
2016-07-30 13:58:49 -05:00
# ifdef CONFIG_SYSCTL
static struct ctl_table_set *
set_lookup ( struct ctl_table_root * root )
{
return & current_user_ns ( ) - > set ;
}
static int set_is_seen ( struct ctl_table_set * set )
{
return & current_user_ns ( ) - > set = = set ;
}
static int set_permissions ( struct ctl_table_header * head ,
struct ctl_table * table )
{
struct user_namespace * user_ns =
container_of ( head - > set , struct user_namespace , set ) ;
int mode ;
/* Allow users with CAP_SYS_RESOURCE unrestrained access */
if ( ns_capable ( user_ns , CAP_SYS_RESOURCE ) )
mode = ( table - > mode & S_IRWXU ) > > 6 ;
else
/* Allow all others at most read-only access */
mode = table - > mode & S_IROTH ;
return ( mode < < 6 ) | ( mode < < 3 ) | mode ;
}
static struct ctl_table_root set_root = {
. lookup = set_lookup ,
. permissions = set_permissions ,
} ;
2016-12-14 15:56:33 +02:00
# define UCOUNT_ENTRY(name) \
2016-08-08 14:41:52 -05:00
{ \
. procname = name , \
. maxlen = sizeof ( int ) , \
. mode = 0644 , \
. proc_handler = proc_dointvec_minmax , \
proc/sysctl: add shared variables for range check
In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range. This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.
On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.
The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:
$ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
248
Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.
This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:
# scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
Data old new delta
sysctl_vals - 12 +12
__kstrtab_sysctl_vals - 12 +12
max 14 10 -4
int_max 16 - -16
one 68 - -68
zero 128 28 -100
Total: Before=20583249, After=20583085, chg -0.00%
[mcroce@redhat.com: tipc: remove two unused variables]
Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-07-18 15:58:50 -07:00
. extra1 = SYSCTL_ZERO , \
. extra2 = SYSCTL_INT_MAX , \
2016-08-08 14:41:52 -05:00
}
2016-08-08 13:54:50 -05:00
static struct ctl_table user_table [ ] = {
2016-08-08 14:41:52 -05:00
UCOUNT_ENTRY ( " max_user_namespaces " ) ,
2016-08-08 14:08:36 -05:00
UCOUNT_ENTRY ( " max_pid_namespaces " ) ,
2016-08-08 14:11:25 -05:00
UCOUNT_ENTRY ( " max_uts_namespaces " ) ,
2016-08-08 14:20:23 -05:00
UCOUNT_ENTRY ( " max_ipc_namespaces " ) ,
2016-08-08 14:33:23 -05:00
UCOUNT_ENTRY ( " max_net_namespaces " ) ,
2016-08-08 14:37:37 -05:00
UCOUNT_ENTRY ( " max_mnt_namespaces " ) ,
2016-08-08 14:25:30 -05:00
UCOUNT_ENTRY ( " max_cgroup_namespaces " ) ,
2016-12-14 15:56:33 +02:00
# ifdef CONFIG_INOTIFY_USER
UCOUNT_ENTRY ( " max_inotify_instances " ) ,
UCOUNT_ENTRY ( " max_inotify_watches " ) ,
# endif
2016-07-30 13:58:49 -05:00
{ }
} ;
# endif /* CONFIG_SYSCTL */
bool setup_userns_sysctls ( struct user_namespace * ns )
{
# ifdef CONFIG_SYSCTL
struct ctl_table * tbl ;
setup_sysctl_set ( & ns - > set , & set_root , set_is_seen ) ;
2016-08-08 13:54:50 -05:00
tbl = kmemdup ( user_table , sizeof ( user_table ) , GFP_KERNEL ) ;
2016-07-30 13:58:49 -05:00
if ( tbl ) {
2016-08-08 14:41:52 -05:00
int i ;
for ( i = 0 ; i < UCOUNT_COUNTS ; i + + ) {
tbl [ i ] . data = & ns - > ucount_max [ i ] ;
}
2016-08-08 13:54:50 -05:00
ns - > sysctls = __register_sysctl_table ( & ns - > set , " user " , tbl ) ;
2016-07-30 13:58:49 -05:00
}
if ( ! ns - > sysctls ) {
kfree ( tbl ) ;
retire_sysctl_set ( & ns - > set ) ;
return false ;
}
# endif
return true ;
}
void retire_userns_sysctls ( struct user_namespace * ns )
{
# ifdef CONFIG_SYSCTL
struct ctl_table * tbl ;
tbl = ns - > sysctls - > ctl_table_arg ;
unregister_sysctl_table ( ns - > sysctls ) ;
retire_sysctl_set ( & ns - > set ) ;
kfree ( tbl ) ;
# endif
}
2016-08-08 13:54:50 -05:00
static struct ucounts * find_ucounts ( struct user_namespace * ns , kuid_t uid , struct hlist_head * hashent )
{
struct ucounts * ucounts ;
hlist_for_each_entry ( ucounts , hashent , node ) {
if ( uid_eq ( ucounts - > uid , uid ) & & ( ucounts - > ns = = ns ) )
return ucounts ;
}
return NULL ;
}
static struct ucounts * get_ucounts ( struct user_namespace * ns , kuid_t uid )
{
struct hlist_head * hashent = ucounts_hashentry ( ns , uid ) ;
struct ucounts * ucounts , * new ;
2017-01-20 15:21:35 +02:00
spin_lock_irq ( & ucounts_lock ) ;
2016-08-08 13:54:50 -05:00
ucounts = find_ucounts ( ns , uid , hashent ) ;
if ( ! ucounts ) {
2017-01-20 15:21:35 +02:00
spin_unlock_irq ( & ucounts_lock ) ;
2016-08-08 13:54:50 -05:00
new = kzalloc ( sizeof ( * new ) , GFP_KERNEL ) ;
if ( ! new )
return NULL ;
new - > ns = ns ;
new - > uid = uid ;
2017-03-05 15:03:22 -06:00
new - > count = 0 ;
2016-08-08 13:54:50 -05:00
2017-01-20 15:21:35 +02:00
spin_lock_irq ( & ucounts_lock ) ;
2016-08-08 13:54:50 -05:00
ucounts = find_ucounts ( ns , uid , hashent ) ;
if ( ucounts ) {
kfree ( new ) ;
} else {
hlist_add_head ( & new - > node , hashent ) ;
ucounts = new ;
}
}
2017-03-05 15:03:22 -06:00
if ( ucounts - > count = = INT_MAX )
2016-08-08 13:54:50 -05:00
ucounts = NULL ;
2017-03-05 15:03:22 -06:00
else
ucounts - > count + = 1 ;
2017-01-20 15:21:35 +02:00
spin_unlock_irq ( & ucounts_lock ) ;
2016-08-08 13:54:50 -05:00
return ucounts ;
}
static void put_ucounts ( struct ucounts * ucounts )
{
2017-01-20 15:21:35 +02:00
unsigned long flags ;
2017-03-05 15:03:22 -06:00
spin_lock_irqsave ( & ucounts_lock , flags ) ;
ucounts - > count - = 1 ;
if ( ! ucounts - > count )
2016-08-08 13:54:50 -05:00
hlist_del_init ( & ucounts - > node ) ;
2017-03-05 15:03:22 -06:00
else
ucounts = NULL ;
spin_unlock_irqrestore ( & ucounts_lock , flags ) ;
2016-08-08 13:54:50 -05:00
2017-03-05 15:03:22 -06:00
kfree ( ucounts ) ;
2016-08-08 13:54:50 -05:00
}
2016-08-08 13:41:24 -05:00
static inline bool atomic_inc_below ( atomic_t * v , int u )
{
int c , old ;
c = atomic_read ( v ) ;
for ( ; ; ) {
if ( unlikely ( c > = u ) )
return false ;
old = atomic_cmpxchg ( v , c , c + 1 ) ;
if ( likely ( old = = c ) )
return true ;
c = old ;
}
}
2016-08-08 14:41:52 -05:00
struct ucounts * inc_ucount ( struct user_namespace * ns , kuid_t uid ,
enum ucount_type type )
2016-08-08 13:41:24 -05:00
{
2016-08-08 13:54:50 -05:00
struct ucounts * ucounts , * iter , * bad ;
struct user_namespace * tns ;
ucounts = get_ucounts ( ns , uid ) ;
for ( iter = ucounts ; iter ; iter = tns - > ucounts ) {
int max ;
tns = iter - > ns ;
2016-08-08 14:41:52 -05:00
max = READ_ONCE ( tns - > ucount_max [ type ] ) ;
if ( ! atomic_inc_below ( & iter - > ucount [ type ] , max ) )
2016-08-08 13:41:24 -05:00
goto fail ;
}
2016-08-08 13:54:50 -05:00
return ucounts ;
2016-08-08 13:41:24 -05:00
fail :
2016-08-08 13:54:50 -05:00
bad = iter ;
for ( iter = ucounts ; iter ! = bad ; iter = iter - > ns - > ucounts )
2016-08-08 14:41:52 -05:00
atomic_dec ( & iter - > ucount [ type ] ) ;
2016-08-08 13:41:24 -05:00
2016-08-08 13:54:50 -05:00
put_ucounts ( ucounts ) ;
return NULL ;
2016-08-08 13:41:24 -05:00
}
2016-08-08 14:41:52 -05:00
void dec_ucount ( struct ucounts * ucounts , enum ucount_type type )
2016-08-08 13:41:24 -05:00
{
2016-08-08 13:54:50 -05:00
struct ucounts * iter ;
for ( iter = ucounts ; iter ; iter = iter - > ns - > ucounts ) {
2016-08-08 14:41:52 -05:00
int dec = atomic_dec_if_positive ( & iter - > ucount [ type ] ) ;
2016-08-08 13:41:24 -05:00
WARN_ON_ONCE ( dec < 0 ) ;
}
2016-08-08 13:54:50 -05:00
put_ucounts ( ucounts ) ;
2016-08-08 13:41:24 -05:00
}
2016-07-30 13:58:49 -05:00
static __init int user_namespace_sysctl_init ( void )
{
# ifdef CONFIG_SYSCTL
2016-08-08 13:54:50 -05:00
static struct ctl_table_header * user_header ;
2016-07-30 13:58:49 -05:00
static struct ctl_table empty [ 1 ] ;
/*
2016-08-08 13:54:50 -05:00
* It is necessary to register the user directory in the
2016-07-30 13:58:49 -05:00
* default set so that registrations in the child sets work
* properly .
*/
2016-08-08 13:54:50 -05:00
user_header = register_sysctl ( " user " , empty ) ;
2017-02-08 14:30:50 -08:00
kmemleak_ignore ( user_header ) ;
2016-08-08 13:54:50 -05:00
BUG_ON ( ! user_header ) ;
2016-07-30 13:58:49 -05:00
BUG_ON ( ! setup_userns_sysctls ( & init_user_ns ) ) ;
# endif
return 0 ;
}
subsys_initcall ( user_namespace_sysctl_init ) ;