2005-04-17 02:20:36 +04:00
/*
* sysctl . c : General linux system control interface
*
* Begun 24 March 1995 , Stephen Tweedie
* Added / proc support , Dec 1995
* Added bdflush entry and intvec min / max checking , 2 / 23 / 96 , Tom Dyas .
* Added hooks for / proc / sys / net ( minor , minor patch ) , 96 / 4 / 1 , Mike Shaver .
* Added kernel / java - { interpreter , appletviewer } , 96 / 5 / 10 , Mike Shaver .
* Dynamic registration fixes , Stephen Tweedie .
* Added kswapd - interval , ctrl - alt - del , printk stuff , 1 / 8 / 97 , Chris Horn .
* Made sysctl support optional via CONFIG_SYSCTL , 1 / 10 / 97 , Chris
* Horn .
* Added proc_doulongvec_ms_jiffies_minmax , 09 / 08 / 99 , Carlos H . Bauer .
* Added proc_doulongvec_minmax , 09 / 08 / 99 , Carlos H . Bauer .
* Changed linked lists to use list . h instead of lists . h , 02 / 24 / 00 , Bill
* Wendling .
* The list_for_each ( ) macro wasn ' t appropriate for the sysctl loop .
* Removed it and replaced it with older style , 03 / 23 / 00 , Bill Wendling
*/
# include <linux/module.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/slab.h>
# include <linux/sysctl.h>
# include <linux/proc_fs.h>
V3 file capabilities: alter behavior of cap_setpcap
The non-filesystem capability meaning of CAP_SETPCAP is that a process, p1,
can change the capabilities of another process, p2. This is not the
meaning that was intended for this capability at all, and this
implementation came about purely because, without filesystem capabilities,
there was no way to use capabilities without one process bestowing them on
another.
Since we now have a filesystem support for capabilities we can fix the
implementation of CAP_SETPCAP.
The most significant thing about this change is that, with it in effect, no
process can set the capabilities of another process.
The capabilities of a program are set via the capability convolution
rules:
pI(post-exec) = pI(pre-exec)
pP(post-exec) = (X(aka cap_bset) & fP) | (pI(post-exec) & fI)
pE(post-exec) = fE ? pP(post-exec) : 0
at exec() time. As such, the only influence the pre-exec() program can
have on the post-exec() program's capabilities are through the pI
capability set.
The correct implementation for CAP_SETPCAP (and that enabled by this patch)
is that it can be used to add extra pI capabilities to the current process
- to be picked up by subsequent exec()s when the above convolution rules
are applied.
Here is how it works:
Let's say we have a process, p. It has capability sets, pE, pP and pI.
Generally, p, can change the value of its own pI to pI' where
(pI' & ~pI) & ~pP = 0.
That is, the only new things in pI' that were not present in pI need to
be present in pP.
The role of CAP_SETPCAP is basically to permit changes to pI beyond
the above:
if (pE & CAP_SETPCAP) {
pI' = anything; /* ie., even (pI' & ~pI) & ~pP != 0 */
}
This capability is useful for things like login, which (say, via
pam_cap) might want to raise certain inheritable capabilities for use
by the children of the logged-in user's shell, but those capabilities
are not useful to or needed by the login program itself.
One such use might be to limit who can run ping. You set the
capabilities of the 'ping' program to be "= cap_net_raw+i", and then
only shells that have (pI & CAP_NET_RAW) will be able to run
it. Without CAP_SETPCAP implemented as described above, login(pam_cap)
would have to also have (pP & CAP_NET_RAW) in order to raise this
capability and pass it on through the inheritable set.
Signed-off-by: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 14:05:59 +04:00
# include <linux/security.h>
2005-04-17 02:20:36 +04:00
# include <linux/ctype.h>
# include <linux/utsname.h>
# include <linux/smp_lock.h>
2007-07-17 15:03:45 +04:00
# include <linux/fs.h>
2005-04-17 02:20:36 +04:00
# include <linux/init.h>
# include <linux/kernel.h>
2005-11-11 07:33:52 +03:00
# include <linux/kobject.h>
2005-08-16 09:18:02 +04:00
# include <linux/net.h>
2005-04-17 02:20:36 +04:00
# include <linux/sysrq.h>
# include <linux/highuid.h>
# include <linux/writeback.h>
# include <linux/hugetlb.h>
# include <linux/security.h>
# include <linux/initrd.h>
# include <linux/times.h>
# include <linux/limits.h>
# include <linux/dcache.h>
# include <linux/syscalls.h>
2006-02-21 05:27:58 +03:00
# include <linux/nfs_fs.h>
# include <linux/acpi.h>
2007-07-18 05:37:02 +04:00
# include <linux/reboot.h>
2005-04-17 02:20:36 +04:00
# include <asm/uaccess.h>
# include <asm/processor.h>
2006-09-30 03:47:55 +04:00
# ifdef CONFIG_X86
# include <asm/nmi.h>
2006-12-07 04:14:11 +03:00
# include <asm/stacktrace.h>
2006-09-30 03:47:55 +04:00
# endif
2007-10-18 14:05:58 +04:00
static int deprecated_sysctl_warning ( struct __sysctl_args * args ) ;
2005-04-17 02:20:36 +04:00
# if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
extern int C_A_D ;
2007-07-16 10:40:10 +04:00
extern int print_fatal_signals ;
2005-04-17 02:20:36 +04:00
extern int sysctl_overcommit_memory ;
extern int sysctl_overcommit_ratio ;
2006-06-23 13:03:13 +04:00
extern int sysctl_panic_on_oom ;
2007-10-17 10:25:56 +04:00
extern int sysctl_oom_kill_allocating_task ;
2005-04-17 02:20:36 +04:00
extern int max_threads ;
extern int core_uses_pid ;
2005-06-23 11:09:43 +04:00
extern int suid_dumpable ;
2005-04-17 02:20:36 +04:00
extern char core_pattern [ ] ;
extern int pid_max ;
extern int min_free_kbytes ;
extern int printk_ratelimit_jiffies ;
extern int printk_ratelimit_burst ;
extern int pid_max_min , pid_max_max ;
2006-01-08 12:00:39 +03:00
extern int sysctl_drop_caches ;
2006-01-08 12:00:40 +03:00
extern int percpu_pagelist_fraction ;
2006-06-26 15:56:52 +04:00
extern int compat_log ;
2007-05-08 11:26:04 +04:00
extern int maps_protect ;
2007-05-09 13:35:13 +04:00
extern int sysctl_stat_interval ;
2007-07-19 12:48:15 +04:00
extern int audit_argv_kb ;
2005-04-17 02:20:36 +04:00
2007-10-17 10:26:09 +04:00
/* Constants used for minimum and maximum */
# ifdef CONFIG_DETECT_SOFTLOCKUP
static int one = 1 ;
static int sixty = 60 ;
# endif
# ifdef CONFIG_MMU
static int two = 2 ;
# endif
static int zero ;
static int one_hundred = 100 ;
2005-04-17 02:20:36 +04:00
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535 ;
static int minolduid ;
2006-01-08 12:00:40 +03:00
static int min_percpu_pagelist_fract = 8 ;
2005-04-17 02:20:36 +04:00
static int ngroups_max = NGROUPS_MAX ;
# ifdef CONFIG_KMOD
extern char modprobe_path [ ] ;
# endif
# ifdef CONFIG_CHR_DEV_SG
extern int sg_big_buff ;
# endif
# ifdef __sparc__
extern char reboot_command [ ] ;
extern int stop_a_enabled ;
extern int scons_pwroff ;
# endif
# ifdef __hppa__
extern int pwrsw_enabled ;
extern int unaligned_enabled ;
# endif
2006-01-06 11:19:28 +03:00
# ifdef CONFIG_S390
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_MATHEMU
extern int sysctl_ieee_emulation_warnings ;
# endif
extern int sysctl_userprocess_debug ;
2005-07-27 22:44:57 +04:00
extern int spin_retry ;
2005-04-17 02:20:36 +04:00
# endif
extern int sysctl_hz_timer ;
# ifdef CONFIG_BSD_PROCESS_ACCT
extern int acct_parm [ ] ;
# endif
2006-02-28 20:42:23 +03:00
# ifdef CONFIG_IA64
extern int no_unaligned_warning ;
# endif
2006-06-27 13:54:53 +04:00
# ifdef CONFIG_RT_MUTEXES
extern int max_lock_depth ;
# endif
2006-09-27 12:51:04 +04:00
# ifdef CONFIG_SYSCTL_SYSCALL
static int parse_table ( int __user * , int , void __user * , size_t __user * ,
2007-10-18 14:05:22 +04:00
void __user * , size_t , struct ctl_table * ) ;
2006-09-27 12:51:04 +04:00
# endif
2006-12-08 13:39:57 +03:00
2006-10-20 10:28:34 +04:00
# ifdef CONFIG_PROC_SYSCTL
2007-10-18 14:05:22 +04:00
static int proc_do_cad_pid ( struct ctl_table * table , int write , struct file * filp ,
2006-10-02 13:19:00 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos ) ;
2007-10-18 14:05:22 +04:00
static int proc_dointvec_taint ( struct ctl_table * table , int write , struct file * filp ,
2007-02-10 12:45:24 +03:00
void __user * buffer , size_t * lenp , loff_t * ppos ) ;
2006-10-20 10:28:34 +04:00
# endif
2006-10-02 13:19:00 +04:00
2007-10-18 14:05:22 +04:00
static struct ctl_table root_table [ ] ;
2005-04-17 02:20:36 +04:00
static struct ctl_table_header root_table_header =
{ root_table , LIST_HEAD_INIT ( root_table_header . ctl_entry ) } ;
2007-10-18 14:05:22 +04:00
static struct ctl_table kern_table [ ] ;
static struct ctl_table vm_table [ ] ;
static struct ctl_table fs_table [ ] ;
static struct ctl_table debug_table [ ] ;
static struct ctl_table dev_table [ ] ;
extern struct ctl_table random_table [ ] ;
2006-06-02 00:10:59 +04:00
# ifdef CONFIG_INOTIFY_USER
2007-10-18 14:05:22 +04:00
extern struct ctl_table inotify_table [ ] ;
2005-07-13 20:38:18 +04:00
# endif
2005-04-17 02:20:36 +04:00
# ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
int sysctl_legacy_va_layout ;
# endif
2007-07-19 12:48:56 +04:00
extern int prove_locking ;
extern int lock_stat ;
2006-12-08 13:39:56 +03:00
2005-04-17 02:20:36 +04:00
/* The default sysctl tables: */
2007-10-18 14:05:22 +04:00
static struct ctl_table root_table [ ] = {
2005-04-17 02:20:36 +04:00
{
. ctl_name = CTL_KERN ,
. procname = " kernel " ,
. mode = 0555 ,
. child = kern_table ,
} ,
{
. ctl_name = CTL_VM ,
. procname = " vm " ,
. mode = 0555 ,
. child = vm_table ,
} ,
# ifdef CONFIG_NET
{
. ctl_name = CTL_NET ,
. procname = " net " ,
. mode = 0555 ,
. child = net_table ,
} ,
# endif
{
. ctl_name = CTL_FS ,
. procname = " fs " ,
. mode = 0555 ,
. child = fs_table ,
} ,
{
. ctl_name = CTL_DEBUG ,
. procname = " debug " ,
. mode = 0555 ,
. child = debug_table ,
} ,
{
. ctl_name = CTL_DEV ,
. procname = " dev " ,
. mode = 0555 ,
. child = dev_table ,
} ,
2007-07-16 10:41:21 +04:00
/*
* NOTE : do not add new entries to this table unless you have read
* Documentation / sysctl / ctl_unnumbered . txt
*/
2005-04-17 02:20:36 +04:00
{ . ctl_name = 0 }
} ;
2007-07-09 20:52:00 +04:00
# ifdef CONFIG_SCHED_DEBUG
2007-12-18 17:21:13 +03:00
static int min_sched_granularity_ns = 100000 ; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC ; /* 1 second */
static int min_wakeup_granularity_ns ; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC ; /* 1 second */
2007-07-09 20:52:00 +04:00
# endif
2007-10-18 14:05:22 +04:00
static struct ctl_table kern_table [ ] = {
2007-07-09 20:52:00 +04:00
# ifdef CONFIG_SCHED_DEBUG
{
. ctl_name = CTL_UNNUMBERED ,
2007-11-10 00:39:37 +03:00
. procname = " sched_min_granularity_ns " ,
. data = & sysctl_sched_min_granularity ,
2007-07-09 20:52:00 +04:00
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
2007-11-10 00:39:37 +03:00
. proc_handler = & sched_nr_latency_handler ,
. strategy = & sysctl_intvec ,
. extra1 = & min_sched_granularity_ns ,
. extra2 = & max_sched_granularity_ns ,
2007-07-09 20:52:00 +04:00
} ,
2007-08-25 20:41:53 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_latency_ns " ,
. data = & sysctl_sched_latency ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
2007-11-10 00:39:37 +03:00
. proc_handler = & sched_nr_latency_handler ,
2007-08-25 20:41:53 +04:00
. strategy = & sysctl_intvec ,
. extra1 = & min_sched_granularity_ns ,
. extra2 = & max_sched_granularity_ns ,
} ,
2007-07-09 20:52:00 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_wakeup_granularity_ns " ,
. data = & sysctl_sched_wakeup_granularity ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & min_wakeup_granularity_ns ,
. extra2 = & max_wakeup_granularity_ns ,
} ,
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_batch_wakeup_granularity_ns " ,
. data = & sysctl_sched_batch_wakeup_granularity ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & min_wakeup_granularity_ns ,
. extra2 = & max_wakeup_granularity_ns ,
} ,
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_child_runs_first " ,
. data = & sysctl_sched_child_runs_first ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-08-25 20:41:52 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_features " ,
. data = & sysctl_sched_features ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-10-15 19:00:18 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_migration_cost " ,
. data = & sysctl_sched_migration_cost ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-11-10 00:39:39 +03:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_nr_migrate " ,
. data = & sysctl_sched_nr_migrate ,
. maxlen = sizeof ( unsigned int ) ,
2008-01-25 23:08:29 +03:00
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_rt_period_ms " ,
. data = & sysctl_sched_rt_period ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_rt_ratio " ,
. data = & sysctl_sched_rt_ratio ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
2007-11-10 00:39:39 +03:00
. proc_handler = & proc_dointvec ,
} ,
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 23:08:00 +03:00
# if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_min_bal_int_shares " ,
. data = & sysctl_sched_min_bal_int_shares ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_max_bal_int_shares " ,
. data = & sysctl_sched_max_bal_int_shares ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
2007-08-25 20:41:52 +04:00
# endif
2007-09-20 01:34:46 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " sched_compat_yield " ,
. data = & sysctl_sched_compat_yield ,
. maxlen = sizeof ( unsigned int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-07-19 12:48:56 +04:00
# ifdef CONFIG_PROVE_LOCKING
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " prove_locking " ,
. data = & prove_locking ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
# ifdef CONFIG_LOCK_STAT
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " lock_stat " ,
. data = & lock_stat ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-07-09 20:52:00 +04:00
# endif
2005-04-17 02:20:36 +04:00
{
. ctl_name = KERN_PANIC ,
. procname = " panic " ,
. data = & panic_timeout ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = KERN_CORE_USES_PID ,
. procname = " core_uses_pid " ,
. data = & core_uses_pid ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-07-19 12:48:15 +04:00
# ifdef CONFIG_AUDITSYSCALL
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " audit_argv_kb " ,
. data = & audit_argv_kb ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
2005-04-17 02:20:36 +04:00
{
. ctl_name = KERN_CORE_PATTERN ,
. procname = " core_pattern " ,
. data = core_pattern ,
2007-05-17 09:11:16 +04:00
. maxlen = CORENAME_MAX_SIZE ,
2005-04-17 02:20:36 +04:00
. mode = 0644 ,
. proc_handler = & proc_dostring ,
. strategy = & sysctl_string ,
} ,
2007-02-10 12:45:24 +03:00
# ifdef CONFIG_PROC_SYSCTL
2005-04-17 02:20:36 +04:00
{
. procname = " tainted " ,
. data = & tainted ,
. maxlen = sizeof ( int ) ,
2007-02-10 12:45:24 +03:00
. mode = 0644 ,
. proc_handler = & proc_dointvec_taint ,
2005-04-17 02:20:36 +04:00
} ,
2007-02-10 12:45:24 +03:00
# endif
V3 file capabilities: alter behavior of cap_setpcap
The non-filesystem capability meaning of CAP_SETPCAP is that a process, p1,
can change the capabilities of another process, p2. This is not the
meaning that was intended for this capability at all, and this
implementation came about purely because, without filesystem capabilities,
there was no way to use capabilities without one process bestowing them on
another.
Since we now have a filesystem support for capabilities we can fix the
implementation of CAP_SETPCAP.
The most significant thing about this change is that, with it in effect, no
process can set the capabilities of another process.
The capabilities of a program are set via the capability convolution
rules:
pI(post-exec) = pI(pre-exec)
pP(post-exec) = (X(aka cap_bset) & fP) | (pI(post-exec) & fI)
pE(post-exec) = fE ? pP(post-exec) : 0
at exec() time. As such, the only influence the pre-exec() program can
have on the post-exec() program's capabilities are through the pI
capability set.
The correct implementation for CAP_SETPCAP (and that enabled by this patch)
is that it can be used to add extra pI capabilities to the current process
- to be picked up by subsequent exec()s when the above convolution rules
are applied.
Here is how it works:
Let's say we have a process, p. It has capability sets, pE, pP and pI.
Generally, p, can change the value of its own pI to pI' where
(pI' & ~pI) & ~pP = 0.
That is, the only new things in pI' that were not present in pI need to
be present in pP.
The role of CAP_SETPCAP is basically to permit changes to pI beyond
the above:
if (pE & CAP_SETPCAP) {
pI' = anything; /* ie., even (pI' & ~pI) & ~pP != 0 */
}
This capability is useful for things like login, which (say, via
pam_cap) might want to raise certain inheritable capabilities for use
by the children of the logged-in user's shell, but those capabilities
are not useful to or needed by the login program itself.
One such use might be to limit who can run ping. You set the
capabilities of the 'ping' program to be "= cap_net_raw+i", and then
only shells that have (pI & CAP_NET_RAW) will be able to run
it. Without CAP_SETPCAP implemented as described above, login(pam_cap)
would have to also have (pP & CAP_NET_RAW) in order to raise this
capability and pass it on through the inheritable set.
Signed-off-by: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 14:05:59 +04:00
# ifdef CONFIG_SECURITY_CAPABILITIES
2005-04-17 02:20:36 +04:00
{
. procname = " cap-bound " ,
. data = & cap_bset ,
. maxlen = sizeof ( kernel_cap_t ) ,
. mode = 0600 ,
. proc_handler = & proc_dointvec_bset ,
} ,
V3 file capabilities: alter behavior of cap_setpcap
The non-filesystem capability meaning of CAP_SETPCAP is that a process, p1,
can change the capabilities of another process, p2. This is not the
meaning that was intended for this capability at all, and this
implementation came about purely because, without filesystem capabilities,
there was no way to use capabilities without one process bestowing them on
another.
Since we now have a filesystem support for capabilities we can fix the
implementation of CAP_SETPCAP.
The most significant thing about this change is that, with it in effect, no
process can set the capabilities of another process.
The capabilities of a program are set via the capability convolution
rules:
pI(post-exec) = pI(pre-exec)
pP(post-exec) = (X(aka cap_bset) & fP) | (pI(post-exec) & fI)
pE(post-exec) = fE ? pP(post-exec) : 0
at exec() time. As such, the only influence the pre-exec() program can
have on the post-exec() program's capabilities are through the pI
capability set.
The correct implementation for CAP_SETPCAP (and that enabled by this patch)
is that it can be used to add extra pI capabilities to the current process
- to be picked up by subsequent exec()s when the above convolution rules
are applied.
Here is how it works:
Let's say we have a process, p. It has capability sets, pE, pP and pI.
Generally, p, can change the value of its own pI to pI' where
(pI' & ~pI) & ~pP = 0.
That is, the only new things in pI' that were not present in pI need to
be present in pP.
The role of CAP_SETPCAP is basically to permit changes to pI beyond
the above:
if (pE & CAP_SETPCAP) {
pI' = anything; /* ie., even (pI' & ~pI) & ~pP != 0 */
}
This capability is useful for things like login, which (say, via
pam_cap) might want to raise certain inheritable capabilities for use
by the children of the logged-in user's shell, but those capabilities
are not useful to or needed by the login program itself.
One such use might be to limit who can run ping. You set the
capabilities of the 'ping' program to be "= cap_net_raw+i", and then
only shells that have (pI & CAP_NET_RAW) will be able to run
it. Without CAP_SETPCAP implemented as described above, login(pam_cap)
would have to also have (pP & CAP_NET_RAW) in order to raise this
capability and pass it on through the inheritable set.
Signed-off-by: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 14:05:59 +04:00
# endif /* def CONFIG_SECURITY_CAPABILITIES */
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_BLK_DEV_INITRD
{
. ctl_name = KERN_REALROOTDEV ,
. procname = " real-root-dev " ,
. data = & real_root_dev ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
2007-07-16 10:40:10 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " print-fatal-signals " ,
. data = & print_fatal_signals ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2005-04-17 02:20:36 +04:00
# ifdef __sparc__
{
. ctl_name = KERN_SPARC_REBOOT ,
. procname = " reboot-cmd " ,
. data = reboot_command ,
. maxlen = 256 ,
. mode = 0644 ,
. proc_handler = & proc_dostring ,
. strategy = & sysctl_string ,
} ,
{
. ctl_name = KERN_SPARC_STOP_A ,
. procname = " stop-a " ,
. data = & stop_a_enabled ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = KERN_SPARC_SCONS_PWROFF ,
. procname = " scons-poweroff " ,
. data = & scons_pwroff ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
# ifdef __hppa__
{
. ctl_name = KERN_HPPA_PWRSW ,
. procname = " soft-power " ,
. data = & pwrsw_enabled ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = KERN_HPPA_UNALIGNED ,
. procname = " unaligned-trap " ,
. data = & unaligned_enabled ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
{
. ctl_name = KERN_CTLALTDEL ,
. procname = " ctrl-alt-del " ,
. data = & C_A_D ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = KERN_PRINTK ,
. procname = " printk " ,
. data = & console_loglevel ,
. maxlen = 4 * sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# ifdef CONFIG_KMOD
{
. ctl_name = KERN_MODPROBE ,
. procname = " modprobe " ,
. data = & modprobe_path ,
. maxlen = KMOD_PATH_LEN ,
. mode = 0644 ,
. proc_handler = & proc_dostring ,
. strategy = & sysctl_string ,
} ,
# endif
2006-06-23 13:05:47 +04:00
# if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
2005-04-17 02:20:36 +04:00
{
. ctl_name = KERN_HOTPLUG ,
. procname = " hotplug " ,
2005-11-16 11:00:00 +03:00
. data = & uevent_helper ,
. maxlen = UEVENT_HELPER_PATH_LEN ,
2005-04-17 02:20:36 +04:00
. mode = 0644 ,
. proc_handler = & proc_dostring ,
. strategy = & sysctl_string ,
} ,
# endif
# ifdef CONFIG_CHR_DEV_SG
{
. ctl_name = KERN_SG_BIG_BUFF ,
. procname = " sg-big-buff " ,
. data = & sg_big_buff ,
. maxlen = sizeof ( int ) ,
. mode = 0444 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
# ifdef CONFIG_BSD_PROCESS_ACCT
{
. ctl_name = KERN_ACCT ,
. procname = " acct " ,
. data = & acct_parm ,
. maxlen = 3 * sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
# ifdef CONFIG_MAGIC_SYSRQ
{
. ctl_name = KERN_SYSRQ ,
. procname = " sysrq " ,
2006-12-13 11:34:36 +03:00
. data = & __sysrq_enabled ,
2005-04-17 02:20:36 +04:00
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
2006-10-20 10:28:34 +04:00
# ifdef CONFIG_PROC_SYSCTL
2005-04-17 02:20:36 +04:00
{
. procname = " cad_pid " ,
2006-10-02 13:19:00 +04:00
. data = NULL ,
2005-04-17 02:20:36 +04:00
. maxlen = sizeof ( int ) ,
. mode = 0600 ,
2006-10-02 13:19:00 +04:00
. proc_handler = & proc_do_cad_pid ,
2005-04-17 02:20:36 +04:00
} ,
2006-10-20 10:28:34 +04:00
# endif
2005-04-17 02:20:36 +04:00
{
. ctl_name = KERN_MAX_THREADS ,
. procname = " threads-max " ,
. data = & max_threads ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = KERN_RANDOM ,
. procname = " random " ,
. mode = 0555 ,
. child = random_table ,
} ,
{
. ctl_name = KERN_OVERFLOWUID ,
. procname = " overflowuid " ,
. data = & overflowuid ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & minolduid ,
. extra2 = & maxolduid ,
} ,
{
. ctl_name = KERN_OVERFLOWGID ,
. procname = " overflowgid " ,
. data = & overflowgid ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & minolduid ,
. extra2 = & maxolduid ,
} ,
2006-01-06 11:19:28 +03:00
# ifdef CONFIG_S390
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_MATHEMU
{
. ctl_name = KERN_IEEE_EMULATION_WARNINGS ,
. procname = " ieee_emulation_warnings " ,
. data = & sysctl_ieee_emulation_warnings ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
# ifdef CONFIG_NO_IDLE_HZ
{
. ctl_name = KERN_HZ_TIMER ,
. procname = " hz_timer " ,
. data = & sysctl_hz_timer ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
{
. ctl_name = KERN_S390_USER_DEBUG_LOGGING ,
. procname = " userprocess_debug " ,
. data = & sysctl_userprocess_debug ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
{
. ctl_name = KERN_PIDMAX ,
. procname = " pid_max " ,
. data = & pid_max ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = sysctl_intvec ,
. extra1 = & pid_max_min ,
. extra2 = & pid_max_max ,
} ,
{
. ctl_name = KERN_PANIC_ON_OOPS ,
. procname = " panic_on_oops " ,
. data = & panic_on_oops ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = KERN_PRINTK_RATELIMIT ,
. procname = " printk_ratelimit " ,
. data = & printk_ratelimit_jiffies ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_jiffies ,
. strategy = & sysctl_jiffies ,
} ,
{
. ctl_name = KERN_PRINTK_RATELIMIT_BURST ,
. procname = " printk_ratelimit_burst " ,
. data = & printk_ratelimit_burst ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = KERN_NGROUPS_MAX ,
. procname = " ngroups_max " ,
. data = & ngroups_max ,
. maxlen = sizeof ( int ) ,
. mode = 0444 ,
. proc_handler = & proc_dointvec ,
} ,
# if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
. ctl_name = KERN_UNKNOWN_NMI_PANIC ,
. procname = " unknown_nmi_panic " ,
. data = & unknown_nmi_panic ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
2006-09-26 12:52:27 +04:00
. proc_handler = & proc_dointvec ,
2005-04-17 02:20:36 +04:00
} ,
2006-09-26 12:52:27 +04:00
{
. procname = " nmi_watchdog " ,
. data = & nmi_watchdog_enabled ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_nmi_enabled ,
2005-04-17 02:20:36 +04:00
} ,
# endif
# if defined(CONFIG_X86)
2006-09-26 12:52:27 +04:00
{
. ctl_name = KERN_PANIC_ON_NMI ,
. procname = " panic_on_unrecovered_nmi " ,
. data = & panic_on_unrecovered_nmi ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2005-04-17 02:20:36 +04:00
{
. ctl_name = KERN_BOOTLOADER_TYPE ,
. procname = " bootloader_type " ,
. data = & bootloader_type ,
. maxlen = sizeof ( int ) ,
. mode = 0444 ,
. proc_handler = & proc_dointvec ,
} ,
2006-12-07 04:14:11 +03:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " kstack_depth_to_print " ,
. data = & kstack_depth_to_print ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2005-04-17 02:20:36 +04:00
# endif
2006-02-21 05:28:07 +03:00
# if defined(CONFIG_MMU)
2005-04-17 02:20:36 +04:00
{
. ctl_name = KERN_RANDOMIZE ,
. procname = " randomize_va_space " ,
. data = & randomize_va_space ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2006-02-21 05:28:07 +03:00
# endif
2006-01-15 00:21:00 +03:00
# if defined(CONFIG_S390) && defined(CONFIG_SMP)
2005-07-27 22:44:57 +04:00
{
. ctl_name = KERN_SPIN_RETRY ,
. procname = " spin_retry " ,
. data = & spin_retry ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2006-02-21 05:27:58 +03:00
# endif
2007-07-28 11:33:16 +04:00
# if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
2006-02-21 05:27:58 +03:00
{
. procname = " acpi_video_flags " ,
2007-07-19 12:47:41 +04:00
. data = & acpi_realmode_flags ,
2006-02-21 05:27:58 +03:00
. maxlen = sizeof ( unsigned long ) ,
. mode = 0644 ,
2006-03-02 13:54:34 +03:00
. proc_handler = & proc_doulongvec_minmax ,
2006-02-21 05:27:58 +03:00
} ,
2006-02-28 20:42:23 +03:00
# endif
# ifdef CONFIG_IA64
{
. ctl_name = KERN_IA64_UNALIGNED ,
. procname = " ignore-unaligned-usertrap " ,
. data = & no_unaligned_warning ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2006-06-26 15:56:52 +04:00
# endif
2007-10-17 10:26:09 +04:00
# ifdef CONFIG_DETECT_SOFTLOCKUP
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " softlockup_thresh " ,
. data = & softlockup_thresh ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & one ,
. extra2 = & sixty ,
} ,
2008-01-25 23:08:02 +03:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " hung_task_check_count " ,
. data = & sysctl_hung_task_check_count ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
} ,
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " hung_task_timeout_secs " ,
. data = & sysctl_hung_task_timeout_secs ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
} ,
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " hung_task_warnings " ,
. data = & sysctl_hung_task_warnings ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
} ,
2007-10-17 10:26:09 +04:00
# endif
2006-06-26 15:56:52 +04:00
# ifdef CONFIG_COMPAT
{
. ctl_name = KERN_COMPAT_LOG ,
. procname = " compat-log " ,
. data = & compat_log ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2005-07-27 22:44:57 +04:00
# endif
2006-06-27 13:54:53 +04:00
# ifdef CONFIG_RT_MUTEXES
{
. ctl_name = KERN_MAX_LOCK_DEPTH ,
. procname = " max_lock_depth " ,
. data = & max_lock_depth ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
2007-05-08 11:26:04 +04:00
# ifdef CONFIG_PROC_FS
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " maps_protect " ,
. data = & maps_protect ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
2007-07-18 05:37:02 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " poweroff_cmd " ,
. data = & poweroff_cmd ,
. maxlen = POWEROFF_CMD_PATH_LEN ,
. mode = 0644 ,
. proc_handler = & proc_dostring ,
. strategy = & sysctl_string ,
} ,
2007-07-19 12:50:35 +04:00
/*
* NOTE : do not add new entries to this table unless you have read
* Documentation / sysctl / ctl_unnumbered . txt
*/
2005-04-17 02:20:36 +04:00
{ . ctl_name = 0 }
} ;
2007-10-18 14:05:22 +04:00
static struct ctl_table vm_table [ ] = {
2005-04-17 02:20:36 +04:00
{
. ctl_name = VM_OVERCOMMIT_MEMORY ,
. procname = " overcommit_memory " ,
. data = & sysctl_overcommit_memory ,
. maxlen = sizeof ( sysctl_overcommit_memory ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2006-06-23 13:03:13 +04:00
{
. ctl_name = VM_PANIC_ON_OOM ,
. procname = " panic_on_oom " ,
. data = & sysctl_panic_on_oom ,
. maxlen = sizeof ( sysctl_panic_on_oom ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-10-17 10:25:56 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " oom_kill_allocating_task " ,
. data = & sysctl_oom_kill_allocating_task ,
. maxlen = sizeof ( sysctl_oom_kill_allocating_task ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2005-04-17 02:20:36 +04:00
{
. ctl_name = VM_OVERCOMMIT_RATIO ,
. procname = " overcommit_ratio " ,
. data = & sysctl_overcommit_ratio ,
. maxlen = sizeof ( sysctl_overcommit_ratio ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = VM_PAGE_CLUSTER ,
. procname = " page-cluster " ,
. data = & page_cluster ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = VM_DIRTY_BACKGROUND ,
. procname = " dirty_background_ratio " ,
. data = & dirty_background_ratio ,
. maxlen = sizeof ( dirty_background_ratio ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
. extra2 = & one_hundred ,
} ,
{
. ctl_name = VM_DIRTY_RATIO ,
. procname = " dirty_ratio " ,
. data = & vm_dirty_ratio ,
. maxlen = sizeof ( vm_dirty_ratio ) ,
. mode = 0644 ,
2007-10-17 10:25:50 +04:00
. proc_handler = & dirty_ratio_handler ,
2005-04-17 02:20:36 +04:00
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
. extra2 = & one_hundred ,
} ,
{
. procname = " dirty_writeback_centisecs " ,
2006-03-24 14:15:48 +03:00
. data = & dirty_writeback_interval ,
. maxlen = sizeof ( dirty_writeback_interval ) ,
2005-04-17 02:20:36 +04:00
. mode = 0644 ,
. proc_handler = & dirty_writeback_centisecs_handler ,
} ,
{
. procname = " dirty_expire_centisecs " ,
2006-03-24 14:15:48 +03:00
. data = & dirty_expire_interval ,
. maxlen = sizeof ( dirty_expire_interval ) ,
2005-04-17 02:20:36 +04:00
. mode = 0644 ,
2006-03-24 14:15:48 +03:00
. proc_handler = & proc_dointvec_userhz_jiffies ,
2005-04-17 02:20:36 +04:00
} ,
{
. ctl_name = VM_NR_PDFLUSH_THREADS ,
. procname = " nr_pdflush_threads " ,
. data = & nr_pdflush_threads ,
. maxlen = sizeof nr_pdflush_threads ,
. mode = 0444 /* read-only*/ ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = VM_SWAPPINESS ,
. procname = " swappiness " ,
. data = & vm_swappiness ,
. maxlen = sizeof ( vm_swappiness ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
. extra2 = & one_hundred ,
} ,
# ifdef CONFIG_HUGETLB_PAGE
{
. procname = " nr_hugepages " ,
. data = & max_huge_pages ,
. maxlen = sizeof ( unsigned long ) ,
. mode = 0644 ,
. proc_handler = & hugetlb_sysctl_handler ,
. extra1 = ( void * ) & hugetlb_zero ,
. extra2 = ( void * ) & hugetlb_infinity ,
} ,
{
. ctl_name = VM_HUGETLB_GROUP ,
. procname = " hugetlb_shm_group " ,
. data = & sysctl_hugetlb_shm_group ,
. maxlen = sizeof ( gid_t ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-07-17 15:03:13 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " hugepages_treat_as_movable " ,
. data = & hugepages_treat_as_movable ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & hugetlb_treat_movable_handler ,
} ,
hugetlb: introduce nr_overcommit_hugepages sysctl
hugetlb: introduce nr_overcommit_hugepages sysctl
While examining the code to support /proc/sys/vm/hugetlb_dynamic_pool, I
became convinced that having a boolean sysctl was insufficient:
1) To support per-node control of hugepages, I have previously submitted
patches to add a sysfs attribute related to nr_hugepages. However, with
a boolean global value and per-mount quota enforcement constraining the
dynamic pool, adding corresponding control of the dynamic pool on a
per-node basis seems inconsistent to me.
2) Administration of the hugetlb dynamic pool with multiple hugetlbfs
mount points is, arguably, more arduous than it needs to be. Each quota
would need to be set separately, and the sum would need to be monitored.
To ease the administration, and to help make the way for per-node
control of the static & dynamic hugepage pool, I added a separate
sysctl, nr_overcommit_hugepages. This value serves as a high watermark
for the overall hugepage pool, while nr_hugepages serves as a low
watermark. The boolean sysctl can then be removed, as the condition
nr_overcommit_hugepages > 0
indicates the same administrative setting as
hugetlb_dynamic_pool == 1
Quotas still serve as local enforcement of the size of the pool on a
per-mount basis.
A few caveats:
1) There is a race whereby the global surplus huge page counter is
incremented before a hugepage has allocated. Another process could then
try grow the pool, and fail to convert a surplus huge page to a normal
huge page and instead allocate a fresh huge page. I believe this is
benign, as no memory is leaked (the actual pages are still tracked
correctly) and the counters won't go out of sync.
2) Shrinking the static pool while a surplus is in effect will allow the
number of surplus huge pages to exceed the overcommit value. As long as
this condition holds, however, no more surplus huge pages will be
allowed on the system until one of the two sysctls are increased
sufficiently, or the surplus huge pages go out of use and are freed.
Successfully tested on x86_64 with the current libhugetlbfs snapshot,
modified to use the new sysctl.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-12-18 03:20:12 +03:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " nr_overcommit_hugepages " ,
. data = & nr_overcommit_huge_pages ,
. maxlen = sizeof ( nr_overcommit_huge_pages ) ,
. mode = 0644 ,
. proc_handler = & proc_doulongvec_minmax ,
} ,
2005-04-17 02:20:36 +04:00
# endif
{
. ctl_name = VM_LOWMEM_RESERVE_RATIO ,
. procname = " lowmem_reserve_ratio " ,
. data = & sysctl_lowmem_reserve_ratio ,
. maxlen = sizeof ( sysctl_lowmem_reserve_ratio ) ,
. mode = 0644 ,
. proc_handler = & lowmem_reserve_ratio_sysctl_handler ,
. strategy = & sysctl_intvec ,
} ,
2006-01-08 12:00:39 +03:00
{
. ctl_name = VM_DROP_PAGECACHE ,
. procname = " drop_caches " ,
. data = & sysctl_drop_caches ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = drop_caches_sysctl_handler ,
. strategy = & sysctl_intvec ,
} ,
2005-04-17 02:20:36 +04:00
{
. ctl_name = VM_MIN_FREE_KBYTES ,
. procname = " min_free_kbytes " ,
. data = & min_free_kbytes ,
. maxlen = sizeof ( min_free_kbytes ) ,
. mode = 0644 ,
. proc_handler = & min_free_kbytes_sysctl_handler ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
} ,
2006-01-08 12:00:40 +03:00
{
. ctl_name = VM_PERCPU_PAGELIST_FRACTION ,
. procname = " percpu_pagelist_fraction " ,
. data = & percpu_pagelist_fraction ,
. maxlen = sizeof ( percpu_pagelist_fraction ) ,
. mode = 0644 ,
. proc_handler = & percpu_pagelist_fraction_sysctl_handler ,
. strategy = & sysctl_intvec ,
. extra1 = & min_percpu_pagelist_fract ,
} ,
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_MMU
{
. ctl_name = VM_MAX_MAP_COUNT ,
. procname = " max_map_count " ,
. data = & sysctl_max_map_count ,
. maxlen = sizeof ( sysctl_max_map_count ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec
} ,
# endif
{
. ctl_name = VM_LAPTOP_MODE ,
. procname = " laptop_mode " ,
. data = & laptop_mode ,
. maxlen = sizeof ( laptop_mode ) ,
. mode = 0644 ,
2006-03-24 14:15:49 +03:00
. proc_handler = & proc_dointvec_jiffies ,
. strategy = & sysctl_jiffies ,
2005-04-17 02:20:36 +04:00
} ,
{
. ctl_name = VM_BLOCK_DUMP ,
. procname = " block_dump " ,
. data = & block_dump ,
. maxlen = sizeof ( block_dump ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
} ,
{
. ctl_name = VM_VFS_CACHE_PRESSURE ,
. procname = " vfs_cache_pressure " ,
. data = & sysctl_vfs_cache_pressure ,
. maxlen = sizeof ( sysctl_vfs_cache_pressure ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
} ,
# ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
{
. ctl_name = VM_LEGACY_VA_LAYOUT ,
. procname = " legacy_va_layout " ,
. data = & sysctl_legacy_va_layout ,
. maxlen = sizeof ( sysctl_legacy_va_layout ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
} ,
# endif
2006-01-19 04:42:32 +03:00
# ifdef CONFIG_NUMA
{
. ctl_name = VM_ZONE_RECLAIM_MODE ,
. procname = " zone_reclaim_mode " ,
. data = & zone_reclaim_mode ,
. maxlen = sizeof ( zone_reclaim_mode ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
2006-02-01 14:05:29 +03:00
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
2006-01-19 04:42:32 +03:00
} ,
2006-07-03 11:24:13 +04:00
{
. ctl_name = VM_MIN_UNMAPPED ,
. procname = " min_unmapped_ratio " ,
. data = & sysctl_min_unmapped_ratio ,
. maxlen = sizeof ( sysctl_min_unmapped_ratio ) ,
. mode = 0644 ,
. proc_handler = & sysctl_min_unmapped_ratio_sysctl_handler ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
. extra2 = & one_hundred ,
} ,
2006-09-26 10:31:52 +04:00
{
. ctl_name = VM_MIN_SLAB ,
. procname = " min_slab_ratio " ,
. data = & sysctl_min_slab_ratio ,
. maxlen = sizeof ( sysctl_min_slab_ratio ) ,
. mode = 0644 ,
. proc_handler = & sysctl_min_slab_ratio_sysctl_handler ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
. extra2 = & one_hundred ,
} ,
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
# endif
2007-05-09 13:35:13 +04:00
# ifdef CONFIG_SMP
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " stat_interval " ,
. data = & sysctl_stat_interval ,
. maxlen = sizeof ( sysctl_stat_interval ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_jiffies ,
. strategy = & sysctl_jiffies ,
} ,
# endif
2007-06-28 23:55:21 +04:00
# ifdef CONFIG_SECURITY
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " mmap_min_addr " ,
. data = & mmap_min_addr ,
. maxlen = sizeof ( unsigned long ) ,
. mode = 0644 ,
. proc_handler = & proc_doulongvec_minmax ,
} ,
2007-08-11 00:00:51 +04:00
# endif
2007-07-16 10:38:01 +04:00
# ifdef CONFIG_NUMA
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " numa_zonelist_order " ,
. data = & numa_zonelist_order ,
. maxlen = NUMA_ZONELIST_ORDER_LEN ,
. mode = 0644 ,
. proc_handler = & numa_zonelist_order_handler ,
. strategy = & sysctl_string ,
} ,
# endif
2007-10-13 11:16:04 +04:00
# if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
2007-03-01 04:07:42 +03:00
( defined ( CONFIG_SUPERH ) & & defined ( CONFIG_VSYSCALL ) )
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
{
. ctl_name = VM_VDSO_ENABLED ,
. procname = " vdso_enabled " ,
. data = & vdso_enabled ,
. maxlen = sizeof ( vdso_enabled ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
} ,
2005-04-17 02:20:36 +04:00
# endif
2007-07-16 10:41:21 +04:00
/*
* NOTE : do not add new entries to this table unless you have read
* Documentation / sysctl / ctl_unnumbered . txt
*/
2005-04-17 02:20:36 +04:00
{ . ctl_name = 0 }
} ;
2007-02-14 11:34:07 +03:00
# if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
2007-10-18 14:05:22 +04:00
static struct ctl_table binfmt_misc_table [ ] = {
2007-02-14 11:34:07 +03:00
{ . ctl_name = 0 }
} ;
# endif
2007-10-18 14:05:22 +04:00
static struct ctl_table fs_table [ ] = {
2005-04-17 02:20:36 +04:00
{
. ctl_name = FS_NRINODE ,
. procname = " inode-nr " ,
. data = & inodes_stat ,
. maxlen = 2 * sizeof ( int ) ,
. mode = 0444 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = FS_STATINODE ,
. procname = " inode-state " ,
. data = & inodes_stat ,
. maxlen = 7 * sizeof ( int ) ,
. mode = 0444 ,
. proc_handler = & proc_dointvec ,
} ,
{
. procname = " file-nr " ,
. data = & files_stat ,
. maxlen = 3 * sizeof ( int ) ,
. mode = 0444 ,
2006-03-08 08:55:35 +03:00
. proc_handler = & proc_nr_files ,
2005-04-17 02:20:36 +04:00
} ,
{
. ctl_name = FS_MAXFILE ,
. procname = " file-max " ,
. data = & files_stat . max_files ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = FS_DENTRY ,
. procname = " dentry-state " ,
. data = & dentry_stat ,
. maxlen = 6 * sizeof ( int ) ,
. mode = 0444 ,
. proc_handler = & proc_dointvec ,
} ,
{
. ctl_name = FS_OVERFLOWUID ,
. procname = " overflowuid " ,
. data = & fs_overflowuid ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & minolduid ,
. extra2 = & maxolduid ,
} ,
{
. ctl_name = FS_OVERFLOWGID ,
. procname = " overflowgid " ,
. data = & fs_overflowgid ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & minolduid ,
. extra2 = & maxolduid ,
} ,
{
. ctl_name = FS_LEASES ,
. procname = " leases-enable " ,
. data = & leases_enable ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# ifdef CONFIG_DNOTIFY
{
. ctl_name = FS_DIR_NOTIFY ,
. procname = " dir-notify-enable " ,
. data = & dir_notify_enable ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
# endif
# ifdef CONFIG_MMU
{
. ctl_name = FS_LEASE_TIME ,
. procname = " lease-break-time " ,
. data = & lease_break_time ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
2007-07-19 12:48:26 +04:00
. proc_handler = & proc_dointvec_minmax ,
. strategy = & sysctl_intvec ,
. extra1 = & zero ,
. extra2 = & two ,
2005-04-17 02:20:36 +04:00
} ,
{
. procname = " aio-nr " ,
. data = & aio_nr ,
. maxlen = sizeof ( aio_nr ) ,
. mode = 0444 ,
2005-11-07 11:59:31 +03:00
. proc_handler = & proc_doulongvec_minmax ,
2005-04-17 02:20:36 +04:00
} ,
{
. procname = " aio-max-nr " ,
. data = & aio_max_nr ,
. maxlen = sizeof ( aio_max_nr ) ,
. mode = 0644 ,
2005-11-07 11:59:31 +03:00
. proc_handler = & proc_doulongvec_minmax ,
2005-04-17 02:20:36 +04:00
} ,
2006-06-02 00:10:59 +04:00
# ifdef CONFIG_INOTIFY_USER
2005-07-13 20:38:18 +04:00
{
. ctl_name = FS_INOTIFY ,
. procname = " inotify " ,
. mode = 0555 ,
. child = inotify_table ,
} ,
# endif
2005-04-17 02:20:36 +04:00
# endif
2005-06-23 11:09:43 +04:00
{
. ctl_name = KERN_SETUID_DUMPABLE ,
. procname = " suid_dumpable " ,
. data = & suid_dumpable ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = & proc_dointvec ,
} ,
2007-02-14 11:34:07 +03:00
# if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " binfmt_misc " ,
. mode = 0555 ,
. child = binfmt_misc_table ,
} ,
# endif
2007-07-16 10:41:21 +04:00
/*
* NOTE : do not add new entries to this table unless you have read
* Documentation / sysctl / ctl_unnumbered . txt
*/
2005-04-17 02:20:36 +04:00
{ . ctl_name = 0 }
} ;
2007-10-18 14:05:22 +04:00
static struct ctl_table debug_table [ ] = {
2007-10-12 04:20:07 +04:00
# if defined(CONFIG_X86) || defined(CONFIG_PPC)
2007-07-22 13:12:28 +04:00
{
. ctl_name = CTL_UNNUMBERED ,
. procname = " exception-trace " ,
. data = & show_unhandled_signals ,
. maxlen = sizeof ( int ) ,
. mode = 0644 ,
. proc_handler = proc_dointvec
} ,
# endif
2005-04-17 02:20:36 +04:00
{ . ctl_name = 0 }
} ;
2007-10-18 14:05:22 +04:00
static struct ctl_table dev_table [ ] = {
2005-04-17 02:20:36 +04:00
{ . ctl_name = 0 }
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 01:06:03 +04:00
} ;
2005-04-17 02:20:36 +04:00
2005-11-04 13:18:40 +03:00
static DEFINE_SPINLOCK ( sysctl_lock ) ;
/* called under sysctl_lock */
static int use_table ( struct ctl_table_header * p )
{
if ( unlikely ( p - > unregistering ) )
return 0 ;
p - > used + + ;
return 1 ;
}
/* called under sysctl_lock */
static void unuse_table ( struct ctl_table_header * p )
{
if ( ! - - p - > used )
if ( unlikely ( p - > unregistering ) )
complete ( p - > unregistering ) ;
}
/* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering ( struct ctl_table_header * p )
{
/*
* if p - > used is 0 , nobody will ever touch that entry again ;
* we ' ll eliminate all paths to it before dropping sysctl_lock
*/
if ( unlikely ( p - > used ) ) {
struct completion wait ;
init_completion ( & wait ) ;
p - > unregistering = & wait ;
spin_unlock ( & sysctl_lock ) ;
wait_for_completion ( & wait ) ;
spin_lock ( & sysctl_lock ) ;
}
/*
* do not remove from the list until nobody holds it ; walking the
* list in do_sysctl ( ) relies on that .
*/
list_del_init ( & p - > ctl_entry ) ;
}
2007-02-14 11:34:11 +03:00
void sysctl_head_finish ( struct ctl_table_header * head )
{
if ( ! head )
return ;
spin_lock ( & sysctl_lock ) ;
unuse_table ( head ) ;
spin_unlock ( & sysctl_lock ) ;
}
struct ctl_table_header * sysctl_head_next ( struct ctl_table_header * prev )
{
struct ctl_table_header * head ;
struct list_head * tmp ;
spin_lock ( & sysctl_lock ) ;
if ( prev ) {
tmp = & prev - > ctl_entry ;
unuse_table ( prev ) ;
goto next ;
}
tmp = & root_table_header . ctl_entry ;
for ( ; ; ) {
head = list_entry ( tmp , struct ctl_table_header , ctl_entry ) ;
if ( ! use_table ( head ) )
goto next ;
spin_unlock ( & sysctl_lock ) ;
return head ;
next :
tmp = tmp - > next ;
if ( tmp = = & root_table_header . ctl_entry )
break ;
}
spin_unlock ( & sysctl_lock ) ;
return NULL ;
}
2006-09-27 12:51:04 +04:00
# ifdef CONFIG_SYSCTL_SYSCALL
2005-04-17 02:20:36 +04:00
int do_sysctl ( int __user * name , int nlen , void __user * oldval , size_t __user * oldlenp ,
void __user * newval , size_t newlen )
{
2007-02-14 11:34:11 +03:00
struct ctl_table_header * head ;
2005-11-04 13:18:40 +03:00
int error = - ENOTDIR ;
2005-04-17 02:20:36 +04:00
if ( nlen < = 0 | | nlen > = CTL_MAXNAME )
return - ENOTDIR ;
if ( oldval ) {
int old_len ;
if ( ! oldlenp | | get_user ( old_len , oldlenp ) )
return - EFAULT ;
}
2005-11-04 13:18:40 +03:00
2007-02-14 11:34:11 +03:00
for ( head = sysctl_head_next ( NULL ) ; head ;
head = sysctl_head_next ( head ) ) {
2005-11-04 13:18:40 +03:00
error = parse_table ( name , nlen , oldval , oldlenp ,
2006-12-10 13:19:10 +03:00
newval , newlen , head - > ctl_table ) ;
2007-02-14 11:34:11 +03:00
if ( error ! = - ENOTDIR ) {
sysctl_head_finish ( head ) ;
2005-11-04 13:18:40 +03:00
break ;
2007-02-14 11:34:11 +03:00
}
}
2005-11-04 13:18:40 +03:00
return error ;
2005-04-17 02:20:36 +04:00
}
asmlinkage long sys_sysctl ( struct __sysctl_args __user * args )
{
struct __sysctl_args tmp ;
int error ;
if ( copy_from_user ( & tmp , args , sizeof ( tmp ) ) )
return - EFAULT ;
2007-10-18 14:05:58 +04:00
error = deprecated_sysctl_warning ( & tmp ) ;
if ( error )
goto out ;
2005-04-17 02:20:36 +04:00
lock_kernel ( ) ;
error = do_sysctl ( tmp . name , tmp . nlen , tmp . oldval , tmp . oldlenp ,
tmp . newval , tmp . newlen ) ;
unlock_kernel ( ) ;
2007-10-18 14:05:58 +04:00
out :
2005-04-17 02:20:36 +04:00
return error ;
}
2006-09-27 12:51:04 +04:00
# endif /* CONFIG_SYSCTL_SYSCALL */
2005-04-17 02:20:36 +04:00
/*
2007-02-14 11:34:11 +03:00
* sysctl_perm does NOT grant the superuser all rights automatically , because
2005-04-17 02:20:36 +04:00
* some sysctl variables are readonly even to root .
*/
static int test_perm ( int mode , int op )
{
if ( ! current - > euid )
mode > > = 6 ;
else if ( in_egroup_p ( 0 ) )
mode > > = 3 ;
if ( ( mode & op & 0007 ) = = op )
return 0 ;
return - EACCES ;
}
2007-10-18 14:05:22 +04:00
int sysctl_perm ( struct ctl_table * table , int op )
2005-04-17 02:20:36 +04:00
{
int error ;
error = security_sysctl ( table , op ) ;
if ( error )
return error ;
return test_perm ( table - > mode , op ) ;
}
2006-09-27 12:51:04 +04:00
# ifdef CONFIG_SYSCTL_SYSCALL
2005-04-17 02:20:36 +04:00
static int parse_table ( int __user * name , int nlen ,
void __user * oldval , size_t __user * oldlenp ,
void __user * newval , size_t newlen ,
2007-10-18 14:05:22 +04:00
struct ctl_table * table )
2005-04-17 02:20:36 +04:00
{
int n ;
repeat :
if ( ! nlen )
return - ENOTDIR ;
if ( get_user ( n , name ) )
return - EFAULT ;
2006-11-06 10:52:12 +03:00
for ( ; table - > ctl_name | | table - > procname ; table + + ) {
if ( ! table - > ctl_name )
continue ;
2007-02-14 11:34:07 +03:00
if ( n = = table - > ctl_name ) {
2005-04-17 02:20:36 +04:00
int error ;
if ( table - > child ) {
2007-02-14 11:34:11 +03:00
if ( sysctl_perm ( table , 001 ) )
2005-04-17 02:20:36 +04:00
return - EPERM ;
name + + ;
nlen - - ;
table = table - > child ;
goto repeat ;
}
error = do_sysctl_strategy ( table , name , nlen ,
oldval , oldlenp ,
2006-12-10 13:19:10 +03:00
newval , newlen ) ;
2005-04-17 02:20:36 +04:00
return error ;
}
}
return - ENOTDIR ;
}
/* Perform the actual read/write of a sysctl table entry. */
2007-10-18 14:05:22 +04:00
int do_sysctl_strategy ( struct ctl_table * table ,
2005-04-17 02:20:36 +04:00
int __user * name , int nlen ,
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
int op = 0 , rc ;
if ( oldval )
op | = 004 ;
if ( newval )
op | = 002 ;
2007-02-14 11:34:11 +03:00
if ( sysctl_perm ( table , op ) )
2005-04-17 02:20:36 +04:00
return - EPERM ;
if ( table - > strategy ) {
rc = table - > strategy ( table , name , nlen , oldval , oldlenp ,
2006-12-10 13:19:10 +03:00
newval , newlen ) ;
2005-04-17 02:20:36 +04:00
if ( rc < 0 )
return rc ;
if ( rc > 0 )
return 0 ;
}
/* If there is no strategy routine, or if the strategy returns
* zero , proceed with automatic r / w */
if ( table - > data & & table - > maxlen ) {
2007-10-18 14:05:23 +04:00
rc = sysctl_data ( table , name , nlen , oldval , oldlenp ,
newval , newlen ) ;
if ( rc < 0 )
return rc ;
2005-04-17 02:20:36 +04:00
}
return 0 ;
}
2006-09-27 12:51:04 +04:00
# endif /* CONFIG_SYSCTL_SYSCALL */
2005-04-17 02:20:36 +04:00
2007-02-14 11:34:13 +03:00
static void sysctl_set_parent ( struct ctl_table * parent , struct ctl_table * table )
{
for ( ; table - > ctl_name | | table - > procname ; table + + ) {
table - > parent = parent ;
if ( table - > child )
sysctl_set_parent ( table , table - > child ) ;
}
}
static __init int sysctl_init ( void )
{
2007-10-18 14:05:54 +04:00
int err ;
2007-02-14 11:34:13 +03:00
sysctl_set_parent ( NULL , root_table ) ;
2007-10-18 14:05:54 +04:00
err = sysctl_check_table ( root_table ) ;
2007-02-14 11:34:13 +03:00
return 0 ;
}
core_initcall ( sysctl_init ) ;
2005-04-17 02:20:36 +04:00
/**
* register_sysctl_table - register a sysctl hierarchy
* @ table : the top - level table structure
*
* Register a sysctl table hierarchy . @ table should be a filled in ctl_table
* array . An entry with a ctl_name of 0 terminates the table .
*
2007-10-18 14:05:22 +04:00
* The members of the & struct ctl_table structure are used as follows :
2005-04-17 02:20:36 +04:00
*
* ctl_name - This is the numeric sysctl value used by sysctl ( 2 ) . The number
* must be unique within that level of sysctl
*
* procname - the name of the sysctl file under / proc / sys . Set to % NULL to not
* enter a sysctl file
*
* data - a pointer to data for use by proc_handler
*
* maxlen - the maximum size in bytes of the data
*
* mode - the file permissions for the / proc / sys file , and for sysctl ( 2 )
*
* child - a pointer to the child sysctl table if this entry is a directory , or
* % NULL .
*
* proc_handler - the text handler routine ( described below )
*
* strategy - the strategy routine ( described below )
*
* de - for internal use by the sysctl routines
*
* extra1 , extra2 - extra pointers usable by the proc handler routines
*
* Leaf nodes in the sysctl tree will be represented by a single file
* under / proc ; non - leaf nodes will be represented by directories .
*
* sysctl ( 2 ) can automatically manage read and write requests through
* the sysctl table . The data and maxlen fields of the ctl_table
* struct enable minimal validation of the values being written to be
* performed , and the mode field allows minimal authentication .
*
* More sophisticated management can be enabled by the provision of a
* strategy routine with the table entry . This will be called before
* any automatic read or write of the data is performed .
*
* The strategy routine may return
*
* < 0 - Error occurred ( error is passed to user process )
*
* 0 - OK - proceed with automatic read or write .
*
* > 0 - OK - read or write has been done by the strategy routine , so
* return immediately .
*
* There must be a proc_handler routine for any terminal nodes
* mirrored under / proc / sys ( non - terminals are handled by a built - in
* directory handler ) . Several default handlers are available to
* cover common cases -
*
* proc_dostring ( ) , proc_dointvec ( ) , proc_dointvec_jiffies ( ) ,
* proc_dointvec_userhz_jiffies ( ) , proc_dointvec_minmax ( ) ,
* proc_doulongvec_ms_jiffies_minmax ( ) , proc_doulongvec_minmax ( )
*
* It is the handler ' s job to read the input buffer from user memory
* and process it . The handler should return 0 on success .
*
* This routine returns % NULL on a failure to register , and a pointer
* to the table header on success .
*/
2007-10-18 14:05:22 +04:00
struct ctl_table_header * register_sysctl_table ( struct ctl_table * table )
2005-04-17 02:20:36 +04:00
{
struct ctl_table_header * tmp ;
tmp = kmalloc ( sizeof ( struct ctl_table_header ) , GFP_KERNEL ) ;
if ( ! tmp )
return NULL ;
tmp - > ctl_table = table ;
INIT_LIST_HEAD ( & tmp - > ctl_entry ) ;
2005-11-04 13:18:40 +03:00
tmp - > used = 0 ;
tmp - > unregistering = NULL ;
2007-02-14 11:34:13 +03:00
sysctl_set_parent ( NULL , table ) ;
2007-10-18 14:05:54 +04:00
if ( sysctl_check_table ( tmp - > ctl_table ) ) {
kfree ( tmp ) ;
return NULL ;
}
2005-11-04 13:18:40 +03:00
spin_lock ( & sysctl_lock ) ;
2007-02-14 11:34:09 +03:00
list_add_tail ( & tmp - > ctl_entry , & root_table_header . ctl_entry ) ;
2005-11-04 13:18:40 +03:00
spin_unlock ( & sysctl_lock ) ;
2005-04-17 02:20:36 +04:00
return tmp ;
}
/**
* unregister_sysctl_table - unregister a sysctl table hierarchy
* @ header : the header returned from register_sysctl_table
*
* Unregisters the sysctl table and all children . proc entries may not
* actually be removed until they are no longer used by anyone .
*/
void unregister_sysctl_table ( struct ctl_table_header * header )
{
2005-11-04 13:18:40 +03:00
might_sleep ( ) ;
2007-12-05 10:45:24 +03:00
if ( header = = NULL )
return ;
2005-11-04 13:18:40 +03:00
spin_lock ( & sysctl_lock ) ;
start_unregistering ( header ) ;
spin_unlock ( & sysctl_lock ) ;
2005-04-17 02:20:36 +04:00
kfree ( header ) ;
}
2006-09-27 12:51:04 +04:00
# else /* !CONFIG_SYSCTL */
2007-10-18 14:05:22 +04:00
struct ctl_table_header * register_sysctl_table ( struct ctl_table * table )
2006-09-27 12:51:04 +04:00
{
return NULL ;
}
void unregister_sysctl_table ( struct ctl_table_header * table )
{
}
# endif /* CONFIG_SYSCTL */
2005-04-17 02:20:36 +04:00
/*
* / proc / sys support
*/
2006-09-27 12:51:04 +04:00
# ifdef CONFIG_PROC_SYSCTL
2005-04-17 02:20:36 +04:00
2006-10-02 13:18:05 +04:00
static int _proc_do_string ( void * data , int maxlen , int write ,
struct file * filp , void __user * buffer ,
size_t * lenp , loff_t * ppos )
2005-04-17 02:20:36 +04:00
{
size_t len ;
char __user * p ;
char c ;
2007-02-10 12:46:38 +03:00
if ( ! data | | ! maxlen | | ! * lenp ) {
2005-04-17 02:20:36 +04:00
* lenp = 0 ;
return 0 ;
}
2007-02-10 12:46:38 +03:00
2005-04-17 02:20:36 +04:00
if ( write ) {
len = 0 ;
p = buffer ;
while ( len < * lenp ) {
if ( get_user ( c , p + + ) )
return - EFAULT ;
if ( c = = 0 | | c = = ' \n ' )
break ;
len + + ;
}
2006-10-02 13:18:04 +04:00
if ( len > = maxlen )
len = maxlen - 1 ;
if ( copy_from_user ( data , buffer , len ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
2006-10-02 13:18:04 +04:00
( ( char * ) data ) [ len ] = 0 ;
2005-04-17 02:20:36 +04:00
* ppos + = * lenp ;
} else {
2006-10-02 13:18:04 +04:00
len = strlen ( data ) ;
if ( len > maxlen )
len = maxlen ;
2007-02-10 12:46:38 +03:00
if ( * ppos > len ) {
* lenp = 0 ;
return 0 ;
}
data + = * ppos ;
len - = * ppos ;
2005-04-17 02:20:36 +04:00
if ( len > * lenp )
len = * lenp ;
if ( len )
2006-10-02 13:18:04 +04:00
if ( copy_to_user ( buffer , data , len ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
if ( len < * lenp ) {
if ( put_user ( ' \n ' , ( ( char __user * ) buffer ) + len ) )
return - EFAULT ;
len + + ;
}
* lenp = len ;
* ppos + = len ;
}
return 0 ;
}
2006-10-02 13:18:04 +04:00
/**
* proc_dostring - read a string sysctl
* @ table : the sysctl table
* @ write : % TRUE if this is a write to the sysctl file
* @ filp : the file structure
* @ buffer : the user buffer
* @ lenp : the size of the user buffer
* @ ppos : file position
*
* Reads / writes a string from / to the user buffer . If the kernel
* buffer provided is not large enough to hold the string , the
* string is truncated . The copied string is % NULL - terminated .
* If the string is being read by the user process , it is copied
* and a newline ' \n ' is added . It is truncated if the buffer is
* not large enough .
*
* Returns 0 on success .
*/
2007-10-18 14:05:22 +04:00
int proc_dostring ( struct ctl_table * table , int write , struct file * filp ,
2006-10-02 13:18:04 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return _proc_do_string ( table - > data , table - > maxlen , write , filp ,
buffer , lenp , ppos ) ;
}
2005-04-17 02:20:36 +04:00
static int do_proc_dointvec_conv ( int * negp , unsigned long * lvalp ,
int * valp ,
int write , void * data )
{
if ( write ) {
* valp = * negp ? - * lvalp : * lvalp ;
} else {
int val = * valp ;
if ( val < 0 ) {
* negp = - 1 ;
* lvalp = ( unsigned long ) - val ;
} else {
* negp = 0 ;
* lvalp = ( unsigned long ) val ;
}
}
return 0 ;
}
2007-10-18 14:05:22 +04:00
static int __do_proc_dointvec ( void * tbl_data , struct ctl_table * table ,
2006-10-02 13:18:23 +04:00
int write , struct file * filp , void __user * buffer ,
size_t * lenp , loff_t * ppos ,
2005-04-17 02:20:36 +04:00
int ( * conv ) ( int * negp , unsigned long * lvalp , int * valp ,
int write , void * data ) ,
void * data )
{
# define TMPBUFLEN 21
int * i , vleft , first = 1 , neg , val ;
unsigned long lval ;
size_t left , len ;
char buf [ TMPBUFLEN ] , * p ;
char __user * s = buffer ;
2006-10-02 13:18:23 +04:00
if ( ! tbl_data | | ! table - > maxlen | | ! * lenp | |
2005-04-17 02:20:36 +04:00
( * ppos & & ! write ) ) {
* lenp = 0 ;
return 0 ;
}
2006-10-02 13:18:23 +04:00
i = ( int * ) tbl_data ;
2005-04-17 02:20:36 +04:00
vleft = table - > maxlen / sizeof ( * i ) ;
left = * lenp ;
if ( ! conv )
conv = do_proc_dointvec_conv ;
for ( ; left & & vleft - - ; i + + , first = 0 ) {
if ( write ) {
while ( left ) {
char c ;
if ( get_user ( c , s ) )
return - EFAULT ;
if ( ! isspace ( c ) )
break ;
left - - ;
s + + ;
}
if ( ! left )
break ;
neg = 0 ;
len = left ;
if ( len > sizeof ( buf ) - 1 )
len = sizeof ( buf ) - 1 ;
if ( copy_from_user ( buf , s , len ) )
return - EFAULT ;
buf [ len ] = 0 ;
p = buf ;
if ( * p = = ' - ' & & left > 1 ) {
neg = 1 ;
2006-12-07 07:39:09 +03:00
p + + ;
2005-04-17 02:20:36 +04:00
}
if ( * p < ' 0 ' | | * p > ' 9 ' )
break ;
lval = simple_strtoul ( p , & p , 0 ) ;
len = p - buf ;
if ( ( len < left ) & & * p & & ! isspace ( * p ) )
break ;
if ( neg )
val = - val ;
s + = len ;
left - = len ;
if ( conv ( & neg , & lval , i , 1 , data ) )
break ;
} else {
p = buf ;
if ( ! first )
* p + + = ' \t ' ;
if ( conv ( & neg , & lval , i , 0 , data ) )
break ;
sprintf ( p , " %s%lu " , neg ? " - " : " " , lval ) ;
len = strlen ( buf ) ;
if ( len > left )
len = left ;
if ( copy_to_user ( s , buf , len ) )
return - EFAULT ;
left - = len ;
s + = len ;
}
}
if ( ! write & & ! first & & left ) {
if ( put_user ( ' \n ' , s ) )
return - EFAULT ;
left - - , s + + ;
}
if ( write ) {
while ( left ) {
char c ;
if ( get_user ( c , s + + ) )
return - EFAULT ;
if ( ! isspace ( c ) )
break ;
left - - ;
}
}
if ( write & & first )
return - EINVAL ;
* lenp - = left ;
* ppos + = * lenp ;
return 0 ;
# undef TMPBUFLEN
}
2007-10-18 14:05:22 +04:00
static int do_proc_dointvec ( struct ctl_table * table , int write , struct file * filp ,
2006-10-02 13:18:23 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos ,
int ( * conv ) ( int * negp , unsigned long * lvalp , int * valp ,
int write , void * data ) ,
void * data )
{
return __do_proc_dointvec ( table - > data , table , write , filp ,
buffer , lenp , ppos , conv , data ) ;
}
2005-04-17 02:20:36 +04:00
/**
* proc_dointvec - read a vector of integers
* @ table : the sysctl table
* @ write : % TRUE if this is a write to the sysctl file
* @ filp : the file structure
* @ buffer : the user buffer
* @ lenp : the size of the user buffer
* @ ppos : file position
*
* Reads / writes up to table - > maxlen / sizeof ( unsigned int ) integer
* values from / to the user buffer , treated as an ASCII string .
*
* Returns 0 on success .
*/
2007-10-18 14:05:22 +04:00
int proc_dointvec ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return do_proc_dointvec ( table , write , filp , buffer , lenp , ppos ,
NULL , NULL ) ;
}
# define OP_SET 0
# define OP_AND 1
2007-02-10 12:45:24 +03:00
# define OP_OR 2
2005-04-17 02:20:36 +04:00
static int do_proc_dointvec_bset_conv ( int * negp , unsigned long * lvalp ,
int * valp ,
int write , void * data )
{
int op = * ( int * ) data ;
if ( write ) {
int val = * negp ? - * lvalp : * lvalp ;
switch ( op ) {
case OP_SET : * valp = val ; break ;
case OP_AND : * valp & = val ; break ;
2007-02-10 12:45:24 +03:00
case OP_OR : * valp | = val ; break ;
2005-04-17 02:20:36 +04:00
}
} else {
int val = * valp ;
if ( val < 0 ) {
* negp = - 1 ;
* lvalp = ( unsigned long ) - val ;
} else {
* negp = 0 ;
* lvalp = ( unsigned long ) val ;
}
}
return 0 ;
}
V3 file capabilities: alter behavior of cap_setpcap
The non-filesystem capability meaning of CAP_SETPCAP is that a process, p1,
can change the capabilities of another process, p2. This is not the
meaning that was intended for this capability at all, and this
implementation came about purely because, without filesystem capabilities,
there was no way to use capabilities without one process bestowing them on
another.
Since we now have a filesystem support for capabilities we can fix the
implementation of CAP_SETPCAP.
The most significant thing about this change is that, with it in effect, no
process can set the capabilities of another process.
The capabilities of a program are set via the capability convolution
rules:
pI(post-exec) = pI(pre-exec)
pP(post-exec) = (X(aka cap_bset) & fP) | (pI(post-exec) & fI)
pE(post-exec) = fE ? pP(post-exec) : 0
at exec() time. As such, the only influence the pre-exec() program can
have on the post-exec() program's capabilities are through the pI
capability set.
The correct implementation for CAP_SETPCAP (and that enabled by this patch)
is that it can be used to add extra pI capabilities to the current process
- to be picked up by subsequent exec()s when the above convolution rules
are applied.
Here is how it works:
Let's say we have a process, p. It has capability sets, pE, pP and pI.
Generally, p, can change the value of its own pI to pI' where
(pI' & ~pI) & ~pP = 0.
That is, the only new things in pI' that were not present in pI need to
be present in pP.
The role of CAP_SETPCAP is basically to permit changes to pI beyond
the above:
if (pE & CAP_SETPCAP) {
pI' = anything; /* ie., even (pI' & ~pI) & ~pP != 0 */
}
This capability is useful for things like login, which (say, via
pam_cap) might want to raise certain inheritable capabilities for use
by the children of the logged-in user's shell, but those capabilities
are not useful to or needed by the login program itself.
One such use might be to limit who can run ping. You set the
capabilities of the 'ping' program to be "= cap_net_raw+i", and then
only shells that have (pI & CAP_NET_RAW) will be able to run
it. Without CAP_SETPCAP implemented as described above, login(pam_cap)
would have to also have (pP & CAP_NET_RAW) in order to raise this
capability and pass it on through the inheritable set.
Signed-off-by: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 14:05:59 +04:00
# ifdef CONFIG_SECURITY_CAPABILITIES
2005-04-17 02:20:36 +04:00
/*
* init may raise the set .
*/
V3 file capabilities: alter behavior of cap_setpcap
The non-filesystem capability meaning of CAP_SETPCAP is that a process, p1,
can change the capabilities of another process, p2. This is not the
meaning that was intended for this capability at all, and this
implementation came about purely because, without filesystem capabilities,
there was no way to use capabilities without one process bestowing them on
another.
Since we now have a filesystem support for capabilities we can fix the
implementation of CAP_SETPCAP.
The most significant thing about this change is that, with it in effect, no
process can set the capabilities of another process.
The capabilities of a program are set via the capability convolution
rules:
pI(post-exec) = pI(pre-exec)
pP(post-exec) = (X(aka cap_bset) & fP) | (pI(post-exec) & fI)
pE(post-exec) = fE ? pP(post-exec) : 0
at exec() time. As such, the only influence the pre-exec() program can
have on the post-exec() program's capabilities are through the pI
capability set.
The correct implementation for CAP_SETPCAP (and that enabled by this patch)
is that it can be used to add extra pI capabilities to the current process
- to be picked up by subsequent exec()s when the above convolution rules
are applied.
Here is how it works:
Let's say we have a process, p. It has capability sets, pE, pP and pI.
Generally, p, can change the value of its own pI to pI' where
(pI' & ~pI) & ~pP = 0.
That is, the only new things in pI' that were not present in pI need to
be present in pP.
The role of CAP_SETPCAP is basically to permit changes to pI beyond
the above:
if (pE & CAP_SETPCAP) {
pI' = anything; /* ie., even (pI' & ~pI) & ~pP != 0 */
}
This capability is useful for things like login, which (say, via
pam_cap) might want to raise certain inheritable capabilities for use
by the children of the logged-in user's shell, but those capabilities
are not useful to or needed by the login program itself.
One such use might be to limit who can run ping. You set the
capabilities of the 'ping' program to be "= cap_net_raw+i", and then
only shells that have (pI & CAP_NET_RAW) will be able to run
it. Without CAP_SETPCAP implemented as described above, login(pam_cap)
would have to also have (pP & CAP_NET_RAW) in order to raise this
capability and pass it on through the inheritable set.
Signed-off-by: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 14:05:59 +04:00
2007-10-18 14:05:22 +04:00
int proc_dointvec_bset ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
int op ;
2007-02-10 12:43:19 +03:00
if ( write & & ! capable ( CAP_SYS_MODULE ) ) {
2005-04-17 02:20:36 +04:00
return - EPERM ;
}
2007-10-19 10:39:52 +04:00
op = is_global_init ( current ) ? OP_SET : OP_AND ;
2005-04-17 02:20:36 +04:00
return do_proc_dointvec ( table , write , filp , buffer , lenp , ppos ,
do_proc_dointvec_bset_conv , & op ) ;
}
V3 file capabilities: alter behavior of cap_setpcap
The non-filesystem capability meaning of CAP_SETPCAP is that a process, p1,
can change the capabilities of another process, p2. This is not the
meaning that was intended for this capability at all, and this
implementation came about purely because, without filesystem capabilities,
there was no way to use capabilities without one process bestowing them on
another.
Since we now have a filesystem support for capabilities we can fix the
implementation of CAP_SETPCAP.
The most significant thing about this change is that, with it in effect, no
process can set the capabilities of another process.
The capabilities of a program are set via the capability convolution
rules:
pI(post-exec) = pI(pre-exec)
pP(post-exec) = (X(aka cap_bset) & fP) | (pI(post-exec) & fI)
pE(post-exec) = fE ? pP(post-exec) : 0
at exec() time. As such, the only influence the pre-exec() program can
have on the post-exec() program's capabilities are through the pI
capability set.
The correct implementation for CAP_SETPCAP (and that enabled by this patch)
is that it can be used to add extra pI capabilities to the current process
- to be picked up by subsequent exec()s when the above convolution rules
are applied.
Here is how it works:
Let's say we have a process, p. It has capability sets, pE, pP and pI.
Generally, p, can change the value of its own pI to pI' where
(pI' & ~pI) & ~pP = 0.
That is, the only new things in pI' that were not present in pI need to
be present in pP.
The role of CAP_SETPCAP is basically to permit changes to pI beyond
the above:
if (pE & CAP_SETPCAP) {
pI' = anything; /* ie., even (pI' & ~pI) & ~pP != 0 */
}
This capability is useful for things like login, which (say, via
pam_cap) might want to raise certain inheritable capabilities for use
by the children of the logged-in user's shell, but those capabilities
are not useful to or needed by the login program itself.
One such use might be to limit who can run ping. You set the
capabilities of the 'ping' program to be "= cap_net_raw+i", and then
only shells that have (pI & CAP_NET_RAW) will be able to run
it. Without CAP_SETPCAP implemented as described above, login(pam_cap)
would have to also have (pP & CAP_NET_RAW) in order to raise this
capability and pass it on through the inheritable set.
Signed-off-by: Andrew Morgan <morgan@kernel.org>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-18 14:05:59 +04:00
# endif /* def CONFIG_SECURITY_CAPABILITIES */
2005-04-17 02:20:36 +04:00
2007-02-10 12:45:24 +03:00
/*
* Taint values can only be increased
*/
2007-10-18 14:05:22 +04:00
static int proc_dointvec_taint ( struct ctl_table * table , int write , struct file * filp ,
2007-02-10 12:45:24 +03:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
int op ;
2007-04-24 01:41:14 +04:00
if ( write & & ! capable ( CAP_SYS_ADMIN ) )
2007-02-10 12:45:24 +03:00
return - EPERM ;
op = OP_OR ;
return do_proc_dointvec ( table , write , filp , buffer , lenp , ppos ,
do_proc_dointvec_bset_conv , & op ) ;
}
2005-04-17 02:20:36 +04:00
struct do_proc_dointvec_minmax_conv_param {
int * min ;
int * max ;
} ;
static int do_proc_dointvec_minmax_conv ( int * negp , unsigned long * lvalp ,
int * valp ,
int write , void * data )
{
struct do_proc_dointvec_minmax_conv_param * param = data ;
if ( write ) {
int val = * negp ? - * lvalp : * lvalp ;
if ( ( param - > min & & * param - > min > val ) | |
( param - > max & & * param - > max < val ) )
return - EINVAL ;
* valp = val ;
} else {
int val = * valp ;
if ( val < 0 ) {
* negp = - 1 ;
* lvalp = ( unsigned long ) - val ;
} else {
* negp = 0 ;
* lvalp = ( unsigned long ) val ;
}
}
return 0 ;
}
/**
* proc_dointvec_minmax - read a vector of integers with min / max values
* @ table : the sysctl table
* @ write : % TRUE if this is a write to the sysctl file
* @ filp : the file structure
* @ buffer : the user buffer
* @ lenp : the size of the user buffer
* @ ppos : file position
*
* Reads / writes up to table - > maxlen / sizeof ( unsigned int ) integer
* values from / to the user buffer , treated as an ASCII string .
*
* This routine will ensure the values are within the range specified by
* table - > extra1 ( min ) and table - > extra2 ( max ) .
*
* Returns 0 on success .
*/
2007-10-18 14:05:22 +04:00
int proc_dointvec_minmax ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
struct do_proc_dointvec_minmax_conv_param param = {
. min = ( int * ) table - > extra1 ,
. max = ( int * ) table - > extra2 ,
} ;
return do_proc_dointvec ( table , write , filp , buffer , lenp , ppos ,
do_proc_dointvec_minmax_conv , & param ) ;
}
2007-10-18 14:05:22 +04:00
static int __do_proc_doulongvec_minmax ( void * data , struct ctl_table * table , int write ,
2005-04-17 02:20:36 +04:00
struct file * filp ,
void __user * buffer ,
size_t * lenp , loff_t * ppos ,
unsigned long convmul ,
unsigned long convdiv )
{
# define TMPBUFLEN 21
unsigned long * i , * min , * max , val ;
int vleft , first = 1 , neg ;
size_t len , left ;
char buf [ TMPBUFLEN ] , * p ;
char __user * s = buffer ;
2006-10-02 13:18:23 +04:00
if ( ! data | | ! table - > maxlen | | ! * lenp | |
2005-04-17 02:20:36 +04:00
( * ppos & & ! write ) ) {
* lenp = 0 ;
return 0 ;
}
2006-10-02 13:18:23 +04:00
i = ( unsigned long * ) data ;
2005-04-17 02:20:36 +04:00
min = ( unsigned long * ) table - > extra1 ;
max = ( unsigned long * ) table - > extra2 ;
vleft = table - > maxlen / sizeof ( unsigned long ) ;
left = * lenp ;
for ( ; left & & vleft - - ; i + + , min + + , max + + , first = 0 ) {
if ( write ) {
while ( left ) {
char c ;
if ( get_user ( c , s ) )
return - EFAULT ;
if ( ! isspace ( c ) )
break ;
left - - ;
s + + ;
}
if ( ! left )
break ;
neg = 0 ;
len = left ;
if ( len > TMPBUFLEN - 1 )
len = TMPBUFLEN - 1 ;
if ( copy_from_user ( buf , s , len ) )
return - EFAULT ;
buf [ len ] = 0 ;
p = buf ;
if ( * p = = ' - ' & & left > 1 ) {
neg = 1 ;
2006-12-07 07:39:09 +03:00
p + + ;
2005-04-17 02:20:36 +04:00
}
if ( * p < ' 0 ' | | * p > ' 9 ' )
break ;
val = simple_strtoul ( p , & p , 0 ) * convmul / convdiv ;
len = p - buf ;
if ( ( len < left ) & & * p & & ! isspace ( * p ) )
break ;
if ( neg )
val = - val ;
s + = len ;
left - = len ;
if ( neg )
continue ;
if ( ( min & & val < * min ) | | ( max & & val > * max ) )
continue ;
* i = val ;
} else {
p = buf ;
if ( ! first )
* p + + = ' \t ' ;
sprintf ( p , " %lu " , convdiv * ( * i ) / convmul ) ;
len = strlen ( buf ) ;
if ( len > left )
len = left ;
if ( copy_to_user ( s , buf , len ) )
return - EFAULT ;
left - = len ;
s + = len ;
}
}
if ( ! write & & ! first & & left ) {
if ( put_user ( ' \n ' , s ) )
return - EFAULT ;
left - - , s + + ;
}
if ( write ) {
while ( left ) {
char c ;
if ( get_user ( c , s + + ) )
return - EFAULT ;
if ( ! isspace ( c ) )
break ;
left - - ;
}
}
if ( write & & first )
return - EINVAL ;
* lenp - = left ;
* ppos + = * lenp ;
return 0 ;
# undef TMPBUFLEN
}
2007-10-18 14:05:22 +04:00
static int do_proc_doulongvec_minmax ( struct ctl_table * table , int write ,
2006-10-02 13:18:23 +04:00
struct file * filp ,
void __user * buffer ,
size_t * lenp , loff_t * ppos ,
unsigned long convmul ,
unsigned long convdiv )
{
return __do_proc_doulongvec_minmax ( table - > data , table , write ,
filp , buffer , lenp , ppos , convmul , convdiv ) ;
}
2005-04-17 02:20:36 +04:00
/**
* proc_doulongvec_minmax - read a vector of long integers with min / max values
* @ table : the sysctl table
* @ write : % TRUE if this is a write to the sysctl file
* @ filp : the file structure
* @ buffer : the user buffer
* @ lenp : the size of the user buffer
* @ ppos : file position
*
* Reads / writes up to table - > maxlen / sizeof ( unsigned long ) unsigned long
* values from / to the user buffer , treated as an ASCII string .
*
* This routine will ensure the values are within the range specified by
* table - > extra1 ( min ) and table - > extra2 ( max ) .
*
* Returns 0 on success .
*/
2007-10-18 14:05:22 +04:00
int proc_doulongvec_minmax ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return do_proc_doulongvec_minmax ( table , write , filp , buffer , lenp , ppos , 1l , 1l ) ;
}
/**
* proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min / max values
* @ table : the sysctl table
* @ write : % TRUE if this is a write to the sysctl file
* @ filp : the file structure
* @ buffer : the user buffer
* @ lenp : the size of the user buffer
* @ ppos : file position
*
* Reads / writes up to table - > maxlen / sizeof ( unsigned long ) unsigned long
* values from / to the user buffer , treated as an ASCII string . The values
* are treated as milliseconds , and converted to jiffies when they are stored .
*
* This routine will ensure the values are within the range specified by
* table - > extra1 ( min ) and table - > extra2 ( max ) .
*
* Returns 0 on success .
*/
2007-10-18 14:05:22 +04:00
int proc_doulongvec_ms_jiffies_minmax ( struct ctl_table * table , int write ,
2005-04-17 02:20:36 +04:00
struct file * filp ,
void __user * buffer ,
size_t * lenp , loff_t * ppos )
{
return do_proc_doulongvec_minmax ( table , write , filp , buffer ,
lenp , ppos , HZ , 1000l ) ;
}
static int do_proc_dointvec_jiffies_conv ( int * negp , unsigned long * lvalp ,
int * valp ,
int write , void * data )
{
if ( write ) {
2006-03-24 14:15:50 +03:00
if ( * lvalp > LONG_MAX / HZ )
return 1 ;
2005-04-17 02:20:36 +04:00
* valp = * negp ? - ( * lvalp * HZ ) : ( * lvalp * HZ ) ;
} else {
int val = * valp ;
unsigned long lval ;
if ( val < 0 ) {
* negp = - 1 ;
lval = ( unsigned long ) - val ;
} else {
* negp = 0 ;
lval = ( unsigned long ) val ;
}
* lvalp = lval / HZ ;
}
return 0 ;
}
static int do_proc_dointvec_userhz_jiffies_conv ( int * negp , unsigned long * lvalp ,
int * valp ,
int write , void * data )
{
if ( write ) {
2006-03-24 14:15:50 +03:00
if ( USER_HZ < HZ & & * lvalp > ( LONG_MAX / HZ ) * USER_HZ )
return 1 ;
2005-04-17 02:20:36 +04:00
* valp = clock_t_to_jiffies ( * negp ? - * lvalp : * lvalp ) ;
} else {
int val = * valp ;
unsigned long lval ;
if ( val < 0 ) {
* negp = - 1 ;
lval = ( unsigned long ) - val ;
} else {
* negp = 0 ;
lval = ( unsigned long ) val ;
}
* lvalp = jiffies_to_clock_t ( lval ) ;
}
return 0 ;
}
static int do_proc_dointvec_ms_jiffies_conv ( int * negp , unsigned long * lvalp ,
int * valp ,
int write , void * data )
{
if ( write ) {
* valp = msecs_to_jiffies ( * negp ? - * lvalp : * lvalp ) ;
} else {
int val = * valp ;
unsigned long lval ;
if ( val < 0 ) {
* negp = - 1 ;
lval = ( unsigned long ) - val ;
} else {
* negp = 0 ;
lval = ( unsigned long ) val ;
}
* lvalp = jiffies_to_msecs ( lval ) ;
}
return 0 ;
}
/**
* proc_dointvec_jiffies - read a vector of integers as seconds
* @ table : the sysctl table
* @ write : % TRUE if this is a write to the sysctl file
* @ filp : the file structure
* @ buffer : the user buffer
* @ lenp : the size of the user buffer
* @ ppos : file position
*
* Reads / writes up to table - > maxlen / sizeof ( unsigned int ) integer
* values from / to the user buffer , treated as an ASCII string .
* The values read are assumed to be in seconds , and are converted into
* jiffies .
*
* Returns 0 on success .
*/
2007-10-18 14:05:22 +04:00
int proc_dointvec_jiffies ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return do_proc_dointvec ( table , write , filp , buffer , lenp , ppos ,
do_proc_dointvec_jiffies_conv , NULL ) ;
}
/**
* proc_dointvec_userhz_jiffies - read a vector of integers as 1 / USER_HZ seconds
* @ table : the sysctl table
* @ write : % TRUE if this is a write to the sysctl file
* @ filp : the file structure
* @ buffer : the user buffer
* @ lenp : the size of the user buffer
2005-11-07 12:01:06 +03:00
* @ ppos : pointer to the file position
2005-04-17 02:20:36 +04:00
*
* Reads / writes up to table - > maxlen / sizeof ( unsigned int ) integer
* values from / to the user buffer , treated as an ASCII string .
* The values read are assumed to be in 1 / USER_HZ seconds , and
* are converted into jiffies .
*
* Returns 0 on success .
*/
2007-10-18 14:05:22 +04:00
int proc_dointvec_userhz_jiffies ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return do_proc_dointvec ( table , write , filp , buffer , lenp , ppos ,
do_proc_dointvec_userhz_jiffies_conv , NULL ) ;
}
/**
* proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
* @ table : the sysctl table
* @ write : % TRUE if this is a write to the sysctl file
* @ filp : the file structure
* @ buffer : the user buffer
* @ lenp : the size of the user buffer
2005-05-01 19:59:26 +04:00
* @ ppos : file position
* @ ppos : the current position in the file
2005-04-17 02:20:36 +04:00
*
* Reads / writes up to table - > maxlen / sizeof ( unsigned int ) integer
* values from / to the user buffer , treated as an ASCII string .
* The values read are assumed to be in 1 / 1000 seconds , and
* are converted into jiffies .
*
* Returns 0 on success .
*/
2007-10-18 14:05:22 +04:00
int proc_dointvec_ms_jiffies ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return do_proc_dointvec ( table , write , filp , buffer , lenp , ppos ,
do_proc_dointvec_ms_jiffies_conv , NULL ) ;
}
2007-10-18 14:05:22 +04:00
static int proc_do_cad_pid ( struct ctl_table * table , int write , struct file * filp ,
2006-10-02 13:19:00 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
struct pid * new_pid ;
pid_t tmp ;
int r ;
2007-10-19 10:40:14 +04:00
tmp = pid_nr_ns ( cad_pid , current - > nsproxy - > pid_ns ) ;
2006-10-02 13:19:00 +04:00
r = __do_proc_dointvec ( & tmp , table , write , filp , buffer ,
lenp , ppos , NULL , NULL ) ;
if ( r | | ! write )
return r ;
new_pid = find_get_pid ( tmp ) ;
if ( ! new_pid )
return - ESRCH ;
put_pid ( xchg ( & cad_pid , new_pid ) ) ;
return 0 ;
}
2005-04-17 02:20:36 +04:00
# else /* CONFIG_PROC_FS */
2007-10-18 14:05:22 +04:00
int proc_dostring ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int proc_dointvec ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int proc_dointvec_bset ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int proc_dointvec_minmax ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int proc_dointvec_jiffies ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int proc_dointvec_userhz_jiffies ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int proc_dointvec_ms_jiffies ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int proc_doulongvec_minmax ( struct ctl_table * table , int write , struct file * filp ,
2005-04-17 02:20:36 +04:00
void __user * buffer , size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int proc_doulongvec_ms_jiffies_minmax ( struct ctl_table * table , int write ,
2005-04-17 02:20:36 +04:00
struct file * filp ,
void __user * buffer ,
size_t * lenp , loff_t * ppos )
{
return - ENOSYS ;
}
# endif /* CONFIG_PROC_FS */
2006-09-27 12:51:04 +04:00
# ifdef CONFIG_SYSCTL_SYSCALL
2005-04-17 02:20:36 +04:00
/*
* General sysctl support routines
*/
2007-10-18 14:05:23 +04:00
/* The generic sysctl data routine (used if no strategy routine supplied) */
int sysctl_data ( struct ctl_table * table , int __user * name , int nlen ,
void __user * oldval , size_t __user * oldlenp ,
void __user * newval , size_t newlen )
{
size_t len ;
/* Get out of I don't have a variable */
if ( ! table - > data | | ! table - > maxlen )
return - ENOTDIR ;
if ( oldval & & oldlenp ) {
if ( get_user ( len , oldlenp ) )
return - EFAULT ;
if ( len ) {
if ( len > table - > maxlen )
len = table - > maxlen ;
if ( copy_to_user ( oldval , table - > data , len ) )
return - EFAULT ;
if ( put_user ( len , oldlenp ) )
return - EFAULT ;
}
}
if ( newval & & newlen ) {
if ( newlen > table - > maxlen )
newlen = table - > maxlen ;
if ( copy_from_user ( table - > data , newval , newlen ) )
return - EFAULT ;
}
return 1 ;
}
2005-04-17 02:20:36 +04:00
/* The generic string strategy routine: */
2007-10-18 14:05:22 +04:00
int sysctl_string ( struct ctl_table * table , int __user * name , int nlen ,
2005-04-17 02:20:36 +04:00
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
if ( ! table - > data | | ! table - > maxlen )
return - ENOTDIR ;
if ( oldval & & oldlenp ) {
2006-01-01 04:00:29 +03:00
size_t bufsize ;
if ( get_user ( bufsize , oldlenp ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
2006-01-01 04:00:29 +03:00
if ( bufsize ) {
size_t len = strlen ( table - > data ) , copied ;
/* This shouldn't trigger for a well-formed sysctl */
if ( len > table - > maxlen )
2005-04-17 02:20:36 +04:00
len = table - > maxlen ;
2006-01-01 04:00:29 +03:00
/* Copy up to a max of bufsize-1 bytes of the string */
copied = ( len > = bufsize ) ? bufsize - 1 : len ;
if ( copy_to_user ( oldval , table - > data , copied ) | |
put_user ( 0 , ( char __user * ) ( oldval + copied ) ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
2006-01-01 04:00:29 +03:00
if ( put_user ( len , oldlenp ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
}
}
if ( newval & & newlen ) {
2006-01-01 04:00:29 +03:00
size_t len = newlen ;
2005-04-17 02:20:36 +04:00
if ( len > table - > maxlen )
len = table - > maxlen ;
if ( copy_from_user ( table - > data , newval , len ) )
return - EFAULT ;
if ( len = = table - > maxlen )
len - - ;
( ( char * ) table - > data ) [ len ] = 0 ;
}
2005-12-30 11:37:10 +03:00
return 1 ;
2005-04-17 02:20:36 +04:00
}
/*
* This function makes sure that all of the integers in the vector
* are between the minimum and maximum values given in the arrays
* table - > extra1 and table - > extra2 , respectively .
*/
2007-10-18 14:05:22 +04:00
int sysctl_intvec ( struct ctl_table * table , int __user * name , int nlen ,
2005-04-17 02:20:36 +04:00
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
if ( newval & & newlen ) {
int __user * vec = ( int __user * ) newval ;
int * min = ( int * ) table - > extra1 ;
int * max = ( int * ) table - > extra2 ;
size_t length ;
int i ;
if ( newlen % sizeof ( int ) ! = 0 )
return - EINVAL ;
if ( ! table - > extra1 & & ! table - > extra2 )
return 0 ;
if ( newlen > table - > maxlen )
newlen = table - > maxlen ;
length = newlen / sizeof ( int ) ;
for ( i = 0 ; i < length ; i + + ) {
int value ;
if ( get_user ( value , vec + i ) )
return - EFAULT ;
if ( min & & value < min [ i ] )
return - EINVAL ;
if ( max & & value > max [ i ] )
return - EINVAL ;
}
}
return 0 ;
}
/* Strategy function to convert jiffies to seconds */
2007-10-18 14:05:22 +04:00
int sysctl_jiffies ( struct ctl_table * table , int __user * name , int nlen ,
2005-04-17 02:20:36 +04:00
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
[PATCH] sysctl_{,ms_}jiffies: fix oldlen semantics
currently it's
1) if *oldlenp == 0,
don't writeback anything
2) if *oldlenp >= table->maxlen,
don't writeback more than table->maxlen bytes and rewrite *oldlenp
don't look at underlying type granularity
3) if 0 < *oldlenp < table->maxlen,
*cough*
string sysctls don't writeback more than *oldlenp bytes.
OK, that's because sizeof(char) == 1
int sysctls writeback anything in (0, table->maxlen] range
Though accept integers divisible by sizeof(int) for writing.
sysctl_jiffies and sysctl_ms_jiffies don't writeback anything but
sizeof(int), which violates 1) and 2).
So, make sysctl_jiffies and sysctl_ms_jiffies accept
a) *oldlenp == 0, not doing writeback
b) *oldlenp >= sizeof(int), writing one integer.
-EINVAL still returned for *oldlenp == 1, 2, 3.
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 12:44:39 +03:00
if ( oldval & & oldlenp ) {
2005-04-17 02:20:36 +04:00
size_t olen ;
[PATCH] sysctl_{,ms_}jiffies: fix oldlen semantics
currently it's
1) if *oldlenp == 0,
don't writeback anything
2) if *oldlenp >= table->maxlen,
don't writeback more than table->maxlen bytes and rewrite *oldlenp
don't look at underlying type granularity
3) if 0 < *oldlenp < table->maxlen,
*cough*
string sysctls don't writeback more than *oldlenp bytes.
OK, that's because sizeof(char) == 1
int sysctls writeback anything in (0, table->maxlen] range
Though accept integers divisible by sizeof(int) for writing.
sysctl_jiffies and sysctl_ms_jiffies don't writeback anything but
sizeof(int), which violates 1) and 2).
So, make sysctl_jiffies and sysctl_ms_jiffies accept
a) *oldlenp == 0, not doing writeback
b) *oldlenp >= sizeof(int), writing one integer.
-EINVAL still returned for *oldlenp == 1, 2, 3.
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 12:44:39 +03:00
if ( get_user ( olen , oldlenp ) )
return - EFAULT ;
if ( olen ) {
int val ;
if ( olen < sizeof ( int ) )
return - EINVAL ;
val = * ( int * ) ( table - > data ) / HZ ;
if ( put_user ( val , ( int __user * ) oldval ) )
return - EFAULT ;
if ( put_user ( sizeof ( int ) , oldlenp ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
}
}
if ( newval & & newlen ) {
int new ;
if ( newlen ! = sizeof ( int ) )
return - EINVAL ;
if ( get_user ( new , ( int __user * ) newval ) )
return - EFAULT ;
* ( int * ) ( table - > data ) = new * HZ ;
}
return 1 ;
}
/* Strategy function to convert jiffies to seconds */
2007-10-18 14:05:22 +04:00
int sysctl_ms_jiffies ( struct ctl_table * table , int __user * name , int nlen ,
2005-04-17 02:20:36 +04:00
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
[PATCH] sysctl_{,ms_}jiffies: fix oldlen semantics
currently it's
1) if *oldlenp == 0,
don't writeback anything
2) if *oldlenp >= table->maxlen,
don't writeback more than table->maxlen bytes and rewrite *oldlenp
don't look at underlying type granularity
3) if 0 < *oldlenp < table->maxlen,
*cough*
string sysctls don't writeback more than *oldlenp bytes.
OK, that's because sizeof(char) == 1
int sysctls writeback anything in (0, table->maxlen] range
Though accept integers divisible by sizeof(int) for writing.
sysctl_jiffies and sysctl_ms_jiffies don't writeback anything but
sizeof(int), which violates 1) and 2).
So, make sysctl_jiffies and sysctl_ms_jiffies accept
a) *oldlenp == 0, not doing writeback
b) *oldlenp >= sizeof(int), writing one integer.
-EINVAL still returned for *oldlenp == 1, 2, 3.
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 12:44:39 +03:00
if ( oldval & & oldlenp ) {
2005-04-17 02:20:36 +04:00
size_t olen ;
[PATCH] sysctl_{,ms_}jiffies: fix oldlen semantics
currently it's
1) if *oldlenp == 0,
don't writeback anything
2) if *oldlenp >= table->maxlen,
don't writeback more than table->maxlen bytes and rewrite *oldlenp
don't look at underlying type granularity
3) if 0 < *oldlenp < table->maxlen,
*cough*
string sysctls don't writeback more than *oldlenp bytes.
OK, that's because sizeof(char) == 1
int sysctls writeback anything in (0, table->maxlen] range
Though accept integers divisible by sizeof(int) for writing.
sysctl_jiffies and sysctl_ms_jiffies don't writeback anything but
sizeof(int), which violates 1) and 2).
So, make sysctl_jiffies and sysctl_ms_jiffies accept
a) *oldlenp == 0, not doing writeback
b) *oldlenp >= sizeof(int), writing one integer.
-EINVAL still returned for *oldlenp == 1, 2, 3.
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 12:44:39 +03:00
if ( get_user ( olen , oldlenp ) )
return - EFAULT ;
if ( olen ) {
int val ;
if ( olen < sizeof ( int ) )
return - EINVAL ;
val = jiffies_to_msecs ( * ( int * ) ( table - > data ) ) ;
if ( put_user ( val , ( int __user * ) oldval ) )
return - EFAULT ;
if ( put_user ( sizeof ( int ) , oldlenp ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
}
}
if ( newval & & newlen ) {
int new ;
if ( newlen ! = sizeof ( int ) )
return - EINVAL ;
if ( get_user ( new , ( int __user * ) newval ) )
return - EFAULT ;
* ( int * ) ( table - > data ) = msecs_to_jiffies ( new ) ;
}
return 1 ;
}
2006-12-08 13:39:55 +03:00
2006-09-27 12:51:04 +04:00
# else /* CONFIG_SYSCTL_SYSCALL */
2005-04-17 02:20:36 +04:00
asmlinkage long sys_sysctl ( struct __sysctl_args __user * args )
{
2006-11-06 10:52:11 +03:00
struct __sysctl_args tmp ;
2007-10-18 14:05:58 +04:00
int error ;
2006-11-06 10:52:11 +03:00
if ( copy_from_user ( & tmp , args , sizeof ( tmp ) ) )
return - EFAULT ;
2007-10-18 14:05:58 +04:00
error = deprecated_sysctl_warning ( & tmp ) ;
2006-09-27 12:51:04 +04:00
2007-10-18 14:05:58 +04:00
/* If no error reading the parameters then just -ENOSYS ... */
if ( ! error )
error = - ENOSYS ;
return error ;
2005-04-17 02:20:36 +04:00
}
2007-10-18 14:05:23 +04:00
int sysctl_data ( struct ctl_table * table , int __user * name , int nlen ,
void __user * oldval , size_t __user * oldlenp ,
void __user * newval , size_t newlen )
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int sysctl_string ( struct ctl_table * table , int __user * name , int nlen ,
2005-04-17 02:20:36 +04:00
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int sysctl_intvec ( struct ctl_table * table , int __user * name , int nlen ,
2005-04-17 02:20:36 +04:00
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int sysctl_jiffies ( struct ctl_table * table , int __user * name , int nlen ,
2005-04-17 02:20:36 +04:00
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
return - ENOSYS ;
}
2007-10-18 14:05:22 +04:00
int sysctl_ms_jiffies ( struct ctl_table * table , int __user * name , int nlen ,
2005-04-17 02:20:36 +04:00
void __user * oldval , size_t __user * oldlenp ,
2006-12-10 13:19:10 +03:00
void __user * newval , size_t newlen )
2005-04-17 02:20:36 +04:00
{
return - ENOSYS ;
}
2006-09-27 12:51:04 +04:00
# endif /* CONFIG_SYSCTL_SYSCALL */
2005-04-17 02:20:36 +04:00
2007-10-18 14:05:58 +04:00
static int deprecated_sysctl_warning ( struct __sysctl_args * args )
{
static int msg_count ;
int name [ CTL_MAXNAME ] ;
int i ;
2007-11-15 03:58:38 +03:00
/* Check args->nlen. */
if ( args - > nlen < 0 | | args - > nlen > CTL_MAXNAME )
return - ENOTDIR ;
2007-10-18 14:05:58 +04:00
/* Read in the sysctl name for better debug message logging */
for ( i = 0 ; i < args - > nlen ; i + + )
if ( get_user ( name [ i ] , args - > name + i ) )
return - EFAULT ;
/* Ignore accesses to kernel.version */
if ( ( args - > nlen = = 2 ) & & ( name [ 0 ] = = CTL_KERN ) & & ( name [ 1 ] = = KERN_VERSION ) )
return 0 ;
if ( msg_count < 5 ) {
msg_count + + ;
printk ( KERN_INFO
" warning: process `%s' used the deprecated sysctl "
" system call with " , current - > comm ) ;
for ( i = 0 ; i < args - > nlen ; i + + )
printk ( " %d. " , name [ i ] ) ;
printk ( " \n " ) ;
}
return 0 ;
}
2005-04-17 02:20:36 +04:00
/*
* No sense putting this after each symbol definition , twice ,
* exception granted : - )
*/
EXPORT_SYMBOL ( proc_dointvec ) ;
EXPORT_SYMBOL ( proc_dointvec_jiffies ) ;
EXPORT_SYMBOL ( proc_dointvec_minmax ) ;
EXPORT_SYMBOL ( proc_dointvec_userhz_jiffies ) ;
EXPORT_SYMBOL ( proc_dointvec_ms_jiffies ) ;
EXPORT_SYMBOL ( proc_dostring ) ;
EXPORT_SYMBOL ( proc_doulongvec_minmax ) ;
EXPORT_SYMBOL ( proc_doulongvec_ms_jiffies_minmax ) ;
EXPORT_SYMBOL ( register_sysctl_table ) ;
EXPORT_SYMBOL ( sysctl_intvec ) ;
EXPORT_SYMBOL ( sysctl_jiffies ) ;
EXPORT_SYMBOL ( sysctl_ms_jiffies ) ;
EXPORT_SYMBOL ( sysctl_string ) ;
2007-10-18 14:05:23 +04:00
EXPORT_SYMBOL ( sysctl_data ) ;
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( unregister_sysctl_table ) ;