2007-02-14 00:34:12 -08:00
/*
* / proc / sys support
*/
2008-10-17 05:07:44 +04:00
# include <linux/init.h>
2007-02-14 00:34:12 -08:00
# include <linux/sysctl.h>
2011-11-02 13:39:22 -07:00
# include <linux/poll.h>
2007-02-14 00:34:12 -08:00
# include <linux/proc_fs.h>
# include <linux/security.h>
2011-01-07 17:49:57 +11:00
# include <linux/namei.h>
2007-02-14 00:34:12 -08:00
# include "internal.h"
2009-02-20 05:58:47 +00:00
static const struct dentry_operations proc_sys_dentry_operations ;
2007-02-14 00:34:12 -08:00
static const struct file_operations proc_sys_file_operations ;
2008-02-08 04:21:19 -08:00
static const struct inode_operations proc_sys_inode_operations ;
2008-07-15 08:54:06 -04:00
static const struct file_operations proc_sys_dir_file_operations ;
static const struct inode_operations proc_sys_dir_operations ;
2007-02-14 00:34:12 -08:00
2011-11-02 13:39:22 -07:00
void proc_sys_poll_notify ( struct ctl_table_poll * poll )
{
if ( ! poll )
return ;
atomic_inc ( & poll - > event ) ;
wake_up_interruptible ( & poll - > wait ) ;
}
2008-07-15 08:54:06 -04:00
static struct inode * proc_sys_make_inode ( struct super_block * sb ,
struct ctl_table_header * head , struct ctl_table * table )
2007-02-14 00:34:12 -08:00
{
struct inode * inode ;
2008-07-15 08:54:06 -04:00
struct proc_inode * ei ;
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
inode = new_inode ( sb ) ;
2007-02-14 00:34:12 -08:00
if ( ! inode )
goto out ;
2010-10-23 11:19:54 -04:00
inode - > i_ino = get_next_ino ( ) ;
2008-07-15 08:54:06 -04:00
sysctl_head_get ( head ) ;
2007-02-14 00:34:12 -08:00
ei = PROC_I ( inode ) ;
2008-07-15 08:54:06 -04:00
ei - > sysctl = head ;
ei - > sysctl_entry = table ;
2007-02-14 00:34:12 -08:00
inode - > i_mtime = inode - > i_atime = inode - > i_ctime = CURRENT_TIME ;
2008-07-15 08:54:06 -04:00
inode - > i_mode = table - > mode ;
if ( ! table - > child ) {
inode - > i_mode | = S_IFREG ;
inode - > i_op = & proc_sys_inode_operations ;
inode - > i_fop = & proc_sys_file_operations ;
} else {
inode - > i_mode | = S_IFDIR ;
2011-10-28 14:13:28 +02:00
clear_nlink ( inode ) ;
2008-07-15 08:54:06 -04:00
inode - > i_op = & proc_sys_dir_operations ;
inode - > i_fop = & proc_sys_dir_file_operations ;
}
2007-02-14 00:34:12 -08:00
out :
return inode ;
}
2008-07-15 08:54:06 -04:00
static struct ctl_table * find_in_table ( struct ctl_table * p , struct qstr * name )
2007-02-14 00:34:12 -08:00
{
int len ;
2009-04-03 03:18:02 -07:00
for ( ; p - > procname ; p + + ) {
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
if ( ! p - > procname )
2007-02-14 00:34:12 -08:00
continue ;
2008-07-15 08:54:06 -04:00
len = strlen ( p - > procname ) ;
2007-02-14 00:34:12 -08:00
if ( len ! = name - > len )
continue ;
2008-07-15 08:54:06 -04:00
if ( memcmp ( p - > procname , name - > name , len ) ! = 0 )
2007-02-14 00:34:12 -08:00
continue ;
/* I have a match */
2008-07-15 08:54:06 -04:00
return p ;
2007-02-14 00:34:12 -08:00
}
return NULL ;
}
2008-10-03 00:33:54 +04:00
static struct ctl_table_header * grab_header ( struct inode * inode )
2007-02-14 00:34:12 -08:00
{
2008-07-15 08:54:06 -04:00
if ( PROC_I ( inode ) - > sysctl )
return sysctl_head_grab ( PROC_I ( inode ) - > sysctl ) ;
else
return sysctl_head_next ( NULL ) ;
}
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
static struct dentry * proc_sys_lookup ( struct inode * dir , struct dentry * dentry ,
struct nameidata * nd )
{
struct ctl_table_header * head = grab_header ( dir ) ;
struct ctl_table * table = PROC_I ( dir ) - > sysctl_entry ;
struct ctl_table_header * h = NULL ;
struct qstr * name = & dentry - > d_name ;
struct ctl_table * p ;
struct inode * inode ;
struct dentry * err = ERR_PTR ( - ENOENT ) ;
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
if ( IS_ERR ( head ) )
return ERR_CAST ( head ) ;
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
if ( table & & ! table - > child ) {
WARN_ON ( 1 ) ;
goto out ;
2007-02-14 00:34:12 -08:00
}
2008-07-15 08:54:06 -04:00
table = table ? table - > child : head - > ctl_table ;
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
p = find_in_table ( table , name ) ;
if ( ! p ) {
for ( h = sysctl_head_next ( NULL ) ; h ; h = sysctl_head_next ( h ) ) {
if ( h - > attached_to ! = table )
continue ;
p = find_in_table ( h - > attached_by , name ) ;
if ( p )
break ;
}
2007-02-14 00:34:12 -08:00
}
2008-07-15 08:54:06 -04:00
if ( ! p )
2007-02-14 00:34:12 -08:00
goto out ;
err = ERR_PTR ( - ENOMEM ) ;
2008-07-15 08:54:06 -04:00
inode = proc_sys_make_inode ( dir - > i_sb , h ? h : head , p ) ;
if ( h )
sysctl_head_finish ( h ) ;
2007-02-14 00:34:12 -08:00
if ( ! inode )
goto out ;
err = NULL ;
2011-01-07 17:49:55 +11:00
d_set_d_op ( dentry , & proc_sys_dentry_operations ) ;
2007-02-14 00:34:12 -08:00
d_add ( dentry , inode ) ;
out :
sysctl_head_finish ( head ) ;
return err ;
}
sysctl: merge equal proc_sys_read and proc_sys_write
Many (most of) sysctls do not have a per-container sense. E.g.
kernel.print_fatal_signals, vm.panic_on_oom, net.core.netdev_budget and so on
and so forth. Besides, tuning then from inside a container is not even
secure. On the other hand, hiding them completely from the container's tasks
sometimes causes user-space to stop working.
When developing net sysctl, the common practice was to duplicate a table and
drop the write bits in table->mode, but this approach was not very elegant,
lead to excessive memory consumption and was not suitable in general.
Here's the alternative solution. To facilitate the per-container sysctls
ctl_table_root-s were introduced. Each root contains a list of
ctl_table_header-s that are visible to different namespaces. The idea of this
set is to add the permissions() callback on the ctl_table_root to allow ctl
root limit permissions to the same ctl_table-s.
The main user of this functionality is the net-namespaces code, but later this
will (should) be used by more and more namespaces, containers and control
groups.
Actually, this idea's core is in a single hunk in the third patch. First two
patches are cleanups for sysctl code, while the third one mostly extends the
arguments set of some sysctl functions.
This patch:
These ->read and ->write callbacks act in a very similar way, so merge these
paths to reduce the number of places to patch later and shrink the .text size
(a bit).
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 01:02:40 -07:00
static ssize_t proc_sys_call_handler ( struct file * filp , void __user * buf ,
size_t count , loff_t * ppos , int write )
2007-02-14 00:34:12 -08:00
{
2008-07-15 08:54:06 -04:00
struct inode * inode = filp - > f_path . dentry - > d_inode ;
struct ctl_table_header * head = grab_header ( inode ) ;
struct ctl_table * table = PROC_I ( inode ) - > sysctl_entry ;
2007-10-25 15:27:40 +01:00
ssize_t error ;
size_t res ;
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
if ( IS_ERR ( head ) )
return PTR_ERR ( head ) ;
2007-02-14 00:34:12 -08:00
/*
* At this point we know that the sysctl was not unregistered
* and won ' t be until we finish .
*/
error = - EPERM ;
2008-04-29 01:02:44 -07:00
if ( sysctl_perm ( head - > root , table , write ? MAY_WRITE : MAY_READ ) )
2007-02-14 00:34:12 -08:00
goto out ;
2008-07-15 08:54:06 -04:00
/* if that can happen at all, it should be -EINVAL, not -EISDIR */
error = - EINVAL ;
if ( ! table - > proc_handler )
goto out ;
2007-02-14 00:34:12 -08:00
/* careful: calling conventions are nasty here */
res = count ;
2009-09-23 15:57:19 -07:00
error = table - > proc_handler ( table , write , buf , & res , ppos ) ;
2007-02-14 00:34:12 -08:00
if ( ! error )
error = res ;
out :
sysctl_head_finish ( head ) ;
return error ;
}
sysctl: merge equal proc_sys_read and proc_sys_write
Many (most of) sysctls do not have a per-container sense. E.g.
kernel.print_fatal_signals, vm.panic_on_oom, net.core.netdev_budget and so on
and so forth. Besides, tuning then from inside a container is not even
secure. On the other hand, hiding them completely from the container's tasks
sometimes causes user-space to stop working.
When developing net sysctl, the common practice was to duplicate a table and
drop the write bits in table->mode, but this approach was not very elegant,
lead to excessive memory consumption and was not suitable in general.
Here's the alternative solution. To facilitate the per-container sysctls
ctl_table_root-s were introduced. Each root contains a list of
ctl_table_header-s that are visible to different namespaces. The idea of this
set is to add the permissions() callback on the ctl_table_root to allow ctl
root limit permissions to the same ctl_table-s.
The main user of this functionality is the net-namespaces code, but later this
will (should) be used by more and more namespaces, containers and control
groups.
Actually, this idea's core is in a single hunk in the third patch. First two
patches are cleanups for sysctl code, while the third one mostly extends the
arguments set of some sysctl functions.
This patch:
These ->read and ->write callbacks act in a very similar way, so merge these
paths to reduce the number of places to patch later and shrink the .text size
(a bit).
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 01:02:40 -07:00
static ssize_t proc_sys_read ( struct file * filp , char __user * buf ,
2007-02-14 00:34:12 -08:00
size_t count , loff_t * ppos )
{
sysctl: merge equal proc_sys_read and proc_sys_write
Many (most of) sysctls do not have a per-container sense. E.g.
kernel.print_fatal_signals, vm.panic_on_oom, net.core.netdev_budget and so on
and so forth. Besides, tuning then from inside a container is not even
secure. On the other hand, hiding them completely from the container's tasks
sometimes causes user-space to stop working.
When developing net sysctl, the common practice was to duplicate a table and
drop the write bits in table->mode, but this approach was not very elegant,
lead to excessive memory consumption and was not suitable in general.
Here's the alternative solution. To facilitate the per-container sysctls
ctl_table_root-s were introduced. Each root contains a list of
ctl_table_header-s that are visible to different namespaces. The idea of this
set is to add the permissions() callback on the ctl_table_root to allow ctl
root limit permissions to the same ctl_table-s.
The main user of this functionality is the net-namespaces code, but later this
will (should) be used by more and more namespaces, containers and control
groups.
Actually, this idea's core is in a single hunk in the third patch. First two
patches are cleanups for sysctl code, while the third one mostly extends the
arguments set of some sysctl functions.
This patch:
These ->read and ->write callbacks act in a very similar way, so merge these
paths to reduce the number of places to patch later and shrink the .text size
(a bit).
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 01:02:40 -07:00
return proc_sys_call_handler ( filp , ( void __user * ) buf , count , ppos , 0 ) ;
}
2007-02-14 00:34:12 -08:00
sysctl: merge equal proc_sys_read and proc_sys_write
Many (most of) sysctls do not have a per-container sense. E.g.
kernel.print_fatal_signals, vm.panic_on_oom, net.core.netdev_budget and so on
and so forth. Besides, tuning then from inside a container is not even
secure. On the other hand, hiding them completely from the container's tasks
sometimes causes user-space to stop working.
When developing net sysctl, the common practice was to duplicate a table and
drop the write bits in table->mode, but this approach was not very elegant,
lead to excessive memory consumption and was not suitable in general.
Here's the alternative solution. To facilitate the per-container sysctls
ctl_table_root-s were introduced. Each root contains a list of
ctl_table_header-s that are visible to different namespaces. The idea of this
set is to add the permissions() callback on the ctl_table_root to allow ctl
root limit permissions to the same ctl_table-s.
The main user of this functionality is the net-namespaces code, but later this
will (should) be used by more and more namespaces, containers and control
groups.
Actually, this idea's core is in a single hunk in the third patch. First two
patches are cleanups for sysctl code, while the third one mostly extends the
arguments set of some sysctl functions.
This patch:
These ->read and ->write callbacks act in a very similar way, so merge these
paths to reduce the number of places to patch later and shrink the .text size
(a bit).
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@sw.ru>
Cc: Denis V. Lunev <den@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 01:02:40 -07:00
static ssize_t proc_sys_write ( struct file * filp , const char __user * buf ,
size_t count , loff_t * ppos )
{
return proc_sys_call_handler ( filp , ( void __user * ) buf , count , ppos , 1 ) ;
2007-02-14 00:34:12 -08:00
}
2011-11-02 13:39:22 -07:00
static int proc_sys_open ( struct inode * inode , struct file * filp )
{
struct ctl_table * table = PROC_I ( inode ) - > sysctl_entry ;
if ( table - > poll )
filp - > private_data = proc_sys_poll_event ( table - > poll ) ;
return 0 ;
}
static unsigned int proc_sys_poll ( struct file * filp , poll_table * wait )
{
struct inode * inode = filp - > f_path . dentry - > d_inode ;
struct ctl_table * table = PROC_I ( inode ) - > sysctl_entry ;
unsigned long event = ( unsigned long ) filp - > private_data ;
unsigned int ret = DEFAULT_POLLMASK ;
if ( ! table - > proc_handler )
goto out ;
if ( ! table - > poll )
goto out ;
poll_wait ( filp , & table - > poll - > wait , wait ) ;
if ( event ! = atomic_read ( & table - > poll - > event ) ) {
filp - > private_data = proc_sys_poll_event ( table - > poll ) ;
ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI ;
}
out :
return ret ;
}
2007-02-14 00:34:12 -08:00
static int proc_sys_fill_cache ( struct file * filp , void * dirent ,
2008-07-15 08:54:06 -04:00
filldir_t filldir ,
struct ctl_table_header * head ,
struct ctl_table * table )
2007-02-14 00:34:12 -08:00
{
struct dentry * child , * dir = filp - > f_path . dentry ;
struct inode * inode ;
struct qstr qname ;
ino_t ino = 0 ;
unsigned type = DT_UNKNOWN ;
qname . name = table - > procname ;
qname . len = strlen ( table - > procname ) ;
qname . hash = full_name_hash ( qname . name , qname . len ) ;
child = d_lookup ( dir , & qname ) ;
if ( ! child ) {
2008-07-15 08:54:06 -04:00
child = d_alloc ( dir , & qname ) ;
if ( child ) {
inode = proc_sys_make_inode ( dir - > d_sb , head , table ) ;
if ( ! inode ) {
dput ( child ) ;
return - ENOMEM ;
} else {
2011-01-07 17:49:55 +11:00
d_set_d_op ( child , & proc_sys_dentry_operations ) ;
2008-07-15 08:54:06 -04:00
d_add ( child , inode ) ;
2007-02-14 00:34:12 -08:00
}
2008-07-15 08:54:06 -04:00
} else {
return - ENOMEM ;
2007-02-14 00:34:12 -08:00
}
}
inode = child - > d_inode ;
2008-07-15 08:54:06 -04:00
ino = inode - > i_ino ;
type = inode - > i_mode > > 12 ;
2007-02-14 00:34:12 -08:00
dput ( child ) ;
2008-07-15 08:54:06 -04:00
return ! ! filldir ( dirent , qname . name , qname . len , filp - > f_pos , ino , type ) ;
}
static int scan ( struct ctl_table_header * head , ctl_table * table ,
unsigned long * pos , struct file * file ,
void * dirent , filldir_t filldir )
{
2009-04-03 03:18:02 -07:00
for ( ; table - > procname ; table + + , ( * pos ) + + ) {
2008-07-15 08:54:06 -04:00
int res ;
/* Can't do anything without a proc name */
if ( ! table - > procname )
continue ;
if ( * pos < file - > f_pos )
continue ;
res = proc_sys_fill_cache ( file , dirent , filldir , head , table ) ;
if ( res )
return res ;
file - > f_pos = * pos + 1 ;
}
return 0 ;
2007-02-14 00:34:12 -08:00
}
static int proc_sys_readdir ( struct file * filp , void * dirent , filldir_t filldir )
{
2008-07-15 08:54:06 -04:00
struct dentry * dentry = filp - > f_path . dentry ;
2007-02-14 00:34:12 -08:00
struct inode * inode = dentry - > d_inode ;
2008-07-15 08:54:06 -04:00
struct ctl_table_header * head = grab_header ( inode ) ;
struct ctl_table * table = PROC_I ( inode ) - > sysctl_entry ;
struct ctl_table_header * h = NULL ;
2007-02-14 00:34:12 -08:00
unsigned long pos ;
2008-07-15 08:54:06 -04:00
int ret = - EINVAL ;
if ( IS_ERR ( head ) )
return PTR_ERR ( head ) ;
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
if ( table & & ! table - > child ) {
WARN_ON ( 1 ) ;
2007-02-14 00:34:12 -08:00
goto out ;
2008-07-15 08:54:06 -04:00
}
table = table ? table - > child : head - > ctl_table ;
2007-02-14 00:34:12 -08:00
ret = 0 ;
/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
if ( filp - > f_pos = = 0 ) {
if ( filldir ( dirent , " . " , 1 , filp - > f_pos ,
inode - > i_ino , DT_DIR ) < 0 )
goto out ;
filp - > f_pos + + ;
}
if ( filp - > f_pos = = 1 ) {
if ( filldir ( dirent , " .. " , 2 , filp - > f_pos ,
parent_ino ( dentry ) , DT_DIR ) < 0 )
goto out ;
filp - > f_pos + + ;
}
pos = 2 ;
2008-07-15 08:54:06 -04:00
ret = scan ( head , table , & pos , filp , dirent , filldir ) ;
if ( ret )
goto out ;
2007-02-14 00:34:12 -08:00
2008-07-15 08:54:06 -04:00
for ( h = sysctl_head_next ( NULL ) ; h ; h = sysctl_head_next ( h ) ) {
if ( h - > attached_to ! = table )
2007-02-14 00:34:12 -08:00
continue ;
2008-07-15 08:54:06 -04:00
ret = scan ( h , h - > attached_by , & pos , filp , dirent , filldir ) ;
if ( ret ) {
sysctl_head_finish ( h ) ;
break ;
2007-02-14 00:34:12 -08:00
}
}
ret = 1 ;
out :
sysctl_head_finish ( head ) ;
return ret ;
}
2011-06-20 19:28:19 -04:00
static int proc_sys_permission ( struct inode * inode , int mask )
2007-02-14 00:34:12 -08:00
{
/*
* sysctl entries that are not writeable ,
* are _NOT_ writeable , capabilities or not .
*/
2008-07-31 13:41:58 +02:00
struct ctl_table_header * head ;
struct ctl_table * table ;
2007-02-14 00:34:12 -08:00
int error ;
2008-07-31 13:41:58 +02:00
/* Executable files are not allowed under /proc/sys/ */
if ( ( mask & MAY_EXEC ) & & S_ISREG ( inode - > i_mode ) )
return - EACCES ;
head = grab_header ( inode ) ;
2008-07-15 08:54:06 -04:00
if ( IS_ERR ( head ) )
return PTR_ERR ( head ) ;
2007-02-14 00:34:12 -08:00
2008-07-31 13:41:58 +02:00
table = PROC_I ( inode ) - > sysctl_entry ;
2008-07-15 08:54:06 -04:00
if ( ! table ) /* global root - r-xr-xr-x */
error = mask & MAY_WRITE ? - EACCES : 0 ;
else /* Use the permissions on the sysctl table entry */
2011-06-20 18:59:02 -04:00
error = sysctl_perm ( head - > root , table , mask & ~ MAY_NOT_BLOCK ) ;
2007-02-14 00:34:12 -08:00
sysctl_head_finish ( head ) ;
return error ;
}
static int proc_sys_setattr ( struct dentry * dentry , struct iattr * attr )
{
struct inode * inode = dentry - > d_inode ;
int error ;
if ( attr - > ia_valid & ( ATTR_MODE | ATTR_UID | ATTR_GID ) )
return - EPERM ;
error = inode_change_ok ( inode , attr ) ;
2010-06-04 11:30:02 +02:00
if ( error )
return error ;
if ( ( attr - > ia_valid & ATTR_SIZE ) & &
attr - > ia_size ! = i_size_read ( inode ) ) {
error = vmtruncate ( inode , attr - > ia_size ) ;
if ( error )
return error ;
}
2007-02-14 00:34:12 -08:00
2010-06-04 11:30:02 +02:00
setattr_copy ( inode , attr ) ;
mark_inode_dirty ( inode ) ;
return 0 ;
2007-02-14 00:34:12 -08:00
}
2008-07-15 08:54:06 -04:00
static int proc_sys_getattr ( struct vfsmount * mnt , struct dentry * dentry , struct kstat * stat )
{
struct inode * inode = dentry - > d_inode ;
struct ctl_table_header * head = grab_header ( inode ) ;
struct ctl_table * table = PROC_I ( inode ) - > sysctl_entry ;
if ( IS_ERR ( head ) )
return PTR_ERR ( head ) ;
generic_fillattr ( inode , stat ) ;
if ( table )
stat - > mode = ( stat - > mode & S_IFMT ) | table - > mode ;
sysctl_head_finish ( head ) ;
return 0 ;
}
2007-02-14 00:34:12 -08:00
static const struct file_operations proc_sys_file_operations = {
2011-11-02 13:39:22 -07:00
. open = proc_sys_open ,
. poll = proc_sys_poll ,
2007-02-14 00:34:12 -08:00
. read = proc_sys_read ,
. write = proc_sys_write ,
llseek: automatically add .llseek fop
All file_operations should get a .llseek operation so we can make
nonseekable_open the default for future file operations without a
.llseek pointer.
The three cases that we can automatically detect are no_llseek, seq_lseek
and default_llseek. For cases where we can we can automatically prove that
the file offset is always ignored, we use noop_llseek, which maintains
the current behavior of not returning an error from a seek.
New drivers should normally not use noop_llseek but instead use no_llseek
and call nonseekable_open at open time. Existing drivers can be converted
to do the same when the maintainer knows for certain that no user code
relies on calling seek on the device file.
The generated code is often incorrectly indented and right now contains
comments that clarify for each added line why a specific variant was
chosen. In the version that gets submitted upstream, the comments will
be gone and I will manually fix the indentation, because there does not
seem to be a way to do that using coccinelle.
Some amount of new code is currently sitting in linux-next that should get
the same modifications, which I will do at the end of the merge window.
Many thanks to Julia Lawall for helping me learn to write a semantic
patch that does all this.
===== begin semantic patch =====
// This adds an llseek= method to all file operations,
// as a preparation for making no_llseek the default.
//
// The rules are
// - use no_llseek explicitly if we do nonseekable_open
// - use seq_lseek for sequential files
// - use default_llseek if we know we access f_pos
// - use noop_llseek if we know we don't access f_pos,
// but we still want to allow users to call lseek
//
@ open1 exists @
identifier nested_open;
@@
nested_open(...)
{
<+...
nonseekable_open(...)
...+>
}
@ open exists@
identifier open_f;
identifier i, f;
identifier open1.nested_open;
@@
int open_f(struct inode *i, struct file *f)
{
<+...
(
nonseekable_open(...)
|
nested_open(...)
)
...+>
}
@ read disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ read_no_fpos disable optional_qualifier exists @
identifier read_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off)
{
... when != off
}
@ write @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
expression E;
identifier func;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
<+...
(
*off = E
|
*off += E
|
func(..., off, ...)
|
E = *off
)
...+>
}
@ write_no_fpos @
identifier write_f;
identifier f, p, s, off;
type ssize_t, size_t, loff_t;
@@
ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off)
{
... when != off
}
@ fops0 @
identifier fops;
@@
struct file_operations fops = {
...
};
@ has_llseek depends on fops0 @
identifier fops0.fops;
identifier llseek_f;
@@
struct file_operations fops = {
...
.llseek = llseek_f,
...
};
@ has_read depends on fops0 @
identifier fops0.fops;
identifier read_f;
@@
struct file_operations fops = {
...
.read = read_f,
...
};
@ has_write depends on fops0 @
identifier fops0.fops;
identifier write_f;
@@
struct file_operations fops = {
...
.write = write_f,
...
};
@ has_open depends on fops0 @
identifier fops0.fops;
identifier open_f;
@@
struct file_operations fops = {
...
.open = open_f,
...
};
// use no_llseek if we call nonseekable_open
////////////////////////////////////////////
@ nonseekable1 depends on !has_llseek && has_open @
identifier fops0.fops;
identifier nso ~= "nonseekable_open";
@@
struct file_operations fops = {
... .open = nso, ...
+.llseek = no_llseek, /* nonseekable */
};
@ nonseekable2 depends on !has_llseek @
identifier fops0.fops;
identifier open.open_f;
@@
struct file_operations fops = {
... .open = open_f, ...
+.llseek = no_llseek, /* open uses nonseekable */
};
// use seq_lseek for sequential files
/////////////////////////////////////
@ seq depends on !has_llseek @
identifier fops0.fops;
identifier sr ~= "seq_read";
@@
struct file_operations fops = {
... .read = sr, ...
+.llseek = seq_lseek, /* we have seq_read */
};
// use default_llseek if there is a readdir
///////////////////////////////////////////
@ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier readdir_e;
@@
// any other fop is used that changes pos
struct file_operations fops = {
... .readdir = readdir_e, ...
+.llseek = default_llseek, /* readdir is present */
};
// use default_llseek if at least one of read/write touches f_pos
/////////////////////////////////////////////////////////////////
@ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read.read_f;
@@
// read fops use offset
struct file_operations fops = {
... .read = read_f, ...
+.llseek = default_llseek, /* read accesses f_pos */
};
@ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write.write_f;
@@
// write fops use offset
struct file_operations fops = {
... .write = write_f, ...
+ .llseek = default_llseek, /* write accesses f_pos */
};
// Use noop_llseek if neither read nor write accesses f_pos
///////////////////////////////////////////////////////////
@ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
identifier write_no_fpos.write_f;
@@
// write fops use offset
struct file_operations fops = {
...
.write = write_f,
.read = read_f,
...
+.llseek = noop_llseek, /* read and write both use no f_pos */
};
@ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier write_no_fpos.write_f;
@@
struct file_operations fops = {
... .write = write_f, ...
+.llseek = noop_llseek, /* write uses no f_pos */
};
@ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
identifier read_no_fpos.read_f;
@@
struct file_operations fops = {
... .read = read_f, ...
+.llseek = noop_llseek, /* read uses no f_pos */
};
@ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @
identifier fops0.fops;
@@
struct file_operations fops = {
...
+.llseek = noop_llseek, /* no read or write fn */
};
===== End semantic patch =====
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Julia Lawall <julia@diku.dk>
Cc: Christoph Hellwig <hch@infradead.org>
2010-08-15 18:52:59 +02:00
. llseek = default_llseek ,
2008-07-15 08:54:06 -04:00
} ;
static const struct file_operations proc_sys_dir_file_operations = {
2011-11-02 13:38:42 -07:00
. read = generic_read_dir ,
2007-02-14 00:34:12 -08:00
. readdir = proc_sys_readdir ,
2008-09-03 21:53:01 +02:00
. llseek = generic_file_llseek ,
2007-02-14 00:34:12 -08:00
} ;
2008-02-08 04:21:19 -08:00
static const struct inode_operations proc_sys_inode_operations = {
2008-07-15 08:54:06 -04:00
. permission = proc_sys_permission ,
. setattr = proc_sys_setattr ,
. getattr = proc_sys_getattr ,
} ;
static const struct inode_operations proc_sys_dir_operations = {
2007-02-14 00:34:12 -08:00
. lookup = proc_sys_lookup ,
. permission = proc_sys_permission ,
. setattr = proc_sys_setattr ,
2008-07-15 08:54:06 -04:00
. getattr = proc_sys_getattr ,
2007-02-14 00:34:12 -08:00
} ;
static int proc_sys_revalidate ( struct dentry * dentry , struct nameidata * nd )
{
2011-01-07 17:49:57 +11:00
if ( nd - > flags & LOOKUP_RCU )
return - ECHILD ;
2008-07-15 08:54:06 -04:00
return ! PROC_I ( dentry - > d_inode ) - > sysctl - > unregistering ;
}
2011-01-07 17:49:23 +11:00
static int proc_sys_delete ( const struct dentry * dentry )
2008-07-15 08:54:06 -04:00
{
return ! ! PROC_I ( dentry - > d_inode ) - > sysctl - > unregistering ;
}
2011-01-07 17:49:27 +11:00
static int proc_sys_compare ( const struct dentry * parent ,
const struct inode * pinode ,
const struct dentry * dentry , const struct inode * inode ,
unsigned int len , const char * str , const struct qstr * name )
2008-07-15 08:54:06 -04:00
{
2011-03-08 01:25:28 -05:00
struct ctl_table_header * head ;
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:49:52 +11:00
/* Although proc doesn't have negative dentries, rcu-walk means
* that inode here can be NULL */
2011-03-08 01:25:28 -05:00
/* AV: can it, indeed? */
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the
ancestor dentry elements. This is called rcu-walk, as opposed to the current
algorithm which is a refcount based walk, or ref-walk.
This results in far fewer atomic operations on every path element,
significantly improving path lookup performance. It also avoids cacheline
bouncing on common dentries, significantly improving scalability.
The overall design is like this:
* LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk.
* Take the RCU lock for the entire path walk, starting with the acquiring
of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are
not required for dentry persistence.
* synchronize_rcu is called when unregistering a filesystem, so we can
access d_ops and i_ops during rcu-walk.
* Similarly take the vfsmount lock for the entire path walk. So now mnt
refcounts are not required for persistence. Also we are free to perform mount
lookups, and to assume dentry mount points and mount roots are stable up and
down the path.
* Have a per-dentry seqlock to protect the dentry name, parent, and inode,
so we can load this tuple atomically, and also check whether any of its
members have changed.
* Dentry lookups (based on parent, candidate string tuple) recheck the parent
sequence after the child is found in case anything changed in the parent
during the path walk.
* inode is also RCU protected so we can load d_inode and use the inode for
limited things.
* i_mode, i_uid, i_gid can be tested for exec permissions during path walk.
* i_op can be loaded.
When we reach the destination dentry, we lock it, recheck lookup sequence,
and increment its refcount and mountpoint refcount. RCU and vfsmount locks
are dropped. This is termed "dropping rcu-walk". If the dentry refcount does
not match, we can not drop rcu-walk gracefully at the current point in the
lokup, so instead return -ECHILD (for want of a better errno). This signals the
path walking code to re-do the entire lookup with a ref-walk.
Aside from the final dentry, there are other situations that may be encounted
where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take
a reference on the last good dentry) and continue with a ref-walk. Again, if
we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup
using ref-walk. But it is very important that we can continue with ref-walk
for most cases, particularly to avoid the overhead of double lookups, and to
gain the scalability advantages on common path elements (like cwd and root).
The cases where rcu-walk cannot continue are:
* NULL dentry (ie. any uncached path element)
* parent with d_inode->i_op->permission or ACLs
* dentries with d_revalidate
* Following links
In future patches, permission checks and d_revalidate become rcu-walk aware. It
may be possible eventually to make following links rcu-walk aware.
Uncached path elements will always require dropping to ref-walk mode, at the
very least because i_mutex needs to be grabbed, and objects allocated.
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
2011-01-07 17:49:52 +11:00
if ( ! inode )
2011-03-08 01:25:28 -05:00
return 1 ;
2011-01-07 17:49:27 +11:00
if ( name - > len ! = len )
2008-07-15 08:54:06 -04:00
return 1 ;
2011-01-07 17:49:27 +11:00
if ( memcmp ( name - > name , str , len ) )
2008-07-15 08:54:06 -04:00
return 1 ;
2011-03-08 01:25:28 -05:00
head = rcu_dereference ( PROC_I ( inode ) - > sysctl ) ;
return ! head | | ! sysctl_is_seen ( head ) ;
2007-02-14 00:34:12 -08:00
}
2009-02-20 05:58:47 +00:00
static const struct dentry_operations proc_sys_dentry_operations = {
2007-02-14 00:34:12 -08:00
. d_revalidate = proc_sys_revalidate ,
2008-07-15 08:54:06 -04:00
. d_delete = proc_sys_delete ,
. d_compare = proc_sys_compare ,
2007-02-14 00:34:12 -08:00
} ;
2008-10-17 05:07:44 +04:00
int __init proc_sys_init ( void )
2007-02-14 00:34:12 -08:00
{
2008-10-03 00:23:32 +04:00
struct proc_dir_entry * proc_sys_root ;
2007-02-14 00:34:12 -08:00
proc_sys_root = proc_mkdir ( " sys " , NULL ) ;
2008-07-15 08:54:06 -04:00
proc_sys_root - > proc_iops = & proc_sys_dir_operations ;
proc_sys_root - > proc_fops = & proc_sys_dir_file_operations ;
2007-02-14 00:34:12 -08:00
proc_sys_root - > nlink = 0 ;
return 0 ;
}