2005-04-16 15:20:36 -07:00
/*
* linux / ipc / util . h
* Copyright ( C ) 1999 Christoph Rohland
*
2006-01-15 02:43:54 +01:00
* ipc helper functions ( c ) 1999 Manfred Spraul < manfred @ colorfullife . com >
2006-10-02 02:18:20 -07:00
* namespaces support . 2006 OpenVZ , SWsoft Inc .
* Pavel Emelianov < xemul @ openvz . org >
2005-04-16 15:20:36 -07:00
*/
# ifndef _IPC_UTIL_H
# define _IPC_UTIL_H
2009-06-20 02:23:29 +02:00
# include <linux/unistd.h>
2007-10-18 23:40:51 -07:00
# include <linux/err.h>
2007-10-18 23:40:48 -07:00
2005-04-16 15:20:36 -07:00
# define SEQ_MULTIPLIER (IPCMNI)
void sem_init ( void ) ;
void msg_init ( void ) ;
void shm_init ( void ) ;
2008-02-08 04:18:22 -08:00
struct ipc_namespace ;
2009-04-06 19:01:08 -07:00
# ifdef CONFIG_POSIX_MQUEUE
namespaces: ipc namespaces: implement support for posix msqueues
Implement multiple mounts of the mqueue file system, and link it to usage
of CLONE_NEWIPC.
Each ipc ns has a corresponding mqueuefs superblock. When a user does
clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the unshare will cause an
internal mount of a new mqueuefs sb linked to the new ipc ns.
When a user does 'mount -t mqueue mqueue /dev/mqueue', he mounts the
mqueuefs superblock.
Posix message queues can be worked with both through the mq_* system calls
(see mq_overview(7)), and through the VFS through the mqueue mount. Any
usage of mq_open() and friends will work with the acting task's ipc
namespace. Any actions through the VFS will work with the mqueuefs in
which the file was created. So if a user doesn't remount mqueuefs after
unshare(CLONE_NEWIPC), mq_open("/ab") will not be reflected in "ls
/dev/mqueue".
If task a mounts mqueue for ipc_ns:1, then clones task b with a new ipcns,
ipcns:2, and then task a is the last task in ipc_ns:1 to exit, then (1)
ipc_ns:1 will be freed, (2) it's superblock will live on until task b
umounts the corresponding mqueuefs, and vfs actions will continue to
succeed, but (3) sb->s_fs_info will be NULL for the sb corresponding to
the deceased ipc_ns:1.
To make this happen, we must protect the ipc reference count when
a) a task exits and drops its ipcns->count, since it might be dropping
it to 0 and freeing the ipcns
b) a task accesses the ipcns through its mqueuefs interface, since it
bumps the ipcns refcount and might race with the last task in the ipcns
exiting.
So the kref is changed to an atomic_t so we can use
atomic_dec_and_lock(&ns->count,mq_lock), and every access to the ipcns
through ns = mqueuefs_sb->s_fs_info is protected by the same lock.
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-06 19:01:10 -07:00
extern void mq_clear_sbinfo ( struct ipc_namespace * ns ) ;
extern void mq_put_mnt ( struct ipc_namespace * ns ) ;
2009-04-06 19:01:08 -07:00
# else
namespaces: ipc namespaces: implement support for posix msqueues
Implement multiple mounts of the mqueue file system, and link it to usage
of CLONE_NEWIPC.
Each ipc ns has a corresponding mqueuefs superblock. When a user does
clone(CLONE_NEWIPC) or unshare(CLONE_NEWIPC), the unshare will cause an
internal mount of a new mqueuefs sb linked to the new ipc ns.
When a user does 'mount -t mqueue mqueue /dev/mqueue', he mounts the
mqueuefs superblock.
Posix message queues can be worked with both through the mq_* system calls
(see mq_overview(7)), and through the VFS through the mqueue mount. Any
usage of mq_open() and friends will work with the acting task's ipc
namespace. Any actions through the VFS will work with the mqueuefs in
which the file was created. So if a user doesn't remount mqueuefs after
unshare(CLONE_NEWIPC), mq_open("/ab") will not be reflected in "ls
/dev/mqueue".
If task a mounts mqueue for ipc_ns:1, then clones task b with a new ipcns,
ipcns:2, and then task a is the last task in ipc_ns:1 to exit, then (1)
ipc_ns:1 will be freed, (2) it's superblock will live on until task b
umounts the corresponding mqueuefs, and vfs actions will continue to
succeed, but (3) sb->s_fs_info will be NULL for the sb corresponding to
the deceased ipc_ns:1.
To make this happen, we must protect the ipc reference count when
a) a task exits and drops its ipcns->count, since it might be dropping
it to 0 and freeing the ipcns
b) a task accesses the ipcns through its mqueuefs interface, since it
bumps the ipcns refcount and might race with the last task in the ipcns
exiting.
So the kref is changed to an atomic_t so we can use
atomic_dec_and_lock(&ns->count,mq_lock), and every access to the ipcns
through ns = mqueuefs_sb->s_fs_info is protected by the same lock.
Signed-off-by: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-06 19:01:10 -07:00
static inline void mq_clear_sbinfo ( struct ipc_namespace * ns ) { }
static inline void mq_put_mnt ( struct ipc_namespace * ns ) { }
2009-04-06 19:01:08 -07:00
# endif
# ifdef CONFIG_SYSVIPC
2008-02-08 04:18:57 -08:00
void sem_init_ns ( struct ipc_namespace * ns ) ;
void msg_init_ns ( struct ipc_namespace * ns ) ;
void shm_init_ns ( struct ipc_namespace * ns ) ;
2006-10-02 02:18:20 -07:00
void sem_exit_ns ( struct ipc_namespace * ns ) ;
void msg_exit_ns ( struct ipc_namespace * ns ) ;
void shm_exit_ns ( struct ipc_namespace * ns ) ;
2009-04-06 19:01:08 -07:00
# else
static inline void sem_init_ns ( struct ipc_namespace * ns ) { }
static inline void msg_init_ns ( struct ipc_namespace * ns ) { }
static inline void shm_init_ns ( struct ipc_namespace * ns ) { }
static inline void sem_exit_ns ( struct ipc_namespace * ns ) { }
static inline void msg_exit_ns ( struct ipc_namespace * ns ) { }
static inline void shm_exit_ns ( struct ipc_namespace * ns ) { }
# endif
2006-10-02 02:18:20 -07:00
2007-10-18 23:40:49 -07:00
/*
* Structure that holds the parameters needed by the ipc operations
* ( see after )
*/
struct ipc_params {
key_t key ;
int flg ;
union {
size_t size ; /* for shared memories */
int nsems ; /* for semaphores */
} u ; /* holds the getnew() specific param */
} ;
/*
* Structure that holds some ipc operations . This structure is used to unify
* the calls to sys_msgget ( ) , sys_semget ( ) , sys_shmget ( )
* . routine to call to create a new ipc object . Can be one of newque ,
* newary , newseg
2007-10-18 23:40:53 -07:00
* . routine to call to check permissions for a new ipc object .
2007-10-18 23:40:49 -07:00
* Can be one of security_msg_associate , security_sem_associate ,
* security_shm_associate
* . routine to call for an extra check if needed
*/
struct ipc_ops {
int ( * getnew ) ( struct ipc_namespace * , struct ipc_params * ) ;
2007-10-18 23:40:51 -07:00
int ( * associate ) ( struct kern_ipc_perm * , int ) ;
int ( * more_checks ) ( struct kern_ipc_perm * , struct ipc_params * ) ;
2007-10-18 23:40:49 -07:00
} ;
2005-09-06 15:17:09 -07:00
struct seq_file ;
2008-02-08 04:18:57 -08:00
struct ipc_ids ;
2007-07-15 23:40:58 -07:00
2007-10-18 23:40:48 -07:00
void ipc_init_ids ( struct ipc_ids * ) ;
2005-09-06 15:17:09 -07:00
# ifdef CONFIG_PROC_FS
void __init ipc_init_proc_interface ( const char * path , const char * header ,
2006-10-02 02:18:20 -07:00
int ids , int ( * show ) ( struct seq_file * , void * ) ) ;
2005-09-06 15:17:09 -07:00
# else
# define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
# endif
2005-04-16 15:20:36 -07:00
2006-10-02 02:18:20 -07:00
# define IPC_SEM_IDS 0
# define IPC_MSG_IDS 1
# define IPC_SHM_IDS 2
2007-10-18 23:40:52 -07:00
# define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
2013-01-04 15:34:50 -08:00
# define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
2007-10-18 23:40:52 -07:00
2007-10-18 23:40:54 -07:00
/* must be called with ids->rw_mutex acquired for writing */
2007-10-18 23:40:48 -07:00
int ipc_addid ( struct ipc_ids * , struct kern_ipc_perm * , int ) ;
2007-10-18 23:40:54 -07:00
/* must be called with ids->rw_mutex acquired for reading */
2007-10-18 23:40:48 -07:00
int ipc_get_maxid ( struct ipc_ids * ) ;
2005-04-16 15:20:36 -07:00
/* must be called with both locks acquired. */
2007-10-18 23:40:48 -07:00
void ipc_rmid ( struct ipc_ids * , struct kern_ipc_perm * ) ;
2005-04-16 15:20:36 -07:00
2007-10-18 23:40:53 -07:00
/* must be called with ipcp locked */
2011-03-23 16:43:24 -07:00
int ipcperms ( struct ipc_namespace * ns , struct kern_ipc_perm * ipcp , short flg ) ;
2005-04-16 15:20:36 -07:00
/* for rare, potentially huge allocations.
* both function can sleep
*/
void * ipc_alloc ( int size ) ;
void ipc_free ( void * ptr , int size ) ;
/*
* For allocation that need to be freed by RCU .
* Objects are reference counted , they start with reference count 1.
* getref increases the refcount , the putref call that reduces the recount
* to 0 schedules the rcu destruction . Caller must guarantee locking .
*/
void * ipc_rcu_alloc ( int size ) ;
2013-04-30 19:15:44 -07:00
int ipc_rcu_getref ( void * ptr ) ;
2005-04-16 15:20:36 -07:00
void ipc_rcu_putref ( void * ptr ) ;
2007-10-18 23:40:51 -07:00
struct kern_ipc_perm * ipc_lock ( struct ipc_ids * , int ) ;
2013-04-30 19:15:19 -07:00
struct kern_ipc_perm * ipc_obtain_object ( struct ipc_ids * ids , int id ) ;
2005-04-16 15:20:36 -07:00
void kernel_to_ipc64_perm ( struct kern_ipc_perm * in , struct ipc64_perm * out ) ;
void ipc64_perm_to_ipc_perm ( struct ipc64_perm * in , struct ipc_perm * out ) ;
2012-02-07 16:54:11 -08:00
int ipc_update_perm ( struct ipc64_perm * in , struct kern_ipc_perm * out ) ;
2013-04-30 19:15:24 -07:00
struct kern_ipc_perm * ipcctl_pre_down_nolock ( struct ipc_namespace * ns ,
struct ipc_ids * ids , int id , int cmd ,
struct ipc64_perm * perm , int extra_perm ) ;
2005-04-16 15:20:36 -07:00
2012-07-30 14:42:46 -07:00
# ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
2005-04-16 15:20:36 -07:00
/* On IA-64, we always use the "64-bit version" of the IPC structures. */
# define ipc_parse_version(cmd) IPC_64
# else
int ipc_parse_version ( int * cmd ) ;
# endif
extern void free_msg ( struct msg_msg * msg ) ;
extern struct msg_msg * load_msg ( const void __user * src , int len ) ;
2013-01-04 15:34:55 -08:00
extern struct msg_msg * copy_msg ( struct msg_msg * src , struct msg_msg * dst ) ;
2005-04-16 15:20:36 -07:00
extern int store_msg ( void __user * dest , struct msg_msg * msg , int len ) ;
2007-10-18 23:40:49 -07:00
2008-04-29 01:00:42 -07:00
extern void recompute_msgmni ( struct ipc_namespace * ) ;
2007-10-18 23:40:55 -07:00
static inline int ipc_buildid ( int id , int seq )
2007-10-18 23:40:52 -07:00
{
return SEQ_MULTIPLIER * seq + id ;
}
2007-10-18 23:40:55 -07:00
static inline int ipc_checkid ( struct kern_ipc_perm * ipcp , int uid )
2007-10-18 23:40:51 -07:00
{
2013-04-30 19:15:14 -07:00
return uid / SEQ_MULTIPLIER ! = ipcp - > seq ;
2007-10-18 23:40:51 -07:00
}
2013-07-08 16:01:10 -07:00
static inline void ipc_lock_object ( struct kern_ipc_perm * perm )
2007-10-18 23:40:51 -07:00
{
spin_lock ( & perm - > lock ) ;
}
2013-07-08 16:01:10 -07:00
static inline void ipc_unlock_object ( struct kern_ipc_perm * perm )
2007-10-18 23:40:51 -07:00
{
spin_unlock ( & perm - > lock ) ;
}
2013-07-08 16:01:10 -07:00
static inline void ipc_assert_locked_object ( struct kern_ipc_perm * perm )
ipc,sem: do not hold ipc lock more than necessary
Instead of holding the ipc lock for permissions and security checks, among
others, only acquire it when necessary.
Some numbers....
1) With Rik's semop-multi.c microbenchmark we can see the following
results:
Baseline (3.9-rc1):
cpus 4, threads: 256, semaphores: 128, test duration: 30 secs
total operations: 151452270, ops/sec 5048409
+ 59.40% a.out [kernel.kallsyms] [k] _raw_spin_lock
+ 6.14% a.out [kernel.kallsyms] [k] sys_semtimedop
+ 3.84% a.out [kernel.kallsyms] [k] avc_has_perm_flags
+ 3.64% a.out [kernel.kallsyms] [k] __audit_syscall_exit
+ 2.06% a.out [kernel.kallsyms] [k] copy_user_enhanced_fast_string
+ 1.86% a.out [kernel.kallsyms] [k] ipc_lock
With this patchset:
cpus 4, threads: 256, semaphores: 128, test duration: 30 secs
total operations: 273156400, ops/sec 9105213
+ 18.54% a.out [kernel.kallsyms] [k] _raw_spin_lock
+ 11.72% a.out [kernel.kallsyms] [k] sys_semtimedop
+ 7.70% a.out [kernel.kallsyms] [k] ipc_has_perm.isra.21
+ 6.58% a.out [kernel.kallsyms] [k] avc_has_perm_flags
+ 6.54% a.out [kernel.kallsyms] [k] __audit_syscall_exit
+ 4.71% a.out [kernel.kallsyms] [k] ipc_obtain_object_check
2) While on an Oracle swingbench DSS (data mining) workload the
improvements are not as exciting as with Rik's benchmark, we can see
some positive numbers. For an 8 socket machine the following are the
percentages of %sys time incurred in the ipc lock:
Baseline (3.9-rc1):
100 swingbench users: 8,74%
400 swingbench users: 21,86%
800 swingbench users: 84,35%
With this patchset:
100 swingbench users: 8,11%
400 swingbench users: 19,93%
800 swingbench users: 77,69%
[riel@redhat.com: fix two locking bugs]
[sasha.levin@oracle.com: prevent releasing RCU read lock twice in semctl_main]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Chegu Vinod <chegu_vinod@hp.com>
Acked-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Emmanuel Benisty <benisty.e@gmail.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Stanislav Kinsbursky <skinsbursky@parallels.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-30 19:15:29 -07:00
{
2013-07-08 16:01:10 -07:00
assert_spin_locked ( & perm - > lock ) ;
}
static inline void ipc_lock_by_ptr ( struct kern_ipc_perm * perm )
{
rcu_read_lock ( ) ;
2013-07-08 16:01:11 -07:00
ipc_lock_object ( perm ) ;
ipc,sem: do not hold ipc lock more than necessary
Instead of holding the ipc lock for permissions and security checks, among
others, only acquire it when necessary.
Some numbers....
1) With Rik's semop-multi.c microbenchmark we can see the following
results:
Baseline (3.9-rc1):
cpus 4, threads: 256, semaphores: 128, test duration: 30 secs
total operations: 151452270, ops/sec 5048409
+ 59.40% a.out [kernel.kallsyms] [k] _raw_spin_lock
+ 6.14% a.out [kernel.kallsyms] [k] sys_semtimedop
+ 3.84% a.out [kernel.kallsyms] [k] avc_has_perm_flags
+ 3.64% a.out [kernel.kallsyms] [k] __audit_syscall_exit
+ 2.06% a.out [kernel.kallsyms] [k] copy_user_enhanced_fast_string
+ 1.86% a.out [kernel.kallsyms] [k] ipc_lock
With this patchset:
cpus 4, threads: 256, semaphores: 128, test duration: 30 secs
total operations: 273156400, ops/sec 9105213
+ 18.54% a.out [kernel.kallsyms] [k] _raw_spin_lock
+ 11.72% a.out [kernel.kallsyms] [k] sys_semtimedop
+ 7.70% a.out [kernel.kallsyms] [k] ipc_has_perm.isra.21
+ 6.58% a.out [kernel.kallsyms] [k] avc_has_perm_flags
+ 6.54% a.out [kernel.kallsyms] [k] __audit_syscall_exit
+ 4.71% a.out [kernel.kallsyms] [k] ipc_obtain_object_check
2) While on an Oracle swingbench DSS (data mining) workload the
improvements are not as exciting as with Rik's benchmark, we can see
some positive numbers. For an 8 socket machine the following are the
percentages of %sys time incurred in the ipc lock:
Baseline (3.9-rc1):
100 swingbench users: 8,74%
400 swingbench users: 21,86%
800 swingbench users: 84,35%
With this patchset:
100 swingbench users: 8,11%
400 swingbench users: 19,93%
800 swingbench users: 77,69%
[riel@redhat.com: fix two locking bugs]
[sasha.levin@oracle.com: prevent releasing RCU read lock twice in semctl_main]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Chegu Vinod <chegu_vinod@hp.com>
Acked-by: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Emmanuel Benisty <benisty.e@gmail.com>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Stanislav Kinsbursky <skinsbursky@parallels.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-04-30 19:15:29 -07:00
}
2013-07-08 16:01:10 -07:00
static inline void ipc_unlock ( struct kern_ipc_perm * perm )
{
2013-07-08 16:01:11 -07:00
ipc_unlock_object ( perm ) ;
2013-07-08 16:01:10 -07:00
rcu_read_unlock ( ) ;
}
2008-02-08 04:18:54 -08:00
struct kern_ipc_perm * ipc_lock_check ( struct ipc_ids * ids , int id ) ;
2013-04-30 19:15:19 -07:00
struct kern_ipc_perm * ipc_obtain_object_check ( struct ipc_ids * ids , int id ) ;
2008-02-08 04:18:54 -08:00
int ipcget ( struct ipc_namespace * ns , struct ipc_ids * ids ,
struct ipc_ops * ops , struct ipc_params * params ) ;
2009-06-17 16:27:57 -07:00
void free_ipcs ( struct ipc_namespace * ns , struct ipc_ids * ids ,
void ( * free ) ( struct ipc_namespace * , struct kern_ipc_perm * ) ) ;
2005-04-16 15:20:36 -07:00
# endif