2005-04-17 02:20:36 +04:00
/*
* linux / kernel / signal . c
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
*
* 1997 - 11 - 02 Modified for POSIX .1 b signals by Richard Henderson
*
* 2003 - 06 - 02 Jim Houston - Concurrent Computer Corp .
* Changes to use preallocated sigqueue structures
* to allow signals to be sent reliably .
*/
# include <linux/slab.h>
# include <linux/module.h>
# include <linux/init.h>
# include <linux/sched.h>
# include <linux/fs.h>
# include <linux/tty.h>
# include <linux/binfmts.h>
# include <linux/security.h>
# include <linux/syscalls.h>
# include <linux/ptrace.h>
2005-05-01 19:59:14 +04:00
# include <linux/signal.h>
signal/timer/event: signalfd core
This patch series implements the new signalfd() system call.
I took part of the original Linus code (and you know how badly it can be
broken :), and I added even more breakage ;) Signals are fetched from the same
signal queue used by the process, so signalfd will compete with standard
kernel delivery in dequeue_signal(). If you want to reliably fetch signals on
the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This
seems to be working fine on my Dual Opteron machine. I made a quick test
program for it:
http://www.xmailserver.org/signafd-test.c
The signalfd() system call implements signal delivery into a file descriptor
receiver. The signalfd file descriptor if created with the following API:
int signalfd(int ufd, const sigset_t *mask, size_t masksize);
The "ufd" parameter allows to change an existing signalfd sigmask, w/out going
to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new
signalfd file.
The "mask" allows to specify the signal mask of signals that we are interested
in. The "masksize" parameter is the size of "mask".
The signalfd fd supports the poll(2) and read(2) system calls. The poll(2)
will return POLLIN when signals are available to be dequeued. As a direct
consequence of supporting the Linux poll subsystem, the signalfd fd can use
used together with epoll(2) too.
The read(2) system call will return a "struct signalfd_siginfo" structure in
the userspace supplied buffer. The return value is the number of bytes copied
in the supplied buffer, or -1 in case of error. The read(2) call can also
return 0, in case the sighand structure to which the signalfd was attached,
has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will
return -EAGAIN in case no signal is available.
If the size of the buffer passed to read(2) is lower than sizeof(struct
signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also
return -ERESTARTSYS in case a signal hits the process. The format of the
struct signalfd_siginfo is, and the valid fields depends of the (->code &
__SI_MASK) value, in the same way a struct siginfo would:
struct signalfd_siginfo {
__u32 signo; /* si_signo */
__s32 err; /* si_errno */
__s32 code; /* si_code */
__u32 pid; /* si_pid */
__u32 uid; /* si_uid */
__s32 fd; /* si_fd */
__u32 tid; /* si_fd */
__u32 band; /* si_band */
__u32 overrun; /* si_overrun */
__u32 trapno; /* si_trapno */
__s32 status; /* si_status */
__s32 svint; /* si_int */
__u64 svptr; /* si_ptr */
__u64 utime; /* si_utime */
__u64 stime; /* si_stime */
__u64 addr; /* si_addr */
};
[akpm@linux-foundation.org: fix signalfd_copyinfo() on i386]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-11 09:23:13 +04:00
# include <linux/signalfd.h>
2008-07-26 06:45:51 +04:00
# include <linux/tracehook.h>
2006-01-11 23:17:46 +03:00
# include <linux/capability.h>
2006-12-07 07:34:23 +03:00
# include <linux/freezer.h>
2006-12-08 13:38:01 +03:00
# include <linux/pid_namespace.h>
# include <linux/nsproxy.h>
tracing, sched: LTTng instrumentation - scheduler
Instrument the scheduler activity (sched_switch, migration, wakeups,
wait for a task, signal delivery) and process/thread
creation/destruction (fork, exit, kthread stop). Actually, kthread
creation is not instrumented in this patch because it is architecture
dependent. It allows to connect tracers such as ftrace which detects
scheduling latencies, good/bad scheduler decisions. Tools like LTTng can
export this scheduler information along with instrumentation of the rest
of the kernel activity to perform post-mortem analysis on the scheduler
activity.
About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added. See the "Tracepoints" patch header for
performance result detail.
Changelog :
- Change instrumentation location and parameter to match ftrace
instrumentation, previously done with kernel markers.
[ mingo@elte.hu: conflict resolutions ]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-18 20:16:17 +04:00
# include <trace/sched.h>
2006-12-08 13:38:01 +03:00
2005-04-17 02:20:36 +04:00
# include <asm/param.h>
# include <asm/uaccess.h>
# include <asm/unistd.h>
# include <asm/siginfo.h>
2006-05-25 18:19:47 +04:00
# include "audit.h" /* audit_signal_info() */
2005-04-17 02:20:36 +04:00
/*
* SLAB caches for signal bits .
*/
2006-12-07 07:33:20 +03:00
static struct kmem_cache * sigqueue_cachep ;
2005-04-17 02:20:36 +04:00
2008-07-26 06:45:51 +04:00
static void __user * sig_handler ( struct task_struct * t , int sig )
2008-04-30 11:52:39 +04:00
{
2008-07-26 06:45:51 +04:00
return t - > sighand - > action [ sig - 1 ] . sa . sa_handler ;
}
2008-04-30 11:52:39 +04:00
2008-07-26 06:45:51 +04:00
static int sig_handler_ignored ( void __user * handler , int sig )
{
2008-04-30 11:52:39 +04:00
/* Is it explicitly or implicitly ignored? */
return handler = = SIG_IGN | |
( handler = = SIG_DFL & & sig_kernel_ignore ( sig ) ) ;
}
2005-04-17 02:20:36 +04:00
static int sig_ignored ( struct task_struct * t , int sig )
{
2008-07-26 06:45:51 +04:00
void __user * handler ;
2005-04-17 02:20:36 +04:00
/*
* Blocked signals are never ignored , since the
* signal handler may change by the time it is
* unblocked .
*/
2007-11-13 02:41:55 +03:00
if ( sigismember ( & t - > blocked , sig ) | | sigismember ( & t - > real_blocked , sig ) )
2005-04-17 02:20:36 +04:00
return 0 ;
2008-07-26 06:45:51 +04:00
handler = sig_handler ( t , sig ) ;
if ( ! sig_handler_ignored ( handler , sig ) )
return 0 ;
/*
* Tracers may want to know about even ignored signals .
*/
return ! tracehook_consider_ignored_signal ( t , sig , handler ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Re - calculate pending state from the set of locally pending
* signals , globally pending signals , and blocked signals .
*/
static inline int has_pending_signals ( sigset_t * signal , sigset_t * blocked )
{
unsigned long ready ;
long i ;
switch ( _NSIG_WORDS ) {
default :
for ( i = _NSIG_WORDS , ready = 0 ; - - i > = 0 ; )
ready | = signal - > sig [ i ] & ~ blocked - > sig [ i ] ;
break ;
case 4 : ready = signal - > sig [ 3 ] & ~ blocked - > sig [ 3 ] ;
ready | = signal - > sig [ 2 ] & ~ blocked - > sig [ 2 ] ;
ready | = signal - > sig [ 1 ] & ~ blocked - > sig [ 1 ] ;
ready | = signal - > sig [ 0 ] & ~ blocked - > sig [ 0 ] ;
break ;
case 2 : ready = signal - > sig [ 1 ] & ~ blocked - > sig [ 1 ] ;
ready | = signal - > sig [ 0 ] & ~ blocked - > sig [ 0 ] ;
break ;
case 1 : ready = signal - > sig [ 0 ] & ~ blocked - > sig [ 0 ] ;
}
return ready ! = 0 ;
}
# define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
2007-05-24 00:57:44 +04:00
static int recalc_sigpending_tsk ( struct task_struct * t )
2005-04-17 02:20:36 +04:00
{
if ( t - > signal - > group_stop_count > 0 | |
PENDING ( & t - > pending , & t - > blocked ) | |
2007-05-24 00:57:44 +04:00
PENDING ( & t - > signal - > shared_pending , & t - > blocked ) ) {
2005-04-17 02:20:36 +04:00
set_tsk_thread_flag ( t , TIF_SIGPENDING ) ;
2007-05-24 00:57:44 +04:00
return 1 ;
}
2007-06-06 14:59:00 +04:00
/*
* We must never clear the flag in another thread , or in current
* when it ' s possible the current syscall is returning - ERESTART * .
* So we don ' t clear it here , and only callers who know they should do .
*/
2007-05-24 00:57:44 +04:00
return 0 ;
}
/*
* After recalculating TIF_SIGPENDING , we need to make sure the task wakes up .
* This is superfluous when called on current , the wakeup is a harmless no - op .
*/
void recalc_sigpending_and_wake ( struct task_struct * t )
{
if ( recalc_sigpending_tsk ( t ) )
signal_wake_up ( t , 0 ) ;
2005-04-17 02:20:36 +04:00
}
void recalc_sigpending ( void )
{
2008-07-26 06:45:55 +04:00
if ( unlikely ( tracehook_force_sigpending ( ) ) )
set_thread_flag ( TIF_SIGPENDING ) ;
else if ( ! recalc_sigpending_tsk ( current ) & & ! freezing ( current ) )
2007-06-06 14:59:00 +04:00
clear_thread_flag ( TIF_SIGPENDING ) ;
2005-04-17 02:20:36 +04:00
}
/* Given the mask, find the first available signal that should be serviced. */
signal/timer/event: signalfd core
This patch series implements the new signalfd() system call.
I took part of the original Linus code (and you know how badly it can be
broken :), and I added even more breakage ;) Signals are fetched from the same
signal queue used by the process, so signalfd will compete with standard
kernel delivery in dequeue_signal(). If you want to reliably fetch signals on
the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This
seems to be working fine on my Dual Opteron machine. I made a quick test
program for it:
http://www.xmailserver.org/signafd-test.c
The signalfd() system call implements signal delivery into a file descriptor
receiver. The signalfd file descriptor if created with the following API:
int signalfd(int ufd, const sigset_t *mask, size_t masksize);
The "ufd" parameter allows to change an existing signalfd sigmask, w/out going
to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new
signalfd file.
The "mask" allows to specify the signal mask of signals that we are interested
in. The "masksize" parameter is the size of "mask".
The signalfd fd supports the poll(2) and read(2) system calls. The poll(2)
will return POLLIN when signals are available to be dequeued. As a direct
consequence of supporting the Linux poll subsystem, the signalfd fd can use
used together with epoll(2) too.
The read(2) system call will return a "struct signalfd_siginfo" structure in
the userspace supplied buffer. The return value is the number of bytes copied
in the supplied buffer, or -1 in case of error. The read(2) call can also
return 0, in case the sighand structure to which the signalfd was attached,
has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will
return -EAGAIN in case no signal is available.
If the size of the buffer passed to read(2) is lower than sizeof(struct
signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also
return -ERESTARTSYS in case a signal hits the process. The format of the
struct signalfd_siginfo is, and the valid fields depends of the (->code &
__SI_MASK) value, in the same way a struct siginfo would:
struct signalfd_siginfo {
__u32 signo; /* si_signo */
__s32 err; /* si_errno */
__s32 code; /* si_code */
__u32 pid; /* si_pid */
__u32 uid; /* si_uid */
__s32 fd; /* si_fd */
__u32 tid; /* si_fd */
__u32 band; /* si_band */
__u32 overrun; /* si_overrun */
__u32 trapno; /* si_trapno */
__s32 status; /* si_status */
__s32 svint; /* si_int */
__u64 svptr; /* si_ptr */
__u64 utime; /* si_utime */
__u64 stime; /* si_stime */
__u64 addr; /* si_addr */
};
[akpm@linux-foundation.org: fix signalfd_copyinfo() on i386]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-11 09:23:13 +04:00
int next_signal ( struct sigpending * pending , sigset_t * mask )
2005-04-17 02:20:36 +04:00
{
unsigned long i , * s , * m , x ;
int sig = 0 ;
s = pending - > signal . sig ;
m = mask - > sig ;
switch ( _NSIG_WORDS ) {
default :
for ( i = 0 ; i < _NSIG_WORDS ; + + i , + + s , + + m )
if ( ( x = * s & ~ * m ) ! = 0 ) {
sig = ffz ( ~ x ) + i * _NSIG_BPW + 1 ;
break ;
}
break ;
case 2 : if ( ( x = s [ 0 ] & ~ m [ 0 ] ) ! = 0 )
sig = 1 ;
else if ( ( x = s [ 1 ] & ~ m [ 1 ] ) ! = 0 )
sig = _NSIG_BPW + 1 ;
else
break ;
sig + = ffz ( ~ x ) ;
break ;
case 1 : if ( ( x = * s & ~ * m ) ! = 0 )
sig = ffz ( ~ x ) + 1 ;
break ;
}
return sig ;
}
2005-10-07 10:46:04 +04:00
static struct sigqueue * __sigqueue_alloc ( struct task_struct * t , gfp_t flags ,
2005-04-17 02:20:36 +04:00
int override_rlimit )
{
struct sigqueue * q = NULL ;
2006-11-05 00:03:00 +03:00
struct user_struct * user ;
2005-04-17 02:20:36 +04:00
2006-11-05 00:03:00 +03:00
/*
* In order to avoid problems with " switch_user() " , we want to make
* sure that the compiler doesn ' t re - load " t->user "
*/
user = t - > user ;
barrier ( ) ;
atomic_inc ( & user - > sigpending ) ;
2005-04-17 02:20:36 +04:00
if ( override_rlimit | |
2006-11-05 00:03:00 +03:00
atomic_read ( & user - > sigpending ) < =
2005-04-17 02:20:36 +04:00
t - > signal - > rlim [ RLIMIT_SIGPENDING ] . rlim_cur )
q = kmem_cache_alloc ( sigqueue_cachep , flags ) ;
if ( unlikely ( q = = NULL ) ) {
2006-11-05 00:03:00 +03:00
atomic_dec ( & user - > sigpending ) ;
2005-04-17 02:20:36 +04:00
} else {
INIT_LIST_HEAD ( & q - > list ) ;
q - > flags = 0 ;
2006-11-05 00:03:00 +03:00
q - > user = get_uid ( user ) ;
2005-04-17 02:20:36 +04:00
}
return ( q ) ;
}
2006-02-03 14:04:41 +03:00
static void __sigqueue_free ( struct sigqueue * q )
2005-04-17 02:20:36 +04:00
{
if ( q - > flags & SIGQUEUE_PREALLOC )
return ;
atomic_dec ( & q - > user - > sigpending ) ;
free_uid ( q - > user ) ;
kmem_cache_free ( sigqueue_cachep , q ) ;
}
2006-03-29 04:11:18 +04:00
void flush_sigqueue ( struct sigpending * queue )
2005-04-17 02:20:36 +04:00
{
struct sigqueue * q ;
sigemptyset ( & queue - > signal ) ;
while ( ! list_empty ( & queue - > list ) ) {
q = list_entry ( queue - > list . next , struct sigqueue , list ) ;
list_del_init ( & q - > list ) ;
__sigqueue_free ( q ) ;
}
}
/*
* Flush all pending signals for a task .
*/
2006-03-29 04:11:17 +04:00
void flush_signals ( struct task_struct * t )
2005-04-17 02:20:36 +04:00
{
unsigned long flags ;
spin_lock_irqsave ( & t - > sighand - > siglock , flags ) ;
2008-04-22 02:15:06 +04:00
clear_tsk_thread_flag ( t , TIF_SIGPENDING ) ;
2005-04-17 02:20:36 +04:00
flush_sigqueue ( & t - > pending ) ;
flush_sigqueue ( & t - > signal - > shared_pending ) ;
spin_unlock_irqrestore ( & t - > sighand - > siglock , flags ) ;
}
2008-05-26 20:55:42 +04:00
static void __flush_itimer_signals ( struct sigpending * pending )
{
sigset_t signal , retain ;
struct sigqueue * q , * n ;
signal = pending - > signal ;
sigemptyset ( & retain ) ;
list_for_each_entry_safe ( q , n , & pending - > list , list ) {
int sig = q - > info . si_signo ;
if ( likely ( q - > info . si_code ! = SI_TIMER ) ) {
sigaddset ( & retain , sig ) ;
} else {
sigdelset ( & signal , sig ) ;
list_del_init ( & q - > list ) ;
__sigqueue_free ( q ) ;
}
}
sigorsets ( & pending - > signal , & signal , & retain ) ;
}
void flush_itimer_signals ( void )
{
struct task_struct * tsk = current ;
unsigned long flags ;
spin_lock_irqsave ( & tsk - > sighand - > siglock , flags ) ;
__flush_itimer_signals ( & tsk - > pending ) ;
__flush_itimer_signals ( & tsk - > signal - > shared_pending ) ;
spin_unlock_irqrestore ( & tsk - > sighand - > siglock , flags ) ;
}
2007-05-09 13:34:37 +04:00
void ignore_signals ( struct task_struct * t )
{
int i ;
for ( i = 0 ; i < _NSIG ; + + i )
t - > sighand - > action [ i ] . sa . sa_handler = SIG_IGN ;
flush_signals ( t ) ;
}
2005-04-17 02:20:36 +04:00
/*
* Flush all handlers for a task .
*/
void
flush_signal_handlers ( struct task_struct * t , int force_default )
{
int i ;
struct k_sigaction * ka = & t - > sighand - > action [ 0 ] ;
for ( i = _NSIG ; i ! = 0 ; i - - ) {
if ( force_default | | ka - > sa . sa_handler ! = SIG_IGN )
ka - > sa . sa_handler = SIG_DFL ;
ka - > sa . sa_flags = 0 ;
sigemptyset ( & ka - > sa . sa_mask ) ;
ka + + ;
}
}
2007-07-22 13:12:28 +04:00
int unhandled_signal ( struct task_struct * tsk , int sig )
{
2008-07-26 06:45:52 +04:00
void __user * handler = tsk - > sighand - > action [ sig - 1 ] . sa . sa_handler ;
2007-10-19 10:39:52 +04:00
if ( is_global_init ( tsk ) )
2007-07-22 13:12:28 +04:00
return 1 ;
2008-07-26 06:45:52 +04:00
if ( handler ! = SIG_IGN & & handler ! = SIG_DFL )
2007-07-22 13:12:28 +04:00
return 0 ;
2008-07-26 06:45:52 +04:00
return ! tracehook_consider_fatal_signal ( tsk , sig , handler ) ;
2007-07-22 13:12:28 +04:00
}
2005-04-17 02:20:36 +04:00
/* Notify the system that a driver wants to block all signals for this
* process , and wants to be notified if any signals at all were to be
* sent / acted upon . If the notifier routine returns non - zero , then the
* signal will be acted upon after all . If the notifier routine returns 0 ,
* then then signal will be blocked . Only one block per process is
* allowed . priv is a pointer to private data that the notifier routine
* can use to determine if the signal should be blocked or not . */
void
block_all_signals ( int ( * notifier ) ( void * priv ) , void * priv , sigset_t * mask )
{
unsigned long flags ;
spin_lock_irqsave ( & current - > sighand - > siglock , flags ) ;
current - > notifier_mask = mask ;
current - > notifier_data = priv ;
current - > notifier = notifier ;
spin_unlock_irqrestore ( & current - > sighand - > siglock , flags ) ;
}
/* Notify the system that blocking has ended. */
void
unblock_all_signals ( void )
{
unsigned long flags ;
spin_lock_irqsave ( & current - > sighand - > siglock , flags ) ;
current - > notifier = NULL ;
current - > notifier_data = NULL ;
recalc_sigpending ( ) ;
spin_unlock_irqrestore ( & current - > sighand - > siglock , flags ) ;
}
2008-07-25 12:47:29 +04:00
static void collect_signal ( int sig , struct sigpending * list , siginfo_t * info )
2005-04-17 02:20:36 +04:00
{
struct sigqueue * q , * first = NULL ;
/*
* Collect the siginfo appropriate to this signal . Check if
* there is another siginfo for the same signal .
*/
list_for_each_entry ( q , & list - > list , list ) {
if ( q - > info . si_signo = = sig ) {
2008-07-25 12:47:28 +04:00
if ( first )
goto still_pending ;
2005-04-17 02:20:36 +04:00
first = q ;
}
}
2008-07-25 12:47:28 +04:00
sigdelset ( & list - > signal , sig ) ;
2005-04-17 02:20:36 +04:00
if ( first ) {
2008-07-25 12:47:28 +04:00
still_pending :
2005-04-17 02:20:36 +04:00
list_del_init ( & first - > list ) ;
copy_siginfo ( info , & first - > info ) ;
__sigqueue_free ( first ) ;
} else {
/* Ok, it wasn't in the queue. This must be
a fast - pathed signal or we must have been
out of queue space . So zero out the info .
*/
info - > si_signo = sig ;
info - > si_errno = 0 ;
info - > si_code = 0 ;
info - > si_pid = 0 ;
info - > si_uid = 0 ;
}
}
static int __dequeue_signal ( struct sigpending * pending , sigset_t * mask ,
siginfo_t * info )
{
2006-09-29 13:00:31 +04:00
int sig = next_signal ( pending , mask ) ;
2005-04-17 02:20:36 +04:00
if ( sig ) {
if ( current - > notifier ) {
if ( sigismember ( current - > notifier_mask , sig ) ) {
if ( ! ( current - > notifier ) ( current - > notifier_data ) ) {
clear_thread_flag ( TIF_SIGPENDING ) ;
return 0 ;
}
}
}
2008-07-25 12:47:29 +04:00
collect_signal ( sig , pending , info ) ;
2005-04-17 02:20:36 +04:00
}
return sig ;
}
/*
* Dequeue a signal and return the element to the caller , which is
* expected to free it .
*
* All callers have to hold the siglock .
*/
int dequeue_signal ( struct task_struct * tsk , sigset_t * mask , siginfo_t * info )
{
2008-04-30 11:52:40 +04:00
int signr ;
2007-06-12 02:16:18 +04:00
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
2007-09-20 23:40:16 +04:00
signr = __dequeue_signal ( & tsk - > pending , mask , info ) ;
2007-02-16 12:28:12 +03:00
if ( ! signr ) {
2005-04-17 02:20:36 +04:00
signr = __dequeue_signal ( & tsk - > signal - > shared_pending ,
mask , info ) ;
2007-02-16 12:28:12 +03:00
/*
* itimer signal ?
*
* itimers are process shared and we restart periodic
* itimers in the signal delivery path to prevent DoS
* attacks in the high resolution timer case . This is
* compliant with the old way of self restarting
* itimers , as the SIGALRM is a legacy signal and only
* queued once . Changing the restart behaviour to
* restart the timer in the signal dequeue path is
* reducing the timer noise on heavy loaded ! highres
* systems too .
*/
if ( unlikely ( signr = = SIGALRM ) ) {
struct hrtimer * tmr = & tsk - > signal - > real_timer ;
if ( ! hrtimer_is_queued ( tmr ) & &
tsk - > signal - > it_real_incr . tv64 ! = 0 ) {
hrtimer_forward ( tmr , tmr - > base - > get_time ( ) ,
tsk - > signal - > it_real_incr ) ;
hrtimer_restart ( tmr ) ;
}
}
}
2008-04-30 11:52:40 +04:00
2007-09-20 23:40:16 +04:00
recalc_sigpending ( ) ;
2008-04-30 11:52:40 +04:00
if ( ! signr )
return 0 ;
if ( unlikely ( sig_kernel_stop ( signr ) ) ) {
2007-02-16 12:28:12 +03:00
/*
* Set a marker that we have dequeued a stop signal . Our
* caller might release the siglock and then the pending
* stop signal it is about to process is no longer in the
* pending bitmasks , but must still be cleared by a SIGCONT
* ( and overruled by a SIGKILL ) . So those cases clear this
* shared flag after we ' ve set it . Note that this flag may
* remain set after the signal we return is ignored or
* handled . That doesn ' t matter because its only purpose
* is to alert stop - signal processing code when another
* processor has come along and cleared the flag .
*/
2008-07-25 12:47:30 +04:00
tsk - > signal - > flags | = SIGNAL_STOP_DEQUEUED ;
2007-02-16 12:28:12 +03:00
}
2008-04-30 11:52:40 +04:00
if ( ( info - > si_code & __SI_MASK ) = = __SI_TIMER & & info - > si_sys_private ) {
2005-04-17 02:20:36 +04:00
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks . Note , we leave
* irqs disabled here , since the posix - timers code is
* about to disable them again anyway .
*/
spin_unlock ( & tsk - > sighand - > siglock ) ;
do_schedule_next_timer ( info ) ;
spin_lock ( & tsk - > sighand - > siglock ) ;
}
return signr ;
}
/*
* Tell a process that it has a new active signal . .
*
* NOTE ! we rely on the previous spin_lock to
* lock interrupts for us ! We can only be called with
* " siglock " held , and the local interrupt must
* have been disabled when that got acquired !
*
* No need to set need_resched since signal event passing
* goes through - > blocked
*/
void signal_wake_up ( struct task_struct * t , int resume )
{
unsigned int mask ;
set_tsk_thread_flag ( t , TIF_SIGPENDING ) ;
/*
2007-12-06 19:13:16 +03:00
* For SIGKILL , we want to wake it up in the stopped / traced / killable
* case . We don ' t check t - > state here because there is a race with it
2005-04-17 02:20:36 +04:00
* executing another processor and just now entering stopped state .
* By using wake_up_state , we ensure the process will wake up and
* handle its death signal .
*/
mask = TASK_INTERRUPTIBLE ;
if ( resume )
2007-12-06 19:13:16 +03:00
mask | = TASK_WAKEKILL ;
2005-04-17 02:20:36 +04:00
if ( ! wake_up_state ( t , mask ) )
kick_process ( t ) ;
}
2006-01-08 12:02:48 +03:00
/*
* Remove signals in mask from the pending set and queue .
* Returns 1 if any signals were found .
*
* All callers must be holding the siglock .
*
* This version takes a sigset mask and looks at all signals ,
* not just those in the first mask word .
*/
static int rm_from_queue_full ( sigset_t * mask , struct sigpending * s )
{
struct sigqueue * q , * n ;
sigset_t m ;
sigandsets ( & m , mask , & s - > signal ) ;
if ( sigisemptyset ( & m ) )
return 0 ;
signandsets ( & s - > signal , & s - > signal , mask ) ;
list_for_each_entry_safe ( q , n , & s - > list , list ) {
if ( sigismember ( mask , q - > info . si_signo ) ) {
list_del_init ( & q - > list ) ;
__sigqueue_free ( q ) ;
}
}
return 1 ;
}
2005-04-17 02:20:36 +04:00
/*
* Remove signals in mask from the pending set and queue .
* Returns 1 if any signals were found .
*
* All callers must be holding the siglock .
*/
static int rm_from_queue ( unsigned long mask , struct sigpending * s )
{
struct sigqueue * q , * n ;
if ( ! sigtestsetmask ( & s - > signal , mask ) )
return 0 ;
sigdelsetmask ( & s - > signal , mask ) ;
list_for_each_entry_safe ( q , n , & s - > list , list ) {
if ( q - > info . si_signo < SIGRTMIN & &
( mask & sigmask ( q - > info . si_signo ) ) ) {
list_del_init ( & q - > list ) ;
__sigqueue_free ( q ) ;
}
}
return 1 ;
}
/*
* Bad permissions for sending the signal
*/
static int check_kill_permission ( int sig , struct siginfo * info ,
struct task_struct * t )
{
2008-04-30 11:53:01 +04:00
struct pid * sid ;
2008-04-30 11:52:42 +04:00
int error ;
2005-05-01 19:59:14 +04:00
if ( ! valid_signal ( sig ) )
2008-04-30 11:52:42 +04:00
return - EINVAL ;
if ( info ! = SEND_SIG_NOINFO & & ( is_si_special ( info ) | | SI_FROMKERNEL ( info ) ) )
return 0 ;
2007-03-30 02:01:04 +04:00
2008-04-30 11:52:42 +04:00
error = audit_signal_info ( sig , t ) ; /* Let audit system see the signal */
if ( error )
2005-04-17 02:20:36 +04:00
return error ;
2008-04-30 11:52:42 +04:00
2008-04-30 11:53:01 +04:00
if ( ( current - > euid ^ t - > suid ) & & ( current - > euid ^ t - > uid ) & &
( current - > uid ^ t - > suid ) & & ( current - > uid ^ t - > uid ) & &
! capable ( CAP_KILL ) ) {
switch ( sig ) {
case SIGCONT :
sid = task_session ( t ) ;
/*
* We don ' t return the error if sid = = NULL . The
* task was unhashed , the caller must notice this .
*/
if ( ! sid | | sid = = task_session ( current ) )
break ;
default :
return - EPERM ;
}
}
2005-05-06 15:38:39 +04:00
2007-03-30 02:01:04 +04:00
return security_task_kill ( t , info , sig , 0 ) ;
2005-04-17 02:20:36 +04:00
}
/*
2008-04-30 11:52:59 +04:00
* Handle magic process - wide effects of stop / continue signals . Unlike
* the signal actions , these happen immediately at signal - generation
2005-04-17 02:20:36 +04:00
* time regardless of blocking , ignoring , or handling . This does the
* actual continuing for SIGCONT , but not the actual stopping for stop
2008-04-30 11:52:59 +04:00
* signals . The process stop is done as a signal action for SIG_DFL .
*
* Returns true if the signal should be actually delivered , otherwise
* it should be dropped .
2005-04-17 02:20:36 +04:00
*/
2008-04-30 11:52:59 +04:00
static int prepare_signal ( int sig , struct task_struct * p )
2005-04-17 02:20:36 +04:00
{
2008-04-30 11:52:46 +04:00
struct signal_struct * signal = p - > signal ;
2005-04-17 02:20:36 +04:00
struct task_struct * t ;
2008-04-30 11:52:59 +04:00
if ( unlikely ( signal - > flags & SIGNAL_GROUP_EXIT ) ) {
2005-04-17 02:20:36 +04:00
/*
2008-04-30 11:52:59 +04:00
* The process is in the middle of dying , nothing to do .
2005-04-17 02:20:36 +04:00
*/
2008-04-30 11:52:59 +04:00
} else if ( sig_kernel_stop ( sig ) ) {
2005-04-17 02:20:36 +04:00
/*
* This is a stop signal . Remove SIGCONT from all queues .
*/
2008-04-30 11:52:46 +04:00
rm_from_queue ( sigmask ( SIGCONT ) , & signal - > shared_pending ) ;
2005-04-17 02:20:36 +04:00
t = p ;
do {
rm_from_queue ( sigmask ( SIGCONT ) , & t - > pending ) ;
2008-04-30 11:52:46 +04:00
} while_each_thread ( p , t ) ;
2005-04-17 02:20:36 +04:00
} else if ( sig = = SIGCONT ) {
2008-04-30 11:52:46 +04:00
unsigned int why ;
2005-04-17 02:20:36 +04:00
/*
* Remove all stop signals from all queues ,
* and wake all threads .
*/
2008-04-30 11:52:46 +04:00
rm_from_queue ( SIG_KERNEL_STOP_MASK , & signal - > shared_pending ) ;
2005-04-17 02:20:36 +04:00
t = p ;
do {
unsigned int state ;
rm_from_queue ( SIG_KERNEL_STOP_MASK , & t - > pending ) ;
/*
* If there is a handler for SIGCONT , we must make
* sure that no thread returns to user mode before
* we post the signal , in case it was the only
* thread eligible to run the signal handler - - then
* it must not do anything between resuming and
* running the handler . With the TIF_SIGPENDING
* flag set , the thread will pause and acquire the
* siglock that we hold now and until we ' ve queued
2008-04-30 11:52:46 +04:00
* the pending signal .
2005-04-17 02:20:36 +04:00
*
* Wake up the stopped thread _after_ setting
* TIF_SIGPENDING
*/
2007-12-06 19:13:16 +03:00
state = __TASK_STOPPED ;
2005-04-17 02:20:36 +04:00
if ( sig_user_defined ( t , SIGCONT ) & & ! sigismember ( & t - > blocked , SIGCONT ) ) {
set_tsk_thread_flag ( t , TIF_SIGPENDING ) ;
state | = TASK_INTERRUPTIBLE ;
}
wake_up_state ( t , state ) ;
2008-04-30 11:52:46 +04:00
} while_each_thread ( p , t ) ;
2005-04-17 02:20:36 +04:00
2008-04-30 11:52:46 +04:00
/*
* Notify the parent with CLD_CONTINUED if we were stopped .
*
* If we were in the middle of a group stop , we pretend it
* was already finished , and then continued . Since SIGCHLD
* doesn ' t queue we report only CLD_STOPPED , as if the next
* CLD_CONTINUED was dropped .
*/
why = 0 ;
2008-04-30 11:52:46 +04:00
if ( signal - > flags & SIGNAL_STOP_STOPPED )
2008-04-30 11:52:46 +04:00
why | = SIGNAL_CLD_CONTINUED ;
2008-04-30 11:52:46 +04:00
else if ( signal - > group_stop_count )
2008-04-30 11:52:46 +04:00
why | = SIGNAL_CLD_STOPPED ;
if ( why ) {
2008-04-30 11:53:00 +04:00
/*
* The first thread which returns from finish_stop ( )
* will take - > siglock , notice SIGNAL_CLD_MASK , and
* notify its parent . See get_signal_to_deliver ( ) .
*/
2008-04-30 11:52:46 +04:00
signal - > flags = why | SIGNAL_STOP_CONTINUED ;
signal - > group_stop_count = 0 ;
signal - > group_exit_code = 0 ;
2005-04-17 02:20:36 +04:00
} else {
/*
* We are not stopped , but there could be a stop
* signal in the middle of being processed after
* being removed from the queue . Clear that too .
*/
2008-04-30 11:52:46 +04:00
signal - > flags & = ~ SIGNAL_STOP_DEQUEUED ;
2005-04-17 02:20:36 +04:00
}
}
2008-04-30 11:52:59 +04:00
return ! sig_ignored ( p , sig ) ;
2005-04-17 02:20:36 +04:00
}
2008-04-30 11:52:53 +04:00
/*
* Test if P wants to take SIG . After we ' ve checked all threads with this ,
* it ' s equivalent to finding no threads not blocking SIG . Any threads not
* blocking SIG were ruled out because they are not running and already
* have pending signals . Such threads will dequeue from the shared queue
* as soon as they ' re available , so putting the signal on the shared queue
* will be equivalent to sending it to one such thread .
*/
static inline int wants_signal ( int sig , struct task_struct * p )
{
if ( sigismember ( & p - > blocked , sig ) )
return 0 ;
if ( p - > flags & PF_EXITING )
return 0 ;
if ( sig = = SIGKILL )
return 1 ;
if ( task_is_stopped_or_traced ( p ) )
return 0 ;
return task_curr ( p ) | | ! signal_pending ( p ) ;
}
2008-04-30 11:52:55 +04:00
static void complete_signal ( int sig , struct task_struct * p , int group )
2008-04-30 11:52:53 +04:00
{
struct signal_struct * signal = p - > signal ;
struct task_struct * t ;
/*
* Now find a thread we can wake up to take the signal off the queue .
*
* If the main thread wants the signal , it gets first crack .
* Probably the least surprising to the average bear .
*/
if ( wants_signal ( sig , p ) )
t = p ;
2008-04-30 11:52:55 +04:00
else if ( ! group | | thread_group_empty ( p ) )
2008-04-30 11:52:53 +04:00
/*
* There is just one thread and it does not need to be woken .
* It will dequeue unblocked signals before it runs again .
*/
return ;
else {
/*
* Otherwise try to find a suitable thread .
*/
t = signal - > curr_target ;
while ( ! wants_signal ( sig , t ) ) {
t = next_thread ( t ) ;
if ( t = = signal - > curr_target )
/*
* No thread needs to be woken .
* Any eligible threads will see
* the signal in the queue soon .
*/
return ;
}
signal - > curr_target = t ;
}
/*
* Found a killable thread . If the signal will be fatal ,
* then start taking the whole group down immediately .
*/
2008-04-30 11:53:03 +04:00
if ( sig_fatal ( p , sig ) & &
! ( signal - > flags & ( SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT ) ) & &
2008-04-30 11:52:53 +04:00
! sigismember ( & t - > real_blocked , sig ) & &
2008-07-26 06:45:52 +04:00
( sig = = SIGKILL | |
! tracehook_consider_fatal_signal ( t , sig , SIG_DFL ) ) ) {
2008-04-30 11:52:53 +04:00
/*
* This signal will be fatal to the whole group .
*/
if ( ! sig_kernel_coredump ( sig ) ) {
/*
* Start a group exit and wake everybody up .
* This way we don ' t have other threads
* running and doing things after a slower
* thread has the fatal signal pending .
*/
signal - > flags = SIGNAL_GROUP_EXIT ;
signal - > group_exit_code = sig ;
signal - > group_stop_count = 0 ;
t = p ;
do {
sigaddset ( & t - > pending . signal , SIGKILL ) ;
signal_wake_up ( t , 1 ) ;
} while_each_thread ( p , t ) ;
return ;
}
}
/*
* The signal is already in the shared - pending queue .
* Tell the chosen thread to wake up and dequeue it .
*/
signal_wake_up ( t , sig = = SIGKILL ) ;
return ;
}
2008-04-30 11:52:34 +04:00
static inline int legacy_queue ( struct sigpending * signals , int sig )
{
return ( sig < SIGRTMIN ) & & sigismember ( & signals - > signal , sig ) ;
}
2005-04-17 02:20:36 +04:00
static int send_signal ( int sig , struct siginfo * info , struct task_struct * t ,
2008-04-30 11:52:54 +04:00
int group )
2005-04-17 02:20:36 +04:00
{
2008-04-30 11:52:54 +04:00
struct sigpending * pending ;
2008-04-30 11:52:50 +04:00
struct sigqueue * q ;
2005-04-17 02:20:36 +04:00
tracing, sched: LTTng instrumentation - scheduler
Instrument the scheduler activity (sched_switch, migration, wakeups,
wait for a task, signal delivery) and process/thread
creation/destruction (fork, exit, kthread stop). Actually, kthread
creation is not instrumented in this patch because it is architecture
dependent. It allows to connect tracers such as ftrace which detects
scheduling latencies, good/bad scheduler decisions. Tools like LTTng can
export this scheduler information along with instrumentation of the rest
of the kernel activity to perform post-mortem analysis on the scheduler
activity.
About the performance impact of tracepoints (which is comparable to
markers), even without immediate values optimizations, tests done by
Hideo Aoki on ia64 show no regression. His test case was using hackbench
on a kernel where scheduler instrumentation (about 5 events in code
scheduler code) was added. See the "Tracepoints" patch header for
performance result detail.
Changelog :
- Change instrumentation location and parameter to match ftrace
instrumentation, previously done with kernel markers.
[ mingo@elte.hu: conflict resolutions ]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: 'Peter Zijlstra' <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-18 20:16:17 +04:00
trace_sched_signal_send ( sig , t ) ;
2008-04-30 11:52:50 +04:00
assert_spin_locked ( & t - > sighand - > siglock ) ;
2008-04-30 11:52:59 +04:00
if ( ! prepare_signal ( sig , t ) )
return 0 ;
2008-04-30 11:52:54 +04:00
pending = group ? & t - > signal - > shared_pending : & t - > pending ;
2008-04-30 11:52:35 +04:00
/*
* Short - circuit ignored signals and support queuing
* exactly one non - rt signal , so that we can get more
* detailed information about the cause of the signal .
*/
2008-04-30 11:52:59 +04:00
if ( legacy_queue ( pending , sig ) )
2008-04-30 11:52:35 +04:00
return 0 ;
2005-04-17 02:20:36 +04:00
/*
* fast - pathed signals for kernel - internal things like SIGSTOP
* or SIGKILL .
*/
2005-10-31 02:03:44 +03:00
if ( info = = SEND_SIG_FORCED )
2005-04-17 02:20:36 +04:00
goto out_set ;
/* Real-time signals must be queued if sent by sigqueue, or
some other real - time mechanism . It is implementation
defined whether kill ( ) does so . We attempt to do so , on
the principle of least surprise , but since kill is not
allowed to fail with EAGAIN when low on memory we just
make sure at least one signal gets delivered and don ' t
pass on the info struct . */
q = __sigqueue_alloc ( t , GFP_ATOMIC , ( sig < SIGRTMIN & &
2005-10-31 02:03:45 +03:00
( is_si_special ( info ) | |
2005-04-17 02:20:36 +04:00
info - > si_code > = 0 ) ) ) ;
if ( q ) {
2008-04-30 11:52:54 +04:00
list_add_tail ( & q - > list , & pending - > list ) ;
2005-04-17 02:20:36 +04:00
switch ( ( unsigned long ) info ) {
2005-10-31 02:03:44 +03:00
case ( unsigned long ) SEND_SIG_NOINFO :
2005-04-17 02:20:36 +04:00
q - > info . si_signo = sig ;
q - > info . si_errno = 0 ;
q - > info . si_code = SI_USER ;
2007-10-19 10:40:14 +04:00
q - > info . si_pid = task_pid_vnr ( current ) ;
2005-04-17 02:20:36 +04:00
q - > info . si_uid = current - > uid ;
break ;
2005-10-31 02:03:44 +03:00
case ( unsigned long ) SEND_SIG_PRIV :
2005-04-17 02:20:36 +04:00
q - > info . si_signo = sig ;
q - > info . si_errno = 0 ;
q - > info . si_code = SI_KERNEL ;
q - > info . si_pid = 0 ;
q - > info . si_uid = 0 ;
break ;
default :
copy_siginfo ( & q - > info , info ) ;
break ;
}
2005-10-31 02:03:45 +03:00
} else if ( ! is_si_special ( info ) ) {
if ( sig > = SIGRTMIN & & info - > si_code ! = SI_USER )
2005-04-17 02:20:36 +04:00
/*
* Queue overflow , abort . We may abort if the signal was rt
* and sent by user using something other than kill ( ) .
*/
return - EAGAIN ;
}
out_set :
2008-04-30 11:53:00 +04:00
signalfd_notify ( t , sig ) ;
2008-04-30 11:52:54 +04:00
sigaddset ( & pending - > signal , sig ) ;
2008-04-30 11:52:55 +04:00
complete_signal ( sig , t , group ) ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2007-07-16 10:40:10 +04:00
int print_fatal_signals ;
static void print_fatal_signal ( struct pt_regs * regs , int signr )
{
printk ( " %s/%d: potentially unexpected fatal signal %d. \n " ,
2007-10-19 10:40:40 +04:00
current - > comm , task_pid_nr ( current ) , signr ) ;
2007-07-16 10:40:10 +04:00
2007-10-29 07:31:16 +03:00
# if defined(__i386__) && !defined(__arch_um__)
2008-01-30 15:30:56 +03:00
printk ( " code at %08lx: " , regs - > ip ) ;
2007-07-16 10:40:10 +04:00
{
int i ;
for ( i = 0 ; i < 16 ; i + + ) {
unsigned char insn ;
2008-01-30 15:30:56 +03:00
__get_user ( insn , ( unsigned char * ) ( regs - > ip + i ) ) ;
2007-07-16 10:40:10 +04:00
printk ( " %02x " , insn ) ;
}
}
# endif
printk ( " \n " ) ;
show_regs ( regs ) ;
}
static int __init setup_print_fatal_signals ( char * str )
{
get_option ( & str , & print_fatal_signals ) ;
return 1 ;
}
__setup ( " print-fatal-signals= " , setup_print_fatal_signals ) ;
2005-04-17 02:20:36 +04:00
2008-04-30 11:52:55 +04:00
int
__group_send_sig_info ( int sig , struct siginfo * info , struct task_struct * p )
{
return send_signal ( sig , info , p , 1 ) ;
}
2005-04-17 02:20:36 +04:00
static int
specific_send_sig_info ( int sig , struct siginfo * info , struct task_struct * t )
{
2008-04-30 11:52:55 +04:00
return send_signal ( sig , info , t , 0 ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Force a signal that the process can ' t ignore : if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL .
2006-08-03 07:17:49 +04:00
*
* Note : If we unblock the signal , we always reset it to SIG_DFL ,
* since we do not want to have a signal handler that was blocked
* be invoked when user space had explicitly blocked it .
*
2008-04-30 11:53:05 +04:00
* We don ' t want to have recursive SIGSEGV ' s etc , for example ,
* that is why we also clear SIGNAL_UNKILLABLE .
2005-04-17 02:20:36 +04:00
*/
int
force_sig_info ( int sig , struct siginfo * info , struct task_struct * t )
{
unsigned long int flags ;
2006-08-03 07:17:49 +04:00
int ret , blocked , ignored ;
struct k_sigaction * action ;
2005-04-17 02:20:36 +04:00
spin_lock_irqsave ( & t - > sighand - > siglock , flags ) ;
2006-08-03 07:17:49 +04:00
action = & t - > sighand - > action [ sig - 1 ] ;
ignored = action - > sa . sa_handler = = SIG_IGN ;
blocked = sigismember ( & t - > blocked , sig ) ;
if ( blocked | | ignored ) {
action - > sa . sa_handler = SIG_DFL ;
if ( blocked ) {
sigdelset ( & t - > blocked , sig ) ;
2007-05-24 00:57:44 +04:00
recalc_sigpending_and_wake ( t ) ;
2006-08-03 07:17:49 +04:00
}
2005-04-17 02:20:36 +04:00
}
2008-04-30 11:53:05 +04:00
if ( action - > sa . sa_handler = = SIG_DFL )
t - > signal - > flags & = ~ SIGNAL_UNKILLABLE ;
2005-04-17 02:20:36 +04:00
ret = specific_send_sig_info ( sig , info , t ) ;
spin_unlock_irqrestore ( & t - > sighand - > siglock , flags ) ;
return ret ;
}
void
force_sig_specific ( int sig , struct task_struct * t )
{
2005-10-31 02:03:46 +03:00
force_sig_info ( sig , SEND_SIG_FORCED , t ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Nuke all other threads in the group .
*/
void zap_other_threads ( struct task_struct * p )
{
struct task_struct * t ;
p - > signal - > group_stop_count = 0 ;
for ( t = next_thread ( p ) ; t ! = p ; t = next_thread ( t ) ) {
/*
* Don ' t bother with already dead threads
*/
if ( t - > exit_state )
continue ;
[PATCH] ptrace/coredump/exit_group deadlock
I could seldom reproduce a deadlock with a task not killable in T state
(TASK_STOPPED, not TASK_TRACED) by attaching a NPTL threaded program to
gdb, by segfaulting the task and triggering a core dump while some other
task is executing exit_group and while one task is in ptrace_attached
TASK_STOPPED state (not TASK_TRACED yet). This originated from a gdb
bugreport (the fact gdb was segfaulting the task wasn't a kernel bug), but
I just incidentally noticed the gdb bug triggered a real kernel bug as
well.
Most threads hangs in exit_mm because the core_dumping is still going, the
core dumping hangs because the stopped task doesn't exit, the stopped task
can't wakeup because it has SIGNAL_GROUP_EXIT set, hence the deadlock.
To me it seems that the problem is that the force_sig_specific(SIGKILL) in
zap_threads is a noop if the task has PF_PTRACED set (like in this case
because gdb is attached). The __ptrace_unlink does nothing because the
signal->flags is set to SIGNAL_GROUP_EXIT|SIGNAL_STOP_DEQUEUED (verified).
The above info also shows that the stopped task hit a race and got the stop
signal (presumably by the ptrace_attach, only the attach, state is still
TASK_STOPPED and gdb hangs waiting the core before it can set it to
TASK_TRACED) after one of the thread invoked the core dump (it's the core
dump that sets signal->flags to SIGNAL_GROUP_EXIT).
So beside the fact nobody would wakeup the task in __ptrace_unlink (the
state is _not_ TASK_TRACED), there's a secondary problem in the signal
handling code, where a task should ignore the ptrace-sigstops as long as
SIGNAL_GROUP_EXIT is set (or the wakeup in __ptrace_unlink path wouldn't be
enough).
So I attempted to make this patch that seems to fix the problem. There
were various ways to fix it, perhaps you prefer a different one, I just
opted to the one that looked safer to me.
I also removed the clearing of the stopped bits from the zap_other_threads
(zap_other_threads was safe unlike zap_threads). I don't like useless
code, this whole NPTL signal/ptrace thing is already unreadable enough and
full of corner cases without confusing useless code into it to make it even
less readable. And if this code is really needed, then you may want to
explain why it's not being done in the other paths that sets
SIGNAL_GROUP_EXIT at least.
Even after this patch I still wonder who serializes the read of
p->ptrace in zap_threads.
Patch is called ptrace-core_dump-exit_group-deadlock-1.
This was the trace I've got:
test T ffff81003e8118c0 0 14305 1 14311 14309 (NOTLB)
ffff810058ccdde8 0000000000000082 000001f4000037e1 ffff810000000013
00000000000000f8 ffff81003e811b00 ffff81003e8118c0 ffff810011362100
0000000000000012 ffff810017ca4180
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff80141677>{finish_stop+87}
<ffffffff8014367f>{get_signal_to_deliver+1359} <ffffffff8010d3ad>{do_signal+157}
<ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80111575>{sys_ptrace+2293}
<ffffffff80131810>{default_wake_function+0} <ffffffff80196399>{sys_ioctl+73}
<ffffffff8010dd27>{sysret_signal+28} <ffffffff8010e00f>{ptregscall_common+103}
test D ffff810011362100 0 14309 1 14305 14312 (NOTLB)
ffff810053c81cf8 0000000000000082 0000000000000286 0000000000000001
0000000000000195 ffff810011362340 ffff810011362100 ffff81002e338040
ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
<ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
<ffffffff801381af>{do_exit+479} <ffffffff80138d0c>{do_group_exit+252}
<ffffffff801436db>{get_signal_to_deliver+1451} <ffffffff8010d3ad>{do_signal+157}
<ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2
<ffffffff8014208a>{force_sig_info+186} <ffffffff804479a0>{do_int3+112}
<ffffffff8010e308>{retint_signal+61}
test D ffff81002e338040 0 14311 1 14716 14305 (NOTLB)
ffff81005ca8dcf8 0000000000000082 0000000000000286 0000000000000001
0000000000000120 ffff81002e338280 ffff81002e338040 ffff8100481cb740
ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
<ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
<ffffffff801381af>{do_exit+479} <ffffffff80142d0e>{__dequeue_signal+558}
<ffffffff80138d0c>{do_group_exit+252} <ffffffff801436db>{get_signal_to_deliver+1451}
<ffffffff8010d3ad>{do_signal+157} <ffffffff8013deee>{ptrace_check_attach+222}
<ffffffff80140850>{specific_send_sig_info+208} <ffffffff8014208a>{force_sig_info+186}
<ffffffff804479a0>{do_int3+112} <ffffffff8010e308>{retint_signal+61}
test D ffff810017ca4180 0 14312 1 14309 13882 (NOTLB)
ffff81005d15fcb8 0000000000000082 ffff81005d15fc58 ffffffff80130816
0000000000000897 ffff810017ca43c0 ffff810017ca4180 ffff81003e8118c0
0000000000000082 ffffffff801317ed
Call Trace:<ffffffff80130816>{activate_task+150} <ffffffff801317ed>{try_to_wake_up+893}
<ffffffff8044677d>{wait_for_completion+173} <ffffffff80131810>{default_wake_function+0}
<ffffffff8018cdc3>{do_coredump+819} <ffffffff80445f52>{thread_return+82}
<ffffffff801436d4>{get_signal_to_deliver+1444} <ffffffff8010d3ad>{do_signal+157}
<ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2
<ffffffff804472e5>{_spin_unlock_irqrestore+5} <ffffffff8014208a>{force_sig_info+186}
<ffffffff804476ff>{do_general_protection+159} <ffffffff8010e308>{retint_signal+61}
Signed-off-by: Andrea Arcangeli <andrea@suse.de>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-31 02:02:38 +03:00
/* SIGKILL will be handled before any pending SIGSTOP */
2005-04-17 02:20:36 +04:00
sigaddset ( & t - > pending . signal , SIGKILL ) ;
signal_wake_up ( t , 1 ) ;
}
}
2008-02-14 02:03:16 +03:00
int __fatal_signal_pending ( struct task_struct * tsk )
2007-12-06 19:15:50 +03:00
{
return sigismember ( & tsk - > pending . signal , SIGKILL ) ;
}
2008-02-01 04:40:29 +03:00
EXPORT_SYMBOL ( __fatal_signal_pending ) ;
2007-12-06 19:15:50 +03:00
2006-03-29 04:11:13 +04:00
struct sighand_struct * lock_task_sighand ( struct task_struct * tsk , unsigned long * flags )
{
struct sighand_struct * sighand ;
2008-04-30 11:52:37 +04:00
rcu_read_lock ( ) ;
2006-03-29 04:11:13 +04:00
for ( ; ; ) {
sighand = rcu_dereference ( tsk - > sighand ) ;
if ( unlikely ( sighand = = NULL ) )
break ;
spin_lock_irqsave ( & sighand - > siglock , * flags ) ;
if ( likely ( sighand = = tsk - > sighand ) )
break ;
spin_unlock_irqrestore ( & sighand - > siglock , * flags ) ;
}
2008-04-30 11:52:37 +04:00
rcu_read_unlock ( ) ;
2006-03-29 04:11:13 +04:00
return sighand ;
}
2005-04-17 02:20:36 +04:00
int group_send_sig_info ( int sig , struct siginfo * info , struct task_struct * p )
{
unsigned long flags ;
int ret ;
ret = check_kill_permission ( sig , info , p ) ;
2006-03-29 04:11:13 +04:00
if ( ! ret & & sig ) {
ret = - ESRCH ;
if ( lock_task_sighand ( p , & flags ) ) {
ret = __group_send_sig_info ( sig , info , p ) ;
unlock_task_sighand ( p , & flags ) ;
2006-01-08 12:01:38 +03:00
}
2005-04-17 02:20:36 +04:00
}
return ret ;
}
/*
2008-02-08 15:19:22 +03:00
* __kill_pgrp_info ( ) sends a signal to a process group : this is what the tty
2005-04-17 02:20:36 +04:00
* control characters do ( ^ C , ^ Z etc )
*/
2006-10-02 13:17:10 +04:00
int __kill_pgrp_info ( int sig , struct siginfo * info , struct pid * pgrp )
2005-04-17 02:20:36 +04:00
{
struct task_struct * p = NULL ;
int retval , success ;
success = 0 ;
retval = - ESRCH ;
2006-10-02 13:17:10 +04:00
do_each_pid_task ( pgrp , PIDTYPE_PGID , p ) {
2005-04-17 02:20:36 +04:00
int err = group_send_sig_info ( sig , info , p ) ;
success | = ! err ;
retval = err ;
2006-10-02 13:17:10 +04:00
} while_each_pid_task ( pgrp , PIDTYPE_PGID , p ) ;
2005-04-17 02:20:36 +04:00
return success ? 0 : retval ;
}
2006-10-02 13:17:10 +04:00
int kill_pid_info ( int sig , struct siginfo * info , struct pid * pid )
2005-04-17 02:20:36 +04:00
{
2008-02-08 15:19:18 +03:00
int error = - ESRCH ;
2005-04-17 02:20:36 +04:00
struct task_struct * p ;
2006-01-08 12:01:37 +03:00
rcu_read_lock ( ) ;
2008-02-08 15:19:18 +03:00
retry :
2006-10-02 13:17:10 +04:00
p = pid_task ( pid , PIDTYPE_PID ) ;
2008-02-08 15:19:18 +03:00
if ( p ) {
2005-04-17 02:20:36 +04:00
error = group_send_sig_info ( sig , info , p ) ;
2008-02-08 15:19:18 +03:00
if ( unlikely ( error = = - ESRCH ) )
/*
* The task was unhashed in between , try again .
* If it is dead , pid_task ( ) will return NULL ,
* if we race with de_thread ( ) it will find the
* new leader .
*/
goto retry ;
}
2006-01-08 12:01:37 +03:00
rcu_read_unlock ( ) ;
2008-04-30 11:52:45 +04:00
2005-04-17 02:20:36 +04:00
return error ;
}
2007-02-09 18:11:47 +03:00
int
kill_proc_info ( int sig , struct siginfo * info , pid_t pid )
2006-10-02 13:17:10 +04:00
{
int error ;
rcu_read_lock ( ) ;
2007-10-19 10:40:14 +04:00
error = kill_pid_info ( sig , info , find_vpid ( pid ) ) ;
2006-10-02 13:17:10 +04:00
rcu_read_unlock ( ) ;
return error ;
}
2006-10-02 13:17:28 +04:00
/* like kill_pid_info(), but doesn't use uid/euid of "current" */
int kill_pid_info_as_uid ( int sig , struct siginfo * info , struct pid * pid ,
2006-06-30 12:55:47 +04:00
uid_t uid , uid_t euid , u32 secid )
2005-10-10 21:44:29 +04:00
{
int ret = - EINVAL ;
struct task_struct * p ;
if ( ! valid_signal ( sig ) )
return ret ;
read_lock ( & tasklist_lock ) ;
2006-10-02 13:17:28 +04:00
p = pid_task ( pid , PIDTYPE_PID ) ;
2005-10-10 21:44:29 +04:00
if ( ! p ) {
ret = - ESRCH ;
goto out_unlock ;
}
2006-01-08 12:03:09 +03:00
if ( ( info = = SEND_SIG_NOINFO | | ( ! is_si_special ( info ) & & SI_FROMUSER ( info ) ) )
2005-10-10 21:44:29 +04:00
& & ( euid ! = p - > suid ) & & ( euid ! = p - > uid )
& & ( uid ! = p - > suid ) & & ( uid ! = p - > uid ) ) {
ret = - EPERM ;
goto out_unlock ;
}
2006-06-30 12:55:47 +04:00
ret = security_task_kill ( p , info , sig , secid ) ;
if ( ret )
goto out_unlock ;
2005-10-10 21:44:29 +04:00
if ( sig & & p - > sighand ) {
unsigned long flags ;
spin_lock_irqsave ( & p - > sighand - > siglock , flags ) ;
ret = __group_send_sig_info ( sig , info , p ) ;
spin_unlock_irqrestore ( & p - > sighand - > siglock , flags ) ;
}
out_unlock :
read_unlock ( & tasklist_lock ) ;
return ret ;
}
2006-10-02 13:17:28 +04:00
EXPORT_SYMBOL_GPL ( kill_pid_info_as_uid ) ;
2005-04-17 02:20:36 +04:00
/*
* kill_something_info ( ) interprets pid in interesting ways just like kill ( 2 ) .
*
* POSIX specifies that kill ( - 1 , sig ) is unspecified , but what we have
* is probably wrong . Should make it like BSD or SYSV .
*/
2008-07-25 12:47:33 +04:00
static int kill_something_info ( int sig , struct siginfo * info , pid_t pid )
2005-04-17 02:20:36 +04:00
{
2007-02-12 11:52:55 +03:00
int ret ;
2008-02-08 15:19:22 +03:00
if ( pid > 0 ) {
rcu_read_lock ( ) ;
ret = kill_pid_info ( sig , info , find_vpid ( pid ) ) ;
rcu_read_unlock ( ) ;
return ret ;
}
read_lock ( & tasklist_lock ) ;
if ( pid ! = - 1 ) {
ret = __kill_pgrp_info ( sig , info ,
pid ? find_vpid ( - pid ) : task_pgrp ( current ) ) ;
} else {
2005-04-17 02:20:36 +04:00
int retval = 0 , count = 0 ;
struct task_struct * p ;
for_each_process ( p ) {
2007-10-19 10:40:18 +04:00
if ( p - > pid > 1 & & ! same_thread_group ( p , current ) ) {
2005-04-17 02:20:36 +04:00
int err = group_send_sig_info ( sig , info , p ) ;
+ + count ;
if ( err ! = - EPERM )
retval = err ;
}
}
2007-02-12 11:52:55 +03:00
ret = count ? retval : - ESRCH ;
2005-04-17 02:20:36 +04:00
}
2008-02-08 15:19:22 +03:00
read_unlock ( & tasklist_lock ) ;
2007-02-12 11:52:55 +03:00
return ret ;
2005-04-17 02:20:36 +04:00
}
/*
* These are for backward compatibility with the rest of the kernel source .
*/
/*
2008-04-30 11:52:51 +04:00
* The caller must ensure the task can ' t exit .
2005-04-17 02:20:36 +04:00
*/
int
send_sig_info ( int sig , struct siginfo * info , struct task_struct * p )
{
int ret ;
unsigned long flags ;
/*
* Make sure legacy kernel users don ' t send in bad values
* ( normal paths check this in check_kill_permission ) .
*/
2005-05-01 19:59:14 +04:00
if ( ! valid_signal ( sig ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
spin_lock_irqsave ( & p - > sighand - > siglock , flags ) ;
ret = specific_send_sig_info ( sig , info , p ) ;
spin_unlock_irqrestore ( & p - > sighand - > siglock , flags ) ;
return ret ;
}
2005-10-31 02:03:44 +03:00
# define __si_special(priv) \
( ( priv ) ? SEND_SIG_PRIV : SEND_SIG_NOINFO )
2005-04-17 02:20:36 +04:00
int
send_sig ( int sig , struct task_struct * p , int priv )
{
2005-10-31 02:03:44 +03:00
return send_sig_info ( sig , __si_special ( priv ) , p ) ;
2005-04-17 02:20:36 +04:00
}
void
force_sig ( int sig , struct task_struct * p )
{
2005-10-31 02:03:44 +03:00
force_sig_info ( sig , SEND_SIG_PRIV , p ) ;
2005-04-17 02:20:36 +04:00
}
/*
* When things go south during signal handling , we
* will force a SIGSEGV . And if the signal that caused
* the problem was already a SIGSEGV , we ' ll want to
* make sure we don ' t even try to deliver the signal . .
*/
int
force_sigsegv ( int sig , struct task_struct * p )
{
if ( sig = = SIGSEGV ) {
unsigned long flags ;
spin_lock_irqsave ( & p - > sighand - > siglock , flags ) ;
p - > sighand - > action [ sig - 1 ] . sa . sa_handler = SIG_DFL ;
spin_unlock_irqrestore ( & p - > sighand - > siglock , flags ) ;
}
force_sig ( SIGSEGV , p ) ;
return 0 ;
}
2006-10-02 13:17:10 +04:00
int kill_pgrp ( struct pid * pid , int sig , int priv )
{
2008-02-08 15:19:22 +03:00
int ret ;
read_lock ( & tasklist_lock ) ;
ret = __kill_pgrp_info ( sig , __si_special ( priv ) , pid ) ;
read_unlock ( & tasklist_lock ) ;
return ret ;
2006-10-02 13:17:10 +04:00
}
EXPORT_SYMBOL ( kill_pgrp ) ;
int kill_pid ( struct pid * pid , int sig , int priv )
{
return kill_pid_info ( sig , __si_special ( priv ) , pid ) ;
}
EXPORT_SYMBOL ( kill_pid ) ;
2005-04-17 02:20:36 +04:00
/*
* These functions support sending signals using preallocated sigqueue
* structures . This is needed " because realtime applications cannot
* afford to lose notifications of asynchronous events , like timer
* expirations or I / O completions " . In the case of Posix Timers
* we allocate the sigqueue structure from the timer_create . If this
* allocation fails we are able to report the failure to the application
* with an EAGAIN error .
*/
struct sigqueue * sigqueue_alloc ( void )
{
struct sigqueue * q ;
if ( ( q = __sigqueue_alloc ( current , GFP_KERNEL , 0 ) ) )
q - > flags | = SIGQUEUE_PREALLOC ;
return ( q ) ;
}
void sigqueue_free ( struct sigqueue * q )
{
unsigned long flags ;
2007-08-31 10:56:35 +04:00
spinlock_t * lock = & current - > sighand - > siglock ;
2005-04-17 02:20:36 +04:00
BUG_ON ( ! ( q - > flags & SIGQUEUE_PREALLOC ) ) ;
/*
2008-05-26 20:55:42 +04:00
* We must hold - > siglock while testing q - > list
* to serialize with collect_signal ( ) or with
2008-05-24 00:04:41 +04:00
* __exit_signal ( ) - > flush_sigqueue ( ) .
2005-04-17 02:20:36 +04:00
*/
2007-08-31 10:56:35 +04:00
spin_lock_irqsave ( lock , flags ) ;
2008-05-26 20:55:42 +04:00
q - > flags & = ~ SIGQUEUE_PREALLOC ;
/*
* If it is queued it will be freed when dequeued ,
* like the " regular " sigqueue .
*/
2007-08-31 10:56:35 +04:00
if ( ! list_empty ( & q - > list ) )
2008-05-26 20:55:42 +04:00
q = NULL ;
2007-08-31 10:56:35 +04:00
spin_unlock_irqrestore ( lock , flags ) ;
2008-05-26 20:55:42 +04:00
if ( q )
__sigqueue_free ( q ) ;
2005-04-17 02:20:36 +04:00
}
2008-04-30 11:52:57 +04:00
int send_sigqueue ( struct sigqueue * q , struct task_struct * t , int group )
2008-04-30 11:52:41 +04:00
{
2008-04-30 11:52:56 +04:00
int sig = q - > info . si_signo ;
2008-04-30 11:52:54 +04:00
struct sigpending * pending ;
2008-04-30 11:52:56 +04:00
unsigned long flags ;
int ret ;
2008-04-30 11:52:54 +04:00
2008-04-30 11:52:55 +04:00
BUG_ON ( ! ( q - > flags & SIGQUEUE_PREALLOC ) ) ;
2008-04-30 11:52:56 +04:00
ret = - 1 ;
if ( ! likely ( lock_task_sighand ( t , & flags ) ) )
goto ret ;
2008-04-30 11:52:59 +04:00
ret = 1 ; /* the signal is ignored */
if ( ! prepare_signal ( sig , t ) )
2008-04-30 11:52:56 +04:00
goto out ;
ret = 0 ;
2008-04-30 11:52:41 +04:00
if ( unlikely ( ! list_empty ( & q - > list ) ) ) {
/*
* If an SI_TIMER entry is already queue just increment
* the overrun count .
*/
BUG_ON ( q - > info . si_code ! = SI_TIMER ) ;
q - > info . si_overrun + + ;
2008-04-30 11:52:56 +04:00
goto out ;
2008-04-30 11:52:41 +04:00
}
posix-timers: fix posix_timer_event() vs dequeue_signal() race
The bug was reported and analysed by Mark McLoughlin <markmc@redhat.com>,
the patch is based on his and Roland's suggestions.
posix_timer_event() always rewrites the pre-allocated siginfo before sending
the signal. Most of the written info is the same all the time, but memset(0)
is very wrong. If ->sigq is queued we can race with collect_signal() which
can fail to find this siginfo looking at .si_signo, or copy_siginfo() can
copy the wrong .si_code/si_tid/etc.
In short, sys_timer_settime() can in fact stop the active timer, or the user
can receive the siginfo with the wrong .si_xxx values.
Move "memset(->info, 0)" from posix_timer_event() to alloc_posix_timer(),
change send_sigqueue() to set .si_overrun = 0 when ->sigq is not queued.
It would be nice to move the whole sigq->info initialization from send to
create path, but this is not easy to do without uglifying timer_create()
further.
As Roland rightly pointed out, we need more cleanups/fixes here, see the
"FIXME" comment in the patch. Hopefully this patch makes sense anyway, and
it can mask the most bad implications.
Reported-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Mark McLoughlin <markmc@redhat.com>
Cc: Oliver Pinter <oliver.pntr@gmail.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: stable@kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
kernel/posix-timers.c | 17 +++++++++++++----
kernel/signal.c | 1 +
2 files changed, 14 insertions(+), 4 deletions(-)
2008-07-23 20:52:05 +04:00
q - > info . si_overrun = 0 ;
2008-04-30 11:52:41 +04:00
signalfd_notify ( t , sig ) ;
2008-04-30 11:52:54 +04:00
pending = group ? & t - > signal - > shared_pending : & t - > pending ;
2008-04-30 11:52:41 +04:00
list_add_tail ( & q - > list , & pending - > list ) ;
sigaddset ( & pending - > signal , sig ) ;
2008-04-30 11:52:55 +04:00
complete_signal ( sig , t , group ) ;
2008-04-30 11:52:56 +04:00
out :
unlock_task_sighand ( t , & flags ) ;
ret :
return ret ;
2008-04-30 11:52:41 +04:00
}
2005-04-17 02:20:36 +04:00
/*
* Wake up any threads in the parent blocked in wait * syscalls .
*/
static inline void __wake_up_parent ( struct task_struct * p ,
struct task_struct * parent )
{
wake_up_interruptible_sync ( & parent - > signal - > wait_chldexit ) ;
}
/*
* Let a parent know about the death of a child .
* For a stopped / continued status change , use do_notify_parent_cldstop instead .
2008-07-26 06:45:54 +04:00
*
* Returns - 1 if our parent ignored us and so we ' ve switched to
* self - reaping , or else @ sig .
2005-04-17 02:20:36 +04:00
*/
2008-07-26 06:45:54 +04:00
int do_notify_parent ( struct task_struct * tsk , int sig )
2005-04-17 02:20:36 +04:00
{
struct siginfo info ;
unsigned long flags ;
struct sighand_struct * psig ;
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-12 20:54:39 +04:00
struct task_cputime cputime ;
2008-08-20 07:37:07 +04:00
int ret = sig ;
2005-04-17 02:20:36 +04:00
BUG_ON ( sig = = - 1 ) ;
/* do_notify_parent_cldstop should have been called instead. */
2007-12-06 19:07:35 +03:00
BUG_ON ( task_is_stopped_or_traced ( tsk ) ) ;
2005-04-17 02:20:36 +04:00
BUG_ON ( ! tsk - > ptrace & &
( tsk - > group_leader ! = tsk | | ! thread_group_empty ( tsk ) ) ) ;
info . si_signo = sig ;
info . si_errno = 0 ;
2007-10-19 10:40:14 +04:00
/*
* we are under tasklist_lock here so our parent is tied to
* us and cannot exit and release its namespace .
*
* the only it can is to switch its nsproxy with sys_unshare ,
* bu uncharing pid namespaces is not allowed , so we ' ll always
* see relevant namespace
*
* write_lock ( ) currently calls preempt_disable ( ) which is the
* same as rcu_read_lock ( ) , but according to Oleg , this is not
* correct to rely on this
*/
rcu_read_lock ( ) ;
info . si_pid = task_pid_nr_ns ( tsk , tsk - > parent - > nsproxy - > pid_ns ) ;
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
info . si_uid = tsk - > uid ;
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-12 20:54:39 +04:00
thread_group_cputime ( tsk , & cputime ) ;
info . si_utime = cputime_to_jiffies ( cputime . utime ) ;
info . si_stime = cputime_to_jiffies ( cputime . stime ) ;
2005-04-17 02:20:36 +04:00
info . si_status = tsk - > exit_code & 0x7f ;
if ( tsk - > exit_code & 0x80 )
info . si_code = CLD_DUMPED ;
else if ( tsk - > exit_code & 0x7f )
info . si_code = CLD_KILLED ;
else {
info . si_code = CLD_EXITED ;
info . si_status = tsk - > exit_code > > 8 ;
}
psig = tsk - > parent - > sighand ;
spin_lock_irqsave ( & psig - > siglock , flags ) ;
2005-11-10 17:22:18 +03:00
if ( ! tsk - > ptrace & & sig = = SIGCHLD & &
2005-04-17 02:20:36 +04:00
( psig - > action [ SIGCHLD - 1 ] . sa . sa_handler = = SIG_IGN | |
( psig - > action [ SIGCHLD - 1 ] . sa . sa_flags & SA_NOCLDWAIT ) ) ) {
/*
* We are exiting and our parent doesn ' t care . POSIX .1
* defines special semantics for setting SIGCHLD to SIG_IGN
* or setting the SA_NOCLDWAIT flag : we should be reaped
* automatically and not left for our parent ' s wait4 call .
* Rather than having the parent do it as a magic kind of
* signal handler , we just set this to tell do_exit that we
* can be cleaned up without becoming a zombie . Note that
* we still call __wake_up_parent in this case , because a
* blocked sys_wait4 might now return - ECHILD .
*
* Whether we send SIGCHLD or not for SA_NOCLDWAIT
* is implementation - defined : we do ( if you don ' t want
* it , just use SIG_IGN instead ) .
*/
2008-08-20 07:37:07 +04:00
ret = tsk - > exit_signal = - 1 ;
2005-04-17 02:20:36 +04:00
if ( psig - > action [ SIGCHLD - 1 ] . sa . sa_handler = = SIG_IGN )
2008-07-26 06:45:54 +04:00
sig = - 1 ;
2005-04-17 02:20:36 +04:00
}
2005-05-01 19:59:14 +04:00
if ( valid_signal ( sig ) & & sig > 0 )
2005-04-17 02:20:36 +04:00
__group_send_sig_info ( sig , & info , tsk - > parent ) ;
__wake_up_parent ( tsk , tsk - > parent ) ;
spin_unlock_irqrestore ( & psig - > siglock , flags ) ;
2008-07-26 06:45:54 +04:00
2008-08-20 07:37:07 +04:00
return ret ;
2005-04-17 02:20:36 +04:00
}
2006-03-29 04:11:29 +04:00
static void do_notify_parent_cldstop ( struct task_struct * tsk , int why )
2005-04-17 02:20:36 +04:00
{
struct siginfo info ;
unsigned long flags ;
2005-09-07 02:17:32 +04:00
struct task_struct * parent ;
2005-04-17 02:20:36 +04:00
struct sighand_struct * sighand ;
2006-03-29 04:11:29 +04:00
if ( tsk - > ptrace & PT_PTRACED )
2005-09-07 02:17:32 +04:00
parent = tsk - > parent ;
else {
tsk = tsk - > group_leader ;
parent = tsk - > real_parent ;
}
2005-04-17 02:20:36 +04:00
info . si_signo = SIGCHLD ;
info . si_errno = 0 ;
2007-10-19 10:40:14 +04:00
/*
* see comment in do_notify_parent ( ) abot the following 3 lines
*/
rcu_read_lock ( ) ;
info . si_pid = task_pid_nr_ns ( tsk , tsk - > parent - > nsproxy - > pid_ns ) ;
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
info . si_uid = tsk - > uid ;
2008-07-25 12:47:32 +04:00
info . si_utime = cputime_to_clock_t ( tsk - > utime ) ;
info . si_stime = cputime_to_clock_t ( tsk - > stime ) ;
2005-04-17 02:20:36 +04:00
info . si_code = why ;
switch ( why ) {
case CLD_CONTINUED :
info . si_status = SIGCONT ;
break ;
case CLD_STOPPED :
info . si_status = tsk - > signal - > group_exit_code & 0x7f ;
break ;
case CLD_TRAPPED :
info . si_status = tsk - > exit_code & 0x7f ;
break ;
default :
BUG ( ) ;
}
sighand = parent - > sighand ;
spin_lock_irqsave ( & sighand - > siglock , flags ) ;
if ( sighand - > action [ SIGCHLD - 1 ] . sa . sa_handler ! = SIG_IGN & &
! ( sighand - > action [ SIGCHLD - 1 ] . sa . sa_flags & SA_NOCLDSTOP ) )
__group_send_sig_info ( SIGCHLD , & info , parent ) ;
/*
* Even if SIGCHLD is not generated , we must wake up wait4 calls .
*/
__wake_up_parent ( tsk , parent ) ;
spin_unlock_irqrestore ( & sighand - > siglock , flags ) ;
}
2006-06-26 11:26:07 +04:00
static inline int may_ptrace_stop ( void )
{
if ( ! likely ( current - > ptrace & PT_PTRACED ) )
return 0 ;
/*
* Are we in the middle of do_coredump ?
* If so and our tracer is also part of the coredump stopping
* is a deadlock situation , and pointless because our tracer
* is dead so don ' t allow us to stop .
* If SIGKILL was already sent before the caller unlocked
2008-07-25 12:47:41 +04:00
* - > siglock we must see - > core_state ! = NULL . Otherwise it
2006-06-26 11:26:07 +04:00
* is safe to enter schedule ( ) .
*/
2008-07-25 12:47:41 +04:00
if ( unlikely ( current - > mm - > core_state ) & &
2006-06-26 11:26:07 +04:00
unlikely ( current - > mm = = current - > parent - > mm ) )
return 0 ;
return 1 ;
}
2008-02-06 12:37:37 +03:00
/*
* Return nonzero if there is a SIGKILL that should be waking us up .
* Called with the siglock held .
*/
static int sigkill_pending ( struct task_struct * tsk )
{
2008-07-25 12:47:37 +04:00
return sigismember ( & tsk - > pending . signal , SIGKILL ) | |
sigismember ( & tsk - > signal - > shared_pending . signal , SIGKILL ) ;
2008-02-06 12:37:37 +03:00
}
2005-04-17 02:20:36 +04:00
/*
* This must be called with current - > sighand - > siglock held .
*
* This should be the path for all ptrace stops .
* We always set current - > last_siginfo while stopped here .
* That makes it a way to test a stopped process for
* being ptrace - stopped vs being job - control - stopped .
*
2008-02-08 15:19:03 +03:00
* If we actually decide not to stop at all because the tracer
* is gone , we keep current - > exit_code unless clear_code .
2005-04-17 02:20:36 +04:00
*/
2008-02-08 15:19:03 +03:00
static void ptrace_stop ( int exit_code , int clear_code , siginfo_t * info )
2005-04-17 02:20:36 +04:00
{
2008-02-06 12:37:37 +03:00
if ( arch_ptrace_stop_needed ( exit_code , info ) ) {
/*
* The arch code has something special to do before a
* ptrace stop . This is allowed to block , e . g . for faults
* on user stack pages . We can ' t keep the siglock while
* calling arch_ptrace_stop , so we must release it now .
* To preserve proper semantics , we must do this before
* any signal bookkeeping like checking group_stop_count .
* Meanwhile , a SIGKILL could come in before we retake the
* siglock . That must prevent us from sleeping in TASK_TRACED .
* So after regaining the lock , we must check for SIGKILL .
*/
spin_unlock_irq ( & current - > sighand - > siglock ) ;
arch_ptrace_stop ( exit_code , info ) ;
spin_lock_irq ( & current - > sighand - > siglock ) ;
2008-07-25 12:47:37 +04:00
if ( sigkill_pending ( current ) )
return ;
2008-02-06 12:37:37 +03:00
}
2005-04-17 02:20:36 +04:00
/*
* If there is a group stop in progress ,
* we must participate in the bookkeeping .
*/
if ( current - > signal - > group_stop_count > 0 )
- - current - > signal - > group_stop_count ;
current - > last_siginfo = info ;
current - > exit_code = exit_code ;
/* Let the debugger run. */
2008-02-06 12:36:13 +03:00
__set_current_state ( TASK_TRACED ) ;
2005-04-17 02:20:36 +04:00
spin_unlock_irq ( & current - > sighand - > siglock ) ;
read_lock ( & tasklist_lock ) ;
2008-07-25 12:47:37 +04:00
if ( may_ptrace_stop ( ) ) {
2006-03-29 04:11:29 +04:00
do_notify_parent_cldstop ( current , CLD_TRAPPED ) ;
2005-04-17 02:20:36 +04:00
read_unlock ( & tasklist_lock ) ;
schedule ( ) ;
} else {
/*
* By the time we got the lock , our tracer went away .
2008-02-08 15:19:00 +03:00
* Don ' t drop the lock yet , another tracer may come .
2005-04-17 02:20:36 +04:00
*/
2008-02-08 15:19:00 +03:00
__set_current_state ( TASK_RUNNING ) ;
2008-02-08 15:19:03 +03:00
if ( clear_code )
current - > exit_code = 0 ;
2008-02-08 15:19:00 +03:00
read_unlock ( & tasklist_lock ) ;
2005-04-17 02:20:36 +04:00
}
2008-03-04 07:22:05 +03:00
/*
* While in TASK_TRACED , we were considered " frozen enough " .
* Now that we woke up , it ' s crucial if we ' re supposed to be
* frozen that we freeze now before running anything substantial .
*/
try_to_freeze ( ) ;
2005-04-17 02:20:36 +04:00
/*
* We are back . Now reacquire the siglock before touching
* last_siginfo , so that we are sure to have synchronized with
* any signal - sending on another CPU that wants to examine it .
*/
spin_lock_irq ( & current - > sighand - > siglock ) ;
current - > last_siginfo = NULL ;
/*
* Queued signals ignored us while we were stopped for tracing .
* So check for any that we should take before resuming user mode .
2007-06-06 14:59:00 +04:00
* This sets TIF_SIGPENDING , but never clears it .
2005-04-17 02:20:36 +04:00
*/
2007-06-06 14:59:00 +04:00
recalc_sigpending_tsk ( current ) ;
2005-04-17 02:20:36 +04:00
}
void ptrace_notify ( int exit_code )
{
siginfo_t info ;
BUG_ON ( ( exit_code & ( 0x7f | ~ 0xffff ) ) ! = SIGTRAP ) ;
memset ( & info , 0 , sizeof info ) ;
info . si_signo = SIGTRAP ;
info . si_code = exit_code ;
2007-10-19 10:40:14 +04:00
info . si_pid = task_pid_vnr ( current ) ;
2005-04-17 02:20:36 +04:00
info . si_uid = current - > uid ;
/* Let the debugger run. */
spin_lock_irq ( & current - > sighand - > siglock ) ;
2008-02-08 15:19:03 +03:00
ptrace_stop ( exit_code , 1 , & info ) ;
2005-04-17 02:20:36 +04:00
spin_unlock_irq ( & current - > sighand - > siglock ) ;
}
static void
finish_stop ( int stop_count )
{
/*
* If there are no other threads in the group , or if there is
* a group stop in progress and we are the last to stop ,
* report to the parent . When ptraced , every thread reports itself .
*/
2008-07-26 06:45:54 +04:00
if ( tracehook_notify_jctl ( stop_count = = 0 , CLD_STOPPED ) ) {
2006-03-29 04:11:29 +04:00
read_lock ( & tasklist_lock ) ;
do_notify_parent_cldstop ( current , CLD_STOPPED ) ;
read_unlock ( & tasklist_lock ) ;
}
2005-09-07 02:17:32 +04:00
2006-12-13 11:34:28 +03:00
do {
schedule ( ) ;
} while ( try_to_freeze ( ) ) ;
2005-04-17 02:20:36 +04:00
/*
* Now we don ' t run again until continued .
*/
current - > exit_code = 0 ;
}
/*
* This performs the stopping for SIGSTOP and other stop signals .
* We have to stop all threads in the thread group .
* Returns nonzero if we ' ve actually stopped and released the siglock .
* Returns zero if we didn ' t stop and still hold the siglock .
*/
2006-03-29 04:11:22 +04:00
static int do_signal_stop ( int signr )
2005-04-17 02:20:36 +04:00
{
struct signal_struct * sig = current - > signal ;
2006-03-29 04:11:28 +04:00
int stop_count ;
2005-04-17 02:20:36 +04:00
if ( sig - > group_stop_count > 0 ) {
/*
* There is a group stop in progress . We don ' t need to
* start another one .
*/
stop_count = - - sig - > group_stop_count ;
2006-03-29 04:11:28 +04:00
} else {
2008-02-05 09:27:24 +03:00
struct task_struct * t ;
2008-07-25 12:47:31 +04:00
if ( ! likely ( sig - > flags & SIGNAL_STOP_DEQUEUED ) | |
2008-04-30 11:52:36 +04:00
unlikely ( signal_group_exit ( sig ) ) )
2008-02-05 09:27:24 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
/*
* There is no group stop already in progress .
2006-03-29 04:11:22 +04:00
* We must initiate one now .
2005-04-17 02:20:36 +04:00
*/
2006-03-29 04:11:22 +04:00
sig - > group_exit_code = signr ;
2005-04-17 02:20:36 +04:00
2006-03-29 04:11:22 +04:00
stop_count = 0 ;
for ( t = next_thread ( current ) ; t ! = current ; t = next_thread ( t ) )
2005-04-17 02:20:36 +04:00
/*
2006-03-29 04:11:22 +04:00
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held ,
* so this check has no races .
2005-04-17 02:20:36 +04:00
*/
2008-02-08 15:19:12 +03:00
if ( ! ( t - > flags & PF_EXITING ) & &
2007-12-06 19:07:35 +03:00
! task_is_stopped_or_traced ( t ) ) {
2006-03-29 04:11:22 +04:00
stop_count + + ;
signal_wake_up ( t , 0 ) ;
}
sig - > group_stop_count = stop_count ;
2005-04-17 02:20:36 +04:00
}
2006-03-29 04:11:28 +04:00
if ( stop_count = = 0 )
sig - > flags = SIGNAL_STOP_STOPPED ;
current - > exit_code = sig - > group_exit_code ;
__set_current_state ( TASK_STOPPED ) ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
2005-04-17 02:20:36 +04:00
finish_stop ( stop_count ) ;
return 1 ;
}
2008-04-18 05:44:38 +04:00
static int ptrace_signal ( int signr , siginfo_t * info ,
struct pt_regs * regs , void * cookie )
{
if ( ! ( current - > ptrace & PT_PTRACED ) )
return signr ;
ptrace_signal_deliver ( regs , cookie ) ;
/* Let the debugger run. */
ptrace_stop ( signr , 0 , info ) ;
/* We're back. Did the debugger cancel the sig? */
signr = current - > exit_code ;
if ( signr = = 0 )
return signr ;
current - > exit_code = 0 ;
/* Update the siginfo structure if the signal has
changed . If the debugger wanted something
specific in the siginfo structure then it should
have updated * info via PTRACE_SETSIGINFO . */
if ( signr ! = info - > si_signo ) {
info - > si_signo = signr ;
info - > si_errno = 0 ;
info - > si_code = SI_USER ;
info - > si_pid = task_pid_vnr ( current - > parent ) ;
info - > si_uid = current - > parent - > uid ;
}
/* If the (new) signal is now blocked, requeue it. */
if ( sigismember ( & current - > blocked , signr ) ) {
specific_send_sig_info ( signr , info , current ) ;
signr = 0 ;
}
return signr ;
}
2005-04-17 02:20:36 +04:00
int get_signal_to_deliver ( siginfo_t * info , struct k_sigaction * return_ka ,
struct pt_regs * regs , void * cookie )
{
2008-04-30 11:52:47 +04:00
struct sighand_struct * sighand = current - > sighand ;
struct signal_struct * signal = current - > signal ;
int signr ;
2005-04-17 02:20:36 +04:00
2008-03-04 07:22:05 +03:00
relock :
/*
* We ' ll jump back here after any time we were stopped in TASK_STOPPED .
* While in TASK_STOPPED , we were considered " frozen enough " .
* Now that we woke up , it ' s crucial if we ' re supposed to be
* frozen that we freeze now before running anything substantial .
*/
2006-03-23 14:00:05 +03:00
try_to_freeze ( ) ;
2008-04-30 11:52:47 +04:00
spin_lock_irq ( & sighand - > siglock ) ;
2008-04-30 11:53:00 +04:00
/*
* Every stopped thread goes here after wakeup . Check to see if
* we should notify the parent , prepare_signal ( SIGCONT ) encodes
* the CLD_ si_code into SIGNAL_CLD_MASK bits .
*/
2008-04-30 11:52:47 +04:00
if ( unlikely ( signal - > flags & SIGNAL_CLD_MASK ) ) {
int why = ( signal - > flags & SIGNAL_STOP_CONTINUED )
2008-04-30 11:52:44 +04:00
? CLD_CONTINUED : CLD_STOPPED ;
2008-04-30 11:52:47 +04:00
signal - > flags & = ~ SIGNAL_CLD_MASK ;
spin_unlock_irq ( & sighand - > siglock ) ;
2008-04-30 11:52:44 +04:00
2008-07-26 06:45:54 +04:00
if ( unlikely ( ! tracehook_notify_jctl ( 1 , why ) ) )
goto relock ;
2008-04-30 11:52:44 +04:00
read_lock ( & tasklist_lock ) ;
do_notify_parent_cldstop ( current - > group_leader , why ) ;
read_unlock ( & tasklist_lock ) ;
goto relock ;
}
2005-04-17 02:20:36 +04:00
for ( ; ; ) {
struct k_sigaction * ka ;
2008-04-30 11:52:47 +04:00
if ( unlikely ( signal - > group_stop_count > 0 ) & &
2008-02-05 09:27:24 +03:00
do_signal_stop ( 0 ) )
2005-04-17 02:20:36 +04:00
goto relock ;
2008-07-26 06:45:53 +04:00
/*
* Tracing can induce an artifical signal and choose sigaction .
* The return value in @ signr determines the default action ,
* but @ info - > si_signo is the signal number we will report .
*/
signr = tracehook_get_signal ( current , regs , info , return_ka ) ;
if ( unlikely ( signr < 0 ) )
goto relock ;
if ( unlikely ( signr ! = 0 ) )
ka = return_ka ;
else {
signr = dequeue_signal ( current , & current - > blocked ,
info ) ;
2005-04-17 02:20:36 +04:00
2008-04-18 05:44:38 +04:00
if ( ! signr )
2008-07-26 06:45:53 +04:00
break ; /* will return 0 */
if ( signr ! = SIGKILL ) {
signr = ptrace_signal ( signr , info ,
regs , cookie ) ;
if ( ! signr )
continue ;
}
ka = & sighand - > action [ signr - 1 ] ;
2005-04-17 02:20:36 +04:00
}
if ( ka - > sa . sa_handler = = SIG_IGN ) /* Do nothing. */
continue ;
if ( ka - > sa . sa_handler ! = SIG_DFL ) {
/* Run the handler. */
* return_ka = * ka ;
if ( ka - > sa . sa_flags & SA_ONESHOT )
ka - > sa . sa_handler = SIG_DFL ;
break ; /* will return non-zero "signr" value */
}
/*
* Now we are doing the default action for this signal .
*/
if ( sig_kernel_ignore ( signr ) ) /* Default is nothing. */
continue ;
2006-12-08 13:38:01 +03:00
/*
2007-10-19 10:40:13 +04:00
* Global init gets no signals it doesn ' t want .
2006-12-08 13:38:01 +03:00
*/
2008-04-30 11:53:03 +04:00
if ( unlikely ( signal - > flags & SIGNAL_UNKILLABLE ) & &
! signal_group_exit ( signal ) )
2005-04-17 02:20:36 +04:00
continue ;
if ( sig_kernel_stop ( signr ) ) {
/*
* The default action is to stop all threads in
* the thread group . The job control signals
* do nothing in an orphaned pgrp , but SIGSTOP
* always works . Note that siglock needs to be
* dropped during the call to is_orphaned_pgrp ( )
* because of lock ordering with tasklist_lock .
* This allows an intervening SIGCONT to be posted .
* We need to check for that and bail out if necessary .
*/
if ( signr ! = SIGSTOP ) {
2008-04-30 11:52:47 +04:00
spin_unlock_irq ( & sighand - > siglock ) ;
2005-04-17 02:20:36 +04:00
/* signals can be posted during this window */
2007-02-12 11:52:58 +03:00
if ( is_current_pgrp_orphaned ( ) )
2005-04-17 02:20:36 +04:00
goto relock ;
2008-04-30 11:52:47 +04:00
spin_lock_irq ( & sighand - > siglock ) ;
2005-04-17 02:20:36 +04:00
}
2008-07-26 06:45:53 +04:00
if ( likely ( do_signal_stop ( info - > si_signo ) ) ) {
2005-04-17 02:20:36 +04:00
/* It released the siglock. */
goto relock ;
}
/*
* We didn ' t actually stop , due to a race
* with SIGCONT or something like that .
*/
continue ;
}
2008-04-30 11:52:47 +04:00
spin_unlock_irq ( & sighand - > siglock ) ;
2005-04-17 02:20:36 +04:00
/*
* Anything else is fatal , maybe with a core dump .
*/
current - > flags | = PF_SIGNALED ;
2008-04-30 11:52:58 +04:00
2005-04-17 02:20:36 +04:00
if ( sig_kernel_coredump ( signr ) ) {
2008-04-30 11:52:58 +04:00
if ( print_fatal_signals )
2008-07-26 06:45:53 +04:00
print_fatal_signal ( regs , info - > si_signo ) ;
2005-04-17 02:20:36 +04:00
/*
* If it was able to dump core , this kills all
* other threads in the group and synchronizes with
* their demise . If we lost the race with another
* thread getting here , it set group_exit_code
* first and our do_group_exit call below will use
* that value and ignore the one we pass it .
*/
2008-07-26 06:45:53 +04:00
do_coredump ( info - > si_signo , info - > si_signo , regs ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Death signals , no core dump .
*/
2008-07-26 06:45:53 +04:00
do_group_exit ( info - > si_signo ) ;
2005-04-17 02:20:36 +04:00
/* NOTREACHED */
}
2008-04-30 11:52:47 +04:00
spin_unlock_irq ( & sighand - > siglock ) ;
2005-04-17 02:20:36 +04:00
return signr ;
}
2008-02-08 15:19:12 +03:00
void exit_signals ( struct task_struct * tsk )
{
int group_stop = 0 ;
2008-02-08 15:19:13 +03:00
struct task_struct * t ;
2008-02-08 15:19:12 +03:00
2008-02-08 15:19:13 +03:00
if ( thread_group_empty ( tsk ) | | signal_group_exit ( tsk - > signal ) ) {
tsk - > flags | = PF_EXITING ;
return ;
2008-02-08 15:19:12 +03:00
}
2008-02-08 15:19:13 +03:00
spin_lock_irq ( & tsk - > sighand - > siglock ) ;
2008-02-08 15:19:12 +03:00
/*
* From now this task is not visible for group - wide signals ,
* see wants_signal ( ) , do_signal_stop ( ) .
*/
tsk - > flags | = PF_EXITING ;
2008-02-08 15:19:13 +03:00
if ( ! signal_pending ( tsk ) )
goto out ;
/* It could be that __group_complete_signal() choose us to
* notify about group - wide signal . Another thread should be
* woken now to take the signal since we will not .
*/
for ( t = tsk ; ( t = next_thread ( t ) ) ! = tsk ; )
if ( ! signal_pending ( t ) & & ! ( t - > flags & PF_EXITING ) )
recalc_sigpending_and_wake ( t ) ;
if ( unlikely ( tsk - > signal - > group_stop_count ) & &
! - - tsk - > signal - > group_stop_count ) {
tsk - > signal - > flags = SIGNAL_STOP_STOPPED ;
group_stop = 1 ;
}
out :
2008-02-08 15:19:12 +03:00
spin_unlock_irq ( & tsk - > sighand - > siglock ) ;
2008-07-26 06:45:54 +04:00
if ( unlikely ( group_stop ) & & tracehook_notify_jctl ( 1 , CLD_STOPPED ) ) {
2008-02-08 15:19:12 +03:00
read_lock ( & tasklist_lock ) ;
do_notify_parent_cldstop ( tsk , CLD_STOPPED ) ;
read_unlock ( & tasklist_lock ) ;
}
}
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( recalc_sigpending ) ;
EXPORT_SYMBOL_GPL ( dequeue_signal ) ;
EXPORT_SYMBOL ( flush_signals ) ;
EXPORT_SYMBOL ( force_sig ) ;
EXPORT_SYMBOL ( send_sig ) ;
EXPORT_SYMBOL ( send_sig_info ) ;
EXPORT_SYMBOL ( sigprocmask ) ;
EXPORT_SYMBOL ( block_all_signals ) ;
EXPORT_SYMBOL ( unblock_all_signals ) ;
/*
* System call entry points .
*/
asmlinkage long sys_restart_syscall ( void )
{
struct restart_block * restart = & current_thread_info ( ) - > restart_block ;
return restart - > fn ( restart ) ;
}
long do_no_restart_syscall ( struct restart_block * param )
{
return - EINTR ;
}
/*
* We don ' t need to get the kernel lock - this is all local to this
* particular thread . . ( and that ' s good , because this is _heavily_
* used by various programs )
*/
/*
* This is also useful for kernel threads that want to temporarily
* ( or permanently ) block certain signals .
*
* NOTE ! Unlike the user - mode sys_sigprocmask ( ) , the kernel
* interface happily blocks " unblockable " signals like SIGKILL
* and friends .
*/
int sigprocmask ( int how , sigset_t * set , sigset_t * oldset )
{
int error ;
spin_lock_irq ( & current - > sighand - > siglock ) ;
2006-03-23 14:00:49 +03:00
if ( oldset )
* oldset = current - > blocked ;
2005-04-17 02:20:36 +04:00
error = 0 ;
switch ( how ) {
case SIG_BLOCK :
sigorsets ( & current - > blocked , & current - > blocked , set ) ;
break ;
case SIG_UNBLOCK :
signandsets ( & current - > blocked , & current - > blocked , set ) ;
break ;
case SIG_SETMASK :
current - > blocked = * set ;
break ;
default :
error = - EINVAL ;
}
recalc_sigpending ( ) ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
2006-03-23 14:00:49 +03:00
2005-04-17 02:20:36 +04:00
return error ;
}
asmlinkage long
sys_rt_sigprocmask ( int how , sigset_t __user * set , sigset_t __user * oset , size_t sigsetsize )
{
int error = - EINVAL ;
sigset_t old_set , new_set ;
/* XXX: Don't preclude handling different sized sigset_t's. */
if ( sigsetsize ! = sizeof ( sigset_t ) )
goto out ;
if ( set ) {
error = - EFAULT ;
if ( copy_from_user ( & new_set , set , sizeof ( * set ) ) )
goto out ;
sigdelsetmask ( & new_set , sigmask ( SIGKILL ) | sigmask ( SIGSTOP ) ) ;
error = sigprocmask ( how , & new_set , & old_set ) ;
if ( error )
goto out ;
if ( oset )
goto set_old ;
} else if ( oset ) {
spin_lock_irq ( & current - > sighand - > siglock ) ;
old_set = current - > blocked ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
set_old :
error = - EFAULT ;
if ( copy_to_user ( oset , & old_set , sizeof ( * oset ) ) )
goto out ;
}
error = 0 ;
out :
return error ;
}
long do_sigpending ( void __user * set , unsigned long sigsetsize )
{
long error = - EINVAL ;
sigset_t pending ;
if ( sigsetsize > sizeof ( sigset_t ) )
goto out ;
spin_lock_irq ( & current - > sighand - > siglock ) ;
sigorsets ( & pending , & current - > pending . signal ,
& current - > signal - > shared_pending . signal ) ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
/* Outside the lock because only this thread touches it. */
sigandsets ( & pending , & current - > blocked , & pending ) ;
error = - EFAULT ;
if ( ! copy_to_user ( set , & pending , sigsetsize ) )
error = 0 ;
out :
return error ;
}
asmlinkage long
sys_rt_sigpending ( sigset_t __user * set , size_t sigsetsize )
{
return do_sigpending ( set , sigsetsize ) ;
}
# ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
int copy_siginfo_to_user ( siginfo_t __user * to , siginfo_t * from )
{
int err ;
if ( ! access_ok ( VERIFY_WRITE , to , sizeof ( siginfo_t ) ) )
return - EFAULT ;
if ( from - > si_code < 0 )
return __copy_to_user ( to , from , sizeof ( siginfo_t ) )
? - EFAULT : 0 ;
/*
* If you change siginfo_t structure , please be sure
* this code is fixed accordingly .
signal/timer/event: signalfd core
This patch series implements the new signalfd() system call.
I took part of the original Linus code (and you know how badly it can be
broken :), and I added even more breakage ;) Signals are fetched from the same
signal queue used by the process, so signalfd will compete with standard
kernel delivery in dequeue_signal(). If you want to reliably fetch signals on
the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This
seems to be working fine on my Dual Opteron machine. I made a quick test
program for it:
http://www.xmailserver.org/signafd-test.c
The signalfd() system call implements signal delivery into a file descriptor
receiver. The signalfd file descriptor if created with the following API:
int signalfd(int ufd, const sigset_t *mask, size_t masksize);
The "ufd" parameter allows to change an existing signalfd sigmask, w/out going
to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new
signalfd file.
The "mask" allows to specify the signal mask of signals that we are interested
in. The "masksize" parameter is the size of "mask".
The signalfd fd supports the poll(2) and read(2) system calls. The poll(2)
will return POLLIN when signals are available to be dequeued. As a direct
consequence of supporting the Linux poll subsystem, the signalfd fd can use
used together with epoll(2) too.
The read(2) system call will return a "struct signalfd_siginfo" structure in
the userspace supplied buffer. The return value is the number of bytes copied
in the supplied buffer, or -1 in case of error. The read(2) call can also
return 0, in case the sighand structure to which the signalfd was attached,
has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will
return -EAGAIN in case no signal is available.
If the size of the buffer passed to read(2) is lower than sizeof(struct
signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also
return -ERESTARTSYS in case a signal hits the process. The format of the
struct signalfd_siginfo is, and the valid fields depends of the (->code &
__SI_MASK) value, in the same way a struct siginfo would:
struct signalfd_siginfo {
__u32 signo; /* si_signo */
__s32 err; /* si_errno */
__s32 code; /* si_code */
__u32 pid; /* si_pid */
__u32 uid; /* si_uid */
__s32 fd; /* si_fd */
__u32 tid; /* si_fd */
__u32 band; /* si_band */
__u32 overrun; /* si_overrun */
__u32 trapno; /* si_trapno */
__s32 status; /* si_status */
__s32 svint; /* si_int */
__u64 svptr; /* si_ptr */
__u64 utime; /* si_utime */
__u64 stime; /* si_stime */
__u64 addr; /* si_addr */
};
[akpm@linux-foundation.org: fix signalfd_copyinfo() on i386]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-11 09:23:13 +04:00
* Please remember to update the signalfd_copyinfo ( ) function
* inside fs / signalfd . c too , in case siginfo_t changes .
2005-04-17 02:20:36 +04:00
* It should never copy any pad contained in the structure
* to avoid security leaks , but must copy the generic
* 3 ints plus the relevant union member .
*/
err = __put_user ( from - > si_signo , & to - > si_signo ) ;
err | = __put_user ( from - > si_errno , & to - > si_errno ) ;
err | = __put_user ( ( short ) from - > si_code , & to - > si_code ) ;
switch ( from - > si_code & __SI_MASK ) {
case __SI_KILL :
err | = __put_user ( from - > si_pid , & to - > si_pid ) ;
err | = __put_user ( from - > si_uid , & to - > si_uid ) ;
break ;
case __SI_TIMER :
err | = __put_user ( from - > si_tid , & to - > si_tid ) ;
err | = __put_user ( from - > si_overrun , & to - > si_overrun ) ;
err | = __put_user ( from - > si_ptr , & to - > si_ptr ) ;
break ;
case __SI_POLL :
err | = __put_user ( from - > si_band , & to - > si_band ) ;
err | = __put_user ( from - > si_fd , & to - > si_fd ) ;
break ;
case __SI_FAULT :
err | = __put_user ( from - > si_addr , & to - > si_addr ) ;
# ifdef __ARCH_SI_TRAPNO
err | = __put_user ( from - > si_trapno , & to - > si_trapno ) ;
# endif
break ;
case __SI_CHLD :
err | = __put_user ( from - > si_pid , & to - > si_pid ) ;
err | = __put_user ( from - > si_uid , & to - > si_uid ) ;
err | = __put_user ( from - > si_status , & to - > si_status ) ;
err | = __put_user ( from - > si_utime , & to - > si_utime ) ;
err | = __put_user ( from - > si_stime , & to - > si_stime ) ;
break ;
case __SI_RT : /* This is not generated by the kernel as of now. */
case __SI_MESGQ : /* But this is */
err | = __put_user ( from - > si_pid , & to - > si_pid ) ;
err | = __put_user ( from - > si_uid , & to - > si_uid ) ;
err | = __put_user ( from - > si_ptr , & to - > si_ptr ) ;
break ;
default : /* this is just in case for now ... */
err | = __put_user ( from - > si_pid , & to - > si_pid ) ;
err | = __put_user ( from - > si_uid , & to - > si_uid ) ;
break ;
}
return err ;
}
# endif
asmlinkage long
sys_rt_sigtimedwait ( const sigset_t __user * uthese ,
siginfo_t __user * uinfo ,
const struct timespec __user * uts ,
size_t sigsetsize )
{
int ret , sig ;
sigset_t these ;
struct timespec ts ;
siginfo_t info ;
long timeout = 0 ;
/* XXX: Don't preclude handling different sized sigset_t's. */
if ( sigsetsize ! = sizeof ( sigset_t ) )
return - EINVAL ;
if ( copy_from_user ( & these , uthese , sizeof ( these ) ) )
return - EFAULT ;
/*
* Invert the set of allowed signals to get those we
* want to block .
*/
sigdelsetmask ( & these , sigmask ( SIGKILL ) | sigmask ( SIGSTOP ) ) ;
signotset ( & these ) ;
if ( uts ) {
if ( copy_from_user ( & ts , uts , sizeof ( ts ) ) )
return - EFAULT ;
if ( ts . tv_nsec > = 1000000000L | | ts . tv_nsec < 0
| | ts . tv_sec < 0 )
return - EINVAL ;
}
spin_lock_irq ( & current - > sighand - > siglock ) ;
sig = dequeue_signal ( current , & these , & info ) ;
if ( ! sig ) {
timeout = MAX_SCHEDULE_TIMEOUT ;
if ( uts )
timeout = ( timespec_to_jiffies ( & ts )
+ ( ts . tv_sec | | ts . tv_nsec ) ) ;
if ( timeout ) {
/* None ready -- temporarily unblock those we're
* interested while we are sleeping in so that we ' ll
* be awakened when they arrive . */
current - > real_blocked = current - > blocked ;
sigandsets ( & current - > blocked , & current - > blocked , & these ) ;
recalc_sigpending ( ) ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
2005-09-10 11:27:24 +04:00
timeout = schedule_timeout_interruptible ( timeout ) ;
2005-04-17 02:20:36 +04:00
spin_lock_irq ( & current - > sighand - > siglock ) ;
sig = dequeue_signal ( current , & these , & info ) ;
current - > blocked = current - > real_blocked ;
siginitset ( & current - > real_blocked , 0 ) ;
recalc_sigpending ( ) ;
}
}
spin_unlock_irq ( & current - > sighand - > siglock ) ;
if ( sig ) {
ret = sig ;
if ( uinfo ) {
if ( copy_siginfo_to_user ( uinfo , & info ) )
ret = - EFAULT ;
}
} else {
ret = - EAGAIN ;
if ( timeout )
ret = - EINTR ;
}
return ret ;
}
asmlinkage long
2008-07-25 12:47:33 +04:00
sys_kill ( pid_t pid , int sig )
2005-04-17 02:20:36 +04:00
{
struct siginfo info ;
info . si_signo = sig ;
info . si_errno = 0 ;
info . si_code = SI_USER ;
2007-10-19 10:40:14 +04:00
info . si_pid = task_tgid_vnr ( current ) ;
2005-04-17 02:20:36 +04:00
info . si_uid = current - > uid ;
return kill_something_info ( sig , & info , pid ) ;
}
2008-07-25 12:47:33 +04:00
static int do_tkill ( pid_t tgid , pid_t pid , int sig )
2005-04-17 02:20:36 +04:00
{
int error ;
2005-10-31 02:02:18 +03:00
struct siginfo info ;
2005-04-17 02:20:36 +04:00
struct task_struct * p ;
2008-04-30 11:52:51 +04:00
unsigned long flags ;
2005-04-17 02:20:36 +04:00
2005-10-31 02:02:18 +03:00
error = - ESRCH ;
2005-04-17 02:20:36 +04:00
info . si_signo = sig ;
info . si_errno = 0 ;
info . si_code = SI_TKILL ;
2007-10-19 10:40:14 +04:00
info . si_pid = task_tgid_vnr ( current ) ;
2005-04-17 02:20:36 +04:00
info . si_uid = current - > uid ;
2008-04-30 11:52:51 +04:00
rcu_read_lock ( ) ;
2007-10-19 10:40:16 +04:00
p = find_task_by_vpid ( pid ) ;
2007-10-19 10:40:14 +04:00
if ( p & & ( tgid < = 0 | | task_tgid_vnr ( p ) = = tgid ) ) {
2005-04-17 02:20:36 +04:00
error = check_kill_permission ( sig , & info , p ) ;
/*
* The null signal is a permissions and process existence
* probe . No signal is actually delivered .
2008-04-30 11:52:51 +04:00
*
* If lock_task_sighand ( ) fails we pretend the task dies
* after receiving the signal . The window is tiny , and the
* signal is private anyway .
2005-04-17 02:20:36 +04:00
*/
2008-04-30 11:52:51 +04:00
if ( ! error & & sig & & lock_task_sighand ( p , & flags ) ) {
2005-04-17 02:20:36 +04:00
error = specific_send_sig_info ( sig , & info , p ) ;
2008-04-30 11:52:51 +04:00
unlock_task_sighand ( p , & flags ) ;
2005-04-17 02:20:36 +04:00
}
}
2008-04-30 11:52:51 +04:00
rcu_read_unlock ( ) ;
2005-10-31 02:02:18 +03:00
2005-04-17 02:20:36 +04:00
return error ;
}
2005-10-31 02:02:18 +03:00
/**
* sys_tgkill - send signal to one specific thread
* @ tgid : the thread group ID of the thread
* @ pid : the PID of the thread
* @ sig : signal to be sent
*
2007-02-10 12:45:59 +03:00
* This syscall also checks the @ tgid and returns - ESRCH even if the PID
2005-10-31 02:02:18 +03:00
* exists but it ' s not belonging to the target process anymore . This
* method solves the problem of threads exiting and PIDs getting reused .
*/
2008-07-25 12:47:33 +04:00
asmlinkage long sys_tgkill ( pid_t tgid , pid_t pid , int sig )
2005-10-31 02:02:18 +03:00
{
/* This is only valid for single tasks */
if ( pid < = 0 | | tgid < = 0 )
return - EINVAL ;
return do_tkill ( tgid , pid , sig ) ;
}
2005-04-17 02:20:36 +04:00
/*
* Send a signal to only one task , even if it ' s a CLONE_THREAD task .
*/
asmlinkage long
2008-07-25 12:47:33 +04:00
sys_tkill ( pid_t pid , int sig )
2005-04-17 02:20:36 +04:00
{
/* This is only valid for single tasks */
if ( pid < = 0 )
return - EINVAL ;
2005-10-31 02:02:18 +03:00
return do_tkill ( 0 , pid , sig ) ;
2005-04-17 02:20:36 +04:00
}
asmlinkage long
2008-07-25 12:47:33 +04:00
sys_rt_sigqueueinfo ( pid_t pid , int sig , siginfo_t __user * uinfo )
2005-04-17 02:20:36 +04:00
{
siginfo_t info ;
if ( copy_from_user ( & info , uinfo , sizeof ( siginfo_t ) ) )
return - EFAULT ;
/* Not even root can pretend to send signals from the kernel.
Nor can they impersonate a kill ( ) , which adds source info . */
if ( info . si_code > = 0 )
return - EPERM ;
info . si_signo = sig ;
/* POSIX.1b doesn't mention process groups. */
return kill_proc_info ( sig , & info , pid ) ;
}
2006-03-29 04:11:24 +04:00
int do_sigaction ( int sig , struct k_sigaction * act , struct k_sigaction * oact )
2005-04-17 02:20:36 +04:00
{
2008-04-30 11:52:39 +04:00
struct task_struct * t = current ;
2005-04-17 02:20:36 +04:00
struct k_sigaction * k ;
2006-01-08 12:02:48 +03:00
sigset_t mask ;
2005-04-17 02:20:36 +04:00
2005-05-01 19:59:14 +04:00
if ( ! valid_signal ( sig ) | | sig < 1 | | ( act & & sig_kernel_only ( sig ) ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2008-04-30 11:52:39 +04:00
k = & t - > sighand - > action [ sig - 1 ] ;
2005-04-17 02:20:36 +04:00
spin_lock_irq ( & current - > sighand - > siglock ) ;
if ( oact )
* oact = * k ;
if ( act ) {
2006-02-09 22:41:50 +03:00
sigdelsetmask ( & act - > sa . sa_mask ,
sigmask ( SIGKILL ) | sigmask ( SIGSTOP ) ) ;
2006-03-29 04:11:24 +04:00
* k = * act ;
2005-04-17 02:20:36 +04:00
/*
* POSIX 3.3 .1 .3 :
* " Setting a signal action to SIG_IGN for a signal that is
* pending shall cause the pending signal to be discarded ,
* whether or not it is blocked . "
*
* " Setting a signal action to SIG_DFL for a signal that is
* pending and whose default action is to ignore the signal
* ( for example , SIGCHLD ) , shall cause the pending signal to
* be discarded , whether or not it is blocked "
*/
2008-07-26 06:45:51 +04:00
if ( sig_handler_ignored ( sig_handler ( t , sig ) , sig ) ) {
2006-01-08 12:02:48 +03:00
sigemptyset ( & mask ) ;
sigaddset ( & mask , sig ) ;
rm_from_queue_full ( & mask , & t - > signal - > shared_pending ) ;
2005-04-17 02:20:36 +04:00
do {
2006-01-08 12:02:48 +03:00
rm_from_queue_full ( & mask , & t - > pending ) ;
2005-04-17 02:20:36 +04:00
t = next_thread ( t ) ;
} while ( t ! = current ) ;
}
}
spin_unlock_irq ( & current - > sighand - > siglock ) ;
return 0 ;
}
int
do_sigaltstack ( const stack_t __user * uss , stack_t __user * uoss , unsigned long sp )
{
stack_t oss ;
int error ;
if ( uoss ) {
oss . ss_sp = ( void __user * ) current - > sas_ss_sp ;
oss . ss_size = current - > sas_ss_size ;
oss . ss_flags = sas_ss_flags ( sp ) ;
}
if ( uss ) {
void __user * ss_sp ;
size_t ss_size ;
int ss_flags ;
error = - EFAULT ;
if ( ! access_ok ( VERIFY_READ , uss , sizeof ( * uss ) )
| | __get_user ( ss_sp , & uss - > ss_sp )
| | __get_user ( ss_flags , & uss - > ss_flags )
| | __get_user ( ss_size , & uss - > ss_size ) )
goto out ;
error = - EPERM ;
if ( on_sig_stack ( sp ) )
goto out ;
error = - EINVAL ;
/*
*
* Note - this code used to test ss_flags incorrectly
* old code may have been written using ss_flags = = 0
* to mean ss_flags = = SS_ONSTACK ( as this was the only
* way that worked ) - this fix preserves that older
* mechanism
*/
if ( ss_flags ! = SS_DISABLE & & ss_flags ! = SS_ONSTACK & & ss_flags ! = 0 )
goto out ;
if ( ss_flags = = SS_DISABLE ) {
ss_size = 0 ;
ss_sp = NULL ;
} else {
error = - ENOMEM ;
if ( ss_size < MINSIGSTKSZ )
goto out ;
}
current - > sas_ss_sp = ( unsigned long ) ss_sp ;
current - > sas_ss_size = ss_size ;
}
if ( uoss ) {
error = - EFAULT ;
if ( copy_to_user ( uoss , & oss , sizeof ( oss ) ) )
goto out ;
}
error = 0 ;
out :
return error ;
}
# ifdef __ARCH_WANT_SYS_SIGPENDING
asmlinkage long
sys_sigpending ( old_sigset_t __user * set )
{
return do_sigpending ( set , sizeof ( * set ) ) ;
}
# endif
# ifdef __ARCH_WANT_SYS_SIGPROCMASK
/* Some platforms have their own version with special arguments others
support only sys_rt_sigprocmask . */
asmlinkage long
sys_sigprocmask ( int how , old_sigset_t __user * set , old_sigset_t __user * oset )
{
int error ;
old_sigset_t old_set , new_set ;
if ( set ) {
error = - EFAULT ;
if ( copy_from_user ( & new_set , set , sizeof ( * set ) ) )
goto out ;
new_set & = ~ ( sigmask ( SIGKILL ) | sigmask ( SIGSTOP ) ) ;
spin_lock_irq ( & current - > sighand - > siglock ) ;
old_set = current - > blocked . sig [ 0 ] ;
error = 0 ;
switch ( how ) {
default :
error = - EINVAL ;
break ;
case SIG_BLOCK :
sigaddsetmask ( & current - > blocked , new_set ) ;
break ;
case SIG_UNBLOCK :
sigdelsetmask ( & current - > blocked , new_set ) ;
break ;
case SIG_SETMASK :
current - > blocked . sig [ 0 ] = new_set ;
break ;
}
recalc_sigpending ( ) ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
if ( error )
goto out ;
if ( oset )
goto set_old ;
} else if ( oset ) {
old_set = current - > blocked . sig [ 0 ] ;
set_old :
error = - EFAULT ;
if ( copy_to_user ( oset , & old_set , sizeof ( * oset ) ) )
goto out ;
}
error = 0 ;
out :
return error ;
}
# endif /* __ARCH_WANT_SYS_SIGPROCMASK */
# ifdef __ARCH_WANT_SYS_RT_SIGACTION
asmlinkage long
sys_rt_sigaction ( int sig ,
const struct sigaction __user * act ,
struct sigaction __user * oact ,
size_t sigsetsize )
{
struct k_sigaction new_sa , old_sa ;
int ret = - EINVAL ;
/* XXX: Don't preclude handling different sized sigset_t's. */
if ( sigsetsize ! = sizeof ( sigset_t ) )
goto out ;
if ( act ) {
if ( copy_from_user ( & new_sa . sa , act , sizeof ( new_sa . sa ) ) )
return - EFAULT ;
}
ret = do_sigaction ( sig , act ? & new_sa : NULL , oact ? & old_sa : NULL ) ;
if ( ! ret & & oact ) {
if ( copy_to_user ( oact , & old_sa . sa , sizeof ( old_sa . sa ) ) )
return - EFAULT ;
}
out :
return ret ;
}
# endif /* __ARCH_WANT_SYS_RT_SIGACTION */
# ifdef __ARCH_WANT_SYS_SGETMASK
/*
* For backwards compatibility . Functionality superseded by sigprocmask .
*/
asmlinkage long
sys_sgetmask ( void )
{
/* SMP safe */
return current - > blocked . sig [ 0 ] ;
}
asmlinkage long
sys_ssetmask ( int newmask )
{
int old ;
spin_lock_irq ( & current - > sighand - > siglock ) ;
old = current - > blocked . sig [ 0 ] ;
siginitset ( & current - > blocked , newmask & ~ ( sigmask ( SIGKILL ) |
sigmask ( SIGSTOP ) ) ) ;
recalc_sigpending ( ) ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
return old ;
}
# endif /* __ARCH_WANT_SGETMASK */
# ifdef __ARCH_WANT_SYS_SIGNAL
/*
* For backwards compatibility . Functionality superseded by sigaction .
*/
asmlinkage unsigned long
sys_signal ( int sig , __sighandler_t handler )
{
struct k_sigaction new_sa , old_sa ;
int ret ;
new_sa . sa . sa_handler = handler ;
new_sa . sa . sa_flags = SA_ONESHOT | SA_NOMASK ;
2006-02-09 22:41:41 +03:00
sigemptyset ( & new_sa . sa . sa_mask ) ;
2005-04-17 02:20:36 +04:00
ret = do_sigaction ( sig , & new_sa , & old_sa ) ;
return ret ? ret : ( unsigned long ) old_sa . sa . sa_handler ;
}
# endif /* __ARCH_WANT_SYS_SIGNAL */
# ifdef __ARCH_WANT_SYS_PAUSE
asmlinkage long
sys_pause ( void )
{
current - > state = TASK_INTERRUPTIBLE ;
schedule ( ) ;
return - ERESTARTNOHAND ;
}
# endif
2006-01-19 04:43:57 +03:00
# ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
asmlinkage long sys_rt_sigsuspend ( sigset_t __user * unewset , size_t sigsetsize )
{
sigset_t newset ;
/* XXX: Don't preclude handling different sized sigset_t's. */
if ( sigsetsize ! = sizeof ( sigset_t ) )
return - EINVAL ;
if ( copy_from_user ( & newset , unewset , sizeof ( newset ) ) )
return - EFAULT ;
sigdelsetmask ( & newset , sigmask ( SIGKILL ) | sigmask ( SIGSTOP ) ) ;
spin_lock_irq ( & current - > sighand - > siglock ) ;
current - > saved_sigmask = current - > blocked ;
current - > blocked = newset ;
recalc_sigpending ( ) ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
current - > state = TASK_INTERRUPTIBLE ;
schedule ( ) ;
2008-04-30 11:53:06 +04:00
set_restore_sigmask ( ) ;
2006-01-19 04:43:57 +03:00
return - ERESTARTNOHAND ;
}
# endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
2006-09-27 12:50:23 +04:00
__attribute__ ( ( weak ) ) const char * arch_vma_name ( struct vm_area_struct * vma )
{
return NULL ;
}
2005-04-17 02:20:36 +04:00
void __init signals_init ( void )
{
2007-05-07 01:49:57 +04:00
sigqueue_cachep = KMEM_CACHE ( sigqueue , SLAB_PANIC ) ;
2005-04-17 02:20:36 +04:00
}