2005-04-17 02:20:36 +04:00
/*
* linux / kernel / timer . c
*
2007-05-08 11:27:59 +04:00
* Kernel internal timers , basic process system calls
2005-04-17 02:20:36 +04:00
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
*
* 1997 - 01 - 28 Modified by Finn Arne Gangstad to make timers scale better .
*
* 1997 - 09 - 10 Updated NTP code according to technical memorandum Jan ' 96
* " A Kernel Model for Precision Timekeeping " by Dave Mills
* 1998 - 12 - 24 Fixed a xtime SMP race ( we need the xtime_lock rw spinlock to
* serialize accesses to xtime / lost_ticks ) .
* Copyright ( C ) 1998 Andrea Arcangeli
* 1999 - 03 - 10 Improved NTP compatibility by Ulrich Windl
* 2002 - 05 - 31 Move sys_sysinfo here and make its locking sane , Robert Love
* 2000 - 10 - 05 Implemented scalable SMP per - CPU timer handling .
* Copyright ( C ) 2000 , 2001 , 2002 Ingo Molnar
* Designed by David S . Miller , Alexey Kuznetsov and Ingo Molnar
*/
# include <linux/kernel_stat.h>
# include <linux/module.h>
# include <linux/interrupt.h>
# include <linux/percpu.h>
# include <linux/init.h>
# include <linux/mm.h>
# include <linux/swap.h>
2007-10-19 10:40:14 +04:00
# include <linux/pid_namespace.h>
2005-04-17 02:20:36 +04:00
# include <linux/notifier.h>
# include <linux/thread_info.h>
# include <linux/time.h>
# include <linux/jiffies.h>
# include <linux/posix-timers.h>
# include <linux/cpu.h>
# include <linux/syscalls.h>
2006-01-08 12:02:17 +03:00
# include <linux/delay.h>
2007-02-16 12:28:03 +03:00
# include <linux/tick.h>
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
# include <linux/kallsyms.h>
2005-04-17 02:20:36 +04:00
# include <asm/uaccess.h>
# include <asm/unistd.h>
# include <asm/div64.h>
# include <asm/timex.h>
# include <asm/io.h>
2005-10-31 02:03:00 +03:00
u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES ;
EXPORT_SYMBOL ( jiffies_64 ) ;
2005-04-17 02:20:36 +04:00
/*
* per - CPU timer vector definitions :
*/
# define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
# define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
# define TVN_SIZE (1 << TVN_BITS)
# define TVR_SIZE (1 << TVR_BITS)
# define TVN_MASK (TVN_SIZE - 1)
# define TVR_MASK (TVR_SIZE - 1)
typedef struct tvec_s {
struct list_head vec [ TVN_SIZE ] ;
} tvec_t ;
typedef struct tvec_root_s {
struct list_head vec [ TVR_SIZE ] ;
} tvec_root_t ;
struct tvec_t_base_s {
2006-03-31 14:30:30 +04:00
spinlock_t lock ;
struct timer_list * running_timer ;
2005-04-17 02:20:36 +04:00
unsigned long timer_jiffies ;
tvec_root_t tv1 ;
tvec_t tv2 ;
tvec_t tv3 ;
tvec_t tv4 ;
tvec_t tv5 ;
2007-05-08 11:27:44 +04:00
} ____cacheline_aligned ;
2005-04-17 02:20:36 +04:00
typedef struct tvec_t_base_s tvec_base_t ;
2006-04-11 09:53:58 +04:00
2006-03-31 14:30:30 +04:00
tvec_base_t boot_tvec_bases ;
EXPORT_SYMBOL ( boot_tvec_bases ) ;
2006-07-30 14:04:14 +04:00
static DEFINE_PER_CPU ( tvec_base_t * , tvec_bases ) = & boot_tvec_bases ;
2005-04-17 02:20:36 +04:00
2007-05-08 11:27:44 +04:00
/*
* Note that all tvec_bases is 2 byte aligned and lower bit of
* base in timer_list is guaranteed to be zero . Use the LSB for
* the new flag to indicate whether the timer is deferrable
*/
# define TBASE_DEFERRABLE_FLAG (0x1)
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable ( tvec_base_t * base )
{
2007-05-10 14:16:01 +04:00
return ( ( unsigned int ) ( unsigned long ) base & TBASE_DEFERRABLE_FLAG ) ;
2007-05-08 11:27:44 +04:00
}
static inline tvec_base_t * tbase_get_base ( tvec_base_t * base )
{
2007-05-10 14:16:01 +04:00
return ( ( tvec_base_t * ) ( ( unsigned long ) base & ~ TBASE_DEFERRABLE_FLAG ) ) ;
2007-05-08 11:27:44 +04:00
}
static inline void timer_set_deferrable ( struct timer_list * timer )
{
2007-05-10 14:16:01 +04:00
timer - > base = ( ( tvec_base_t * ) ( ( unsigned long ) ( timer - > base ) |
2007-07-19 12:49:16 +04:00
TBASE_DEFERRABLE_FLAG ) ) ;
2007-05-08 11:27:44 +04:00
}
static inline void
timer_set_base ( struct timer_list * timer , tvec_base_t * new_base )
{
2007-05-10 14:16:01 +04:00
timer - > base = ( tvec_base_t * ) ( ( unsigned long ) ( new_base ) |
2007-07-19 12:49:16 +04:00
tbase_get_deferrable ( timer - > base ) ) ;
2007-05-08 11:27:44 +04:00
}
2006-12-10 13:21:24 +03:00
/**
* __round_jiffies - function to round jiffies to a full second
* @ j : the time in ( absolute ) jiffies that should be rounded
* @ cpu : the processor number on which the timeout will happen
*
2007-02-10 12:45:59 +03:00
* __round_jiffies ( ) rounds an absolute time in the future ( in jiffies )
2006-12-10 13:21:24 +03:00
* up or down to ( approximately ) full seconds . This is useful for timers
* for which the exact time they fire does not matter too much , as long as
* they fire approximately every X seconds .
*
* By rounding these timers to whole seconds , all such timers will fire
* at the same time , rather than at various times spread out . The goal
* of this is to have the CPU wake up less , which saves power .
*
* The exact rounding is skewed for each processor to avoid all
* processors firing at the exact same time , which could lead
* to lock contention or spurious cache line bouncing .
*
2007-02-10 12:45:59 +03:00
* The return value is the rounded version of the @ j parameter .
2006-12-10 13:21:24 +03:00
*/
unsigned long __round_jiffies ( unsigned long j , int cpu )
{
int rem ;
unsigned long original = j ;
/*
* We don ' t want all cpus firing their timers at once hitting the
* same lock or cachelines , so we skew each extra cpu with an extra
* 3 jiffies . This 3 jiffies came originally from the mm / code which
* already did this .
* The skew is done by adding 3 * cpunr , then round , then subtract this
* extra offset again .
*/
j + = cpu * 3 ;
rem = j % HZ ;
/*
* If the target jiffie is just after a whole second ( which can happen
* due to delays of the timer irq , long irq off times etc etc ) then
* we should round down to the whole second , not up . Use 1 / 4 th second
* as cutoff for this rounding as an extreme upper bound for this .
*/
if ( rem < HZ / 4 ) /* round down */
j = j - rem ;
else /* round up */
j = j - rem + HZ ;
/* now that we have rounded, subtract the extra skew again */
j - = cpu * 3 ;
if ( j < = jiffies ) /* rounding ate our timeout entirely; */
return original ;
return j ;
}
EXPORT_SYMBOL_GPL ( __round_jiffies ) ;
/**
* __round_jiffies_relative - function to round jiffies to a full second
* @ j : the time in ( relative ) jiffies that should be rounded
* @ cpu : the processor number on which the timeout will happen
*
2007-02-10 12:45:59 +03:00
* __round_jiffies_relative ( ) rounds a time delta in the future ( in jiffies )
2006-12-10 13:21:24 +03:00
* up or down to ( approximately ) full seconds . This is useful for timers
* for which the exact time they fire does not matter too much , as long as
* they fire approximately every X seconds .
*
* By rounding these timers to whole seconds , all such timers will fire
* at the same time , rather than at various times spread out . The goal
* of this is to have the CPU wake up less , which saves power .
*
* The exact rounding is skewed for each processor to avoid all
* processors firing at the exact same time , which could lead
* to lock contention or spurious cache line bouncing .
*
2007-02-10 12:45:59 +03:00
* The return value is the rounded version of the @ j parameter .
2006-12-10 13:21:24 +03:00
*/
unsigned long __round_jiffies_relative ( unsigned long j , int cpu )
{
/*
* In theory the following code can skip a jiffy in case jiffies
* increments right between the addition and the later subtraction .
* However since the entire point of this function is to use approximate
* timeouts , it ' s entirely ok to not handle that .
*/
return __round_jiffies ( j + jiffies , cpu ) - jiffies ;
}
EXPORT_SYMBOL_GPL ( __round_jiffies_relative ) ;
/**
* round_jiffies - function to round jiffies to a full second
* @ j : the time in ( absolute ) jiffies that should be rounded
*
2007-02-10 12:45:59 +03:00
* round_jiffies ( ) rounds an absolute time in the future ( in jiffies )
2006-12-10 13:21:24 +03:00
* up or down to ( approximately ) full seconds . This is useful for timers
* for which the exact time they fire does not matter too much , as long as
* they fire approximately every X seconds .
*
* By rounding these timers to whole seconds , all such timers will fire
* at the same time , rather than at various times spread out . The goal
* of this is to have the CPU wake up less , which saves power .
*
2007-02-10 12:45:59 +03:00
* The return value is the rounded version of the @ j parameter .
2006-12-10 13:21:24 +03:00
*/
unsigned long round_jiffies ( unsigned long j )
{
return __round_jiffies ( j , raw_smp_processor_id ( ) ) ;
}
EXPORT_SYMBOL_GPL ( round_jiffies ) ;
/**
* round_jiffies_relative - function to round jiffies to a full second
* @ j : the time in ( relative ) jiffies that should be rounded
*
2007-02-10 12:45:59 +03:00
* round_jiffies_relative ( ) rounds a time delta in the future ( in jiffies )
2006-12-10 13:21:24 +03:00
* up or down to ( approximately ) full seconds . This is useful for timers
* for which the exact time they fire does not matter too much , as long as
* they fire approximately every X seconds .
*
* By rounding these timers to whole seconds , all such timers will fire
* at the same time , rather than at various times spread out . The goal
* of this is to have the CPU wake up less , which saves power .
*
2007-02-10 12:45:59 +03:00
* The return value is the rounded version of the @ j parameter .
2006-12-10 13:21:24 +03:00
*/
unsigned long round_jiffies_relative ( unsigned long j )
{
return __round_jiffies_relative ( j , raw_smp_processor_id ( ) ) ;
}
EXPORT_SYMBOL_GPL ( round_jiffies_relative ) ;
2005-04-17 02:20:36 +04:00
static inline void set_running_timer ( tvec_base_t * base ,
struct timer_list * timer )
{
# ifdef CONFIG_SMP
2006-03-31 14:30:30 +04:00
base - > running_timer = timer ;
2005-04-17 02:20:36 +04:00
# endif
}
static void internal_add_timer ( tvec_base_t * base , struct timer_list * timer )
{
unsigned long expires = timer - > expires ;
unsigned long idx = expires - base - > timer_jiffies ;
struct list_head * vec ;
if ( idx < TVR_SIZE ) {
int i = expires & TVR_MASK ;
vec = base - > tv1 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + TVN_BITS ) ) {
int i = ( expires > > TVR_BITS ) & TVN_MASK ;
vec = base - > tv2 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + 2 * TVN_BITS ) ) {
int i = ( expires > > ( TVR_BITS + TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv3 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + 3 * TVN_BITS ) ) {
int i = ( expires > > ( TVR_BITS + 2 * TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv4 . vec + i ;
} else if ( ( signed long ) idx < 0 ) {
/*
* Can happen if you add a timer with expires = = jiffies ,
* or you set a timer to go off in the past
*/
vec = base - > tv1 . vec + ( base - > timer_jiffies & TVR_MASK ) ;
} else {
int i ;
/* If the timeout is larger than 0xffffffff on 64-bit
* architectures then we use the maximum timeout :
*/
if ( idx > 0xffffffffUL ) {
idx = 0xffffffffUL ;
expires = idx + base - > timer_jiffies ;
}
i = ( expires > > ( TVR_BITS + 3 * TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv5 . vec + i ;
}
/*
* Timers are FIFO :
*/
list_add_tail ( & timer - > entry , vec ) ;
}
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
# ifdef CONFIG_TIMER_STATS
void __timer_stats_timer_set_start_info ( struct timer_list * timer , void * addr )
{
if ( timer - > start_site )
return ;
timer - > start_site = addr ;
memcpy ( timer - > start_comm , current - > comm , TASK_COMM_LEN ) ;
timer - > start_pid = current - > pid ;
}
2007-07-16 10:40:30 +04:00
static void timer_stats_account_timer ( struct timer_list * timer )
{
unsigned int flag = 0 ;
if ( unlikely ( tbase_get_deferrable ( timer - > base ) ) )
flag | = TIMER_STATS_FLAG_DEFERRABLE ;
timer_stats_update_stats ( timer , timer - > start_pid , timer - > start_site ,
timer - > function , timer - > start_comm , flag ) ;
}
# else
static void timer_stats_account_timer ( struct timer_list * timer ) { }
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
# endif
2006-09-29 12:59:46 +04:00
/**
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* init_timer - initialize a timer .
* @ timer : the timer to be initialized
*
* init_timer ( ) must be done to a timer prior calling * any * of the
* other timer functions .
*/
void fastcall init_timer ( struct timer_list * timer )
{
timer - > entry . next = NULL ;
[PATCH] Define __raw_get_cpu_var and use it
There are several instances of per_cpu(foo, raw_smp_processor_id()), which
is semantically equivalent to __get_cpu_var(foo) but without the warning
that smp_processor_id() can give if CONFIG_DEBUG_PREEMPT is enabled. For
those architectures with optimized per-cpu implementations, namely ia64,
powerpc, s390, sparc64 and x86_64, per_cpu() turns into more and slower
code than __get_cpu_var(), so it would be preferable to use __get_cpu_var
on those platforms.
This defines a __raw_get_cpu_var(x) macro which turns into per_cpu(x,
raw_smp_processor_id()) on architectures that use the generic per-cpu
implementation, and turns into __get_cpu_var(x) on the architectures that
have an optimized per-cpu implementation.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-25 16:47:14 +04:00
timer - > base = __raw_get_cpu_var ( tvec_bases ) ;
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
# ifdef CONFIG_TIMER_STATS
timer - > start_site = NULL ;
timer - > start_pid = - 1 ;
memset ( timer - > start_comm , 0 , TASK_COMM_LEN ) ;
# endif
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
}
EXPORT_SYMBOL ( init_timer ) ;
2007-05-08 11:27:44 +04:00
void fastcall init_timer_deferrable ( struct timer_list * timer )
{
init_timer ( timer ) ;
timer_set_deferrable ( timer ) ;
}
EXPORT_SYMBOL ( init_timer_deferrable ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
static inline void detach_timer ( struct timer_list * timer ,
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
int clear_pending )
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
{
struct list_head * entry = & timer - > entry ;
__list_del ( entry - > prev , entry - > next ) ;
if ( clear_pending )
entry - > next = NULL ;
entry - > prev = LIST_POISON2 ;
}
/*
2006-03-31 14:30:30 +04:00
* We are using hashed locking : holding per_cpu ( tvec_bases ) . lock
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* means that all timers which are tied to this base via timer - > base are
* locked , and the base itself is locked too .
*
* So __run_timers / migrate_timers can safely modify all timers which could
* be found on - > tvX lists .
*
* When the timer ' s base is locked , and the timer removed from list , it is
* possible to set timer - > base = NULL and drop the lock : the timer remains
* locked .
*/
2006-03-31 14:30:30 +04:00
static tvec_base_t * lock_timer_base ( struct timer_list * timer ,
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
unsigned long * flags )
2006-09-29 12:59:36 +04:00
__acquires ( timer - > base - > lock )
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
for ( ; ; ) {
2007-05-08 11:27:44 +04:00
tvec_base_t * prelock_base = timer - > base ;
base = tbase_get_base ( prelock_base ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
if ( likely ( base ! = NULL ) ) {
spin_lock_irqsave ( & base - > lock , * flags ) ;
2007-05-08 11:27:44 +04:00
if ( likely ( prelock_base = = timer - > base ) )
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
return base ;
/* The timer has migrated to another CPU */
spin_unlock_irqrestore ( & base - > lock , * flags ) ;
}
cpu_relax ( ) ;
}
}
2005-04-17 02:20:36 +04:00
int __mod_timer ( struct timer_list * timer , unsigned long expires )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base , * new_base ;
2005-04-17 02:20:36 +04:00
unsigned long flags ;
int ret = 0 ;
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
timer_stats_timer_set_start_info ( timer ) ;
2005-04-17 02:20:36 +04:00
BUG_ON ( ! timer - > function ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
base = lock_timer_base ( timer , & flags ) ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 0 ) ;
ret = 1 ;
}
2006-03-24 14:15:54 +03:00
new_base = __get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
2006-03-31 14:30:30 +04:00
if ( base ! = new_base ) {
2005-04-17 02:20:36 +04:00
/*
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* We are trying to schedule the timer on the local CPU .
* However we can ' t change timer ' s base while it is running ,
* otherwise del_timer_sync ( ) can ' t detect that the timer ' s
* handler yet has not finished . This also guarantees that
* the timer is serialized wrt itself .
2005-04-17 02:20:36 +04:00
*/
2006-03-31 14:30:31 +04:00
if ( likely ( base - > running_timer ! = timer ) ) {
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
/* See the comment in lock_timer_base() */
2007-05-08 11:27:44 +04:00
timer_set_base ( timer , NULL ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
spin_unlock ( & base - > lock ) ;
2006-03-31 14:30:31 +04:00
base = new_base ;
spin_lock ( & base - > lock ) ;
2007-05-08 11:27:44 +04:00
timer_set_base ( timer , base ) ;
2005-04-17 02:20:36 +04:00
}
}
timer - > expires = expires ;
2006-03-31 14:30:31 +04:00
internal_add_timer ( base , timer ) ;
spin_unlock_irqrestore ( & base - > lock , flags ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
EXPORT_SYMBOL ( __mod_timer ) ;
2006-09-29 12:59:46 +04:00
/**
2005-04-17 02:20:36 +04:00
* add_timer_on - start a timer on a particular CPU
* @ timer : the timer to be added
* @ cpu : the CPU to start it on
*
* This is not very scalable on SMP . Double adds are not possible .
*/
void add_timer_on ( struct timer_list * timer , int cpu )
{
2006-03-24 14:15:54 +03:00
tvec_base_t * base = per_cpu ( tvec_bases , cpu ) ;
2007-07-19 12:49:16 +04:00
unsigned long flags ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
timer_stats_timer_set_start_info ( timer ) ;
2007-07-19 12:49:16 +04:00
BUG_ON ( timer_pending ( timer ) | | ! timer - > function ) ;
2006-03-31 14:30:30 +04:00
spin_lock_irqsave ( & base - > lock , flags ) ;
2007-05-08 11:27:44 +04:00
timer_set_base ( timer , base ) ;
2005-04-17 02:20:36 +04:00
internal_add_timer ( base , timer ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irqrestore ( & base - > lock , flags ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-29 12:59:46 +04:00
/**
2005-04-17 02:20:36 +04:00
* mod_timer - modify a timer ' s timeout
* @ timer : the timer to be modified
2006-09-29 12:59:46 +04:00
* @ expires : new timeout in jiffies
2005-04-17 02:20:36 +04:00
*
2007-02-10 12:45:59 +03:00
* mod_timer ( ) is a more efficient way to update the expire field of an
2005-04-17 02:20:36 +04:00
* active timer ( if the timer is inactive it will be activated )
*
* mod_timer ( timer , expires ) is equivalent to :
*
* del_timer ( timer ) ; timer - > expires = expires ; add_timer ( timer ) ;
*
* Note that if there are multiple unserialized concurrent users of the
* same timer , then mod_timer ( ) is the only safe way to modify the timeout ,
* since add_timer ( ) cannot modify an already running timer .
*
* The function returns whether it has modified a pending timer or not .
* ( ie . mod_timer ( ) of an inactive timer returns 0 , mod_timer ( ) of an
* active timer returns 1. )
*/
int mod_timer ( struct timer_list * timer , unsigned long expires )
{
BUG_ON ( ! timer - > function ) ;
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
timer_stats_timer_set_start_info ( timer ) ;
2005-04-17 02:20:36 +04:00
/*
* This is a common optimization triggered by the
* networking code - if the timer is re - modified
* to be the same thing then just return :
*/
if ( timer - > expires = = expires & & timer_pending ( timer ) )
return 1 ;
return __mod_timer ( timer , expires ) ;
}
EXPORT_SYMBOL ( mod_timer ) ;
2006-09-29 12:59:46 +04:00
/**
2005-04-17 02:20:36 +04:00
* del_timer - deactive a timer .
* @ timer : the timer to be deactivated
*
* del_timer ( ) deactivates a timer - this works on both active and inactive
* timers .
*
* The function returns whether it has deactivated a pending timer or not .
* ( ie . del_timer ( ) of an inactive timer returns 0 , del_timer ( ) of an
* active timer returns 1. )
*/
int del_timer ( struct timer_list * timer )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
2005-04-17 02:20:36 +04:00
unsigned long flags ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
int ret = 0 ;
2005-04-17 02:20:36 +04:00
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
timer_stats_timer_clear_start_info ( timer ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
if ( timer_pending ( timer ) ) {
base = lock_timer_base ( timer , & flags ) ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 1 ) ;
ret = 1 ;
}
2005-04-17 02:20:36 +04:00
spin_unlock_irqrestore ( & base - > lock , flags ) ;
}
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
return ret ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( del_timer ) ;
# ifdef CONFIG_SMP
2006-09-29 12:59:46 +04:00
/**
* try_to_del_timer_sync - Try to deactivate a timer
* @ timer : timer do del
*
2005-06-23 11:08:59 +04:00
* This function tries to deactivate a timer . Upon successful ( ret > = 0 )
* exit the timer is not queued and the handler is not running on any CPU .
*
* It must not be called from interrupt contexts .
*/
int try_to_del_timer_sync ( struct timer_list * timer )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
2005-06-23 11:08:59 +04:00
unsigned long flags ;
int ret = - 1 ;
base = lock_timer_base ( timer , & flags ) ;
if ( base - > running_timer = = timer )
goto out ;
ret = 0 ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 1 ) ;
ret = 1 ;
}
out :
spin_unlock_irqrestore ( & base - > lock , flags ) ;
return ret ;
}
2007-04-27 02:46:56 +04:00
EXPORT_SYMBOL ( try_to_del_timer_sync ) ;
2006-09-29 12:59:46 +04:00
/**
2005-04-17 02:20:36 +04:00
* del_timer_sync - deactivate a timer and wait for the handler to finish .
* @ timer : the timer to be deactivated
*
* This function only differs from del_timer ( ) on SMP : besides deactivating
* the timer it also makes sure the handler has finished executing on other
* CPUs .
*
2007-02-10 12:45:59 +03:00
* Synchronization rules : Callers must prevent restarting of the timer ,
2005-04-17 02:20:36 +04:00
* otherwise this function is meaningless . It must not be called from
* interrupt contexts . The caller must not hold locks which would prevent
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* completion of the timer ' s handler . The timer ' s handler must not call
* add_timer_on ( ) . Upon exit the timer is not queued and the handler is
* not running on any CPU .
2005-04-17 02:20:36 +04:00
*
* The function returns whether it has deactivated a pending timer or not .
*/
int del_timer_sync ( struct timer_list * timer )
{
2005-06-23 11:08:59 +04:00
for ( ; ; ) {
int ret = try_to_del_timer_sync ( timer ) ;
if ( ret > = 0 )
return ret ;
2006-07-14 11:24:06 +04:00
cpu_relax ( ) ;
2005-06-23 11:08:59 +04:00
}
2005-04-17 02:20:36 +04:00
}
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
EXPORT_SYMBOL ( del_timer_sync ) ;
2005-04-17 02:20:36 +04:00
# endif
static int cascade ( tvec_base_t * base , tvec_t * tv , int index )
{
/* cascade all the timers from tv up one level */
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
struct timer_list * timer , * tmp ;
struct list_head tv_list ;
list_replace_init ( tv - > vec + index , & tv_list ) ;
2005-04-17 02:20:36 +04:00
/*
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
* We are removing _all_ timers from the list , so we
* don ' t have to detach them individually .
2005-04-17 02:20:36 +04:00
*/
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
list_for_each_entry_safe ( timer , tmp , & tv_list , entry ) {
2007-05-08 11:27:44 +04:00
BUG_ON ( tbase_get_base ( timer - > base ) ! = base ) ;
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
internal_add_timer ( base , timer ) ;
2005-04-17 02:20:36 +04:00
}
return index ;
}
2006-09-29 12:59:46 +04:00
# define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
/**
2005-04-17 02:20:36 +04:00
* __run_timers - run all expired timers ( if any ) on this CPU .
* @ base : the timer vector to be processed .
*
* This function cascades all vectors and executes all expired timer
* vectors .
*/
static inline void __run_timers ( tvec_base_t * base )
{
struct timer_list * timer ;
2006-03-31 14:30:30 +04:00
spin_lock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
while ( time_after_eq ( jiffies , base - > timer_jiffies ) ) {
2006-06-23 13:05:55 +04:00
struct list_head work_list ;
2005-04-17 02:20:36 +04:00
struct list_head * head = & work_list ;
2007-07-19 12:49:16 +04:00
int index = base - > timer_jiffies & TVR_MASK ;
2006-06-23 13:05:55 +04:00
2005-04-17 02:20:36 +04:00
/*
* Cascade timers :
*/
if ( ! index & &
( ! cascade ( base , & base - > tv2 , INDEX ( 0 ) ) ) & &
( ! cascade ( base , & base - > tv3 , INDEX ( 1 ) ) ) & &
! cascade ( base , & base - > tv4 , INDEX ( 2 ) ) )
cascade ( base , & base - > tv5 , INDEX ( 3 ) ) ;
2006-06-23 13:05:55 +04:00
+ + base - > timer_jiffies ;
list_replace_init ( base - > tv1 . vec + index , & work_list ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
while ( ! list_empty ( head ) ) {
2005-04-17 02:20:36 +04:00
void ( * fn ) ( unsigned long ) ;
unsigned long data ;
Introduce a handy list_first_entry macro
There are many places in the kernel where the construction like
foo = list_entry(head->next, struct foo_struct, list);
are used.
The code might look more descriptive and neat if using the macro
list_first_entry(head, type, member) \
list_entry((head)->next, type, member)
Here is the macro itself and the examples of its usage in the generic code.
If it will turn out to be useful, I can prepare the set of patches to
inject in into arch-specific code, drivers, networking, etc.
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-08 11:30:19 +04:00
timer = list_first_entry ( head , struct timer_list , entry ) ;
2007-07-19 12:49:16 +04:00
fn = timer - > function ;
data = timer - > data ;
2005-04-17 02:20:36 +04:00
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
timer_stats_account_timer ( timer ) ;
2005-04-17 02:20:36 +04:00
set_running_timer ( base , timer ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
detach_timer ( timer , 1 ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
{
2005-06-23 11:09:09 +04:00
int preempt_count = preempt_count ( ) ;
2005-04-17 02:20:36 +04:00
fn ( data ) ;
if ( preempt_count ! = preempt_count ( ) ) {
2005-06-23 11:09:09 +04:00
printk ( KERN_WARNING " huh, entered %p "
" with preempt_count %08x, exited "
" with %08x? \n " ,
fn , preempt_count ,
preempt_count ( ) ) ;
2005-04-17 02:20:36 +04:00
BUG ( ) ;
}
}
2006-03-31 14:30:30 +04:00
spin_lock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
}
}
set_running_timer ( base , NULL ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
}
2007-02-16 12:27:47 +03:00
# if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
2005-04-17 02:20:36 +04:00
/*
* Find out when the next timer event is due to happen . This
* is used on S / 390 to stop all activity when a cpus is idle .
* This functions needs to be called disabled .
*/
2007-02-16 12:27:46 +03:00
static unsigned long __next_timer_interrupt ( tvec_base_t * base )
2005-04-17 02:20:36 +04:00
{
2007-02-16 12:27:46 +03:00
unsigned long timer_jiffies = base - > timer_jiffies ;
2007-05-30 01:47:39 +04:00
unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA ;
2007-02-16 12:27:46 +03:00
int index , slot , array , found = 0 ;
2005-04-17 02:20:36 +04:00
struct timer_list * nte ;
tvec_t * varray [ 4 ] ;
/* Look for timer events in tv1. */
2007-02-16 12:27:46 +03:00
index = slot = timer_jiffies & TVR_MASK ;
2005-04-17 02:20:36 +04:00
do {
2007-02-16 12:27:46 +03:00
list_for_each_entry ( nte , base - > tv1 . vec + slot , entry ) {
2007-07-19 12:49:16 +04:00
if ( tbase_get_deferrable ( nte - > base ) )
continue ;
2007-05-08 11:27:44 +04:00
2007-02-16 12:27:46 +03:00
found = 1 ;
2005-04-17 02:20:36 +04:00
expires = nte - > expires ;
2007-02-16 12:27:46 +03:00
/* Look at the cascade bucket(s)? */
if ( ! index | | slot < index )
goto cascade ;
return expires ;
2005-04-17 02:20:36 +04:00
}
2007-02-16 12:27:46 +03:00
slot = ( slot + 1 ) & TVR_MASK ;
} while ( slot ! = index ) ;
cascade :
/* Calculate the next cascade event */
if ( index )
timer_jiffies + = TVR_SIZE - index ;
timer_jiffies > > = TVR_BITS ;
2005-04-17 02:20:36 +04:00
/* Check tv2-tv5. */
varray [ 0 ] = & base - > tv2 ;
varray [ 1 ] = & base - > tv3 ;
varray [ 2 ] = & base - > tv4 ;
varray [ 3 ] = & base - > tv5 ;
2007-02-16 12:27:46 +03:00
for ( array = 0 ; array < 4 ; array + + ) {
tvec_t * varp = varray [ array ] ;
index = slot = timer_jiffies & TVN_MASK ;
2005-04-17 02:20:36 +04:00
do {
2007-02-16 12:27:46 +03:00
list_for_each_entry ( nte , varp - > vec + slot , entry ) {
found = 1 ;
2005-04-17 02:20:36 +04:00
if ( time_before ( nte - > expires , expires ) )
expires = nte - > expires ;
2007-02-16 12:27:46 +03:00
}
/*
* Do we still search for the first timer or are
* we looking up the cascade buckets ?
*/
if ( found ) {
/* Look at the cascade bucket(s)? */
if ( ! index | | slot < index )
break ;
return expires ;
}
slot = ( slot + 1 ) & TVN_MASK ;
} while ( slot ! = index ) ;
if ( index )
timer_jiffies + = TVN_SIZE - index ;
timer_jiffies > > = TVN_BITS ;
2005-04-17 02:20:36 +04:00
}
2007-02-16 12:27:46 +03:00
return expires ;
}
2006-03-07 02:42:45 +03:00
2007-02-16 12:27:46 +03:00
/*
* Check , if the next hrtimer event is before the next timer wheel
* event :
*/
static unsigned long cmp_next_hrtimer_event ( unsigned long now ,
unsigned long expires )
{
ktime_t hr_delta = hrtimer_get_next_event ( ) ;
struct timespec tsdelta ;
2007-03-25 16:31:17 +04:00
unsigned long delta ;
2007-02-16 12:27:46 +03:00
if ( hr_delta . tv64 = = KTIME_MAX )
return expires ;
2006-05-21 02:00:24 +04:00
2007-03-25 16:31:17 +04:00
/*
* Expired timer available , let it expire in the next tick
*/
if ( hr_delta . tv64 < = 0 )
return now + 1 ;
2006-03-07 02:42:45 +03:00
2007-02-16 12:27:46 +03:00
tsdelta = ktime_to_timespec ( hr_delta ) ;
2007-03-25 16:31:17 +04:00
delta = timespec_to_jiffies ( & tsdelta ) ;
2007-05-30 01:47:39 +04:00
/*
* Limit the delta to the max value , which is checked in
* tick_nohz_stop_sched_tick ( ) :
*/
if ( delta > NEXT_TIMER_MAX_DELTA )
delta = NEXT_TIMER_MAX_DELTA ;
2007-03-25 16:31:17 +04:00
/*
* Take rounding errors in to account and make sure , that it
* expires in the next tick . Otherwise we go into an endless
* ping pong due to tick_nohz_stop_sched_tick ( ) retriggering
* the timer softirq
*/
if ( delta < 1 )
delta = 1 ;
now + = delta ;
2007-02-16 12:27:46 +03:00
if ( time_before ( now , expires ) )
return now ;
2005-04-17 02:20:36 +04:00
return expires ;
}
2007-02-16 12:27:46 +03:00
/**
2007-11-06 01:51:10 +03:00
* get_next_timer_interrupt - return the jiffy of the next pending timer
2007-03-01 07:12:13 +03:00
* @ now : current time ( in jiffies )
2007-02-16 12:27:46 +03:00
*/
2007-02-16 12:27:47 +03:00
unsigned long get_next_timer_interrupt ( unsigned long now )
2007-02-16 12:27:46 +03:00
{
tvec_base_t * base = __get_cpu_var ( tvec_bases ) ;
2007-02-16 12:27:47 +03:00
unsigned long expires ;
2007-02-16 12:27:46 +03:00
spin_lock ( & base - > lock ) ;
expires = __next_timer_interrupt ( base ) ;
spin_unlock ( & base - > lock ) ;
if ( time_before_eq ( expires , now ) )
return now ;
return cmp_next_hrtimer_event ( now , expires ) ;
}
2007-02-16 12:27:47 +03:00
# ifdef CONFIG_NO_IDLE_HZ
unsigned long next_timer_interrupt ( void )
{
return get_next_timer_interrupt ( jiffies ) ;
}
# endif
2005-04-17 02:20:36 +04:00
# endif
2007-11-10 00:39:38 +03:00
# ifndef CONFIG_VIRT_CPU_ACCOUNTING
void account_process_tick ( struct task_struct * p , int user_tick )
{
if ( user_tick ) {
account_user_time ( p , jiffies_to_cputime ( 1 ) ) ;
account_user_time_scaled ( p , jiffies_to_cputime ( 1 ) ) ;
} else {
account_system_time ( p , HARDIRQ_OFFSET , jiffies_to_cputime ( 1 ) ) ;
account_system_time_scaled ( p , jiffies_to_cputime ( 1 ) ) ;
}
}
# endif
2005-04-17 02:20:36 +04:00
/*
2007-10-18 14:06:11 +04:00
* Called from the timer interrupt handler to charge one tick to the current
2005-04-17 02:20:36 +04:00
* process . user_tick is 1 if the tick is user time , 0 for system .
*/
void update_process_times ( int user_tick )
{
struct task_struct * p = current ;
int cpu = smp_processor_id ( ) ;
/* Note: this timer irq context must be accounted for as well. */
2007-11-10 00:39:38 +03:00
account_process_tick ( p , user_tick ) ;
2005-04-17 02:20:36 +04:00
run_local_timers ( ) ;
if ( rcu_pending ( cpu ) )
rcu_check_callbacks ( cpu , user_tick ) ;
scheduler_tick ( ) ;
2007-07-19 12:49:16 +04:00
run_posix_cpu_timers ( p ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Nr of active tasks - counted in fixed - point numbers
*/
static unsigned long count_active_tasks ( void )
{
2006-03-31 14:31:21 +04:00
return nr_active ( ) * FIXED_1 ;
2005-04-17 02:20:36 +04:00
}
/*
* Hmm . . Changed this , as the GNU make sources ( load . c ) seems to
* imply that avenrun [ ] is the standard name for this kind of thing .
* Nothing else seems to be standardized : the fractional size etc
* all seem to differ on different machines .
*
* Requires xtime_lock to access .
*/
unsigned long avenrun [ 3 ] ;
EXPORT_SYMBOL ( avenrun ) ;
/*
* calc_load - given tick count , update the avenrun load estimates .
* This is called while holding a write_lock on xtime_lock .
*/
static inline void calc_load ( unsigned long ticks )
{
unsigned long active_tasks ; /* fixed-point */
static int count = LOAD_FREQ ;
2006-12-13 11:35:45 +03:00
count - = ticks ;
if ( unlikely ( count < 0 ) ) {
active_tasks = count_active_tasks ( ) ;
do {
CALC_LOAD ( avenrun [ 0 ] , EXP_1 , active_tasks ) ;
CALC_LOAD ( avenrun [ 1 ] , EXP_5 , active_tasks ) ;
CALC_LOAD ( avenrun [ 2 ] , EXP_15 , active_tasks ) ;
count + = LOAD_FREQ ;
} while ( count < 0 ) ;
2005-04-17 02:20:36 +04:00
}
}
/*
* This function runs timers and the timer - tq in bottom half context .
*/
static void run_timer_softirq ( struct softirq_action * h )
{
2006-03-24 14:15:54 +03:00
tvec_base_t * base = __get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
hrtimer_run_queues ( ) ;
2005-04-17 02:20:36 +04:00
if ( time_after_eq ( jiffies , base - > timer_jiffies ) )
__run_timers ( base ) ;
}
/*
* Called by the local , per - CPU timer interrupt on SMP .
*/
void run_local_timers ( void )
{
raise_softirq ( TIMER_SOFTIRQ ) ;
2006-03-24 14:18:41 +03:00
softlockup_tick ( ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Called by the timer interrupt . xtime_lock must already be taken
* by the timer IRQ !
*/
2006-09-29 13:00:32 +04:00
static inline void update_times ( unsigned long ticks )
2005-04-17 02:20:36 +04:00
{
2006-06-26 11:25:06 +04:00
update_wall_time ( ) ;
2005-04-17 02:20:36 +04:00
calc_load ( ticks ) ;
}
2007-07-19 12:49:16 +04:00
2005-04-17 02:20:36 +04:00
/*
* The 64 - bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock .
* jiffies is defined in the linker script . . .
*/
2006-09-29 13:00:32 +04:00
void do_timer ( unsigned long ticks )
2005-04-17 02:20:36 +04:00
{
2006-09-29 13:00:32 +04:00
jiffies_64 + = ticks ;
update_times ( ticks ) ;
2005-04-17 02:20:36 +04:00
}
# ifdef __ARCH_WANT_SYS_ALARM
/*
* For backwards compatibility ? This can be done in libc so Alpha
* and all newer ports shouldn ' t need it .
*/
asmlinkage unsigned long sys_alarm ( unsigned int seconds )
{
2006-03-25 14:06:33 +03:00
return alarm_setitimer ( seconds ) ;
2005-04-17 02:20:36 +04:00
}
# endif
# ifndef __alpha__
/*
* The Alpha uses getxpid , getxuid , and getxgid instead . Maybe this
* should be moved into arch / i386 instead ?
*/
/**
* sys_getpid - return the thread group id of the current process
*
* Note , despite the name , this returns the tgid not the pid . The tgid and
* the pid are identical unless CLONE_THREAD was specified on clone ( ) in
* which case the tgid is the same in all threads of the same group .
*
* This is SMP safe as current - > tgid does not change .
*/
asmlinkage long sys_getpid ( void )
{
2007-10-19 10:40:14 +04:00
return task_tgid_vnr ( current ) ;
2005-04-17 02:20:36 +04:00
}
/*
2006-08-14 10:24:23 +04:00
* Accessing - > real_parent is not SMP - safe , it could
* change from under us . However , we can use a stale
* value of - > real_parent under rcu_read_lock ( ) , see
* release_task ( ) - > call_rcu ( delayed_put_task_struct ) .
2005-04-17 02:20:36 +04:00
*/
asmlinkage long sys_getppid ( void )
{
int pid ;
2006-08-14 10:24:23 +04:00
rcu_read_lock ( ) ;
2008-01-10 23:52:04 +03:00
pid = task_tgid_nr_ns ( current - > real_parent , current - > nsproxy - > pid_ns ) ;
2006-08-14 10:24:23 +04:00
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
return pid ;
}
asmlinkage long sys_getuid ( void )
{
/* Only we change this so SMP safe */
return current - > uid ;
}
asmlinkage long sys_geteuid ( void )
{
/* Only we change this so SMP safe */
return current - > euid ;
}
asmlinkage long sys_getgid ( void )
{
/* Only we change this so SMP safe */
return current - > gid ;
}
asmlinkage long sys_getegid ( void )
{
/* Only we change this so SMP safe */
return current - > egid ;
}
# endif
static void process_timeout ( unsigned long __data )
{
2006-07-03 11:25:41 +04:00
wake_up_process ( ( struct task_struct * ) __data ) ;
2005-04-17 02:20:36 +04:00
}
/**
* schedule_timeout - sleep until timeout
* @ timeout : timeout value in jiffies
*
* Make the current task sleep until @ timeout jiffies have
* elapsed . The routine will return immediately unless
* the current task state has been set ( see set_current_state ( ) ) .
*
* You can set the task state as follows -
*
* % TASK_UNINTERRUPTIBLE - at least @ timeout jiffies are guaranteed to
* pass before the routine returns . The routine will return 0
*
* % TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task . In this case the remaining time
* in jiffies will be returned , or 0 if the timer expired in time
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns .
*
* Specifying a @ timeout value of % MAX_SCHEDULE_TIMEOUT will schedule
* the CPU away without a bound on the timeout . In this case the return
* value will be % MAX_SCHEDULE_TIMEOUT .
*
* In all cases the return value is guaranteed to be non - negative .
*/
fastcall signed long __sched schedule_timeout ( signed long timeout )
{
struct timer_list timer ;
unsigned long expire ;
switch ( timeout )
{
case MAX_SCHEDULE_TIMEOUT :
/*
* These two special cases are useful to be comfortable
* in the caller . Nothing more . We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I ' d like to return a valid offset ( > = 0 ) to allow
* the caller to do everything it want with the retval .
*/
schedule ( ) ;
goto out ;
default :
/*
* Another bit of PARANOID . Note that the retval will be
* 0 since no piece of kernel is supposed to do a check
* for a negative retval of schedule_timeout ( ) ( since it
* should never happens anyway ) . You just have the printk ( )
* that will tell you if something is gone wrong and where .
*/
2006-12-22 12:10:14 +03:00
if ( timeout < 0 ) {
2005-04-17 02:20:36 +04:00
printk ( KERN_ERR " schedule_timeout: wrong timeout "
2006-12-22 12:10:14 +03:00
" value %lx \n " , timeout ) ;
dump_stack ( ) ;
2005-04-17 02:20:36 +04:00
current - > state = TASK_RUNNING ;
goto out ;
}
}
expire = timeout + jiffies ;
2005-10-31 02:01:38 +03:00
setup_timer ( & timer , process_timeout , ( unsigned long ) current ) ;
__mod_timer ( & timer , expire ) ;
2005-04-17 02:20:36 +04:00
schedule ( ) ;
del_singleshot_timer_sync ( & timer ) ;
timeout = expire - jiffies ;
out :
return timeout < 0 ? 0 : timeout ;
}
EXPORT_SYMBOL ( schedule_timeout ) ;
2005-09-13 12:25:15 +04:00
/*
* We can use __set_current_state ( ) here because schedule_timeout ( ) calls
* schedule ( ) unconditionally .
*/
2005-09-10 11:27:21 +04:00
signed long __sched schedule_timeout_interruptible ( signed long timeout )
{
2005-10-31 02:01:42 +03:00
__set_current_state ( TASK_INTERRUPTIBLE ) ;
return schedule_timeout ( timeout ) ;
2005-09-10 11:27:21 +04:00
}
EXPORT_SYMBOL ( schedule_timeout_interruptible ) ;
signed long __sched schedule_timeout_uninterruptible ( signed long timeout )
{
2005-10-31 02:01:42 +03:00
__set_current_state ( TASK_UNINTERRUPTIBLE ) ;
return schedule_timeout ( timeout ) ;
2005-09-10 11:27:21 +04:00
}
EXPORT_SYMBOL ( schedule_timeout_uninterruptible ) ;
2005-04-17 02:20:36 +04:00
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid ( void )
{
2007-10-19 10:40:14 +04:00
return task_pid_vnr ( current ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-29 12:59:46 +04:00
/**
2007-02-10 12:46:00 +03:00
* do_sysinfo - fill in sysinfo struct
2006-09-29 12:59:46 +04:00
* @ info : pointer to buffer to fill
2007-07-19 12:49:16 +04:00
*/
2007-02-10 12:46:00 +03:00
int do_sysinfo ( struct sysinfo * info )
2005-04-17 02:20:36 +04:00
{
unsigned long mem_total , sav_total ;
unsigned int mem_unit , bitcount ;
unsigned long seq ;
2007-02-10 12:46:00 +03:00
memset ( info , 0 , sizeof ( struct sysinfo ) ) ;
2005-04-17 02:20:36 +04:00
do {
struct timespec tp ;
seq = read_seqbegin ( & xtime_lock ) ;
/*
* This is annoying . The below is the same thing
* posix_get_clock_monotonic ( ) does , but it wants to
* take the lock which we want to cover the loads stuff
* too .
*/
getnstimeofday ( & tp ) ;
tp . tv_sec + = wall_to_monotonic . tv_sec ;
tp . tv_nsec + = wall_to_monotonic . tv_nsec ;
2007-07-16 10:39:42 +04:00
monotonic_to_bootbased ( & tp ) ;
2005-04-17 02:20:36 +04:00
if ( tp . tv_nsec - NSEC_PER_SEC > = 0 ) {
tp . tv_nsec = tp . tv_nsec - NSEC_PER_SEC ;
tp . tv_sec + + ;
}
2007-02-10 12:46:00 +03:00
info - > uptime = tp . tv_sec + ( tp . tv_nsec ? 1 : 0 ) ;
2005-04-17 02:20:36 +04:00
2007-02-10 12:46:00 +03:00
info - > loads [ 0 ] = avenrun [ 0 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
info - > loads [ 1 ] = avenrun [ 1 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
info - > loads [ 2 ] = avenrun [ 2 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
2005-04-17 02:20:36 +04:00
2007-02-10 12:46:00 +03:00
info - > procs = nr_threads ;
2005-04-17 02:20:36 +04:00
} while ( read_seqretry ( & xtime_lock , seq ) ) ;
2007-02-10 12:46:00 +03:00
si_meminfo ( info ) ;
si_swapinfo ( info ) ;
2005-04-17 02:20:36 +04:00
/*
* If the sum of all the available memory ( i . e . ram + swap )
* is less than can be stored in a 32 bit unsigned long then
* we can be binary compatible with 2.2 . x kernels . If not ,
* well , in that case 2.2 . x was broken anyways . . .
*
* - Erik Andersen < andersee @ debian . org >
*/
2007-02-10 12:46:00 +03:00
mem_total = info - > totalram + info - > totalswap ;
if ( mem_total < info - > totalram | | mem_total < info - > totalswap )
2005-04-17 02:20:36 +04:00
goto out ;
bitcount = 0 ;
2007-02-10 12:46:00 +03:00
mem_unit = info - > mem_unit ;
2005-04-17 02:20:36 +04:00
while ( mem_unit > 1 ) {
bitcount + + ;
mem_unit > > = 1 ;
sav_total = mem_total ;
mem_total < < = 1 ;
if ( mem_total < sav_total )
goto out ;
}
/*
* If mem_total did not overflow , multiply all memory values by
2007-02-10 12:46:00 +03:00
* info - > mem_unit and set it to 1. This leaves things compatible
2005-04-17 02:20:36 +04:00
* with 2.2 . x , and also retains compatibility with earlier 2.4 . x
* kernels . . .
*/
2007-02-10 12:46:00 +03:00
info - > mem_unit = 1 ;
info - > totalram < < = bitcount ;
info - > freeram < < = bitcount ;
info - > sharedram < < = bitcount ;
info - > bufferram < < = bitcount ;
info - > totalswap < < = bitcount ;
info - > freeswap < < = bitcount ;
info - > totalhigh < < = bitcount ;
info - > freehigh < < = bitcount ;
out :
return 0 ;
}
asmlinkage long sys_sysinfo ( struct sysinfo __user * info )
{
struct sysinfo val ;
do_sysinfo ( & val ) ;
2005-04-17 02:20:36 +04:00
if ( copy_to_user ( info , & val , sizeof ( struct sysinfo ) ) )
return - EFAULT ;
return 0 ;
}
2006-07-03 11:25:10 +04:00
/*
* lockdep : we want to track each per - CPU base as a separate lock - class ,
* but timer - bases are kmalloc ( ) - ed , so we need to attach separate
* keys to them :
*/
static struct lock_class_key base_lock_keys [ NR_CPUS ] ;
2007-12-18 20:05:58 +03:00
static int __cpuinit init_timers_cpu ( int cpu )
2005-04-17 02:20:36 +04:00
{
int j ;
tvec_base_t * base ;
2007-12-18 20:05:58 +03:00
static char __cpuinitdata tvec_base_done [ NR_CPUS ] ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
2006-04-11 09:53:58 +04:00
if ( ! tvec_base_done [ cpu ] ) {
2006-03-24 14:15:54 +03:00
static char boot_done ;
if ( boot_done ) {
2006-04-11 09:53:58 +04:00
/*
* The APs use this path later in boot
*/
2007-07-17 15:03:29 +04:00
base = kmalloc_node ( sizeof ( * base ) ,
GFP_KERNEL | __GFP_ZERO ,
2006-03-24 14:15:54 +03:00
cpu_to_node ( cpu ) ) ;
if ( ! base )
return - ENOMEM ;
2007-05-08 11:27:44 +04:00
/* Make sure that tvec_base is 2 byte aligned */
if ( tbase_get_deferrable ( base ) ) {
WARN_ON ( 1 ) ;
kfree ( base ) ;
return - ENOMEM ;
}
2006-04-11 09:53:58 +04:00
per_cpu ( tvec_bases , cpu ) = base ;
2006-03-24 14:15:54 +03:00
} else {
2006-04-11 09:53:58 +04:00
/*
* This is for the boot CPU - we use compile - time
* static initialisation because per - cpu memory isn ' t
* ready yet and because the memory allocators are not
* initialised either .
*/
2006-03-24 14:15:54 +03:00
boot_done = 1 ;
2006-04-11 09:53:58 +04:00
base = & boot_tvec_bases ;
2006-03-24 14:15:54 +03:00
}
2006-04-11 09:53:58 +04:00
tvec_base_done [ cpu ] = 1 ;
} else {
base = per_cpu ( tvec_bases , cpu ) ;
2006-03-24 14:15:54 +03:00
}
2006-04-11 09:53:58 +04:00
2006-03-31 14:30:30 +04:00
spin_lock_init ( & base - > lock ) ;
2006-07-03 11:25:10 +04:00
lockdep_set_class ( & base - > lock , base_lock_keys + cpu ) ;
2005-04-17 02:20:36 +04:00
for ( j = 0 ; j < TVN_SIZE ; j + + ) {
INIT_LIST_HEAD ( base - > tv5 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv4 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv3 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv2 . vec + j ) ;
}
for ( j = 0 ; j < TVR_SIZE ; j + + )
INIT_LIST_HEAD ( base - > tv1 . vec + j ) ;
base - > timer_jiffies = jiffies ;
2006-03-24 14:15:54 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
# ifdef CONFIG_HOTPLUG_CPU
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
static void migrate_timer_list ( tvec_base_t * new_base , struct list_head * head )
2005-04-17 02:20:36 +04:00
{
struct timer_list * timer ;
while ( ! list_empty ( head ) ) {
Introduce a handy list_first_entry macro
There are many places in the kernel where the construction like
foo = list_entry(head->next, struct foo_struct, list);
are used.
The code might look more descriptive and neat if using the macro
list_first_entry(head, type, member) \
list_entry((head)->next, type, member)
Here is the macro itself and the examples of its usage in the generic code.
If it will turn out to be useful, I can prepare the set of patches to
inject in into arch-specific code, drivers, networking, etc.
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-08 11:30:19 +04:00
timer = list_first_entry ( head , struct timer_list , entry ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
detach_timer ( timer , 0 ) ;
2007-05-08 11:27:44 +04:00
timer_set_base ( timer , new_base ) ;
2005-04-17 02:20:36 +04:00
internal_add_timer ( new_base , timer ) ;
}
}
2008-01-22 04:18:25 +03:00
static void __cpuinit migrate_timers ( int cpu )
2005-04-17 02:20:36 +04:00
{
tvec_base_t * old_base ;
tvec_base_t * new_base ;
int i ;
BUG_ON ( cpu_online ( cpu ) ) ;
2006-03-24 14:15:54 +03:00
old_base = per_cpu ( tvec_bases , cpu ) ;
new_base = get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
local_irq_disable ( ) ;
2007-03-05 11:30:51 +03:00
double_spin_lock ( & new_base - > lock , & old_base - > lock ,
smp_processor_id ( ) < cpu ) ;
2006-03-31 14:30:30 +04:00
BUG_ON ( old_base - > running_timer ) ;
2005-04-17 02:20:36 +04:00
for ( i = 0 ; i < TVR_SIZE ; i + + )
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
migrate_timer_list ( new_base , old_base - > tv1 . vec + i ) ;
for ( i = 0 ; i < TVN_SIZE ; i + + ) {
migrate_timer_list ( new_base , old_base - > tv2 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv3 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv4 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv5 . vec + i ) ;
}
2007-03-05 11:30:51 +03:00
double_spin_unlock ( & new_base - > lock , & old_base - > lock ,
smp_processor_id ( ) < cpu ) ;
2005-04-17 02:20:36 +04:00
local_irq_enable ( ) ;
put_cpu_var ( tvec_bases ) ;
}
# endif /* CONFIG_HOTPLUG_CPU */
2006-07-30 14:03:35 +04:00
static int __cpuinit timer_cpu_notify ( struct notifier_block * self ,
2005-04-17 02:20:36 +04:00
unsigned long action , void * hcpu )
{
long cpu = ( long ) hcpu ;
switch ( action ) {
case CPU_UP_PREPARE :
2007-05-09 13:35:10 +04:00
case CPU_UP_PREPARE_FROZEN :
2006-03-24 14:15:54 +03:00
if ( init_timers_cpu ( cpu ) < 0 )
return NOTIFY_BAD ;
2005-04-17 02:20:36 +04:00
break ;
# ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD :
2007-05-09 13:35:10 +04:00
case CPU_DEAD_FROZEN :
2005-04-17 02:20:36 +04:00
migrate_timers ( cpu ) ;
break ;
# endif
default :
break ;
}
return NOTIFY_OK ;
}
2006-07-30 14:03:35 +04:00
static struct notifier_block __cpuinitdata timers_nb = {
2005-04-17 02:20:36 +04:00
. notifier_call = timer_cpu_notify ,
} ;
void __init init_timers ( void )
{
2006-09-29 13:00:22 +04:00
int err = timer_cpu_notify ( & timers_nb , ( unsigned long ) CPU_UP_PREPARE ,
2005-04-17 02:20:36 +04:00
( void * ) ( long ) smp_processor_id ( ) ) ;
2006-09-29 13:00:22 +04:00
[PATCH] Add debugging feature /proc/timer_stat
Add /proc/timer_stats support: debugging feature to profile timer expiration.
Both the starting site, process/PID and the expiration function is captured.
This allows the quick identification of timer event sources in a system.
Sample output:
# echo 1 > /proc/timer_stats
# cat /proc/timer_stats
Timer Stats Version: v0.1
Sample period: 4.010 s
24, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
11, 0 swapper sk_reset_timer (tcp_delack_timer)
6, 0 swapper hrtimer_stop_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
17, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
2, 1 swapper queue_delayed_work_on (delayed_work_timer_fn)
4, 2050 pcscd do_nanosleep (hrtimer_wakeup)
5, 4179 sshd sk_reset_timer (tcp_write_timer)
4, 2248 yum-updatesd schedule_timeout (process_timeout)
18, 0 swapper hrtimer_restart_sched_tick (hrtimer_sched_tick)
3, 0 swapper sk_reset_timer (tcp_delack_timer)
1, 1 swapper neigh_table_init_no_netlink (neigh_periodic_timer)
2, 1 swapper e1000_up (e1000_watchdog)
1, 1 init schedule_timeout (process_timeout)
100 total events, 25.24 events/sec
[ cleanups and hrtimers support from Thomas Gleixner <tglx@linutronix.de> ]
[bunk@stusta.de: nr_entries can become static]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 12:28:13 +03:00
init_timer_stats ( ) ;
2006-09-29 13:00:22 +04:00
BUG_ON ( err = = NOTIFY_BAD ) ;
2005-04-17 02:20:36 +04:00
register_cpu_notifier ( & timers_nb ) ;
open_softirq ( TIMER_SOFTIRQ , run_timer_softirq , NULL ) ;
}
/**
* msleep - sleep safely even with waitqueue interruptions
* @ msecs : Time in milliseconds to sleep for
*/
void msleep ( unsigned int msecs )
{
unsigned long timeout = msecs_to_jiffies ( msecs ) + 1 ;
2005-09-10 11:27:24 +04:00
while ( timeout )
timeout = schedule_timeout_uninterruptible ( timeout ) ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( msleep ) ;
/**
2005-06-26 01:58:43 +04:00
* msleep_interruptible - sleep waiting for signals
2005-04-17 02:20:36 +04:00
* @ msecs : Time in milliseconds to sleep for
*/
unsigned long msleep_interruptible ( unsigned int msecs )
{
unsigned long timeout = msecs_to_jiffies ( msecs ) + 1 ;
2005-09-10 11:27:24 +04:00
while ( timeout & & ! signal_pending ( current ) )
timeout = schedule_timeout_interruptible ( timeout ) ;
2005-04-17 02:20:36 +04:00
return jiffies_to_msecs ( timeout ) ;
}
EXPORT_SYMBOL ( msleep_interruptible ) ;