2005-04-17 02:20:36 +04:00
/*
* linux / kernel / timer . c
*
* Kernel internal timers , kernel timekeeping , basic process system calls
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
*
* 1997 - 01 - 28 Modified by Finn Arne Gangstad to make timers scale better .
*
* 1997 - 09 - 10 Updated NTP code according to technical memorandum Jan ' 96
* " A Kernel Model for Precision Timekeeping " by Dave Mills
* 1998 - 12 - 24 Fixed a xtime SMP race ( we need the xtime_lock rw spinlock to
* serialize accesses to xtime / lost_ticks ) .
* Copyright ( C ) 1998 Andrea Arcangeli
* 1999 - 03 - 10 Improved NTP compatibility by Ulrich Windl
* 2002 - 05 - 31 Move sys_sysinfo here and make its locking sane , Robert Love
* 2000 - 10 - 05 Implemented scalable SMP per - CPU timer handling .
* Copyright ( C ) 2000 , 2001 , 2002 Ingo Molnar
* Designed by David S . Miller , Alexey Kuznetsov and Ingo Molnar
*/
# include <linux/kernel_stat.h>
# include <linux/module.h>
# include <linux/interrupt.h>
# include <linux/percpu.h>
# include <linux/init.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/notifier.h>
# include <linux/thread_info.h>
# include <linux/time.h>
# include <linux/jiffies.h>
# include <linux/posix-timers.h>
# include <linux/cpu.h>
# include <linux/syscalls.h>
2006-01-08 12:02:17 +03:00
# include <linux/delay.h>
2005-04-17 02:20:36 +04:00
# include <asm/uaccess.h>
# include <asm/unistd.h>
# include <asm/div64.h>
# include <asm/timex.h>
# include <asm/io.h>
2005-10-31 02:03:00 +03:00
u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES ;
EXPORT_SYMBOL ( jiffies_64 ) ;
2005-04-17 02:20:36 +04:00
/*
* per - CPU timer vector definitions :
*/
# define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
# define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
# define TVN_SIZE (1 << TVN_BITS)
# define TVR_SIZE (1 << TVR_BITS)
# define TVN_MASK (TVN_SIZE - 1)
# define TVR_MASK (TVR_SIZE - 1)
typedef struct tvec_s {
struct list_head vec [ TVN_SIZE ] ;
} tvec_t ;
typedef struct tvec_root_s {
struct list_head vec [ TVR_SIZE ] ;
} tvec_root_t ;
struct tvec_t_base_s {
2006-03-31 14:30:30 +04:00
spinlock_t lock ;
struct timer_list * running_timer ;
2005-04-17 02:20:36 +04:00
unsigned long timer_jiffies ;
tvec_root_t tv1 ;
tvec_t tv2 ;
tvec_t tv3 ;
tvec_t tv4 ;
tvec_t tv5 ;
} ____cacheline_aligned_in_smp ;
typedef struct tvec_t_base_s tvec_base_t ;
2006-04-11 09:53:58 +04:00
2006-03-31 14:30:30 +04:00
tvec_base_t boot_tvec_bases ;
EXPORT_SYMBOL ( boot_tvec_bases ) ;
2006-07-30 14:04:14 +04:00
static DEFINE_PER_CPU ( tvec_base_t * , tvec_bases ) = & boot_tvec_bases ;
2005-04-17 02:20:36 +04:00
2006-12-10 13:21:24 +03:00
/**
* __round_jiffies - function to round jiffies to a full second
* @ j : the time in ( absolute ) jiffies that should be rounded
* @ cpu : the processor number on which the timeout will happen
*
2007-02-10 12:45:59 +03:00
* __round_jiffies ( ) rounds an absolute time in the future ( in jiffies )
2006-12-10 13:21:24 +03:00
* up or down to ( approximately ) full seconds . This is useful for timers
* for which the exact time they fire does not matter too much , as long as
* they fire approximately every X seconds .
*
* By rounding these timers to whole seconds , all such timers will fire
* at the same time , rather than at various times spread out . The goal
* of this is to have the CPU wake up less , which saves power .
*
* The exact rounding is skewed for each processor to avoid all
* processors firing at the exact same time , which could lead
* to lock contention or spurious cache line bouncing .
*
2007-02-10 12:45:59 +03:00
* The return value is the rounded version of the @ j parameter .
2006-12-10 13:21:24 +03:00
*/
unsigned long __round_jiffies ( unsigned long j , int cpu )
{
int rem ;
unsigned long original = j ;
/*
* We don ' t want all cpus firing their timers at once hitting the
* same lock or cachelines , so we skew each extra cpu with an extra
* 3 jiffies . This 3 jiffies came originally from the mm / code which
* already did this .
* The skew is done by adding 3 * cpunr , then round , then subtract this
* extra offset again .
*/
j + = cpu * 3 ;
rem = j % HZ ;
/*
* If the target jiffie is just after a whole second ( which can happen
* due to delays of the timer irq , long irq off times etc etc ) then
* we should round down to the whole second , not up . Use 1 / 4 th second
* as cutoff for this rounding as an extreme upper bound for this .
*/
if ( rem < HZ / 4 ) /* round down */
j = j - rem ;
else /* round up */
j = j - rem + HZ ;
/* now that we have rounded, subtract the extra skew again */
j - = cpu * 3 ;
if ( j < = jiffies ) /* rounding ate our timeout entirely; */
return original ;
return j ;
}
EXPORT_SYMBOL_GPL ( __round_jiffies ) ;
/**
* __round_jiffies_relative - function to round jiffies to a full second
* @ j : the time in ( relative ) jiffies that should be rounded
* @ cpu : the processor number on which the timeout will happen
*
2007-02-10 12:45:59 +03:00
* __round_jiffies_relative ( ) rounds a time delta in the future ( in jiffies )
2006-12-10 13:21:24 +03:00
* up or down to ( approximately ) full seconds . This is useful for timers
* for which the exact time they fire does not matter too much , as long as
* they fire approximately every X seconds .
*
* By rounding these timers to whole seconds , all such timers will fire
* at the same time , rather than at various times spread out . The goal
* of this is to have the CPU wake up less , which saves power .
*
* The exact rounding is skewed for each processor to avoid all
* processors firing at the exact same time , which could lead
* to lock contention or spurious cache line bouncing .
*
2007-02-10 12:45:59 +03:00
* The return value is the rounded version of the @ j parameter .
2006-12-10 13:21:24 +03:00
*/
unsigned long __round_jiffies_relative ( unsigned long j , int cpu )
{
/*
* In theory the following code can skip a jiffy in case jiffies
* increments right between the addition and the later subtraction .
* However since the entire point of this function is to use approximate
* timeouts , it ' s entirely ok to not handle that .
*/
return __round_jiffies ( j + jiffies , cpu ) - jiffies ;
}
EXPORT_SYMBOL_GPL ( __round_jiffies_relative ) ;
/**
* round_jiffies - function to round jiffies to a full second
* @ j : the time in ( absolute ) jiffies that should be rounded
*
2007-02-10 12:45:59 +03:00
* round_jiffies ( ) rounds an absolute time in the future ( in jiffies )
2006-12-10 13:21:24 +03:00
* up or down to ( approximately ) full seconds . This is useful for timers
* for which the exact time they fire does not matter too much , as long as
* they fire approximately every X seconds .
*
* By rounding these timers to whole seconds , all such timers will fire
* at the same time , rather than at various times spread out . The goal
* of this is to have the CPU wake up less , which saves power .
*
2007-02-10 12:45:59 +03:00
* The return value is the rounded version of the @ j parameter .
2006-12-10 13:21:24 +03:00
*/
unsigned long round_jiffies ( unsigned long j )
{
return __round_jiffies ( j , raw_smp_processor_id ( ) ) ;
}
EXPORT_SYMBOL_GPL ( round_jiffies ) ;
/**
* round_jiffies_relative - function to round jiffies to a full second
* @ j : the time in ( relative ) jiffies that should be rounded
*
2007-02-10 12:45:59 +03:00
* round_jiffies_relative ( ) rounds a time delta in the future ( in jiffies )
2006-12-10 13:21:24 +03:00
* up or down to ( approximately ) full seconds . This is useful for timers
* for which the exact time they fire does not matter too much , as long as
* they fire approximately every X seconds .
*
* By rounding these timers to whole seconds , all such timers will fire
* at the same time , rather than at various times spread out . The goal
* of this is to have the CPU wake up less , which saves power .
*
2007-02-10 12:45:59 +03:00
* The return value is the rounded version of the @ j parameter .
2006-12-10 13:21:24 +03:00
*/
unsigned long round_jiffies_relative ( unsigned long j )
{
return __round_jiffies_relative ( j , raw_smp_processor_id ( ) ) ;
}
EXPORT_SYMBOL_GPL ( round_jiffies_relative ) ;
2005-04-17 02:20:36 +04:00
static inline void set_running_timer ( tvec_base_t * base ,
struct timer_list * timer )
{
# ifdef CONFIG_SMP
2006-03-31 14:30:30 +04:00
base - > running_timer = timer ;
2005-04-17 02:20:36 +04:00
# endif
}
static void internal_add_timer ( tvec_base_t * base , struct timer_list * timer )
{
unsigned long expires = timer - > expires ;
unsigned long idx = expires - base - > timer_jiffies ;
struct list_head * vec ;
if ( idx < TVR_SIZE ) {
int i = expires & TVR_MASK ;
vec = base - > tv1 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + TVN_BITS ) ) {
int i = ( expires > > TVR_BITS ) & TVN_MASK ;
vec = base - > tv2 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + 2 * TVN_BITS ) ) {
int i = ( expires > > ( TVR_BITS + TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv3 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + 3 * TVN_BITS ) ) {
int i = ( expires > > ( TVR_BITS + 2 * TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv4 . vec + i ;
} else if ( ( signed long ) idx < 0 ) {
/*
* Can happen if you add a timer with expires = = jiffies ,
* or you set a timer to go off in the past
*/
vec = base - > tv1 . vec + ( base - > timer_jiffies & TVR_MASK ) ;
} else {
int i ;
/* If the timeout is larger than 0xffffffff on 64-bit
* architectures then we use the maximum timeout :
*/
if ( idx > 0xffffffffUL ) {
idx = 0xffffffffUL ;
expires = idx + base - > timer_jiffies ;
}
i = ( expires > > ( TVR_BITS + 3 * TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv5 . vec + i ;
}
/*
* Timers are FIFO :
*/
list_add_tail ( & timer - > entry , vec ) ;
}
2006-09-29 12:59:46 +04:00
/**
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* init_timer - initialize a timer .
* @ timer : the timer to be initialized
*
* init_timer ( ) must be done to a timer prior calling * any * of the
* other timer functions .
*/
void fastcall init_timer ( struct timer_list * timer )
{
timer - > entry . next = NULL ;
[PATCH] Define __raw_get_cpu_var and use it
There are several instances of per_cpu(foo, raw_smp_processor_id()), which
is semantically equivalent to __get_cpu_var(foo) but without the warning
that smp_processor_id() can give if CONFIG_DEBUG_PREEMPT is enabled. For
those architectures with optimized per-cpu implementations, namely ia64,
powerpc, s390, sparc64 and x86_64, per_cpu() turns into more and slower
code than __get_cpu_var(), so it would be preferable to use __get_cpu_var
on those platforms.
This defines a __raw_get_cpu_var(x) macro which turns into per_cpu(x,
raw_smp_processor_id()) on architectures that use the generic per-cpu
implementation, and turns into __get_cpu_var(x) on the architectures that
have an optimized per-cpu implementation.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-25 16:47:14 +04:00
timer - > base = __raw_get_cpu_var ( tvec_bases ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
}
EXPORT_SYMBOL ( init_timer ) ;
static inline void detach_timer ( struct timer_list * timer ,
int clear_pending )
{
struct list_head * entry = & timer - > entry ;
__list_del ( entry - > prev , entry - > next ) ;
if ( clear_pending )
entry - > next = NULL ;
entry - > prev = LIST_POISON2 ;
}
/*
2006-03-31 14:30:30 +04:00
* We are using hashed locking : holding per_cpu ( tvec_bases ) . lock
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* means that all timers which are tied to this base via timer - > base are
* locked , and the base itself is locked too .
*
* So __run_timers / migrate_timers can safely modify all timers which could
* be found on - > tvX lists .
*
* When the timer ' s base is locked , and the timer removed from list , it is
* possible to set timer - > base = NULL and drop the lock : the timer remains
* locked .
*/
2006-03-31 14:30:30 +04:00
static tvec_base_t * lock_timer_base ( struct timer_list * timer ,
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
unsigned long * flags )
2006-09-29 12:59:36 +04:00
__acquires ( timer - > base - > lock )
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
for ( ; ; ) {
base = timer - > base ;
if ( likely ( base ! = NULL ) ) {
spin_lock_irqsave ( & base - > lock , * flags ) ;
if ( likely ( base = = timer - > base ) )
return base ;
/* The timer has migrated to another CPU */
spin_unlock_irqrestore ( & base - > lock , * flags ) ;
}
cpu_relax ( ) ;
}
}
2005-04-17 02:20:36 +04:00
int __mod_timer ( struct timer_list * timer , unsigned long expires )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base , * new_base ;
2005-04-17 02:20:36 +04:00
unsigned long flags ;
int ret = 0 ;
BUG_ON ( ! timer - > function ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
base = lock_timer_base ( timer , & flags ) ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 0 ) ;
ret = 1 ;
}
2006-03-24 14:15:54 +03:00
new_base = __get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
2006-03-31 14:30:30 +04:00
if ( base ! = new_base ) {
2005-04-17 02:20:36 +04:00
/*
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* We are trying to schedule the timer on the local CPU .
* However we can ' t change timer ' s base while it is running ,
* otherwise del_timer_sync ( ) can ' t detect that the timer ' s
* handler yet has not finished . This also guarantees that
* the timer is serialized wrt itself .
2005-04-17 02:20:36 +04:00
*/
2006-03-31 14:30:31 +04:00
if ( likely ( base - > running_timer ! = timer ) ) {
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
/* See the comment in lock_timer_base() */
timer - > base = NULL ;
spin_unlock ( & base - > lock ) ;
2006-03-31 14:30:31 +04:00
base = new_base ;
spin_lock ( & base - > lock ) ;
timer - > base = base ;
2005-04-17 02:20:36 +04:00
}
}
timer - > expires = expires ;
2006-03-31 14:30:31 +04:00
internal_add_timer ( base , timer ) ;
spin_unlock_irqrestore ( & base - > lock , flags ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
EXPORT_SYMBOL ( __mod_timer ) ;
2006-09-29 12:59:46 +04:00
/**
2005-04-17 02:20:36 +04:00
* add_timer_on - start a timer on a particular CPU
* @ timer : the timer to be added
* @ cpu : the CPU to start it on
*
* This is not very scalable on SMP . Double adds are not possible .
*/
void add_timer_on ( struct timer_list * timer , int cpu )
{
2006-03-24 14:15:54 +03:00
tvec_base_t * base = per_cpu ( tvec_bases , cpu ) ;
2005-04-17 02:20:36 +04:00
unsigned long flags ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
2005-04-17 02:20:36 +04:00
BUG_ON ( timer_pending ( timer ) | | ! timer - > function ) ;
2006-03-31 14:30:30 +04:00
spin_lock_irqsave ( & base - > lock , flags ) ;
timer - > base = base ;
2005-04-17 02:20:36 +04:00
internal_add_timer ( base , timer ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irqrestore ( & base - > lock , flags ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-29 12:59:46 +04:00
/**
2005-04-17 02:20:36 +04:00
* mod_timer - modify a timer ' s timeout
* @ timer : the timer to be modified
2006-09-29 12:59:46 +04:00
* @ expires : new timeout in jiffies
2005-04-17 02:20:36 +04:00
*
2007-02-10 12:45:59 +03:00
* mod_timer ( ) is a more efficient way to update the expire field of an
2005-04-17 02:20:36 +04:00
* active timer ( if the timer is inactive it will be activated )
*
* mod_timer ( timer , expires ) is equivalent to :
*
* del_timer ( timer ) ; timer - > expires = expires ; add_timer ( timer ) ;
*
* Note that if there are multiple unserialized concurrent users of the
* same timer , then mod_timer ( ) is the only safe way to modify the timeout ,
* since add_timer ( ) cannot modify an already running timer .
*
* The function returns whether it has modified a pending timer or not .
* ( ie . mod_timer ( ) of an inactive timer returns 0 , mod_timer ( ) of an
* active timer returns 1. )
*/
int mod_timer ( struct timer_list * timer , unsigned long expires )
{
BUG_ON ( ! timer - > function ) ;
/*
* This is a common optimization triggered by the
* networking code - if the timer is re - modified
* to be the same thing then just return :
*/
if ( timer - > expires = = expires & & timer_pending ( timer ) )
return 1 ;
return __mod_timer ( timer , expires ) ;
}
EXPORT_SYMBOL ( mod_timer ) ;
2006-09-29 12:59:46 +04:00
/**
2005-04-17 02:20:36 +04:00
* del_timer - deactive a timer .
* @ timer : the timer to be deactivated
*
* del_timer ( ) deactivates a timer - this works on both active and inactive
* timers .
*
* The function returns whether it has deactivated a pending timer or not .
* ( ie . del_timer ( ) of an inactive timer returns 0 , del_timer ( ) of an
* active timer returns 1. )
*/
int del_timer ( struct timer_list * timer )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
2005-04-17 02:20:36 +04:00
unsigned long flags ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
int ret = 0 ;
2005-04-17 02:20:36 +04:00
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
if ( timer_pending ( timer ) ) {
base = lock_timer_base ( timer , & flags ) ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 1 ) ;
ret = 1 ;
}
2005-04-17 02:20:36 +04:00
spin_unlock_irqrestore ( & base - > lock , flags ) ;
}
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
return ret ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( del_timer ) ;
# ifdef CONFIG_SMP
2006-09-29 12:59:46 +04:00
/**
* try_to_del_timer_sync - Try to deactivate a timer
* @ timer : timer do del
*
2005-06-23 11:08:59 +04:00
* This function tries to deactivate a timer . Upon successful ( ret > = 0 )
* exit the timer is not queued and the handler is not running on any CPU .
*
* It must not be called from interrupt contexts .
*/
int try_to_del_timer_sync ( struct timer_list * timer )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
2005-06-23 11:08:59 +04:00
unsigned long flags ;
int ret = - 1 ;
base = lock_timer_base ( timer , & flags ) ;
if ( base - > running_timer = = timer )
goto out ;
ret = 0 ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 1 ) ;
ret = 1 ;
}
out :
spin_unlock_irqrestore ( & base - > lock , flags ) ;
return ret ;
}
2006-09-29 12:59:46 +04:00
/**
2005-04-17 02:20:36 +04:00
* del_timer_sync - deactivate a timer and wait for the handler to finish .
* @ timer : the timer to be deactivated
*
* This function only differs from del_timer ( ) on SMP : besides deactivating
* the timer it also makes sure the handler has finished executing on other
* CPUs .
*
2007-02-10 12:45:59 +03:00
* Synchronization rules : Callers must prevent restarting of the timer ,
2005-04-17 02:20:36 +04:00
* otherwise this function is meaningless . It must not be called from
* interrupt contexts . The caller must not hold locks which would prevent
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* completion of the timer ' s handler . The timer ' s handler must not call
* add_timer_on ( ) . Upon exit the timer is not queued and the handler is
* not running on any CPU .
2005-04-17 02:20:36 +04:00
*
* The function returns whether it has deactivated a pending timer or not .
*/
int del_timer_sync ( struct timer_list * timer )
{
2005-06-23 11:08:59 +04:00
for ( ; ; ) {
int ret = try_to_del_timer_sync ( timer ) ;
if ( ret > = 0 )
return ret ;
2006-07-14 11:24:06 +04:00
cpu_relax ( ) ;
2005-06-23 11:08:59 +04:00
}
2005-04-17 02:20:36 +04:00
}
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
EXPORT_SYMBOL ( del_timer_sync ) ;
2005-04-17 02:20:36 +04:00
# endif
static int cascade ( tvec_base_t * base , tvec_t * tv , int index )
{
/* cascade all the timers from tv up one level */
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
struct timer_list * timer , * tmp ;
struct list_head tv_list ;
list_replace_init ( tv - > vec + index , & tv_list ) ;
2005-04-17 02:20:36 +04:00
/*
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
* We are removing _all_ timers from the list , so we
* don ' t have to detach them individually .
2005-04-17 02:20:36 +04:00
*/
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
list_for_each_entry_safe ( timer , tmp , & tv_list , entry ) {
BUG_ON ( timer - > base ! = base ) ;
internal_add_timer ( base , timer ) ;
2005-04-17 02:20:36 +04:00
}
return index ;
}
2006-09-29 12:59:46 +04:00
# define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
/**
2005-04-17 02:20:36 +04:00
* __run_timers - run all expired timers ( if any ) on this CPU .
* @ base : the timer vector to be processed .
*
* This function cascades all vectors and executes all expired timer
* vectors .
*/
static inline void __run_timers ( tvec_base_t * base )
{
struct timer_list * timer ;
2006-03-31 14:30:30 +04:00
spin_lock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
while ( time_after_eq ( jiffies , base - > timer_jiffies ) ) {
2006-06-23 13:05:55 +04:00
struct list_head work_list ;
2005-04-17 02:20:36 +04:00
struct list_head * head = & work_list ;
int index = base - > timer_jiffies & TVR_MASK ;
2006-06-23 13:05:55 +04:00
2005-04-17 02:20:36 +04:00
/*
* Cascade timers :
*/
if ( ! index & &
( ! cascade ( base , & base - > tv2 , INDEX ( 0 ) ) ) & &
( ! cascade ( base , & base - > tv3 , INDEX ( 1 ) ) ) & &
! cascade ( base , & base - > tv4 , INDEX ( 2 ) ) )
cascade ( base , & base - > tv5 , INDEX ( 3 ) ) ;
2006-06-23 13:05:55 +04:00
+ + base - > timer_jiffies ;
list_replace_init ( base - > tv1 . vec + index , & work_list ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
while ( ! list_empty ( head ) ) {
2005-04-17 02:20:36 +04:00
void ( * fn ) ( unsigned long ) ;
unsigned long data ;
timer = list_entry ( head - > next , struct timer_list , entry ) ;
fn = timer - > function ;
data = timer - > data ;
set_running_timer ( base , timer ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
detach_timer ( timer , 1 ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
{
2005-06-23 11:09:09 +04:00
int preempt_count = preempt_count ( ) ;
2005-04-17 02:20:36 +04:00
fn ( data ) ;
if ( preempt_count ! = preempt_count ( ) ) {
2005-06-23 11:09:09 +04:00
printk ( KERN_WARNING " huh, entered %p "
" with preempt_count %08x, exited "
" with %08x? \n " ,
fn , preempt_count ,
preempt_count ( ) ) ;
2005-04-17 02:20:36 +04:00
BUG ( ) ;
}
}
2006-03-31 14:30:30 +04:00
spin_lock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
}
}
set_running_timer ( base , NULL ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
}
# ifdef CONFIG_NO_IDLE_HZ
/*
* Find out when the next timer event is due to happen . This
* is used on S / 390 to stop all activity when a cpus is idle .
* This functions needs to be called disabled .
*/
2007-02-16 12:27:46 +03:00
static unsigned long __next_timer_interrupt ( tvec_base_t * base )
2005-04-17 02:20:36 +04:00
{
2007-02-16 12:27:46 +03:00
unsigned long timer_jiffies = base - > timer_jiffies ;
unsigned long expires = timer_jiffies + ( LONG_MAX > > 1 ) ;
int index , slot , array , found = 0 ;
2005-04-17 02:20:36 +04:00
struct timer_list * nte ;
tvec_t * varray [ 4 ] ;
/* Look for timer events in tv1. */
2007-02-16 12:27:46 +03:00
index = slot = timer_jiffies & TVR_MASK ;
2005-04-17 02:20:36 +04:00
do {
2007-02-16 12:27:46 +03:00
list_for_each_entry ( nte , base - > tv1 . vec + slot , entry ) {
found = 1 ;
2005-04-17 02:20:36 +04:00
expires = nte - > expires ;
2007-02-16 12:27:46 +03:00
/* Look at the cascade bucket(s)? */
if ( ! index | | slot < index )
goto cascade ;
return expires ;
2005-04-17 02:20:36 +04:00
}
2007-02-16 12:27:46 +03:00
slot = ( slot + 1 ) & TVR_MASK ;
} while ( slot ! = index ) ;
cascade :
/* Calculate the next cascade event */
if ( index )
timer_jiffies + = TVR_SIZE - index ;
timer_jiffies > > = TVR_BITS ;
2005-04-17 02:20:36 +04:00
/* Check tv2-tv5. */
varray [ 0 ] = & base - > tv2 ;
varray [ 1 ] = & base - > tv3 ;
varray [ 2 ] = & base - > tv4 ;
varray [ 3 ] = & base - > tv5 ;
2007-02-16 12:27:46 +03:00
for ( array = 0 ; array < 4 ; array + + ) {
tvec_t * varp = varray [ array ] ;
index = slot = timer_jiffies & TVN_MASK ;
2005-04-17 02:20:36 +04:00
do {
2007-02-16 12:27:46 +03:00
list_for_each_entry ( nte , varp - > vec + slot , entry ) {
found = 1 ;
2005-04-17 02:20:36 +04:00
if ( time_before ( nte - > expires , expires ) )
expires = nte - > expires ;
2007-02-16 12:27:46 +03:00
}
/*
* Do we still search for the first timer or are
* we looking up the cascade buckets ?
*/
if ( found ) {
/* Look at the cascade bucket(s)? */
if ( ! index | | slot < index )
break ;
return expires ;
}
slot = ( slot + 1 ) & TVN_MASK ;
} while ( slot ! = index ) ;
if ( index )
timer_jiffies + = TVN_SIZE - index ;
timer_jiffies > > = TVN_BITS ;
2005-04-17 02:20:36 +04:00
}
2007-02-16 12:27:46 +03:00
return expires ;
}
2006-03-07 02:42:45 +03:00
2007-02-16 12:27:46 +03:00
/*
* Check , if the next hrtimer event is before the next timer wheel
* event :
*/
static unsigned long cmp_next_hrtimer_event ( unsigned long now ,
unsigned long expires )
{
ktime_t hr_delta = hrtimer_get_next_event ( ) ;
struct timespec tsdelta ;
if ( hr_delta . tv64 = = KTIME_MAX )
return expires ;
2006-05-21 02:00:24 +04:00
2007-02-16 12:27:46 +03:00
if ( hr_delta . tv64 < = TICK_NSEC )
return now ;
2006-03-07 02:42:45 +03:00
2007-02-16 12:27:46 +03:00
tsdelta = ktime_to_timespec ( hr_delta ) ;
now + = timespec_to_jiffies ( & tsdelta ) ;
if ( time_before ( now , expires ) )
return now ;
2005-04-17 02:20:36 +04:00
return expires ;
}
2007-02-16 12:27:46 +03:00
/**
* next_timer_interrupt - return the jiffy of the next pending timer
*/
unsigned long next_timer_interrupt ( void )
{
tvec_base_t * base = __get_cpu_var ( tvec_bases ) ;
unsigned long expires , now = jiffies ;
spin_lock ( & base - > lock ) ;
expires = __next_timer_interrupt ( base ) ;
spin_unlock ( & base - > lock ) ;
if ( time_before_eq ( expires , now ) )
return now ;
return cmp_next_hrtimer_event ( now , expires ) ;
}
2005-04-17 02:20:36 +04:00
# endif
/******************************************************************/
/*
* The current time
* wall_to_monotonic is what we need to add to xtime ( or xtime corrected
* for sub jiffie times ) to get to monotonic time . Monotonic is pegged
* at zero at system boot time , so wall_to_monotonic will be negative ,
* however , we will ALWAYS keep the tv_nsec part positive so we can use
* the usual normalization .
*/
struct timespec xtime __attribute__ ( ( aligned ( 16 ) ) ) ;
struct timespec wall_to_monotonic __attribute__ ( ( aligned ( 16 ) ) ) ;
EXPORT_SYMBOL ( xtime ) ;
2006-02-17 02:30:23 +03:00
2006-06-26 11:25:06 +04:00
/* XXX - all of this timekeeping code should be later moved to time.c */
# include <linux/clocksource.h>
static struct clocksource * clock ; /* pointer to current clocksource */
2006-06-26 11:25:08 +04:00
# ifdef CONFIG_GENERIC_TIME
/**
* __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
*
* private function , must hold xtime_lock lock when being
* called . Returns the number of nanoseconds since the
* last call to update_wall_time ( ) ( adjusted by NTP scaling )
*/
static inline s64 __get_nsec_offset ( void )
{
cycle_t cycle_now , cycle_delta ;
s64 ns_offset ;
/* read clocksource: */
2006-06-26 11:25:14 +04:00
cycle_now = clocksource_read ( clock ) ;
2006-06-26 11:25:08 +04:00
/* calculate the delta since the last update_wall_time: */
2006-06-26 11:25:18 +04:00
cycle_delta = ( cycle_now - clock - > cycle_last ) & clock - > mask ;
2006-06-26 11:25:08 +04:00
/* convert to nanoseconds: */
ns_offset = cyc2ns ( clock , cycle_delta ) ;
return ns_offset ;
}
/**
* __get_realtime_clock_ts - Returns the time of day in a timespec
* @ ts : pointer to the timespec to be set
*
* Returns the time of day in a timespec . Used by
* do_gettimeofday ( ) and get_realtime_clock_ts ( ) .
*/
static inline void __get_realtime_clock_ts ( struct timespec * ts )
{
unsigned long seq ;
s64 nsecs ;
do {
seq = read_seqbegin ( & xtime_lock ) ;
* ts = xtime ;
nsecs = __get_nsec_offset ( ) ;
} while ( read_seqretry ( & xtime_lock , seq ) ) ;
timespec_add_ns ( ts , nsecs ) ;
}
/**
2006-06-26 11:25:14 +04:00
* getnstimeofday - Returns the time of day in a timespec
2006-06-26 11:25:08 +04:00
* @ ts : pointer to the timespec to be set
*
* Returns the time of day in a timespec .
*/
void getnstimeofday ( struct timespec * ts )
{
__get_realtime_clock_ts ( ts ) ;
}
EXPORT_SYMBOL ( getnstimeofday ) ;
/**
* do_gettimeofday - Returns the time of day in a timeval
* @ tv : pointer to the timeval to be set
*
* NOTE : Users should be converted to using get_realtime_clock_ts ( )
*/
void do_gettimeofday ( struct timeval * tv )
{
struct timespec now ;
__get_realtime_clock_ts ( & now ) ;
tv - > tv_sec = now . tv_sec ;
tv - > tv_usec = now . tv_nsec / 1000 ;
}
EXPORT_SYMBOL ( do_gettimeofday ) ;
/**
* do_settimeofday - Sets the time of day
* @ tv : pointer to the timespec variable containing the new time
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
int do_settimeofday ( struct timespec * tv )
{
unsigned long flags ;
time_t wtm_sec , sec = tv - > tv_sec ;
long wtm_nsec , nsec = tv - > tv_nsec ;
if ( ( unsigned long ) tv - > tv_nsec > = NSEC_PER_SEC )
return - EINVAL ;
write_seqlock_irqsave ( & xtime_lock , flags ) ;
nsec - = __get_nsec_offset ( ) ;
wtm_sec = wall_to_monotonic . tv_sec + ( xtime . tv_sec - sec ) ;
wtm_nsec = wall_to_monotonic . tv_nsec + ( xtime . tv_nsec - nsec ) ;
set_normalized_timespec ( & xtime , sec , nsec ) ;
set_normalized_timespec ( & wall_to_monotonic , wtm_sec , wtm_nsec ) ;
2006-07-10 15:44:32 +04:00
clock - > error = 0 ;
2006-06-26 11:25:08 +04:00
ntp_clear ( ) ;
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
/* signal hrtimers about time change */
clock_was_set ( ) ;
return 0 ;
}
EXPORT_SYMBOL ( do_settimeofday ) ;
/**
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
*/
2007-02-16 12:27:43 +03:00
static void change_clocksource ( void )
2006-06-26 11:25:08 +04:00
{
struct clocksource * new ;
cycle_t now ;
u64 nsec ;
2007-02-16 12:27:43 +03:00
2006-06-26 11:25:14 +04:00
new = clocksource_get_next ( ) ;
2007-02-16 12:27:43 +03:00
if ( clock = = new )
return ;
now = clocksource_read ( new ) ;
nsec = __get_nsec_offset ( ) ;
timespec_add_ns ( & xtime , nsec ) ;
clock = new ;
clock - > cycle_last = now ;
clock - > error = 0 ;
clock - > xtime_nsec = 0 ;
clocksource_calculate_interval ( clock , NTP_INTERVAL_LENGTH ) ;
printk ( KERN_INFO " Time: %s clocksource has been installed. \n " ,
clock - > name ) ;
2006-06-26 11:25:08 +04:00
}
# else
2007-02-16 12:27:43 +03:00
static inline void change_clocksource ( void ) { }
2006-06-26 11:25:08 +04:00
# endif
/**
* timeofday_is_continuous - check to see if timekeeping is free running
*/
int timekeeping_is_continuous ( void )
{
unsigned long seq ;
int ret ;
do {
seq = read_seqbegin ( & xtime_lock ) ;
2007-02-16 12:27:43 +03:00
ret = clock - > flags & CLOCK_SOURCE_VALID_FOR_HRES ;
2006-06-26 11:25:08 +04:00
} while ( read_seqretry ( & xtime_lock , seq ) ) ;
return ret ;
}
2007-02-16 12:27:30 +03:00
/**
* read_persistent_clock - Return time in seconds from the persistent clock .
*
* Weak dummy function for arches that do not yet support it .
* Returns seconds from epoch using the battery backed persistent clock .
* Returns zero if unsupported .
*
* XXX - Do be sure to remove it once all arches implement it .
*/
unsigned long __attribute__ ( ( weak ) ) read_persistent_clock ( void )
{
return 0 ;
}
2005-04-17 02:20:36 +04:00
/*
2006-06-26 11:25:06 +04:00
* timekeeping_init - Initializes the clocksource and common timekeeping values
2005-04-17 02:20:36 +04:00
*/
2006-06-26 11:25:06 +04:00
void __init timekeeping_init ( void )
2005-04-17 02:20:36 +04:00
{
2006-06-26 11:25:06 +04:00
unsigned long flags ;
2007-02-16 12:27:30 +03:00
unsigned long sec = read_persistent_clock ( ) ;
2006-06-26 11:25:06 +04:00
write_seqlock_irqsave ( & xtime_lock , flags ) ;
2006-10-01 10:28:22 +04:00
ntp_clear ( ) ;
2006-06-26 11:25:14 +04:00
clock = clocksource_get_next ( ) ;
2007-02-16 12:27:26 +03:00
clocksource_calculate_interval ( clock , NTP_INTERVAL_LENGTH ) ;
2006-06-26 11:25:18 +04:00
clock - > cycle_last = clocksource_read ( clock ) ;
2006-10-01 10:28:22 +04:00
2007-02-16 12:27:30 +03:00
xtime . tv_sec = sec ;
xtime . tv_nsec = 0 ;
set_normalized_timespec ( & wall_to_monotonic ,
- xtime . tv_sec , - xtime . tv_nsec ) ;
2006-06-26 11:25:06 +04:00
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
}
2007-02-16 12:27:30 +03:00
/* flag for if timekeeping is suspended */
2006-07-14 11:24:17 +04:00
static int timekeeping_suspended ;
2007-02-16 12:27:30 +03:00
/* time in seconds when suspend began */
static unsigned long timekeeping_suspend_time ;
2006-09-29 12:59:46 +04:00
/**
2006-06-26 11:25:06 +04:00
* timekeeping_resume - Resumes the generic timekeeping subsystem .
* @ dev : unused
*
* This is for the generic clocksource timekeeping .
2006-10-01 10:28:31 +04:00
* xtime / wall_to_monotonic / jiffies / etc are
2006-06-26 11:25:06 +04:00
* still managed by arch specific suspend / resume code .
*/
static int timekeeping_resume ( struct sys_device * dev )
{
unsigned long flags ;
2007-02-16 12:27:30 +03:00
unsigned long now = read_persistent_clock ( ) ;
2006-06-26 11:25:06 +04:00
write_seqlock_irqsave ( & xtime_lock , flags ) ;
2007-02-16 12:27:30 +03:00
if ( now & & ( now > timekeeping_suspend_time ) ) {
unsigned long sleep_length = now - timekeeping_suspend_time ;
xtime . tv_sec + = sleep_length ;
wall_to_monotonic . tv_sec - = sleep_length ;
}
/* re-base the last cycle value */
2006-06-26 11:25:18 +04:00
clock - > cycle_last = clocksource_read ( clock ) ;
2006-07-14 11:24:17 +04:00
clock - > error = 0 ;
timekeeping_suspended = 0 ;
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
2007-02-16 12:27:30 +03:00
touch_softlockup_watchdog ( ) ;
hrtimer_notify_resume ( ) ;
2006-07-14 11:24:17 +04:00
return 0 ;
}
static int timekeeping_suspend ( struct sys_device * dev , pm_message_t state )
{
unsigned long flags ;
write_seqlock_irqsave ( & xtime_lock , flags ) ;
timekeeping_suspended = 1 ;
2007-02-16 12:27:30 +03:00
timekeeping_suspend_time = read_persistent_clock ( ) ;
2006-06-26 11:25:06 +04:00
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
return 0 ;
}
/* sysfs resume/suspend bits for timekeeping */
static struct sysdev_class timekeeping_sysclass = {
. resume = timekeeping_resume ,
2006-07-14 11:24:17 +04:00
. suspend = timekeeping_suspend ,
2006-06-26 11:25:06 +04:00
set_kset_name ( " timekeeping " ) ,
} ;
static struct sys_device device_timer = {
. id = 0 ,
. cls = & timekeeping_sysclass ,
} ;
static int __init timekeeping_init_device ( void )
{
int error = sysdev_class_register ( & timekeeping_sysclass ) ;
if ( ! error )
error = sysdev_register ( & device_timer ) ;
return error ;
}
device_initcall ( timekeeping_init_device ) ;
2006-06-26 11:25:18 +04:00
/*
2006-07-10 15:44:32 +04:00
* If the error is already larger , we look ahead even further
2006-06-26 11:25:18 +04:00
* to compensate for late or lost adjustments .
*/
2006-12-10 13:21:33 +03:00
static __always_inline int clocksource_bigadjust ( s64 error , s64 * interval ,
s64 * offset )
2006-06-26 11:25:18 +04:00
{
2006-07-10 15:44:32 +04:00
s64 tick_error , i ;
u32 look_ahead , adj ;
s32 error2 , mult ;
2006-06-26 11:25:18 +04:00
/*
2006-07-10 15:44:32 +04:00
* Use the current error value to determine how much to look ahead .
* The larger the error the slower we adjust for it to avoid problems
* with losing too many ticks , otherwise we would overadjust and
* produce an even larger error . The smaller the adjustment the
* faster we try to adjust for it , as lost ticks can do less harm
* here . This is tuned so that an error of about 1 msec is adusted
* within about 1 sec ( or 2 ^ 20 nsec in 2 ^ SHIFT_HZ ticks ) .
2006-06-26 11:25:18 +04:00
*/
2006-07-10 15:44:32 +04:00
error2 = clock - > error > > ( TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ ) ;
error2 = abs ( error2 ) ;
for ( look_ahead = 0 ; error2 > 0 ; look_ahead + + )
error2 > > = 2 ;
2006-06-26 11:25:18 +04:00
/*
2006-07-10 15:44:32 +04:00
* Now calculate the error in ( 1 < < look_ahead ) ticks , but first
* remove the single look ahead already included in the error .
2006-06-26 11:25:18 +04:00
*/
2006-12-10 13:21:33 +03:00
tick_error = current_tick_length ( ) > >
( TICK_LENGTH_SHIFT - clock - > shift + 1 ) ;
2006-07-10 15:44:32 +04:00
tick_error - = clock - > xtime_interval > > 1 ;
error = ( ( error - tick_error ) > > look_ahead ) + tick_error ;
/* Finally calculate the adjustment shift value. */
i = * interval ;
mult = 1 ;
if ( error < 0 ) {
error = - error ;
* interval = - * interval ;
* offset = - * offset ;
mult = - 1 ;
2006-06-26 11:25:18 +04:00
}
2006-07-10 15:44:32 +04:00
for ( adj = 0 ; error > i ; adj + + )
error > > = 1 ;
2006-06-26 11:25:18 +04:00
* interval < < = adj ;
* offset < < = adj ;
2006-07-10 15:44:32 +04:00
return mult < < adj ;
2006-06-26 11:25:18 +04:00
}
/*
* Adjust the multiplier to reduce the error value ,
* this is optimized for the most common adjustments of - 1 , 0 , 1 ,
* for other values we can do a bit more work .
*/
static void clocksource_adjust ( struct clocksource * clock , s64 offset )
{
s64 error , interval = clock - > cycle_interval ;
int adj ;
error = clock - > error > > ( TICK_LENGTH_SHIFT - clock - > shift - 1 ) ;
if ( error > interval ) {
2006-07-10 15:44:32 +04:00
error > > = 2 ;
if ( likely ( error < = interval ) )
adj = 1 ;
else
adj = clocksource_bigadjust ( error , & interval , & offset ) ;
2006-06-26 11:25:18 +04:00
} else if ( error < - interval ) {
2006-07-10 15:44:32 +04:00
error > > = 2 ;
if ( likely ( error > = - interval ) ) {
adj = - 1 ;
interval = - interval ;
offset = - offset ;
} else
adj = clocksource_bigadjust ( error , & interval , & offset ) ;
2006-06-26 11:25:18 +04:00
} else
return ;
clock - > mult + = adj ;
clock - > xtime_interval + = interval ;
clock - > xtime_nsec - = offset ;
2006-12-10 13:21:33 +03:00
clock - > error - = ( interval - offset ) < <
( TICK_LENGTH_SHIFT - clock - > shift ) ;
2006-06-26 11:25:18 +04:00
}
2006-09-29 12:59:46 +04:00
/**
2006-06-26 11:25:06 +04:00
* update_wall_time - Uses the current clocksource to increment the wall time
*
* Called from the timer interrupt , must hold a write on xtime_lock .
*/
static void update_wall_time ( void )
{
2006-06-26 11:25:18 +04:00
cycle_t offset ;
2006-06-26 11:25:06 +04:00
2006-07-14 11:24:17 +04:00
/* Make sure we're fully resumed: */
if ( unlikely ( timekeeping_suspended ) )
return ;
2006-06-26 11:25:07 +04:00
2006-06-26 11:25:18 +04:00
# ifdef CONFIG_GENERIC_TIME
offset = ( clocksource_read ( clock ) - clock - > cycle_last ) & clock - > mask ;
# else
offset = clock - > cycle_interval ;
# endif
2006-07-14 11:24:17 +04:00
clock - > xtime_nsec + = ( s64 ) xtime . tv_nsec < < clock - > shift ;
2006-06-26 11:25:06 +04:00
/* normally this loop will run just once, however in the
* case of lost or late ticks , it will accumulate correctly .
*/
2006-06-26 11:25:18 +04:00
while ( offset > = clock - > cycle_interval ) {
2006-06-26 11:25:06 +04:00
/* accumulate one interval */
2006-06-26 11:25:18 +04:00
clock - > xtime_nsec + = clock - > xtime_interval ;
clock - > cycle_last + = clock - > cycle_interval ;
offset - = clock - > cycle_interval ;
if ( clock - > xtime_nsec > = ( u64 ) NSEC_PER_SEC < < clock - > shift ) {
clock - > xtime_nsec - = ( u64 ) NSEC_PER_SEC < < clock - > shift ;
xtime . tv_sec + + ;
second_overflow ( ) ;
}
2006-06-26 11:25:06 +04:00
2006-06-26 11:25:07 +04:00
/* interpolator bits */
2006-06-26 11:25:18 +04:00
time_interpolator_update ( clock - > xtime_interval
2006-06-26 11:25:07 +04:00
> > clock - > shift ) ;
/* accumulate error between NTP and clock interval */
2006-06-26 11:25:18 +04:00
clock - > error + = current_tick_length ( ) ;
clock - > error - = clock - > xtime_interval < < ( TICK_LENGTH_SHIFT - clock - > shift ) ;
}
2006-06-26 11:25:07 +04:00
2006-06-26 11:25:18 +04:00
/* correct the clock when NTP error is too big */
clocksource_adjust ( clock , offset ) ;
2006-06-26 11:25:07 +04:00
/* store full nanoseconds into xtime */
2006-07-10 15:44:32 +04:00
xtime . tv_nsec = ( s64 ) clock - > xtime_nsec > > clock - > shift ;
2006-06-26 11:25:18 +04:00
clock - > xtime_nsec - = ( s64 ) xtime . tv_nsec < < clock - > shift ;
2006-06-26 11:25:08 +04:00
/* check to see if there is a new clocksource to use */
2007-02-16 12:27:43 +03:00
change_clocksource ( ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Called from the timer interrupt handler to charge one tick to the current
* process . user_tick is 1 if the tick is user time , 0 for system .
*/
void update_process_times ( int user_tick )
{
struct task_struct * p = current ;
int cpu = smp_processor_id ( ) ;
/* Note: this timer irq context must be accounted for as well. */
if ( user_tick )
account_user_time ( p , jiffies_to_cputime ( 1 ) ) ;
else
account_system_time ( p , HARDIRQ_OFFSET , jiffies_to_cputime ( 1 ) ) ;
run_local_timers ( ) ;
if ( rcu_pending ( cpu ) )
rcu_check_callbacks ( cpu , user_tick ) ;
scheduler_tick ( ) ;
run_posix_cpu_timers ( p ) ;
}
/*
* Nr of active tasks - counted in fixed - point numbers
*/
static unsigned long count_active_tasks ( void )
{
2006-03-31 14:31:21 +04:00
return nr_active ( ) * FIXED_1 ;
2005-04-17 02:20:36 +04:00
}
/*
* Hmm . . Changed this , as the GNU make sources ( load . c ) seems to
* imply that avenrun [ ] is the standard name for this kind of thing .
* Nothing else seems to be standardized : the fractional size etc
* all seem to differ on different machines .
*
* Requires xtime_lock to access .
*/
unsigned long avenrun [ 3 ] ;
EXPORT_SYMBOL ( avenrun ) ;
/*
* calc_load - given tick count , update the avenrun load estimates .
* This is called while holding a write_lock on xtime_lock .
*/
static inline void calc_load ( unsigned long ticks )
{
unsigned long active_tasks ; /* fixed-point */
static int count = LOAD_FREQ ;
2006-12-13 11:35:45 +03:00
count - = ticks ;
if ( unlikely ( count < 0 ) ) {
active_tasks = count_active_tasks ( ) ;
do {
CALC_LOAD ( avenrun [ 0 ] , EXP_1 , active_tasks ) ;
CALC_LOAD ( avenrun [ 1 ] , EXP_5 , active_tasks ) ;
CALC_LOAD ( avenrun [ 2 ] , EXP_15 , active_tasks ) ;
count + = LOAD_FREQ ;
} while ( count < 0 ) ;
2005-04-17 02:20:36 +04:00
}
}
/*
* This read - write spinlock protects us from races in SMP while
* playing with xtime and avenrun .
*/
2007-02-13 15:26:21 +03:00
__attribute__ ( ( weak ) ) __cacheline_aligned_in_smp DEFINE_SEQLOCK ( xtime_lock ) ;
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( xtime_lock ) ;
/*
* This function runs timers and the timer - tq in bottom half context .
*/
static void run_timer_softirq ( struct softirq_action * h )
{
2006-03-24 14:15:54 +03:00
tvec_base_t * base = __get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
2006-01-10 07:52:32 +03:00
hrtimer_run_queues ( ) ;
2005-04-17 02:20:36 +04:00
if ( time_after_eq ( jiffies , base - > timer_jiffies ) )
__run_timers ( base ) ;
}
/*
* Called by the local , per - CPU timer interrupt on SMP .
*/
void run_local_timers ( void )
{
raise_softirq ( TIMER_SOFTIRQ ) ;
2006-03-24 14:18:41 +03:00
softlockup_tick ( ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Called by the timer interrupt . xtime_lock must already be taken
* by the timer IRQ !
*/
2006-09-29 13:00:32 +04:00
static inline void update_times ( unsigned long ticks )
2005-04-17 02:20:36 +04:00
{
2006-06-26 11:25:06 +04:00
update_wall_time ( ) ;
2005-04-17 02:20:36 +04:00
calc_load ( ticks ) ;
}
/*
* The 64 - bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock .
* jiffies is defined in the linker script . . .
*/
2006-09-29 13:00:32 +04:00
void do_timer ( unsigned long ticks )
2005-04-17 02:20:36 +04:00
{
2006-09-29 13:00:32 +04:00
jiffies_64 + = ticks ;
update_times ( ticks ) ;
2005-04-17 02:20:36 +04:00
}
# ifdef __ARCH_WANT_SYS_ALARM
/*
* For backwards compatibility ? This can be done in libc so Alpha
* and all newer ports shouldn ' t need it .
*/
asmlinkage unsigned long sys_alarm ( unsigned int seconds )
{
2006-03-25 14:06:33 +03:00
return alarm_setitimer ( seconds ) ;
2005-04-17 02:20:36 +04:00
}
# endif
# ifndef __alpha__
/*
* The Alpha uses getxpid , getxuid , and getxgid instead . Maybe this
* should be moved into arch / i386 instead ?
*/
/**
* sys_getpid - return the thread group id of the current process
*
* Note , despite the name , this returns the tgid not the pid . The tgid and
* the pid are identical unless CLONE_THREAD was specified on clone ( ) in
* which case the tgid is the same in all threads of the same group .
*
* This is SMP safe as current - > tgid does not change .
*/
asmlinkage long sys_getpid ( void )
{
return current - > tgid ;
}
/*
2006-08-14 10:24:23 +04:00
* Accessing - > real_parent is not SMP - safe , it could
* change from under us . However , we can use a stale
* value of - > real_parent under rcu_read_lock ( ) , see
* release_task ( ) - > call_rcu ( delayed_put_task_struct ) .
2005-04-17 02:20:36 +04:00
*/
asmlinkage long sys_getppid ( void )
{
int pid ;
2006-08-14 10:24:23 +04:00
rcu_read_lock ( ) ;
pid = rcu_dereference ( current - > real_parent ) - > tgid ;
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
return pid ;
}
asmlinkage long sys_getuid ( void )
{
/* Only we change this so SMP safe */
return current - > uid ;
}
asmlinkage long sys_geteuid ( void )
{
/* Only we change this so SMP safe */
return current - > euid ;
}
asmlinkage long sys_getgid ( void )
{
/* Only we change this so SMP safe */
return current - > gid ;
}
asmlinkage long sys_getegid ( void )
{
/* Only we change this so SMP safe */
return current - > egid ;
}
# endif
static void process_timeout ( unsigned long __data )
{
2006-07-03 11:25:41 +04:00
wake_up_process ( ( struct task_struct * ) __data ) ;
2005-04-17 02:20:36 +04:00
}
/**
* schedule_timeout - sleep until timeout
* @ timeout : timeout value in jiffies
*
* Make the current task sleep until @ timeout jiffies have
* elapsed . The routine will return immediately unless
* the current task state has been set ( see set_current_state ( ) ) .
*
* You can set the task state as follows -
*
* % TASK_UNINTERRUPTIBLE - at least @ timeout jiffies are guaranteed to
* pass before the routine returns . The routine will return 0
*
* % TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task . In this case the remaining time
* in jiffies will be returned , or 0 if the timer expired in time
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns .
*
* Specifying a @ timeout value of % MAX_SCHEDULE_TIMEOUT will schedule
* the CPU away without a bound on the timeout . In this case the return
* value will be % MAX_SCHEDULE_TIMEOUT .
*
* In all cases the return value is guaranteed to be non - negative .
*/
fastcall signed long __sched schedule_timeout ( signed long timeout )
{
struct timer_list timer ;
unsigned long expire ;
switch ( timeout )
{
case MAX_SCHEDULE_TIMEOUT :
/*
* These two special cases are useful to be comfortable
* in the caller . Nothing more . We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I ' d like to return a valid offset ( > = 0 ) to allow
* the caller to do everything it want with the retval .
*/
schedule ( ) ;
goto out ;
default :
/*
* Another bit of PARANOID . Note that the retval will be
* 0 since no piece of kernel is supposed to do a check
* for a negative retval of schedule_timeout ( ) ( since it
* should never happens anyway ) . You just have the printk ( )
* that will tell you if something is gone wrong and where .
*/
2006-12-22 12:10:14 +03:00
if ( timeout < 0 ) {
2005-04-17 02:20:36 +04:00
printk ( KERN_ERR " schedule_timeout: wrong timeout "
2006-12-22 12:10:14 +03:00
" value %lx \n " , timeout ) ;
dump_stack ( ) ;
2005-04-17 02:20:36 +04:00
current - > state = TASK_RUNNING ;
goto out ;
}
}
expire = timeout + jiffies ;
2005-10-31 02:01:38 +03:00
setup_timer ( & timer , process_timeout , ( unsigned long ) current ) ;
__mod_timer ( & timer , expire ) ;
2005-04-17 02:20:36 +04:00
schedule ( ) ;
del_singleshot_timer_sync ( & timer ) ;
timeout = expire - jiffies ;
out :
return timeout < 0 ? 0 : timeout ;
}
EXPORT_SYMBOL ( schedule_timeout ) ;
2005-09-13 12:25:15 +04:00
/*
* We can use __set_current_state ( ) here because schedule_timeout ( ) calls
* schedule ( ) unconditionally .
*/
2005-09-10 11:27:21 +04:00
signed long __sched schedule_timeout_interruptible ( signed long timeout )
{
2005-10-31 02:01:42 +03:00
__set_current_state ( TASK_INTERRUPTIBLE ) ;
return schedule_timeout ( timeout ) ;
2005-09-10 11:27:21 +04:00
}
EXPORT_SYMBOL ( schedule_timeout_interruptible ) ;
signed long __sched schedule_timeout_uninterruptible ( signed long timeout )
{
2005-10-31 02:01:42 +03:00
__set_current_state ( TASK_UNINTERRUPTIBLE ) ;
return schedule_timeout ( timeout ) ;
2005-09-10 11:27:21 +04:00
}
EXPORT_SYMBOL ( schedule_timeout_uninterruptible ) ;
2005-04-17 02:20:36 +04:00
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid ( void )
{
return current - > pid ;
}
2006-09-29 12:59:46 +04:00
/**
2007-02-10 12:46:00 +03:00
* do_sysinfo - fill in sysinfo struct
2006-09-29 12:59:46 +04:00
* @ info : pointer to buffer to fill
2005-04-17 02:20:36 +04:00
*/
2007-02-10 12:46:00 +03:00
int do_sysinfo ( struct sysinfo * info )
2005-04-17 02:20:36 +04:00
{
unsigned long mem_total , sav_total ;
unsigned int mem_unit , bitcount ;
unsigned long seq ;
2007-02-10 12:46:00 +03:00
memset ( info , 0 , sizeof ( struct sysinfo ) ) ;
2005-04-17 02:20:36 +04:00
do {
struct timespec tp ;
seq = read_seqbegin ( & xtime_lock ) ;
/*
* This is annoying . The below is the same thing
* posix_get_clock_monotonic ( ) does , but it wants to
* take the lock which we want to cover the loads stuff
* too .
*/
getnstimeofday ( & tp ) ;
tp . tv_sec + = wall_to_monotonic . tv_sec ;
tp . tv_nsec + = wall_to_monotonic . tv_nsec ;
if ( tp . tv_nsec - NSEC_PER_SEC > = 0 ) {
tp . tv_nsec = tp . tv_nsec - NSEC_PER_SEC ;
tp . tv_sec + + ;
}
2007-02-10 12:46:00 +03:00
info - > uptime = tp . tv_sec + ( tp . tv_nsec ? 1 : 0 ) ;
2005-04-17 02:20:36 +04:00
2007-02-10 12:46:00 +03:00
info - > loads [ 0 ] = avenrun [ 0 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
info - > loads [ 1 ] = avenrun [ 1 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
info - > loads [ 2 ] = avenrun [ 2 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
2005-04-17 02:20:36 +04:00
2007-02-10 12:46:00 +03:00
info - > procs = nr_threads ;
2005-04-17 02:20:36 +04:00
} while ( read_seqretry ( & xtime_lock , seq ) ) ;
2007-02-10 12:46:00 +03:00
si_meminfo ( info ) ;
si_swapinfo ( info ) ;
2005-04-17 02:20:36 +04:00
/*
* If the sum of all the available memory ( i . e . ram + swap )
* is less than can be stored in a 32 bit unsigned long then
* we can be binary compatible with 2.2 . x kernels . If not ,
* well , in that case 2.2 . x was broken anyways . . .
*
* - Erik Andersen < andersee @ debian . org >
*/
2007-02-10 12:46:00 +03:00
mem_total = info - > totalram + info - > totalswap ;
if ( mem_total < info - > totalram | | mem_total < info - > totalswap )
2005-04-17 02:20:36 +04:00
goto out ;
bitcount = 0 ;
2007-02-10 12:46:00 +03:00
mem_unit = info - > mem_unit ;
2005-04-17 02:20:36 +04:00
while ( mem_unit > 1 ) {
bitcount + + ;
mem_unit > > = 1 ;
sav_total = mem_total ;
mem_total < < = 1 ;
if ( mem_total < sav_total )
goto out ;
}
/*
* If mem_total did not overflow , multiply all memory values by
2007-02-10 12:46:00 +03:00
* info - > mem_unit and set it to 1. This leaves things compatible
2005-04-17 02:20:36 +04:00
* with 2.2 . x , and also retains compatibility with earlier 2.4 . x
* kernels . . .
*/
2007-02-10 12:46:00 +03:00
info - > mem_unit = 1 ;
info - > totalram < < = bitcount ;
info - > freeram < < = bitcount ;
info - > sharedram < < = bitcount ;
info - > bufferram < < = bitcount ;
info - > totalswap < < = bitcount ;
info - > freeswap < < = bitcount ;
info - > totalhigh < < = bitcount ;
info - > freehigh < < = bitcount ;
out :
return 0 ;
}
asmlinkage long sys_sysinfo ( struct sysinfo __user * info )
{
struct sysinfo val ;
do_sysinfo ( & val ) ;
2005-04-17 02:20:36 +04:00
if ( copy_to_user ( info , & val , sizeof ( struct sysinfo ) ) )
return - EFAULT ;
return 0 ;
}
2006-07-03 11:25:10 +04:00
/*
* lockdep : we want to track each per - CPU base as a separate lock - class ,
* but timer - bases are kmalloc ( ) - ed , so we need to attach separate
* keys to them :
*/
static struct lock_class_key base_lock_keys [ NR_CPUS ] ;
2006-03-24 14:15:54 +03:00
static int __devinit init_timers_cpu ( int cpu )
2005-04-17 02:20:36 +04:00
{
int j ;
tvec_base_t * base ;
2006-04-11 09:53:58 +04:00
static char __devinitdata tvec_base_done [ NR_CPUS ] ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
2006-04-11 09:53:58 +04:00
if ( ! tvec_base_done [ cpu ] ) {
2006-03-24 14:15:54 +03:00
static char boot_done ;
if ( boot_done ) {
2006-04-11 09:53:58 +04:00
/*
* The APs use this path later in boot
*/
2006-03-24 14:15:54 +03:00
base = kmalloc_node ( sizeof ( * base ) , GFP_KERNEL ,
cpu_to_node ( cpu ) ) ;
if ( ! base )
return - ENOMEM ;
memset ( base , 0 , sizeof ( * base ) ) ;
2006-04-11 09:53:58 +04:00
per_cpu ( tvec_bases , cpu ) = base ;
2006-03-24 14:15:54 +03:00
} else {
2006-04-11 09:53:58 +04:00
/*
* This is for the boot CPU - we use compile - time
* static initialisation because per - cpu memory isn ' t
* ready yet and because the memory allocators are not
* initialised either .
*/
2006-03-24 14:15:54 +03:00
boot_done = 1 ;
2006-04-11 09:53:58 +04:00
base = & boot_tvec_bases ;
2006-03-24 14:15:54 +03:00
}
2006-04-11 09:53:58 +04:00
tvec_base_done [ cpu ] = 1 ;
} else {
base = per_cpu ( tvec_bases , cpu ) ;
2006-03-24 14:15:54 +03:00
}
2006-04-11 09:53:58 +04:00
2006-03-31 14:30:30 +04:00
spin_lock_init ( & base - > lock ) ;
2006-07-03 11:25:10 +04:00
lockdep_set_class ( & base - > lock , base_lock_keys + cpu ) ;
2005-04-17 02:20:36 +04:00
for ( j = 0 ; j < TVN_SIZE ; j + + ) {
INIT_LIST_HEAD ( base - > tv5 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv4 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv3 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv2 . vec + j ) ;
}
for ( j = 0 ; j < TVR_SIZE ; j + + )
INIT_LIST_HEAD ( base - > tv1 . vec + j ) ;
base - > timer_jiffies = jiffies ;
2006-03-24 14:15:54 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
# ifdef CONFIG_HOTPLUG_CPU
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
static void migrate_timer_list ( tvec_base_t * new_base , struct list_head * head )
2005-04-17 02:20:36 +04:00
{
struct timer_list * timer ;
while ( ! list_empty ( head ) ) {
timer = list_entry ( head - > next , struct timer_list , entry ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
detach_timer ( timer , 0 ) ;
2006-03-31 14:30:30 +04:00
timer - > base = new_base ;
2005-04-17 02:20:36 +04:00
internal_add_timer ( new_base , timer ) ;
}
}
static void __devinit migrate_timers ( int cpu )
{
tvec_base_t * old_base ;
tvec_base_t * new_base ;
int i ;
BUG_ON ( cpu_online ( cpu ) ) ;
2006-03-24 14:15:54 +03:00
old_base = per_cpu ( tvec_bases , cpu ) ;
new_base = get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
local_irq_disable ( ) ;
2006-03-31 14:30:30 +04:00
spin_lock ( & new_base - > lock ) ;
spin_lock ( & old_base - > lock ) ;
BUG_ON ( old_base - > running_timer ) ;
2005-04-17 02:20:36 +04:00
for ( i = 0 ; i < TVR_SIZE ; i + + )
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
migrate_timer_list ( new_base , old_base - > tv1 . vec + i ) ;
for ( i = 0 ; i < TVN_SIZE ; i + + ) {
migrate_timer_list ( new_base , old_base - > tv2 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv3 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv4 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv5 . vec + i ) ;
}
2006-03-31 14:30:30 +04:00
spin_unlock ( & old_base - > lock ) ;
spin_unlock ( & new_base - > lock ) ;
2005-04-17 02:20:36 +04:00
local_irq_enable ( ) ;
put_cpu_var ( tvec_bases ) ;
}
# endif /* CONFIG_HOTPLUG_CPU */
2006-07-30 14:03:35 +04:00
static int __cpuinit timer_cpu_notify ( struct notifier_block * self ,
2005-04-17 02:20:36 +04:00
unsigned long action , void * hcpu )
{
long cpu = ( long ) hcpu ;
switch ( action ) {
case CPU_UP_PREPARE :
2006-03-24 14:15:54 +03:00
if ( init_timers_cpu ( cpu ) < 0 )
return NOTIFY_BAD ;
2005-04-17 02:20:36 +04:00
break ;
# ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD :
migrate_timers ( cpu ) ;
break ;
# endif
default :
break ;
}
return NOTIFY_OK ;
}
2006-07-30 14:03:35 +04:00
static struct notifier_block __cpuinitdata timers_nb = {
2005-04-17 02:20:36 +04:00
. notifier_call = timer_cpu_notify ,
} ;
void __init init_timers ( void )
{
2006-09-29 13:00:22 +04:00
int err = timer_cpu_notify ( & timers_nb , ( unsigned long ) CPU_UP_PREPARE ,
2005-04-17 02:20:36 +04:00
( void * ) ( long ) smp_processor_id ( ) ) ;
2006-09-29 13:00:22 +04:00
BUG_ON ( err = = NOTIFY_BAD ) ;
2005-04-17 02:20:36 +04:00
register_cpu_notifier ( & timers_nb ) ;
open_softirq ( TIMER_SOFTIRQ , run_timer_softirq , NULL ) ;
}
# ifdef CONFIG_TIME_INTERPOLATION
2006-03-17 10:04:00 +03:00
struct time_interpolator * time_interpolator __read_mostly ;
static struct time_interpolator * time_interpolator_list __read_mostly ;
2005-04-17 02:20:36 +04:00
static DEFINE_SPINLOCK ( time_interpolator_lock ) ;
2007-02-10 12:45:40 +03:00
static inline cycles_t time_interpolator_get_cycles ( unsigned int src )
2005-04-17 02:20:36 +04:00
{
unsigned long ( * x ) ( void ) ;
switch ( src )
{
case TIME_SOURCE_FUNCTION :
x = time_interpolator - > addr ;
return x ( ) ;
case TIME_SOURCE_MMIO64 :
2006-03-02 13:54:35 +03:00
return readq_relaxed ( ( void __iomem * ) time_interpolator - > addr ) ;
2005-04-17 02:20:36 +04:00
case TIME_SOURCE_MMIO32 :
2006-03-02 13:54:35 +03:00
return readl_relaxed ( ( void __iomem * ) time_interpolator - > addr ) ;
2005-04-17 02:20:36 +04:00
default : return get_cycles ( ) ;
}
}
2005-09-07 02:17:04 +04:00
static inline u64 time_interpolator_get_counter ( int writelock )
2005-04-17 02:20:36 +04:00
{
unsigned int src = time_interpolator - > source ;
if ( time_interpolator - > jitter )
{
2007-02-10 12:45:40 +03:00
cycles_t lcycle ;
cycles_t now ;
2005-04-17 02:20:36 +04:00
do {
lcycle = time_interpolator - > last_cycle ;
now = time_interpolator_get_cycles ( src ) ;
if ( lcycle & & time_after ( lcycle , now ) )
return lcycle ;
2005-09-07 02:17:04 +04:00
/* When holding the xtime write lock, there's no need
* to add the overhead of the cmpxchg . Readers are
* force to retry until the write lock is released .
*/
if ( writelock ) {
time_interpolator - > last_cycle = now ;
return now ;
}
2005-04-17 02:20:36 +04:00
/* Keep track of the last timer value returned. The use of cmpxchg here
* will cause contention in an SMP environment .
*/
} while ( unlikely ( cmpxchg ( & time_interpolator - > last_cycle , lcycle , now ) ! = lcycle ) ) ;
return now ;
}
else
return time_interpolator_get_cycles ( src ) ;
}
void time_interpolator_reset ( void )
{
time_interpolator - > offset = 0 ;
2005-09-07 02:17:04 +04:00
time_interpolator - > last_counter = time_interpolator_get_counter ( 1 ) ;
2005-04-17 02:20:36 +04:00
}
# define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
unsigned long time_interpolator_get_offset ( void )
{
/* If we do not have a time interpolator set up then just return zero */
if ( ! time_interpolator )
return 0 ;
return time_interpolator - > offset +
2005-09-07 02:17:04 +04:00
GET_TI_NSECS ( time_interpolator_get_counter ( 0 ) , time_interpolator ) ;
2005-04-17 02:20:36 +04:00
}
# define INTERPOLATOR_ADJUST 65536
# define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
2006-10-01 10:28:22 +04:00
void time_interpolator_update ( long delta_nsec )
2005-04-17 02:20:36 +04:00
{
u64 counter ;
unsigned long offset ;
/* If there is no time interpolator set up then do nothing */
if ( ! time_interpolator )
return ;
2005-10-31 02:01:42 +03:00
/*
* The interpolator compensates for late ticks by accumulating the late
* time in time_interpolator - > offset . A tick earlier than expected will
* lead to a reset of the offset and a corresponding jump of the clock
* forward . Again this only works if the interpolator clock is running
* slightly slower than the regular clock and the tuning logic insures
* that .
*/
2005-04-17 02:20:36 +04:00
2005-09-07 02:17:04 +04:00
counter = time_interpolator_get_counter ( 1 ) ;
2005-10-31 02:01:42 +03:00
offset = time_interpolator - > offset +
GET_TI_NSECS ( counter , time_interpolator ) ;
2005-04-17 02:20:36 +04:00
if ( delta_nsec < 0 | | ( unsigned long ) delta_nsec < offset )
time_interpolator - > offset = offset - delta_nsec ;
else {
time_interpolator - > skips + + ;
time_interpolator - > ns_skipped + = delta_nsec - offset ;
time_interpolator - > offset = 0 ;
}
time_interpolator - > last_counter = counter ;
/* Tuning logic for time interpolator invoked every minute or so.
* Decrease interpolator clock speed if no skips occurred and an offset is carried .
* Increase interpolator clock speed if we skip too much time .
*/
if ( jiffies % INTERPOLATOR_ADJUST = = 0 )
{
2006-04-07 21:50:18 +04:00
if ( time_interpolator - > skips = = 0 & & time_interpolator - > offset > tick_nsec )
2005-04-17 02:20:36 +04:00
time_interpolator - > nsec_per_cyc - - ;
if ( time_interpolator - > ns_skipped > INTERPOLATOR_MAX_SKIP & & time_interpolator - > offset = = 0 )
time_interpolator - > nsec_per_cyc + + ;
time_interpolator - > skips = 0 ;
time_interpolator - > ns_skipped = 0 ;
}
}
static inline int
is_better_time_interpolator ( struct time_interpolator * new )
{
if ( ! time_interpolator )
return 1 ;
return new - > frequency > 2 * time_interpolator - > frequency | |
( unsigned long ) new - > drift < ( unsigned long ) time_interpolator - > drift ;
}
void
register_time_interpolator ( struct time_interpolator * ti )
{
unsigned long flags ;
/* Sanity check */
2006-04-02 15:45:55 +04:00
BUG_ON ( ti - > frequency = = 0 | | ti - > mask = = 0 ) ;
2005-04-17 02:20:36 +04:00
ti - > nsec_per_cyc = ( ( u64 ) NSEC_PER_SEC < < ti - > shift ) / ti - > frequency ;
spin_lock ( & time_interpolator_lock ) ;
write_seqlock_irqsave ( & xtime_lock , flags ) ;
if ( is_better_time_interpolator ( ti ) ) {
time_interpolator = ti ;
time_interpolator_reset ( ) ;
}
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
ti - > next = time_interpolator_list ;
time_interpolator_list = ti ;
spin_unlock ( & time_interpolator_lock ) ;
}
void
unregister_time_interpolator ( struct time_interpolator * ti )
{
struct time_interpolator * curr , * * prev ;
unsigned long flags ;
spin_lock ( & time_interpolator_lock ) ;
prev = & time_interpolator_list ;
for ( curr = * prev ; curr ; curr = curr - > next ) {
if ( curr = = ti ) {
* prev = curr - > next ;
break ;
}
prev = & curr - > next ;
}
write_seqlock_irqsave ( & xtime_lock , flags ) ;
if ( ti = = time_interpolator ) {
/* we lost the best time-interpolator: */
time_interpolator = NULL ;
/* find the next-best interpolator */
for ( curr = time_interpolator_list ; curr ; curr = curr - > next )
if ( is_better_time_interpolator ( curr ) )
time_interpolator = curr ;
time_interpolator_reset ( ) ;
}
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
spin_unlock ( & time_interpolator_lock ) ;
}
# endif /* CONFIG_TIME_INTERPOLATION */
/**
* msleep - sleep safely even with waitqueue interruptions
* @ msecs : Time in milliseconds to sleep for
*/
void msleep ( unsigned int msecs )
{
unsigned long timeout = msecs_to_jiffies ( msecs ) + 1 ;
2005-09-10 11:27:24 +04:00
while ( timeout )
timeout = schedule_timeout_uninterruptible ( timeout ) ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( msleep ) ;
/**
2005-06-26 01:58:43 +04:00
* msleep_interruptible - sleep waiting for signals
2005-04-17 02:20:36 +04:00
* @ msecs : Time in milliseconds to sleep for
*/
unsigned long msleep_interruptible ( unsigned int msecs )
{
unsigned long timeout = msecs_to_jiffies ( msecs ) + 1 ;
2005-09-10 11:27:24 +04:00
while ( timeout & & ! signal_pending ( current ) )
timeout = schedule_timeout_interruptible ( timeout ) ;
2005-04-17 02:20:36 +04:00
return jiffies_to_msecs ( timeout ) ;
}
EXPORT_SYMBOL ( msleep_interruptible ) ;