2005-04-17 02:20:36 +04:00
/*
* linux / kernel / timer . c
*
* Kernel internal timers , kernel timekeeping , basic process system calls
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
*
* 1997 - 01 - 28 Modified by Finn Arne Gangstad to make timers scale better .
*
* 1997 - 09 - 10 Updated NTP code according to technical memorandum Jan ' 96
* " A Kernel Model for Precision Timekeeping " by Dave Mills
* 1998 - 12 - 24 Fixed a xtime SMP race ( we need the xtime_lock rw spinlock to
* serialize accesses to xtime / lost_ticks ) .
* Copyright ( C ) 1998 Andrea Arcangeli
* 1999 - 03 - 10 Improved NTP compatibility by Ulrich Windl
* 2002 - 05 - 31 Move sys_sysinfo here and make its locking sane , Robert Love
* 2000 - 10 - 05 Implemented scalable SMP per - CPU timer handling .
* Copyright ( C ) 2000 , 2001 , 2002 Ingo Molnar
* Designed by David S . Miller , Alexey Kuznetsov and Ingo Molnar
*/
# include <linux/kernel_stat.h>
# include <linux/module.h>
# include <linux/interrupt.h>
# include <linux/percpu.h>
# include <linux/init.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/notifier.h>
# include <linux/thread_info.h>
# include <linux/time.h>
# include <linux/jiffies.h>
# include <linux/posix-timers.h>
# include <linux/cpu.h>
# include <linux/syscalls.h>
2006-01-08 12:02:17 +03:00
# include <linux/delay.h>
2005-04-17 02:20:36 +04:00
# include <asm/uaccess.h>
# include <asm/unistd.h>
# include <asm/div64.h>
# include <asm/timex.h>
# include <asm/io.h>
# ifdef CONFIG_TIME_INTERPOLATION
static void time_interpolator_update ( long delta_nsec ) ;
# else
# define time_interpolator_update(x)
# endif
2005-10-31 02:03:00 +03:00
u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES ;
EXPORT_SYMBOL ( jiffies_64 ) ;
2005-04-17 02:20:36 +04:00
/*
* per - CPU timer vector definitions :
*/
# define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
# define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
# define TVN_SIZE (1 << TVN_BITS)
# define TVR_SIZE (1 << TVR_BITS)
# define TVN_MASK (TVN_SIZE - 1)
# define TVR_MASK (TVR_SIZE - 1)
typedef struct tvec_s {
struct list_head vec [ TVN_SIZE ] ;
} tvec_t ;
typedef struct tvec_root_s {
struct list_head vec [ TVR_SIZE ] ;
} tvec_root_t ;
struct tvec_t_base_s {
2006-03-31 14:30:30 +04:00
spinlock_t lock ;
struct timer_list * running_timer ;
2005-04-17 02:20:36 +04:00
unsigned long timer_jiffies ;
tvec_root_t tv1 ;
tvec_t tv2 ;
tvec_t tv3 ;
tvec_t tv4 ;
tvec_t tv5 ;
} ____cacheline_aligned_in_smp ;
typedef struct tvec_t_base_s tvec_base_t ;
2006-04-11 09:53:58 +04:00
2006-03-31 14:30:30 +04:00
tvec_base_t boot_tvec_bases ;
EXPORT_SYMBOL ( boot_tvec_bases ) ;
2006-07-30 14:04:14 +04:00
static DEFINE_PER_CPU ( tvec_base_t * , tvec_bases ) = & boot_tvec_bases ;
2005-04-17 02:20:36 +04:00
static inline void set_running_timer ( tvec_base_t * base ,
struct timer_list * timer )
{
# ifdef CONFIG_SMP
2006-03-31 14:30:30 +04:00
base - > running_timer = timer ;
2005-04-17 02:20:36 +04:00
# endif
}
static void internal_add_timer ( tvec_base_t * base , struct timer_list * timer )
{
unsigned long expires = timer - > expires ;
unsigned long idx = expires - base - > timer_jiffies ;
struct list_head * vec ;
if ( idx < TVR_SIZE ) {
int i = expires & TVR_MASK ;
vec = base - > tv1 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + TVN_BITS ) ) {
int i = ( expires > > TVR_BITS ) & TVN_MASK ;
vec = base - > tv2 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + 2 * TVN_BITS ) ) {
int i = ( expires > > ( TVR_BITS + TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv3 . vec + i ;
} else if ( idx < 1 < < ( TVR_BITS + 3 * TVN_BITS ) ) {
int i = ( expires > > ( TVR_BITS + 2 * TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv4 . vec + i ;
} else if ( ( signed long ) idx < 0 ) {
/*
* Can happen if you add a timer with expires = = jiffies ,
* or you set a timer to go off in the past
*/
vec = base - > tv1 . vec + ( base - > timer_jiffies & TVR_MASK ) ;
} else {
int i ;
/* If the timeout is larger than 0xffffffff on 64-bit
* architectures then we use the maximum timeout :
*/
if ( idx > 0xffffffffUL ) {
idx = 0xffffffffUL ;
expires = idx + base - > timer_jiffies ;
}
i = ( expires > > ( TVR_BITS + 3 * TVN_BITS ) ) & TVN_MASK ;
vec = base - > tv5 . vec + i ;
}
/*
* Timers are FIFO :
*/
list_add_tail ( & timer - > entry , vec ) ;
}
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
/***
* init_timer - initialize a timer .
* @ timer : the timer to be initialized
*
* init_timer ( ) must be done to a timer prior calling * any * of the
* other timer functions .
*/
void fastcall init_timer ( struct timer_list * timer )
{
timer - > entry . next = NULL ;
[PATCH] Define __raw_get_cpu_var and use it
There are several instances of per_cpu(foo, raw_smp_processor_id()), which
is semantically equivalent to __get_cpu_var(foo) but without the warning
that smp_processor_id() can give if CONFIG_DEBUG_PREEMPT is enabled. For
those architectures with optimized per-cpu implementations, namely ia64,
powerpc, s390, sparc64 and x86_64, per_cpu() turns into more and slower
code than __get_cpu_var(), so it would be preferable to use __get_cpu_var
on those platforms.
This defines a __raw_get_cpu_var(x) macro which turns into per_cpu(x,
raw_smp_processor_id()) on architectures that use the generic per-cpu
implementation, and turns into __get_cpu_var(x) on the architectures that
have an optimized per-cpu implementation.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-25 16:47:14 +04:00
timer - > base = __raw_get_cpu_var ( tvec_bases ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
}
EXPORT_SYMBOL ( init_timer ) ;
static inline void detach_timer ( struct timer_list * timer ,
int clear_pending )
{
struct list_head * entry = & timer - > entry ;
__list_del ( entry - > prev , entry - > next ) ;
if ( clear_pending )
entry - > next = NULL ;
entry - > prev = LIST_POISON2 ;
}
/*
2006-03-31 14:30:30 +04:00
* We are using hashed locking : holding per_cpu ( tvec_bases ) . lock
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* means that all timers which are tied to this base via timer - > base are
* locked , and the base itself is locked too .
*
* So __run_timers / migrate_timers can safely modify all timers which could
* be found on - > tvX lists .
*
* When the timer ' s base is locked , and the timer removed from list , it is
* possible to set timer - > base = NULL and drop the lock : the timer remains
* locked .
*/
2006-03-31 14:30:30 +04:00
static tvec_base_t * lock_timer_base ( struct timer_list * timer ,
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
unsigned long * flags )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
for ( ; ; ) {
base = timer - > base ;
if ( likely ( base ! = NULL ) ) {
spin_lock_irqsave ( & base - > lock , * flags ) ;
if ( likely ( base = = timer - > base ) )
return base ;
/* The timer has migrated to another CPU */
spin_unlock_irqrestore ( & base - > lock , * flags ) ;
}
cpu_relax ( ) ;
}
}
2005-04-17 02:20:36 +04:00
int __mod_timer ( struct timer_list * timer , unsigned long expires )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base , * new_base ;
2005-04-17 02:20:36 +04:00
unsigned long flags ;
int ret = 0 ;
BUG_ON ( ! timer - > function ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
base = lock_timer_base ( timer , & flags ) ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 0 ) ;
ret = 1 ;
}
2006-03-24 14:15:54 +03:00
new_base = __get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
2006-03-31 14:30:30 +04:00
if ( base ! = new_base ) {
2005-04-17 02:20:36 +04:00
/*
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* We are trying to schedule the timer on the local CPU .
* However we can ' t change timer ' s base while it is running ,
* otherwise del_timer_sync ( ) can ' t detect that the timer ' s
* handler yet has not finished . This also guarantees that
* the timer is serialized wrt itself .
2005-04-17 02:20:36 +04:00
*/
2006-03-31 14:30:31 +04:00
if ( likely ( base - > running_timer ! = timer ) ) {
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
/* See the comment in lock_timer_base() */
timer - > base = NULL ;
spin_unlock ( & base - > lock ) ;
2006-03-31 14:30:31 +04:00
base = new_base ;
spin_lock ( & base - > lock ) ;
timer - > base = base ;
2005-04-17 02:20:36 +04:00
}
}
timer - > expires = expires ;
2006-03-31 14:30:31 +04:00
internal_add_timer ( base , timer ) ;
spin_unlock_irqrestore ( & base - > lock , flags ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
EXPORT_SYMBOL ( __mod_timer ) ;
/***
* add_timer_on - start a timer on a particular CPU
* @ timer : the timer to be added
* @ cpu : the CPU to start it on
*
* This is not very scalable on SMP . Double adds are not possible .
*/
void add_timer_on ( struct timer_list * timer , int cpu )
{
2006-03-24 14:15:54 +03:00
tvec_base_t * base = per_cpu ( tvec_bases , cpu ) ;
2005-04-17 02:20:36 +04:00
unsigned long flags ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
2005-04-17 02:20:36 +04:00
BUG_ON ( timer_pending ( timer ) | | ! timer - > function ) ;
2006-03-31 14:30:30 +04:00
spin_lock_irqsave ( & base - > lock , flags ) ;
timer - > base = base ;
2005-04-17 02:20:36 +04:00
internal_add_timer ( base , timer ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irqrestore ( & base - > lock , flags ) ;
2005-04-17 02:20:36 +04:00
}
/***
* mod_timer - modify a timer ' s timeout
* @ timer : the timer to be modified
*
* mod_timer is a more efficient way to update the expire field of an
* active timer ( if the timer is inactive it will be activated )
*
* mod_timer ( timer , expires ) is equivalent to :
*
* del_timer ( timer ) ; timer - > expires = expires ; add_timer ( timer ) ;
*
* Note that if there are multiple unserialized concurrent users of the
* same timer , then mod_timer ( ) is the only safe way to modify the timeout ,
* since add_timer ( ) cannot modify an already running timer .
*
* The function returns whether it has modified a pending timer or not .
* ( ie . mod_timer ( ) of an inactive timer returns 0 , mod_timer ( ) of an
* active timer returns 1. )
*/
int mod_timer ( struct timer_list * timer , unsigned long expires )
{
BUG_ON ( ! timer - > function ) ;
/*
* This is a common optimization triggered by the
* networking code - if the timer is re - modified
* to be the same thing then just return :
*/
if ( timer - > expires = = expires & & timer_pending ( timer ) )
return 1 ;
return __mod_timer ( timer , expires ) ;
}
EXPORT_SYMBOL ( mod_timer ) ;
/***
* del_timer - deactive a timer .
* @ timer : the timer to be deactivated
*
* del_timer ( ) deactivates a timer - this works on both active and inactive
* timers .
*
* The function returns whether it has deactivated a pending timer or not .
* ( ie . del_timer ( ) of an inactive timer returns 0 , del_timer ( ) of an
* active timer returns 1. )
*/
int del_timer ( struct timer_list * timer )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
2005-04-17 02:20:36 +04:00
unsigned long flags ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
int ret = 0 ;
2005-04-17 02:20:36 +04:00
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
if ( timer_pending ( timer ) ) {
base = lock_timer_base ( timer , & flags ) ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 1 ) ;
ret = 1 ;
}
2005-04-17 02:20:36 +04:00
spin_unlock_irqrestore ( & base - > lock , flags ) ;
}
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
return ret ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( del_timer ) ;
# ifdef CONFIG_SMP
2005-06-23 11:08:59 +04:00
/*
* This function tries to deactivate a timer . Upon successful ( ret > = 0 )
* exit the timer is not queued and the handler is not running on any CPU .
*
* It must not be called from interrupt contexts .
*/
int try_to_del_timer_sync ( struct timer_list * timer )
{
2006-03-31 14:30:30 +04:00
tvec_base_t * base ;
2005-06-23 11:08:59 +04:00
unsigned long flags ;
int ret = - 1 ;
base = lock_timer_base ( timer , & flags ) ;
if ( base - > running_timer = = timer )
goto out ;
ret = 0 ;
if ( timer_pending ( timer ) ) {
detach_timer ( timer , 1 ) ;
ret = 1 ;
}
out :
spin_unlock_irqrestore ( & base - > lock , flags ) ;
return ret ;
}
2005-04-17 02:20:36 +04:00
/***
* del_timer_sync - deactivate a timer and wait for the handler to finish .
* @ timer : the timer to be deactivated
*
* This function only differs from del_timer ( ) on SMP : besides deactivating
* the timer it also makes sure the handler has finished executing on other
* CPUs .
*
* Synchronization rules : callers must prevent restarting of the timer ,
* otherwise this function is meaningless . It must not be called from
* interrupt contexts . The caller must not hold locks which would prevent
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
* completion of the timer ' s handler . The timer ' s handler must not call
* add_timer_on ( ) . Upon exit the timer is not queued and the handler is
* not running on any CPU .
2005-04-17 02:20:36 +04:00
*
* The function returns whether it has deactivated a pending timer or not .
*/
int del_timer_sync ( struct timer_list * timer )
{
2005-06-23 11:08:59 +04:00
for ( ; ; ) {
int ret = try_to_del_timer_sync ( timer ) ;
if ( ret > = 0 )
return ret ;
2006-07-14 11:24:06 +04:00
cpu_relax ( ) ;
2005-06-23 11:08:59 +04:00
}
2005-04-17 02:20:36 +04:00
}
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
EXPORT_SYMBOL ( del_timer_sync ) ;
2005-04-17 02:20:36 +04:00
# endif
static int cascade ( tvec_base_t * base , tvec_t * tv , int index )
{
/* cascade all the timers from tv up one level */
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
struct timer_list * timer , * tmp ;
struct list_head tv_list ;
list_replace_init ( tv - > vec + index , & tv_list ) ;
2005-04-17 02:20:36 +04:00
/*
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
* We are removing _all_ timers from the list , so we
* don ' t have to detach them individually .
2005-04-17 02:20:36 +04:00
*/
[PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop
When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop.
Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list
base->tv5 may cascade into base->tv5. So, the kernel enters the infinite
loop in the function cascade().
I created a test module to verify this bug, and a patch to fix it.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/timer.h>
#if 0
#include <linux/kdb.h>
#else
#define kdb_printf printk
#endif
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
#define TV_SIZE(N) (N*TVN_BITS + TVR_BITS)
struct timer_list timer0;
struct timer_list dummy_timer1;
struct timer_list dummy_timer2;
void dummy_timer_fun(unsigned long data) {
}
unsigned long j=0;
void check_timer_base(unsigned long data)
{
kdb_printf("check_timer_base %08x\n",jiffies);
mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF);
}
int init_module(void)
{
init_timer(&timer0);
timer0.data = (unsigned long)0;
timer0.function = check_timer_base;
mod_timer(&timer0,jiffies+1);
init_timer(&dummy_timer1);
dummy_timer1.data = (unsigned long)0;
dummy_timer1.function = dummy_timer_fun;
init_timer(&dummy_timer2);
dummy_timer2.data = (unsigned long)0;
dummy_timer2.function = dummy_timer_fun;
j=jiffies;
j&=(~((1<<TV_SIZE(3))-1));
j+=(1<<TV_SIZE(3));
j+=(1<<TV_SIZE(4));
kdb_printf("mod_timer %08x\n",j);
mod_timer(&dummy_timer1, j );
mod_timer(&dummy_timer2, j );
return 0;
}
void cleanup_module()
{
del_timer_sync(&timer0);
del_timer_sync(&dummy_timer1);
del_timer_sync(&dummy_timer2);
}
(Cleanups from Oleg)
[oleg@tv-sign.ru: use list_replace_init()]
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:05:56 +04:00
list_for_each_entry_safe ( timer , tmp , & tv_list , entry ) {
BUG_ON ( timer - > base ! = base ) ;
internal_add_timer ( base , timer ) ;
2005-04-17 02:20:36 +04:00
}
return index ;
}
/***
* __run_timers - run all expired timers ( if any ) on this CPU .
* @ base : the timer vector to be processed .
*
* This function cascades all vectors and executes all expired timer
* vectors .
*/
2006-07-30 14:03:38 +04:00
# define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
2005-04-17 02:20:36 +04:00
static inline void __run_timers ( tvec_base_t * base )
{
struct timer_list * timer ;
2006-03-31 14:30:30 +04:00
spin_lock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
while ( time_after_eq ( jiffies , base - > timer_jiffies ) ) {
2006-06-23 13:05:55 +04:00
struct list_head work_list ;
2005-04-17 02:20:36 +04:00
struct list_head * head = & work_list ;
int index = base - > timer_jiffies & TVR_MASK ;
2006-06-23 13:05:55 +04:00
2005-04-17 02:20:36 +04:00
/*
* Cascade timers :
*/
if ( ! index & &
( ! cascade ( base , & base - > tv2 , INDEX ( 0 ) ) ) & &
( ! cascade ( base , & base - > tv3 , INDEX ( 1 ) ) ) & &
! cascade ( base , & base - > tv4 , INDEX ( 2 ) ) )
cascade ( base , & base - > tv5 , INDEX ( 3 ) ) ;
2006-06-23 13:05:55 +04:00
+ + base - > timer_jiffies ;
list_replace_init ( base - > tv1 . vec + index , & work_list ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
while ( ! list_empty ( head ) ) {
2005-04-17 02:20:36 +04:00
void ( * fn ) ( unsigned long ) ;
unsigned long data ;
timer = list_entry ( head - > next , struct timer_list , entry ) ;
fn = timer - > function ;
data = timer - > data ;
set_running_timer ( base , timer ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
detach_timer ( timer , 1 ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
{
2005-06-23 11:09:09 +04:00
int preempt_count = preempt_count ( ) ;
2005-04-17 02:20:36 +04:00
fn ( data ) ;
if ( preempt_count ! = preempt_count ( ) ) {
2005-06-23 11:09:09 +04:00
printk ( KERN_WARNING " huh, entered %p "
" with preempt_count %08x, exited "
" with %08x? \n " ,
fn , preempt_count ,
preempt_count ( ) ) ;
2005-04-17 02:20:36 +04:00
BUG ( ) ;
}
}
2006-03-31 14:30:30 +04:00
spin_lock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
}
}
set_running_timer ( base , NULL ) ;
2006-03-31 14:30:30 +04:00
spin_unlock_irq ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
}
# ifdef CONFIG_NO_IDLE_HZ
/*
* Find out when the next timer event is due to happen . This
* is used on S / 390 to stop all activity when a cpus is idle .
* This functions needs to be called disabled .
*/
unsigned long next_timer_interrupt ( void )
{
tvec_base_t * base ;
struct list_head * list ;
struct timer_list * nte ;
unsigned long expires ;
2006-03-07 02:42:45 +03:00
unsigned long hr_expires = MAX_JIFFY_OFFSET ;
ktime_t hr_delta ;
2005-04-17 02:20:36 +04:00
tvec_t * varray [ 4 ] ;
int i , j ;
2006-03-07 02:42:45 +03:00
hr_delta = hrtimer_get_next_event ( ) ;
if ( hr_delta . tv64 ! = KTIME_MAX ) {
struct timespec tsdelta ;
tsdelta = ktime_to_timespec ( hr_delta ) ;
hr_expires = timespec_to_jiffies ( & tsdelta ) ;
if ( hr_expires < 3 )
return hr_expires + jiffies ;
}
hr_expires + = jiffies ;
2006-03-24 14:15:54 +03:00
base = __get_cpu_var ( tvec_bases ) ;
2006-03-31 14:30:30 +04:00
spin_lock ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
expires = base - > timer_jiffies + ( LONG_MAX > > 1 ) ;
2006-02-01 13:56:41 +03:00
list = NULL ;
2005-04-17 02:20:36 +04:00
/* Look for timer events in tv1. */
j = base - > timer_jiffies & TVR_MASK ;
do {
list_for_each_entry ( nte , base - > tv1 . vec + j , entry ) {
expires = nte - > expires ;
if ( j < ( base - > timer_jiffies & TVR_MASK ) )
list = base - > tv2 . vec + ( INDEX ( 0 ) ) ;
goto found ;
}
j = ( j + 1 ) & TVR_MASK ;
} while ( j ! = ( base - > timer_jiffies & TVR_MASK ) ) ;
/* Check tv2-tv5. */
varray [ 0 ] = & base - > tv2 ;
varray [ 1 ] = & base - > tv3 ;
varray [ 2 ] = & base - > tv4 ;
varray [ 3 ] = & base - > tv5 ;
for ( i = 0 ; i < 4 ; i + + ) {
j = INDEX ( i ) ;
do {
if ( list_empty ( varray [ i ] - > vec + j ) ) {
j = ( j + 1 ) & TVN_MASK ;
continue ;
}
list_for_each_entry ( nte , varray [ i ] - > vec + j , entry )
if ( time_before ( nte - > expires , expires ) )
expires = nte - > expires ;
if ( j < ( INDEX ( i ) ) & & i < 3 )
list = varray [ i + 1 ] - > vec + ( INDEX ( i + 1 ) ) ;
goto found ;
} while ( j ! = ( INDEX ( i ) ) ) ;
}
found :
if ( list ) {
/*
* The search wrapped . We need to look at the next list
* from next tv element that would cascade into tv element
* where we found the timer element .
*/
list_for_each_entry ( nte , list , entry ) {
if ( time_before ( nte - > expires , expires ) )
expires = nte - > expires ;
}
}
2006-03-31 14:30:30 +04:00
spin_unlock ( & base - > lock ) ;
2006-03-07 02:42:45 +03:00
2006-05-21 02:00:24 +04:00
/*
* It can happen that other CPUs service timer IRQs and increment
* jiffies , but we have not yet got a local timer tick to process
* the timer wheels . In that case , the expiry time can be before
* jiffies , but since the high - resolution timer here is relative to
* jiffies , the default expression when high - resolution timers are
* not active ,
*
* time_before ( MAX_JIFFY_OFFSET + jiffies , expires )
*
* would falsely evaluate to true . If that is the case , just
* return jiffies so that we can immediately fire the local timer
*/
if ( time_before ( expires , jiffies ) )
return jiffies ;
2006-03-07 02:42:45 +03:00
if ( time_before ( hr_expires , expires ) )
return hr_expires ;
2005-04-17 02:20:36 +04:00
return expires ;
}
# endif
/******************************************************************/
/*
* Timekeeping variables
*/
unsigned long tick_usec = TICK_USEC ; /* USER_HZ period (usec) */
unsigned long tick_nsec = TICK_NSEC ; /* ACTHZ period (nsec) */
/*
* The current time
* wall_to_monotonic is what we need to add to xtime ( or xtime corrected
* for sub jiffie times ) to get to monotonic time . Monotonic is pegged
* at zero at system boot time , so wall_to_monotonic will be negative ,
* however , we will ALWAYS keep the tv_nsec part positive so we can use
* the usual normalization .
*/
struct timespec xtime __attribute__ ( ( aligned ( 16 ) ) ) ;
struct timespec wall_to_monotonic __attribute__ ( ( aligned ( 16 ) ) ) ;
EXPORT_SYMBOL ( xtime ) ;
/* Don't completely fail for HZ > 500. */
int tickadj = 500 / HZ ? : 1 ; /* microsecs */
/*
* phase - lock loop variables
*/
/* TIME_ERROR prevents overwriting the CMOS clock */
int time_state = TIME_OK ; /* clock synchronization status */
int time_status = STA_UNSYNC ; /* clock status bits */
long time_offset ; /* time adjustment (us) */
long time_constant = 2 ; /* pll time constant */
long time_tolerance = MAXFREQ ; /* frequency tolerance (ppm) */
long time_precision = 1 ; /* clock precision (us) */
long time_maxerror = NTP_PHASE_LIMIT ; /* maximum error (us) */
long time_esterror = NTP_PHASE_LIMIT ; /* estimated error (us) */
long time_freq = ( ( ( NSEC_PER_SEC + HZ / 2 ) % HZ - HZ / 2 ) < < SHIFT_USEC ) / NSEC_PER_USEC ;
/* frequency offset (scaled ppm)*/
static long time_adj ; /* tick adjust (scaled 1 / HZ) */
long time_reftime ; /* time at last adjustment (s) */
long time_adjust ;
long time_next_adjust ;
/*
* this routine handles the overflow of the microsecond field
*
* The tricky bits of code to handle the accurate clock support
* were provided by Dave Mills ( Mills @ UDEL . EDU ) of NTP fame .
* They were originally developed for SUN and DEC kernels .
* All the kudos should go to Dave for this stuff .
*
*/
static void second_overflow ( void )
{
2005-10-31 02:01:42 +03:00
long ltemp ;
/* Bump the maxerror field */
time_maxerror + = time_tolerance > > SHIFT_USEC ;
if ( time_maxerror > NTP_PHASE_LIMIT ) {
time_maxerror = NTP_PHASE_LIMIT ;
time_status | = STA_UNSYNC ;
2005-04-17 02:20:36 +04:00
}
2005-10-31 02:01:42 +03:00
/*
* Leap second processing . If in leap - insert state at the end of the
* day , the system clock is set back one second ; if in leap - delete
* state , the system clock is set ahead one second . The microtime ( )
* routine or external clock driver will insure that reported time is
* always monotonic . The ugly divides should be replaced .
*/
switch ( time_state ) {
case TIME_OK :
if ( time_status & STA_INS )
time_state = TIME_INS ;
else if ( time_status & STA_DEL )
time_state = TIME_DEL ;
break ;
case TIME_INS :
if ( xtime . tv_sec % 86400 = = 0 ) {
xtime . tv_sec - - ;
wall_to_monotonic . tv_sec + + ;
/*
* The timer interpolator will make time change
* gradually instead of an immediate jump by one second
*/
time_interpolator_update ( - NSEC_PER_SEC ) ;
time_state = TIME_OOP ;
clock_was_set ( ) ;
printk ( KERN_NOTICE " Clock: inserting leap second "
" 23:59:60 UTC \n " ) ;
}
break ;
case TIME_DEL :
if ( ( xtime . tv_sec + 1 ) % 86400 = = 0 ) {
xtime . tv_sec + + ;
wall_to_monotonic . tv_sec - - ;
/*
* Use of time interpolator for a gradual change of
* time
*/
time_interpolator_update ( NSEC_PER_SEC ) ;
time_state = TIME_WAIT ;
clock_was_set ( ) ;
printk ( KERN_NOTICE " Clock: deleting leap second "
" 23:59:59 UTC \n " ) ;
}
break ;
case TIME_OOP :
time_state = TIME_WAIT ;
break ;
case TIME_WAIT :
if ( ! ( time_status & ( STA_INS | STA_DEL ) ) )
time_state = TIME_OK ;
2005-04-17 02:20:36 +04:00
}
2005-10-31 02:01:42 +03:00
/*
* Compute the phase adjustment for the next second . In PLL mode , the
* offset is reduced by a fixed factor times the time constant . In FLL
* mode the offset is used directly . In either mode , the maximum phase
* adjustment for each second is clamped so as to spread the adjustment
* over not more than the number of seconds between updates .
*/
2005-04-17 02:20:36 +04:00
ltemp = time_offset ;
if ( ! ( time_status & STA_FLL ) )
2005-10-31 02:01:42 +03:00
ltemp = shift_right ( ltemp , SHIFT_KG + time_constant ) ;
ltemp = min ( ltemp , ( MAXPHASE / MINSEC ) < < SHIFT_UPDATE ) ;
ltemp = max ( ltemp , - ( MAXPHASE / MINSEC ) < < SHIFT_UPDATE ) ;
2005-04-17 02:20:36 +04:00
time_offset - = ltemp ;
time_adj = ltemp < < ( SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE ) ;
2005-10-31 02:01:42 +03:00
/*
* Compute the frequency estimate and additional phase adjustment due
2006-03-25 14:08:28 +03:00
* to frequency error for the next second .
2005-10-31 02:01:42 +03:00
*/
2006-03-25 14:08:28 +03:00
ltemp = time_freq ;
2005-10-31 02:01:42 +03:00
time_adj + = shift_right ( ltemp , ( SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE ) ) ;
2005-04-17 02:20:36 +04:00
# if HZ == 100
2005-10-31 02:01:42 +03:00
/*
* Compensate for ( HZ = = 100 ) ! = ( 1 < < SHIFT_HZ ) . Add 25 % and 3.125 % to
* get 128.125 ; = > only 0.125 % error ( p . 14 )
*/
time_adj + = shift_right ( time_adj , 2 ) + shift_right ( time_adj , 5 ) ;
2005-04-17 02:20:36 +04:00
# endif
2005-10-30 04:15:42 +03:00
# if HZ == 250
2005-10-31 02:01:42 +03:00
/*
* Compensate for ( HZ = = 250 ) ! = ( 1 < < SHIFT_HZ ) . Add 1.5625 % and
* 0.78125 % to get 255.85938 ; = > only 0.05 % error ( p . 14 )
*/
time_adj + = shift_right ( time_adj , 6 ) + shift_right ( time_adj , 7 ) ;
2005-10-30 04:15:42 +03:00
# endif
2005-04-17 02:20:36 +04:00
# if HZ == 1000
2005-10-31 02:01:42 +03:00
/*
* Compensate for ( HZ = = 1000 ) ! = ( 1 < < SHIFT_HZ ) . Add 1.5625 % and
* 0.78125 % to get 1023.4375 ; = > only 0.05 % error ( p . 14 )
*/
time_adj + = shift_right ( time_adj , 6 ) + shift_right ( time_adj , 7 ) ;
2005-04-17 02:20:36 +04:00
# endif
}
2006-02-17 02:30:23 +03:00
/*
* Returns how many microseconds we need to add to xtime this tick
* in doing an adjustment requested with adjtime .
*/
static long adjtime_adjustment ( void )
2005-04-17 02:20:36 +04:00
{
2006-02-17 02:30:23 +03:00
long time_adjust_step ;
2005-04-17 02:20:36 +04:00
2006-02-17 02:30:23 +03:00
time_adjust_step = time_adjust ;
if ( time_adjust_step ) {
2005-10-31 02:01:42 +03:00
/*
* We are doing an adjtime thing . Prepare time_adjust_step to
* be within bounds . Note that a positive time_adjust means we
* want the clock to run faster .
*
* Limit the amount of the step to be in the range
* - tickadj . . + tickadj
*/
time_adjust_step = min ( time_adjust_step , ( long ) tickadj ) ;
time_adjust_step = max ( time_adjust_step , ( long ) - tickadj ) ;
2006-02-17 02:30:23 +03:00
}
return time_adjust_step ;
}
2005-10-31 02:01:42 +03:00
2006-02-17 02:30:23 +03:00
/* in the NTP reference this is called "hardclock()" */
2006-06-26 11:25:07 +04:00
static void update_ntp_one_tick ( void )
2006-02-17 02:30:23 +03:00
{
2006-06-26 11:25:07 +04:00
long time_adjust_step ;
2006-02-17 02:30:23 +03:00
time_adjust_step = adjtime_adjustment ( ) ;
if ( time_adjust_step )
2005-10-31 02:01:42 +03:00
/* Reduce by this step the amount of time left */
time_adjust - = time_adjust_step ;
2005-04-17 02:20:36 +04:00
/* Changes by adjtime() do not take effect till next tick. */
if ( time_next_adjust ! = 0 ) {
time_adjust = time_next_adjust ;
time_next_adjust = 0 ;
}
}
2006-02-17 02:30:23 +03:00
/*
* Return how long ticks are at the moment , that is , how much time
* update_wall_time_one_tick will add to xtime next time we call it
* ( assuming no calls to do_adjtimex in the meantime ) .
2006-06-26 11:25:07 +04:00
* The return value is in fixed - point nanoseconds shifted by the
* specified number of bits to the right of the binary point .
2006-02-17 02:30:23 +03:00
* This function has no side - effects .
*/
2006-06-26 11:25:18 +04:00
u64 current_tick_length ( void )
2006-02-17 02:30:23 +03:00
{
long delta_nsec ;
2006-06-26 11:25:07 +04:00
u64 ret ;
2006-02-17 02:30:23 +03:00
2006-06-26 11:25:07 +04:00
/* calculate the finest interval NTP will allow.
* ie : nanosecond value shifted by ( SHIFT_SCALE - 10 )
*/
2006-02-17 02:30:23 +03:00
delta_nsec = tick_nsec + adjtime_adjustment ( ) * 1000 ;
2006-06-26 11:25:18 +04:00
ret = ( u64 ) delta_nsec < < TICK_LENGTH_SHIFT ;
ret + = ( s64 ) time_adj < < ( TICK_LENGTH_SHIFT - ( SHIFT_SCALE - 10 ) ) ;
2006-06-26 11:25:07 +04:00
return ret ;
2006-02-17 02:30:23 +03:00
}
2006-06-26 11:25:06 +04:00
/* XXX - all of this timekeeping code should be later moved to time.c */
# include <linux/clocksource.h>
static struct clocksource * clock ; /* pointer to current clocksource */
2006-06-26 11:25:08 +04:00
# ifdef CONFIG_GENERIC_TIME
/**
* __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
*
* private function , must hold xtime_lock lock when being
* called . Returns the number of nanoseconds since the
* last call to update_wall_time ( ) ( adjusted by NTP scaling )
*/
static inline s64 __get_nsec_offset ( void )
{
cycle_t cycle_now , cycle_delta ;
s64 ns_offset ;
/* read clocksource: */
2006-06-26 11:25:14 +04:00
cycle_now = clocksource_read ( clock ) ;
2006-06-26 11:25:08 +04:00
/* calculate the delta since the last update_wall_time: */
2006-06-26 11:25:18 +04:00
cycle_delta = ( cycle_now - clock - > cycle_last ) & clock - > mask ;
2006-06-26 11:25:08 +04:00
/* convert to nanoseconds: */
ns_offset = cyc2ns ( clock , cycle_delta ) ;
return ns_offset ;
}
/**
* __get_realtime_clock_ts - Returns the time of day in a timespec
* @ ts : pointer to the timespec to be set
*
* Returns the time of day in a timespec . Used by
* do_gettimeofday ( ) and get_realtime_clock_ts ( ) .
*/
static inline void __get_realtime_clock_ts ( struct timespec * ts )
{
unsigned long seq ;
s64 nsecs ;
do {
seq = read_seqbegin ( & xtime_lock ) ;
* ts = xtime ;
nsecs = __get_nsec_offset ( ) ;
} while ( read_seqretry ( & xtime_lock , seq ) ) ;
timespec_add_ns ( ts , nsecs ) ;
}
/**
2006-06-26 11:25:14 +04:00
* getnstimeofday - Returns the time of day in a timespec
2006-06-26 11:25:08 +04:00
* @ ts : pointer to the timespec to be set
*
* Returns the time of day in a timespec .
*/
void getnstimeofday ( struct timespec * ts )
{
__get_realtime_clock_ts ( ts ) ;
}
EXPORT_SYMBOL ( getnstimeofday ) ;
/**
* do_gettimeofday - Returns the time of day in a timeval
* @ tv : pointer to the timeval to be set
*
* NOTE : Users should be converted to using get_realtime_clock_ts ( )
*/
void do_gettimeofday ( struct timeval * tv )
{
struct timespec now ;
__get_realtime_clock_ts ( & now ) ;
tv - > tv_sec = now . tv_sec ;
tv - > tv_usec = now . tv_nsec / 1000 ;
}
EXPORT_SYMBOL ( do_gettimeofday ) ;
/**
* do_settimeofday - Sets the time of day
* @ tv : pointer to the timespec variable containing the new time
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
int do_settimeofday ( struct timespec * tv )
{
unsigned long flags ;
time_t wtm_sec , sec = tv - > tv_sec ;
long wtm_nsec , nsec = tv - > tv_nsec ;
if ( ( unsigned long ) tv - > tv_nsec > = NSEC_PER_SEC )
return - EINVAL ;
write_seqlock_irqsave ( & xtime_lock , flags ) ;
nsec - = __get_nsec_offset ( ) ;
wtm_sec = wall_to_monotonic . tv_sec + ( xtime . tv_sec - sec ) ;
wtm_nsec = wall_to_monotonic . tv_nsec + ( xtime . tv_nsec - nsec ) ;
set_normalized_timespec ( & xtime , sec , nsec ) ;
set_normalized_timespec ( & wall_to_monotonic , wtm_sec , wtm_nsec ) ;
2006-07-10 15:44:32 +04:00
clock - > error = 0 ;
2006-06-26 11:25:08 +04:00
ntp_clear ( ) ;
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
/* signal hrtimers about time change */
clock_was_set ( ) ;
return 0 ;
}
EXPORT_SYMBOL ( do_settimeofday ) ;
/**
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
*/
static int change_clocksource ( void )
{
struct clocksource * new ;
cycle_t now ;
u64 nsec ;
2006-06-26 11:25:14 +04:00
new = clocksource_get_next ( ) ;
2006-06-26 11:25:08 +04:00
if ( clock ! = new ) {
2006-06-26 11:25:14 +04:00
now = clocksource_read ( new ) ;
2006-06-26 11:25:08 +04:00
nsec = __get_nsec_offset ( ) ;
timespec_add_ns ( & xtime , nsec ) ;
clock = new ;
2006-06-26 11:25:18 +04:00
clock - > cycle_last = now ;
2006-06-26 11:25:08 +04:00
printk ( KERN_INFO " Time: %s clocksource has been installed. \n " ,
clock - > name ) ;
return 1 ;
} else if ( clock - > update_callback ) {
return clock - > update_callback ( ) ;
}
return 0 ;
}
# else
# define change_clocksource() (0)
# endif
/**
* timeofday_is_continuous - check to see if timekeeping is free running
*/
int timekeeping_is_continuous ( void )
{
unsigned long seq ;
int ret ;
do {
seq = read_seqbegin ( & xtime_lock ) ;
ret = clock - > is_continuous ;
} while ( read_seqretry ( & xtime_lock , seq ) ) ;
return ret ;
}
2005-04-17 02:20:36 +04:00
/*
2006-06-26 11:25:06 +04:00
* timekeeping_init - Initializes the clocksource and common timekeeping values
2005-04-17 02:20:36 +04:00
*/
2006-06-26 11:25:06 +04:00
void __init timekeeping_init ( void )
2005-04-17 02:20:36 +04:00
{
2006-06-26 11:25:06 +04:00
unsigned long flags ;
write_seqlock_irqsave ( & xtime_lock , flags ) ;
2006-06-26 11:25:14 +04:00
clock = clocksource_get_next ( ) ;
clocksource_calculate_interval ( clock , tick_nsec ) ;
2006-06-26 11:25:18 +04:00
clock - > cycle_last = clocksource_read ( clock ) ;
2006-06-26 11:25:06 +04:00
ntp_clear ( ) ;
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
}
2006-07-14 11:24:17 +04:00
static int timekeeping_suspended ;
2006-06-26 11:25:06 +04:00
/*
* timekeeping_resume - Resumes the generic timekeeping subsystem .
* @ dev : unused
*
* This is for the generic clocksource timekeeping .
* xtime / wall_to_monotonic / jiffies / wall_jiffies / etc are
* still managed by arch specific suspend / resume code .
*/
static int timekeeping_resume ( struct sys_device * dev )
{
unsigned long flags ;
write_seqlock_irqsave ( & xtime_lock , flags ) ;
/* restart the last cycle value */
2006-06-26 11:25:18 +04:00
clock - > cycle_last = clocksource_read ( clock ) ;
2006-07-14 11:24:17 +04:00
clock - > error = 0 ;
timekeeping_suspended = 0 ;
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
return 0 ;
}
static int timekeeping_suspend ( struct sys_device * dev , pm_message_t state )
{
unsigned long flags ;
write_seqlock_irqsave ( & xtime_lock , flags ) ;
timekeeping_suspended = 1 ;
2006-06-26 11:25:06 +04:00
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
return 0 ;
}
/* sysfs resume/suspend bits for timekeeping */
static struct sysdev_class timekeeping_sysclass = {
. resume = timekeeping_resume ,
2006-07-14 11:24:17 +04:00
. suspend = timekeeping_suspend ,
2006-06-26 11:25:06 +04:00
set_kset_name ( " timekeeping " ) ,
} ;
static struct sys_device device_timer = {
. id = 0 ,
. cls = & timekeeping_sysclass ,
} ;
static int __init timekeeping_init_device ( void )
{
int error = sysdev_class_register ( & timekeeping_sysclass ) ;
if ( ! error )
error = sysdev_register ( & device_timer ) ;
return error ;
}
device_initcall ( timekeeping_init_device ) ;
2006-06-26 11:25:18 +04:00
/*
2006-07-10 15:44:32 +04:00
* If the error is already larger , we look ahead even further
2006-06-26 11:25:18 +04:00
* to compensate for late or lost adjustments .
*/
2006-07-10 15:44:32 +04:00
static __always_inline int clocksource_bigadjust ( s64 error , s64 * interval , s64 * offset )
2006-06-26 11:25:18 +04:00
{
2006-07-10 15:44:32 +04:00
s64 tick_error , i ;
u32 look_ahead , adj ;
s32 error2 , mult ;
2006-06-26 11:25:18 +04:00
/*
2006-07-10 15:44:32 +04:00
* Use the current error value to determine how much to look ahead .
* The larger the error the slower we adjust for it to avoid problems
* with losing too many ticks , otherwise we would overadjust and
* produce an even larger error . The smaller the adjustment the
* faster we try to adjust for it , as lost ticks can do less harm
* here . This is tuned so that an error of about 1 msec is adusted
* within about 1 sec ( or 2 ^ 20 nsec in 2 ^ SHIFT_HZ ticks ) .
2006-06-26 11:25:18 +04:00
*/
2006-07-10 15:44:32 +04:00
error2 = clock - > error > > ( TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ ) ;
error2 = abs ( error2 ) ;
for ( look_ahead = 0 ; error2 > 0 ; look_ahead + + )
error2 > > = 2 ;
2006-06-26 11:25:18 +04:00
/*
2006-07-10 15:44:32 +04:00
* Now calculate the error in ( 1 < < look_ahead ) ticks , but first
* remove the single look ahead already included in the error .
2006-06-26 11:25:18 +04:00
*/
2006-07-10 15:44:32 +04:00
tick_error = current_tick_length ( ) > > ( TICK_LENGTH_SHIFT - clock - > shift + 1 ) ;
tick_error - = clock - > xtime_interval > > 1 ;
error = ( ( error - tick_error ) > > look_ahead ) + tick_error ;
/* Finally calculate the adjustment shift value. */
i = * interval ;
mult = 1 ;
if ( error < 0 ) {
error = - error ;
* interval = - * interval ;
* offset = - * offset ;
mult = - 1 ;
2006-06-26 11:25:18 +04:00
}
2006-07-10 15:44:32 +04:00
for ( adj = 0 ; error > i ; adj + + )
error > > = 1 ;
2006-06-26 11:25:18 +04:00
* interval < < = adj ;
* offset < < = adj ;
2006-07-10 15:44:32 +04:00
return mult < < adj ;
2006-06-26 11:25:18 +04:00
}
/*
* Adjust the multiplier to reduce the error value ,
* this is optimized for the most common adjustments of - 1 , 0 , 1 ,
* for other values we can do a bit more work .
*/
static void clocksource_adjust ( struct clocksource * clock , s64 offset )
{
s64 error , interval = clock - > cycle_interval ;
int adj ;
error = clock - > error > > ( TICK_LENGTH_SHIFT - clock - > shift - 1 ) ;
if ( error > interval ) {
2006-07-10 15:44:32 +04:00
error > > = 2 ;
if ( likely ( error < = interval ) )
adj = 1 ;
else
adj = clocksource_bigadjust ( error , & interval , & offset ) ;
2006-06-26 11:25:18 +04:00
} else if ( error < - interval ) {
2006-07-10 15:44:32 +04:00
error > > = 2 ;
if ( likely ( error > = - interval ) ) {
adj = - 1 ;
interval = - interval ;
offset = - offset ;
} else
adj = clocksource_bigadjust ( error , & interval , & offset ) ;
2006-06-26 11:25:18 +04:00
} else
return ;
clock - > mult + = adj ;
clock - > xtime_interval + = interval ;
clock - > xtime_nsec - = offset ;
clock - > error - = ( interval - offset ) < < ( TICK_LENGTH_SHIFT - clock - > shift ) ;
}
2006-06-26 11:25:06 +04:00
/*
* update_wall_time - Uses the current clocksource to increment the wall time
*
* Called from the timer interrupt , must hold a write on xtime_lock .
*/
static void update_wall_time ( void )
{
2006-06-26 11:25:18 +04:00
cycle_t offset ;
2006-06-26 11:25:06 +04:00
2006-07-14 11:24:17 +04:00
/* Make sure we're fully resumed: */
if ( unlikely ( timekeeping_suspended ) )
return ;
2006-06-26 11:25:07 +04:00
2006-06-26 11:25:18 +04:00
# ifdef CONFIG_GENERIC_TIME
offset = ( clocksource_read ( clock ) - clock - > cycle_last ) & clock - > mask ;
# else
offset = clock - > cycle_interval ;
# endif
2006-07-14 11:24:17 +04:00
clock - > xtime_nsec + = ( s64 ) xtime . tv_nsec < < clock - > shift ;
2006-06-26 11:25:06 +04:00
/* normally this loop will run just once, however in the
* case of lost or late ticks , it will accumulate correctly .
*/
2006-06-26 11:25:18 +04:00
while ( offset > = clock - > cycle_interval ) {
2006-06-26 11:25:06 +04:00
/* accumulate one interval */
2006-06-26 11:25:18 +04:00
clock - > xtime_nsec + = clock - > xtime_interval ;
clock - > cycle_last + = clock - > cycle_interval ;
offset - = clock - > cycle_interval ;
if ( clock - > xtime_nsec > = ( u64 ) NSEC_PER_SEC < < clock - > shift ) {
clock - > xtime_nsec - = ( u64 ) NSEC_PER_SEC < < clock - > shift ;
xtime . tv_sec + + ;
second_overflow ( ) ;
}
2006-06-26 11:25:06 +04:00
2006-06-26 11:25:07 +04:00
/* interpolator bits */
2006-06-26 11:25:18 +04:00
time_interpolator_update ( clock - > xtime_interval
2006-06-26 11:25:07 +04:00
> > clock - > shift ) ;
/* increment the NTP state machine */
update_ntp_one_tick ( ) ;
/* accumulate error between NTP and clock interval */
2006-06-26 11:25:18 +04:00
clock - > error + = current_tick_length ( ) ;
clock - > error - = clock - > xtime_interval < < ( TICK_LENGTH_SHIFT - clock - > shift ) ;
}
2006-06-26 11:25:07 +04:00
2006-06-26 11:25:18 +04:00
/* correct the clock when NTP error is too big */
clocksource_adjust ( clock , offset ) ;
2006-06-26 11:25:07 +04:00
/* store full nanoseconds into xtime */
2006-07-10 15:44:32 +04:00
xtime . tv_nsec = ( s64 ) clock - > xtime_nsec > > clock - > shift ;
2006-06-26 11:25:18 +04:00
clock - > xtime_nsec - = ( s64 ) xtime . tv_nsec < < clock - > shift ;
2006-06-26 11:25:08 +04:00
/* check to see if there is a new clocksource to use */
if ( change_clocksource ( ) ) {
2006-06-26 11:25:18 +04:00
clock - > error = 0 ;
clock - > xtime_nsec = 0 ;
2006-06-26 11:25:14 +04:00
clocksource_calculate_interval ( clock , tick_nsec ) ;
2006-06-26 11:25:08 +04:00
}
2005-04-17 02:20:36 +04:00
}
/*
* Called from the timer interrupt handler to charge one tick to the current
* process . user_tick is 1 if the tick is user time , 0 for system .
*/
void update_process_times ( int user_tick )
{
struct task_struct * p = current ;
int cpu = smp_processor_id ( ) ;
/* Note: this timer irq context must be accounted for as well. */
if ( user_tick )
account_user_time ( p , jiffies_to_cputime ( 1 ) ) ;
else
account_system_time ( p , HARDIRQ_OFFSET , jiffies_to_cputime ( 1 ) ) ;
run_local_timers ( ) ;
if ( rcu_pending ( cpu ) )
rcu_check_callbacks ( cpu , user_tick ) ;
scheduler_tick ( ) ;
run_posix_cpu_timers ( p ) ;
}
/*
* Nr of active tasks - counted in fixed - point numbers
*/
static unsigned long count_active_tasks ( void )
{
2006-03-31 14:31:21 +04:00
return nr_active ( ) * FIXED_1 ;
2005-04-17 02:20:36 +04:00
}
/*
* Hmm . . Changed this , as the GNU make sources ( load . c ) seems to
* imply that avenrun [ ] is the standard name for this kind of thing .
* Nothing else seems to be standardized : the fractional size etc
* all seem to differ on different machines .
*
* Requires xtime_lock to access .
*/
unsigned long avenrun [ 3 ] ;
EXPORT_SYMBOL ( avenrun ) ;
/*
* calc_load - given tick count , update the avenrun load estimates .
* This is called while holding a write_lock on xtime_lock .
*/
static inline void calc_load ( unsigned long ticks )
{
unsigned long active_tasks ; /* fixed-point */
static int count = LOAD_FREQ ;
count - = ticks ;
if ( count < 0 ) {
count + = LOAD_FREQ ;
active_tasks = count_active_tasks ( ) ;
CALC_LOAD ( avenrun [ 0 ] , EXP_1 , active_tasks ) ;
CALC_LOAD ( avenrun [ 1 ] , EXP_5 , active_tasks ) ;
CALC_LOAD ( avenrun [ 2 ] , EXP_15 , active_tasks ) ;
}
}
/* jiffies at the most recent update of wall time */
unsigned long wall_jiffies = INITIAL_JIFFIES ;
/*
* This read - write spinlock protects us from races in SMP while
* playing with xtime and avenrun .
*/
# ifndef ARCH_HAVE_XTIME_LOCK
2006-07-03 11:24:34 +04:00
__cacheline_aligned_in_smp DEFINE_SEQLOCK ( xtime_lock ) ;
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( xtime_lock ) ;
# endif
/*
* This function runs timers and the timer - tq in bottom half context .
*/
static void run_timer_softirq ( struct softirq_action * h )
{
2006-03-24 14:15:54 +03:00
tvec_base_t * base = __get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
2006-01-10 07:52:32 +03:00
hrtimer_run_queues ( ) ;
2005-04-17 02:20:36 +04:00
if ( time_after_eq ( jiffies , base - > timer_jiffies ) )
__run_timers ( base ) ;
}
/*
* Called by the local , per - CPU timer interrupt on SMP .
*/
void run_local_timers ( void )
{
raise_softirq ( TIMER_SOFTIRQ ) ;
2006-03-24 14:18:41 +03:00
softlockup_tick ( ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Called by the timer interrupt . xtime_lock must already be taken
* by the timer IRQ !
*/
static inline void update_times ( void )
{
unsigned long ticks ;
ticks = jiffies - wall_jiffies ;
2006-06-26 11:25:06 +04:00
wall_jiffies + = ticks ;
update_wall_time ( ) ;
2005-04-17 02:20:36 +04:00
calc_load ( ticks ) ;
}
/*
* The 64 - bit jiffies value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock .
* jiffies is defined in the linker script . . .
*/
void do_timer ( struct pt_regs * regs )
{
jiffies_64 + + ;
2006-03-07 02:42:51 +03:00
/* prevent loading jiffies before storing new jiffies_64 value. */
barrier ( ) ;
2005-04-17 02:20:36 +04:00
update_times ( ) ;
}
# ifdef __ARCH_WANT_SYS_ALARM
/*
* For backwards compatibility ? This can be done in libc so Alpha
* and all newer ports shouldn ' t need it .
*/
asmlinkage unsigned long sys_alarm ( unsigned int seconds )
{
2006-03-25 14:06:33 +03:00
return alarm_setitimer ( seconds ) ;
2005-04-17 02:20:36 +04:00
}
# endif
# ifndef __alpha__
/*
* The Alpha uses getxpid , getxuid , and getxgid instead . Maybe this
* should be moved into arch / i386 instead ?
*/
/**
* sys_getpid - return the thread group id of the current process
*
* Note , despite the name , this returns the tgid not the pid . The tgid and
* the pid are identical unless CLONE_THREAD was specified on clone ( ) in
* which case the tgid is the same in all threads of the same group .
*
* This is SMP safe as current - > tgid does not change .
*/
asmlinkage long sys_getpid ( void )
{
return current - > tgid ;
}
/*
2006-08-14 10:24:23 +04:00
* Accessing - > real_parent is not SMP - safe , it could
* change from under us . However , we can use a stale
* value of - > real_parent under rcu_read_lock ( ) , see
* release_task ( ) - > call_rcu ( delayed_put_task_struct ) .
2005-04-17 02:20:36 +04:00
*/
asmlinkage long sys_getppid ( void )
{
int pid ;
2006-08-14 10:24:23 +04:00
rcu_read_lock ( ) ;
pid = rcu_dereference ( current - > real_parent ) - > tgid ;
rcu_read_unlock ( ) ;
2005-04-17 02:20:36 +04:00
return pid ;
}
asmlinkage long sys_getuid ( void )
{
/* Only we change this so SMP safe */
return current - > uid ;
}
asmlinkage long sys_geteuid ( void )
{
/* Only we change this so SMP safe */
return current - > euid ;
}
asmlinkage long sys_getgid ( void )
{
/* Only we change this so SMP safe */
return current - > gid ;
}
asmlinkage long sys_getegid ( void )
{
/* Only we change this so SMP safe */
return current - > egid ;
}
# endif
static void process_timeout ( unsigned long __data )
{
2006-07-03 11:25:41 +04:00
wake_up_process ( ( struct task_struct * ) __data ) ;
2005-04-17 02:20:36 +04:00
}
/**
* schedule_timeout - sleep until timeout
* @ timeout : timeout value in jiffies
*
* Make the current task sleep until @ timeout jiffies have
* elapsed . The routine will return immediately unless
* the current task state has been set ( see set_current_state ( ) ) .
*
* You can set the task state as follows -
*
* % TASK_UNINTERRUPTIBLE - at least @ timeout jiffies are guaranteed to
* pass before the routine returns . The routine will return 0
*
* % TASK_INTERRUPTIBLE - the routine may return early if a signal is
* delivered to the current task . In this case the remaining time
* in jiffies will be returned , or 0 if the timer expired in time
*
* The current task state is guaranteed to be TASK_RUNNING when this
* routine returns .
*
* Specifying a @ timeout value of % MAX_SCHEDULE_TIMEOUT will schedule
* the CPU away without a bound on the timeout . In this case the return
* value will be % MAX_SCHEDULE_TIMEOUT .
*
* In all cases the return value is guaranteed to be non - negative .
*/
fastcall signed long __sched schedule_timeout ( signed long timeout )
{
struct timer_list timer ;
unsigned long expire ;
switch ( timeout )
{
case MAX_SCHEDULE_TIMEOUT :
/*
* These two special cases are useful to be comfortable
* in the caller . Nothing more . We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I ' d like to return a valid offset ( > = 0 ) to allow
* the caller to do everything it want with the retval .
*/
schedule ( ) ;
goto out ;
default :
/*
* Another bit of PARANOID . Note that the retval will be
* 0 since no piece of kernel is supposed to do a check
* for a negative retval of schedule_timeout ( ) ( since it
* should never happens anyway ) . You just have the printk ( )
* that will tell you if something is gone wrong and where .
*/
if ( timeout < 0 )
{
printk ( KERN_ERR " schedule_timeout: wrong timeout "
2005-10-31 02:01:42 +03:00
" value %lx from %p \n " , timeout ,
__builtin_return_address ( 0 ) ) ;
2005-04-17 02:20:36 +04:00
current - > state = TASK_RUNNING ;
goto out ;
}
}
expire = timeout + jiffies ;
2005-10-31 02:01:38 +03:00
setup_timer ( & timer , process_timeout , ( unsigned long ) current ) ;
__mod_timer ( & timer , expire ) ;
2005-04-17 02:20:36 +04:00
schedule ( ) ;
del_singleshot_timer_sync ( & timer ) ;
timeout = expire - jiffies ;
out :
return timeout < 0 ? 0 : timeout ;
}
EXPORT_SYMBOL ( schedule_timeout ) ;
2005-09-13 12:25:15 +04:00
/*
* We can use __set_current_state ( ) here because schedule_timeout ( ) calls
* schedule ( ) unconditionally .
*/
2005-09-10 11:27:21 +04:00
signed long __sched schedule_timeout_interruptible ( signed long timeout )
{
2005-10-31 02:01:42 +03:00
__set_current_state ( TASK_INTERRUPTIBLE ) ;
return schedule_timeout ( timeout ) ;
2005-09-10 11:27:21 +04:00
}
EXPORT_SYMBOL ( schedule_timeout_interruptible ) ;
signed long __sched schedule_timeout_uninterruptible ( signed long timeout )
{
2005-10-31 02:01:42 +03:00
__set_current_state ( TASK_UNINTERRUPTIBLE ) ;
return schedule_timeout ( timeout ) ;
2005-09-10 11:27:21 +04:00
}
EXPORT_SYMBOL ( schedule_timeout_uninterruptible ) ;
2005-04-17 02:20:36 +04:00
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid ( void )
{
return current - > pid ;
}
/*
* sys_sysinfo - fill in sysinfo struct
*/
asmlinkage long sys_sysinfo ( struct sysinfo __user * info )
{
struct sysinfo val ;
unsigned long mem_total , sav_total ;
unsigned int mem_unit , bitcount ;
unsigned long seq ;
memset ( ( char * ) & val , 0 , sizeof ( struct sysinfo ) ) ;
do {
struct timespec tp ;
seq = read_seqbegin ( & xtime_lock ) ;
/*
* This is annoying . The below is the same thing
* posix_get_clock_monotonic ( ) does , but it wants to
* take the lock which we want to cover the loads stuff
* too .
*/
getnstimeofday ( & tp ) ;
tp . tv_sec + = wall_to_monotonic . tv_sec ;
tp . tv_nsec + = wall_to_monotonic . tv_nsec ;
if ( tp . tv_nsec - NSEC_PER_SEC > = 0 ) {
tp . tv_nsec = tp . tv_nsec - NSEC_PER_SEC ;
tp . tv_sec + + ;
}
val . uptime = tp . tv_sec + ( tp . tv_nsec ? 1 : 0 ) ;
val . loads [ 0 ] = avenrun [ 0 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
val . loads [ 1 ] = avenrun [ 1 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
val . loads [ 2 ] = avenrun [ 2 ] < < ( SI_LOAD_SHIFT - FSHIFT ) ;
val . procs = nr_threads ;
} while ( read_seqretry ( & xtime_lock , seq ) ) ;
si_meminfo ( & val ) ;
si_swapinfo ( & val ) ;
/*
* If the sum of all the available memory ( i . e . ram + swap )
* is less than can be stored in a 32 bit unsigned long then
* we can be binary compatible with 2.2 . x kernels . If not ,
* well , in that case 2.2 . x was broken anyways . . .
*
* - Erik Andersen < andersee @ debian . org >
*/
mem_total = val . totalram + val . totalswap ;
if ( mem_total < val . totalram | | mem_total < val . totalswap )
goto out ;
bitcount = 0 ;
mem_unit = val . mem_unit ;
while ( mem_unit > 1 ) {
bitcount + + ;
mem_unit > > = 1 ;
sav_total = mem_total ;
mem_total < < = 1 ;
if ( mem_total < sav_total )
goto out ;
}
/*
* If mem_total did not overflow , multiply all memory values by
* val . mem_unit and set it to 1. This leaves things compatible
* with 2.2 . x , and also retains compatibility with earlier 2.4 . x
* kernels . . .
*/
val . mem_unit = 1 ;
val . totalram < < = bitcount ;
val . freeram < < = bitcount ;
val . sharedram < < = bitcount ;
val . bufferram < < = bitcount ;
val . totalswap < < = bitcount ;
val . freeswap < < = bitcount ;
val . totalhigh < < = bitcount ;
val . freehigh < < = bitcount ;
out :
if ( copy_to_user ( info , & val , sizeof ( struct sysinfo ) ) )
return - EFAULT ;
return 0 ;
}
2006-07-03 11:25:10 +04:00
/*
* lockdep : we want to track each per - CPU base as a separate lock - class ,
* but timer - bases are kmalloc ( ) - ed , so we need to attach separate
* keys to them :
*/
static struct lock_class_key base_lock_keys [ NR_CPUS ] ;
2006-03-24 14:15:54 +03:00
static int __devinit init_timers_cpu ( int cpu )
2005-04-17 02:20:36 +04:00
{
int j ;
tvec_base_t * base ;
2006-04-11 09:53:58 +04:00
static char __devinitdata tvec_base_done [ NR_CPUS ] ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
2006-04-11 09:53:58 +04:00
if ( ! tvec_base_done [ cpu ] ) {
2006-03-24 14:15:54 +03:00
static char boot_done ;
if ( boot_done ) {
2006-04-11 09:53:58 +04:00
/*
* The APs use this path later in boot
*/
2006-03-24 14:15:54 +03:00
base = kmalloc_node ( sizeof ( * base ) , GFP_KERNEL ,
cpu_to_node ( cpu ) ) ;
if ( ! base )
return - ENOMEM ;
memset ( base , 0 , sizeof ( * base ) ) ;
2006-04-11 09:53:58 +04:00
per_cpu ( tvec_bases , cpu ) = base ;
2006-03-24 14:15:54 +03:00
} else {
2006-04-11 09:53:58 +04:00
/*
* This is for the boot CPU - we use compile - time
* static initialisation because per - cpu memory isn ' t
* ready yet and because the memory allocators are not
* initialised either .
*/
2006-03-24 14:15:54 +03:00
boot_done = 1 ;
2006-04-11 09:53:58 +04:00
base = & boot_tvec_bases ;
2006-03-24 14:15:54 +03:00
}
2006-04-11 09:53:58 +04:00
tvec_base_done [ cpu ] = 1 ;
} else {
base = per_cpu ( tvec_bases , cpu ) ;
2006-03-24 14:15:54 +03:00
}
2006-04-11 09:53:58 +04:00
2006-03-31 14:30:30 +04:00
spin_lock_init ( & base - > lock ) ;
2006-07-03 11:25:10 +04:00
lockdep_set_class ( & base - > lock , base_lock_keys + cpu ) ;
2005-04-17 02:20:36 +04:00
for ( j = 0 ; j < TVN_SIZE ; j + + ) {
INIT_LIST_HEAD ( base - > tv5 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv4 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv3 . vec + j ) ;
INIT_LIST_HEAD ( base - > tv2 . vec + j ) ;
}
for ( j = 0 ; j < TVR_SIZE ; j + + )
INIT_LIST_HEAD ( base - > tv1 . vec + j ) ;
base - > timer_jiffies = jiffies ;
2006-03-24 14:15:54 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
# ifdef CONFIG_HOTPLUG_CPU
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
static void migrate_timer_list ( tvec_base_t * new_base , struct list_head * head )
2005-04-17 02:20:36 +04:00
{
struct timer_list * timer ;
while ( ! list_empty ( head ) ) {
timer = list_entry ( head - > next , struct timer_list , entry ) ;
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
detach_timer ( timer , 0 ) ;
2006-03-31 14:30:30 +04:00
timer - > base = new_base ;
2005-04-17 02:20:36 +04:00
internal_add_timer ( new_base , timer ) ;
}
}
static void __devinit migrate_timers ( int cpu )
{
tvec_base_t * old_base ;
tvec_base_t * new_base ;
int i ;
BUG_ON ( cpu_online ( cpu ) ) ;
2006-03-24 14:15:54 +03:00
old_base = per_cpu ( tvec_bases , cpu ) ;
new_base = get_cpu_var ( tvec_bases ) ;
2005-04-17 02:20:36 +04:00
local_irq_disable ( ) ;
2006-03-31 14:30:30 +04:00
spin_lock ( & new_base - > lock ) ;
spin_lock ( & old_base - > lock ) ;
BUG_ON ( old_base - > running_timer ) ;
2005-04-17 02:20:36 +04:00
for ( i = 0 ; i < TVR_SIZE ; i + + )
[PATCH] timers fixes/improvements
This patch tries to solve following problems:
1. del_timer_sync() is racy. The timer can be fired again after
del_timer_sync have checked all cpus and before it will recheck
timer_pending().
2. It has scalability problems. All cpus are scanned to determine
if the timer is running on that cpu.
With this patch del_timer_sync is O(1) and no slower than plain
del_timer(pending_timer), unless it has to actually wait for
completion of the currently running timer.
The only restriction is that the recurring timer should not use
add_timer_on().
3. The timers are not serialized wrt to itself.
If CPU_0 does mod_timer(jiffies+1) while the timer is currently
running on CPU 1, it is quite possible that local interrupt on
CPU_0 will start that timer before it finished on CPU_1.
4. The timers locking is suboptimal. __mod_timer() takes 3 locks
at once and still requires wmb() in del_timer/run_timers.
The new implementation takes 2 locks sequentially and does not
need memory barriers.
Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.
This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.
The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.
So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).
When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.
This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.
__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.
__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.
So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.
We don't need timer_list->lock anymore, this patch kills it.
We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.
One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global
struct timer_base_s {
spinlock_t lock;
struct timer_list *running_timer;
} __init_timer_base;
which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.
It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:08:56 +04:00
migrate_timer_list ( new_base , old_base - > tv1 . vec + i ) ;
for ( i = 0 ; i < TVN_SIZE ; i + + ) {
migrate_timer_list ( new_base , old_base - > tv2 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv3 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv4 . vec + i ) ;
migrate_timer_list ( new_base , old_base - > tv5 . vec + i ) ;
}
2006-03-31 14:30:30 +04:00
spin_unlock ( & old_base - > lock ) ;
spin_unlock ( & new_base - > lock ) ;
2005-04-17 02:20:36 +04:00
local_irq_enable ( ) ;
put_cpu_var ( tvec_bases ) ;
}
# endif /* CONFIG_HOTPLUG_CPU */
2006-07-30 14:03:35 +04:00
static int __cpuinit timer_cpu_notify ( struct notifier_block * self ,
2005-04-17 02:20:36 +04:00
unsigned long action , void * hcpu )
{
long cpu = ( long ) hcpu ;
switch ( action ) {
case CPU_UP_PREPARE :
2006-03-24 14:15:54 +03:00
if ( init_timers_cpu ( cpu ) < 0 )
return NOTIFY_BAD ;
2005-04-17 02:20:36 +04:00
break ;
# ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD :
migrate_timers ( cpu ) ;
break ;
# endif
default :
break ;
}
return NOTIFY_OK ;
}
2006-07-30 14:03:35 +04:00
static struct notifier_block __cpuinitdata timers_nb = {
2005-04-17 02:20:36 +04:00
. notifier_call = timer_cpu_notify ,
} ;
void __init init_timers ( void )
{
timer_cpu_notify ( & timers_nb , ( unsigned long ) CPU_UP_PREPARE ,
( void * ) ( long ) smp_processor_id ( ) ) ;
register_cpu_notifier ( & timers_nb ) ;
open_softirq ( TIMER_SOFTIRQ , run_timer_softirq , NULL ) ;
}
# ifdef CONFIG_TIME_INTERPOLATION
2006-03-17 10:04:00 +03:00
struct time_interpolator * time_interpolator __read_mostly ;
static struct time_interpolator * time_interpolator_list __read_mostly ;
2005-04-17 02:20:36 +04:00
static DEFINE_SPINLOCK ( time_interpolator_lock ) ;
static inline u64 time_interpolator_get_cycles ( unsigned int src )
{
unsigned long ( * x ) ( void ) ;
switch ( src )
{
case TIME_SOURCE_FUNCTION :
x = time_interpolator - > addr ;
return x ( ) ;
case TIME_SOURCE_MMIO64 :
2006-03-02 13:54:35 +03:00
return readq_relaxed ( ( void __iomem * ) time_interpolator - > addr ) ;
2005-04-17 02:20:36 +04:00
case TIME_SOURCE_MMIO32 :
2006-03-02 13:54:35 +03:00
return readl_relaxed ( ( void __iomem * ) time_interpolator - > addr ) ;
2005-04-17 02:20:36 +04:00
default : return get_cycles ( ) ;
}
}
2005-09-07 02:17:04 +04:00
static inline u64 time_interpolator_get_counter ( int writelock )
2005-04-17 02:20:36 +04:00
{
unsigned int src = time_interpolator - > source ;
if ( time_interpolator - > jitter )
{
u64 lcycle ;
u64 now ;
do {
lcycle = time_interpolator - > last_cycle ;
now = time_interpolator_get_cycles ( src ) ;
if ( lcycle & & time_after ( lcycle , now ) )
return lcycle ;
2005-09-07 02:17:04 +04:00
/* When holding the xtime write lock, there's no need
* to add the overhead of the cmpxchg . Readers are
* force to retry until the write lock is released .
*/
if ( writelock ) {
time_interpolator - > last_cycle = now ;
return now ;
}
2005-04-17 02:20:36 +04:00
/* Keep track of the last timer value returned. The use of cmpxchg here
* will cause contention in an SMP environment .
*/
} while ( unlikely ( cmpxchg ( & time_interpolator - > last_cycle , lcycle , now ) ! = lcycle ) ) ;
return now ;
}
else
return time_interpolator_get_cycles ( src ) ;
}
void time_interpolator_reset ( void )
{
time_interpolator - > offset = 0 ;
2005-09-07 02:17:04 +04:00
time_interpolator - > last_counter = time_interpolator_get_counter ( 1 ) ;
2005-04-17 02:20:36 +04:00
}
# define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
unsigned long time_interpolator_get_offset ( void )
{
/* If we do not have a time interpolator set up then just return zero */
if ( ! time_interpolator )
return 0 ;
return time_interpolator - > offset +
2005-09-07 02:17:04 +04:00
GET_TI_NSECS ( time_interpolator_get_counter ( 0 ) , time_interpolator ) ;
2005-04-17 02:20:36 +04:00
}
# define INTERPOLATOR_ADJUST 65536
# define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
static void time_interpolator_update ( long delta_nsec )
{
u64 counter ;
unsigned long offset ;
/* If there is no time interpolator set up then do nothing */
if ( ! time_interpolator )
return ;
2005-10-31 02:01:42 +03:00
/*
* The interpolator compensates for late ticks by accumulating the late
* time in time_interpolator - > offset . A tick earlier than expected will
* lead to a reset of the offset and a corresponding jump of the clock
* forward . Again this only works if the interpolator clock is running
* slightly slower than the regular clock and the tuning logic insures
* that .
*/
2005-04-17 02:20:36 +04:00
2005-09-07 02:17:04 +04:00
counter = time_interpolator_get_counter ( 1 ) ;
2005-10-31 02:01:42 +03:00
offset = time_interpolator - > offset +
GET_TI_NSECS ( counter , time_interpolator ) ;
2005-04-17 02:20:36 +04:00
if ( delta_nsec < 0 | | ( unsigned long ) delta_nsec < offset )
time_interpolator - > offset = offset - delta_nsec ;
else {
time_interpolator - > skips + + ;
time_interpolator - > ns_skipped + = delta_nsec - offset ;
time_interpolator - > offset = 0 ;
}
time_interpolator - > last_counter = counter ;
/* Tuning logic for time interpolator invoked every minute or so.
* Decrease interpolator clock speed if no skips occurred and an offset is carried .
* Increase interpolator clock speed if we skip too much time .
*/
if ( jiffies % INTERPOLATOR_ADJUST = = 0 )
{
2006-04-07 21:50:18 +04:00
if ( time_interpolator - > skips = = 0 & & time_interpolator - > offset > tick_nsec )
2005-04-17 02:20:36 +04:00
time_interpolator - > nsec_per_cyc - - ;
if ( time_interpolator - > ns_skipped > INTERPOLATOR_MAX_SKIP & & time_interpolator - > offset = = 0 )
time_interpolator - > nsec_per_cyc + + ;
time_interpolator - > skips = 0 ;
time_interpolator - > ns_skipped = 0 ;
}
}
static inline int
is_better_time_interpolator ( struct time_interpolator * new )
{
if ( ! time_interpolator )
return 1 ;
return new - > frequency > 2 * time_interpolator - > frequency | |
( unsigned long ) new - > drift < ( unsigned long ) time_interpolator - > drift ;
}
void
register_time_interpolator ( struct time_interpolator * ti )
{
unsigned long flags ;
/* Sanity check */
2006-04-02 15:45:55 +04:00
BUG_ON ( ti - > frequency = = 0 | | ti - > mask = = 0 ) ;
2005-04-17 02:20:36 +04:00
ti - > nsec_per_cyc = ( ( u64 ) NSEC_PER_SEC < < ti - > shift ) / ti - > frequency ;
spin_lock ( & time_interpolator_lock ) ;
write_seqlock_irqsave ( & xtime_lock , flags ) ;
if ( is_better_time_interpolator ( ti ) ) {
time_interpolator = ti ;
time_interpolator_reset ( ) ;
}
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
ti - > next = time_interpolator_list ;
time_interpolator_list = ti ;
spin_unlock ( & time_interpolator_lock ) ;
}
void
unregister_time_interpolator ( struct time_interpolator * ti )
{
struct time_interpolator * curr , * * prev ;
unsigned long flags ;
spin_lock ( & time_interpolator_lock ) ;
prev = & time_interpolator_list ;
for ( curr = * prev ; curr ; curr = curr - > next ) {
if ( curr = = ti ) {
* prev = curr - > next ;
break ;
}
prev = & curr - > next ;
}
write_seqlock_irqsave ( & xtime_lock , flags ) ;
if ( ti = = time_interpolator ) {
/* we lost the best time-interpolator: */
time_interpolator = NULL ;
/* find the next-best interpolator */
for ( curr = time_interpolator_list ; curr ; curr = curr - > next )
if ( is_better_time_interpolator ( curr ) )
time_interpolator = curr ;
time_interpolator_reset ( ) ;
}
write_sequnlock_irqrestore ( & xtime_lock , flags ) ;
spin_unlock ( & time_interpolator_lock ) ;
}
# endif /* CONFIG_TIME_INTERPOLATION */
/**
* msleep - sleep safely even with waitqueue interruptions
* @ msecs : Time in milliseconds to sleep for
*/
void msleep ( unsigned int msecs )
{
unsigned long timeout = msecs_to_jiffies ( msecs ) + 1 ;
2005-09-10 11:27:24 +04:00
while ( timeout )
timeout = schedule_timeout_uninterruptible ( timeout ) ;
2005-04-17 02:20:36 +04:00
}
EXPORT_SYMBOL ( msleep ) ;
/**
2005-06-26 01:58:43 +04:00
* msleep_interruptible - sleep waiting for signals
2005-04-17 02:20:36 +04:00
* @ msecs : Time in milliseconds to sleep for
*/
unsigned long msleep_interruptible ( unsigned int msecs )
{
unsigned long timeout = msecs_to_jiffies ( msecs ) + 1 ;
2005-09-10 11:27:24 +04:00
while ( timeout & & ! signal_pending ( current ) )
timeout = schedule_timeout_interruptible ( timeout ) ;
2005-04-17 02:20:36 +04:00
return jiffies_to_msecs ( timeout ) ;
}
EXPORT_SYMBOL ( msleep_interruptible ) ;