From 1cc4fff0b360aeffeedb7d6db5089d88dd861700 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Dec 2008 02:24:48 +0100 Subject: [PATCH 01/26] hrtimers: increase clock min delta threshold while interrupt hanging Impact: avoid timer IRQ hanging slow systems While using the function graph tracer on a virtualized system, the hrtimer_interrupt can hang the system on an infinite loop. This can be caused in several situations: - the hardware is very slow and HZ is set too high - something intrusive is slowing the system down (tracing under emulation) ... and the next clock events to program are always before the current time. This patch implements a reasonable compromise: if such a situation is detected, we share the CPUs time in 1/4 to process the hrtimer interrupts. This is enough to let the system running without serious starvation. It has been successfully tested under VirtualBox with 1000 HZ and 100 HZ with function graph tracer launched. On both cases, the clock events were increased until about 25 ms periodic ticks, which means 40 HZ. So we change a hard to debug hang into a warning message and a system that still manages to limp along. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index bda9cb924276..c2a69b89ac61 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1171,6 +1171,29 @@ static void __run_hrtimer(struct hrtimer *timer) #ifdef CONFIG_HIGH_RES_TIMERS +static int force_clock_reprogram; + +/* + * After 5 iteration's attempts, we consider that hrtimer_interrupt() + * is hanging, which could happen with something that slows the interrupt + * such as the tracing. Then we force the clock reprogramming for each future + * hrtimer interrupts to avoid infinite loops and use the min_delta_ns + * threshold that we will overwrite. + * The next tick event will be scheduled to 3 times we currently spend on + * hrtimer_interrupt(). This gives a good compromise, the cpus will spend + * 1/4 of their time to process the hrtimer interrupts. This is enough to + * let it running without serious starvation. + */ + +static inline void +hrtimer_interrupt_hanging(struct clock_event_device *dev, + ktime_t try_time) +{ + force_clock_reprogram = 1; + dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; + printk(KERN_WARNING "hrtimer: interrupt too slow, " + "forcing clock min delta to %lu ns\n", dev->min_delta_ns); +} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1180,6 +1203,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; ktime_t expires_next, now; + int nr_retries = 0; int i; BUG_ON(!cpu_base->hres_active); @@ -1187,6 +1211,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; retry: + /* 5 retries is enough to notice a hang */ + if (!(++nr_retries % 5)) + hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); + now = ktime_get(); expires_next.tv64 = KTIME_MAX; @@ -1239,7 +1267,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, 0)) + if (tick_program_event(expires_next, force_clock_reprogram)) goto retry; } } From 5762ba1873b0bb9faa631aaa02f533c2b9837f82 Mon Sep 17 00:00:00 2001 From: Sebastien Dugue Date: Mon, 1 Dec 2008 14:09:07 +0100 Subject: [PATCH 02/26] hrtimers: allow the hot-unplugging of all cpus Impact: fix CPU hotplug hang on Power6 testbox On architectures that support offlining all cpus (at least powerpc/pseries), hot-unpluging the tick_do_timer_cpu can result in a system hang. This comes from the fact that if the cpu going down happens to be the cpu doing the tick, then as the tick_do_timer_cpu handover happens after the cpu is dead (via the CPU_DEAD notification), we're left without ticks, jiffies are frozen and any task relying on timers (msleep, ...) is stuck. That's particularly the case for the cpu looping in __cpu_die() waiting for the dying cpu to be dead. This patch addresses this by having the tick_do_timer_cpu handover happen earlier during the CPU_DYING notification. For this, a new clockevent notification type is introduced (CLOCK_EVT_NOTIFY_CPU_DYING) which is triggered in hrtimer_cpu_notify(). Signed-off-by: Sebastien Dugue Cc: Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 1 + kernel/hrtimer.c | 4 ++++ kernel/time/tick-common.c | 26 +++++++++++++++++++------- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index ed3a5d473e52..c6de413c5dd1 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -36,6 +36,7 @@ enum clock_event_nofitiers { CLOCK_EVT_NOTIFY_BROADCAST_EXIT, CLOCK_EVT_NOTIFY_SUSPEND, CLOCK_EVT_NOTIFY_RESUME, + CLOCK_EVT_NOTIFY_CPU_DYING, CLOCK_EVT_NOTIFY_CPU_DEAD, }; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c2a69b89ac61..61cb933395ba 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1609,6 +1609,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + case CPU_DYING_FROZEN: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df12434b43ca..457d281258ee 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -273,6 +273,21 @@ out_bc: return ret; } +/* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. + */ +static void tick_handover_do_timer(int *cpup) +{ + if (*cpup == tick_do_timer_cpu) { + int cpu = first_cpu(cpu_online_map); + + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; + } +} + /* * Shutdown an event device on a given cpu: * @@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - /* Transfer the do_timer job away from this cpu */ - if (*cpup == tick_do_timer_cpu) { - int cpu = first_cpu(cpu_online_map); - - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : - TICK_DO_TIMER_NONE; - } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, tick_broadcast_oneshot_control(reason); break; + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(dev); + break; + case CLOCK_EVT_NOTIFY_CPU_DEAD: tick_shutdown_broadcast_oneshot(dev); tick_shutdown_broadcast(dev); From 42bb8cc5e81028e217105299001070d57eb84ad7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 9 Jan 2009 12:17:40 -0800 Subject: [PATCH 03/26] x86: hpet: allow force enable on ICH10 HPET Intel "Smackover" x58 BIOS don't have HPET enabled in the BIOS, so allow to force enable it at least. The register layout is the same as in other recent ICHs, so all the code can be reused. Using numerical PCI-ID because it's unlikely the PCI-ID will be used anywhere else. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 309949e9e1c1..697d1b78cfbf 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -172,7 +172,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, ich_force_enable_hpet); - +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */ + ich_force_enable_hpet); static struct pci_dev *cached_dev; From 2d68259db26ad57fd9643f1c69b5181ec9836ca9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 16 Jan 2009 17:14:38 +0900 Subject: [PATCH 04/26] clockevents: let set_mode() setup delta information Allow the set_mode() clockevent callback to decide and fill in delta details such as shift, mult, max_delta_ns and min_delta_ns. With this change the clockevent can be registered without delta details which allows us to keep the parent clock disabled until the clockevent gets setup using set_mode(). Letting set_mode() fill in or update delta details allows us to save power by disabling the parent clock while the clockevent is unused. This may however make the parent clock rate change, so next time the clockevent gets enabled we need let set_mode() to update the detla details accordingly. Doing it at registration time is not enough. Furthermore, the delta details seem unused in the case of periodic-only clockevent drivers, so this change also allows registration of such drivers without the delta details filled in. Signed-off-by: Magnus Damm Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index ea2f48af83cf..d13be216a790 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev, if (dev->mode != mode) { dev->set_mode(mode, dev); dev->mode = mode; + + /* + * A nsec2cyc multiplicator of 0 is invalid and we'd crash + * on it, so fix it up and emit a warning: + */ + if (mode == CLOCK_EVT_MODE_ONESHOT) { + if (unlikely(!dev->mult)) { + dev->mult = 1; + WARN_ON(1); + } + } } } @@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(!dev->cpumask); - /* - * A nsec2cyc multiplicator of 0 is invalid and we'd crash - * on it, so fix it up and emit a warning: - */ - if (unlikely(!dev->mult)) { - dev->mult = 1; - WARN_ON(1); - } - spin_lock(&clockevents_lock); list_add(&dev->list, &clockevent_devices); From 6626bff24578753808c8b5bd4f1619e14e980f0f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 25 Jan 2009 11:31:36 +0100 Subject: [PATCH 05/26] hrtimer: prevent negative expiry value after clock_was_set() Impact: prevent false positive WARN_ON() in clockevents_program_event() clock_was_set() changes the base->offset of CLOCK_REALTIME and enforces the reprogramming of the clockevent device to expire timers which are based on CLOCK_REALTIME. If the clock change is large enough then the subtraction of the timer expiry value and base->offset can become negative which triggers the warning in clockevents_program_event(). Check the subtraction result and set a negative value to 0. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2c40ee8f44bd..d71cef25954b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) continue; timer = rb_entry(base->first, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + /* + * clock_was_set() has changed base->offset so the + * result might be negative. Fix it up to prevent a + * false positive in clockevents_program_event() + */ + if (expires.tv64 < 0) + expires.tv64 = 0; if (expires.tv64 < cpu_base->expires_next.tv64) cpu_base->expires_next = expires; } From 74019224ac34b044b44a31dd89a54e3477db4896 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Feb 2009 12:23:29 +0100 Subject: [PATCH 06/26] timers: add mod_timer_pending() Impact: new timer API Based on an idea from Martin Josefsson with the help of Patrick McHardy and Stephen Hemminger: introduce the mod_timer_pending() API which is a mod_timer() offspring that is an invariant on already removed timers. (regular mod_timer() re-activates non-pending timers.) This is useful for the networking code in that it can allow unserialized mod_timer_pending() timer-forwarding calls, but a single del_timer*() will stop the timer from being reactivated again. Also while at it: - optimize the regular mod_timer() path some more, the timer-stat and a debug check was needlessly duplicated in __mod_timer(). - make the exports come straight after the function, as most other exports in timer.c already did. - eliminate __mod_timer() as an external API, change the users to mod_timer(). The regular mod_timer() code path is not impacted significantly, due to inlining optimizations and due to the simplifications. Based-on-patch-from: Stephen Hemminger Acked-by: Stephen Hemminger Cc: "David S. Miller" Cc: Patrick McHardy Cc: netdev@vger.kernel.org Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Ingo Molnar --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- drivers/infiniband/hw/ipath/ipath_driver.c | 6 +- include/linux/timer.h | 22 +--- kernel/relay.c | 2 +- kernel/timer.c | 126 +++++++++++++-------- 5 files changed, 88 insertions(+), 70 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 6a0ad196aeb3..f085369301b1 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -508,7 +508,7 @@ static void __spu_add_to_rq(struct spu_context *ctx) list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]); set_bit(ctx->prio, spu_prio->bitmap); if (!spu_prio->nr_waiting++) - __mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); + mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); } } diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 69c0ce321b4e..cb9daa6ac029 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -2715,7 +2715,7 @@ static void ipath_hol_signal_up(struct ipath_devdata *dd) * to prevent HoL blocking, then start the HoL timer that * periodically continues, then stop procs, so they can detect * link down if they want, and do something about it. - * Timer may already be running, so use __mod_timer, not add_timer. + * Timer may already be running, so use mod_timer, not add_timer. */ void ipath_hol_down(struct ipath_devdata *dd) { @@ -2724,7 +2724,7 @@ void ipath_hol_down(struct ipath_devdata *dd) dd->ipath_hol_next = IPATH_HOL_DOWNCONT; dd->ipath_hol_timer.expires = jiffies + msecs_to_jiffies(ipath_hol_timeout_ms); - __mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); } /* @@ -2763,7 +2763,7 @@ void ipath_hol_event(unsigned long opaque) else { dd->ipath_hol_timer.expires = jiffies + msecs_to_jiffies(ipath_hol_timeout_ms); - __mod_timer(&dd->ipath_hol_timer, + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); } } diff --git a/include/linux/timer.h b/include/linux/timer.h index daf9685b861c..e2d662e3416e 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -86,8 +86,8 @@ static inline int timer_pending(const struct timer_list * timer) extern void add_timer_on(struct timer_list *timer, int cpu); extern int del_timer(struct timer_list * timer); -extern int __mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer(struct timer_list *timer, unsigned long expires); +extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); /* * The jiffies value which is added to now, when there is no timer @@ -146,25 +146,7 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer) } #endif -/** - * add_timer - start a timer - * @timer: the timer to be added - * - * The kernel will do a ->function(->data) callback from the - * timer interrupt at the ->expires point in the future. The - * current time is 'jiffies'. - * - * The timer's ->expires, ->function (and if the handler uses it, ->data) - * fields must be set prior calling this function. - * - * Timers with an ->expires field in the past will be executed in the next - * timer tick. - */ -static inline void add_timer(struct timer_list *timer) -{ - BUG_ON(timer_pending(timer)); - __mod_timer(timer, timer->expires); -} +extern void add_timer(struct timer_list *timer); #ifdef CONFIG_SMP extern int try_to_del_timer_sync(struct timer_list *timer); diff --git a/kernel/relay.c b/kernel/relay.c index 9d79b7854fa6..8f2179c8056f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -750,7 +750,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) * from the scheduler (trying to re-grab * rq->lock), so defer it. */ - __mod_timer(&buf->timer, jiffies + 1); + mod_timer(&buf->timer, jiffies + 1); } old = buf->data; diff --git a/kernel/timer.c b/kernel/timer.c index 13dd64fe143d..9b77fc9a9ac8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -589,11 +589,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, } } -int __mod_timer(struct timer_list *timer, unsigned long expires) +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0; + int ret; + + ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -603,6 +606,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer)) { detach_timer(timer, 0); ret = 1; + } else { + if (pending_only) + goto out_unlock; } debug_timer_activate(timer); @@ -629,12 +635,83 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) timer->expires = expires; internal_add_timer(base, timer); + +out_unlock: spin_unlock_irqrestore(&base->lock, flags); return ret; } -EXPORT_SYMBOL(__mod_timer); +/** + * mod_timer_pending - modify a pending timer's timeout + * @timer: the pending timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pending() is the same for pending timers as mod_timer(), + * but will not re-activate and modify already deleted timers. + * + * It is useful for unserialized use of timers. + */ +int mod_timer_pending(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, true); +} +EXPORT_SYMBOL(mod_timer_pending); + +/** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer() is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * The function returns whether it has modified a pending timer or not. + * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an + * active timer returns 1.) + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer->expires == expires && timer_pending(timer)) + return 1; + + return __mod_timer(timer, expires, false); +} +EXPORT_SYMBOL(mod_timer); + +/** + * add_timer - start a timer + * @timer: the timer to be added + * + * The kernel will do a ->function(->data) callback from the + * timer interrupt at the ->expires point in the future. The + * current time is 'jiffies'. + * + * The timer's ->expires, ->function (and if the handler uses it, ->data) + * fields must be set prior calling this function. + * + * Timers with an ->expires field in the past will be executed in the next + * timer tick. + */ +void add_timer(struct timer_list *timer) +{ + BUG_ON(timer_pending(timer)); + mod_timer(timer, timer->expires); +} +EXPORT_SYMBOL(add_timer); /** * add_timer_on - start a timer on a particular CPU @@ -666,44 +743,6 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_unlock_irqrestore(&base->lock, flags); } -/** - * mod_timer - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer() is a more efficient way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * - * mod_timer(timer, expires) is equivalent to: - * - * del_timer(timer); timer->expires = expires; add_timer(timer); - * - * Note that if there are multiple unserialized concurrent users of the - * same timer, then mod_timer() is the only safe way to modify the timeout, - * since add_timer() cannot modify an already running timer. - * - * The function returns whether it has modified a pending timer or not. - * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an - * active timer returns 1.) - */ -int mod_timer(struct timer_list *timer, unsigned long expires) -{ - BUG_ON(!timer->function); - - timer_stats_timer_set_start_info(timer); - /* - * This is a common optimization triggered by the - * networking code - if the timer is re-modified - * to be the same thing then just return: - */ - if (timer->expires == expires && timer_pending(timer)) - return 1; - - return __mod_timer(timer, expires); -} - -EXPORT_SYMBOL(mod_timer); - /** * del_timer - deactive a timer. * @timer: the timer to be deactivated @@ -733,7 +772,6 @@ int del_timer(struct timer_list *timer) return ret; } - EXPORT_SYMBOL(del_timer); #ifdef CONFIG_SMP @@ -767,7 +805,6 @@ out: return ret; } - EXPORT_SYMBOL(try_to_del_timer_sync); /** @@ -796,7 +833,6 @@ int del_timer_sync(struct timer_list *timer) cpu_relax(); } } - EXPORT_SYMBOL(del_timer_sync); #endif @@ -1268,7 +1304,7 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire); + __mod_timer(&timer, expire, false); schedule(); del_singleshot_timer_sync(&timer); From fdcedf7b75808dd72c3cc0b931be11b04d75c60a Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 18 Feb 2009 16:02:22 -0800 Subject: [PATCH 07/26] time: apply NTP frequency/tick changes immediately Since the GENERIC_TIME changes landed, the adjtimex behavior changed for struct timex.tick and .freq changed. When the tick or freq value is set, we adjust the tick_length_base in ntp_update_frequency(). However, this new value doesn't get applied to tick_length until the next second (via second_overflow). This means some applications that do quick time tweaking do not see the requested change made as quickly as expected. I've run a few tests with this change, and ntpd still functions fine. Signed-off-by: John Stultz Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f5f793d92415..e1fa3689a903 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -51,6 +51,7 @@ static long ntp_tick_adj; static void ntp_update_frequency(void) { + u64 old_tick_length_base = tick_length_base; u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; @@ -60,6 +61,12 @@ static void ntp_update_frequency(void) tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); + + /* + * Don't wait for the next second_overflow, apply + * the change to the tick length immediately + */ + tick_length += tick_length_base - old_tick_length_base; } static void ntp_update_offset(long offset) From b98103a5597b87211a1c74077b06faeac554bedc Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Sat, 21 Feb 2009 00:09:47 +0100 Subject: [PATCH 08/26] x86: hpet: print HPET registers during setup (if hpet=verbose is used) Signed-off-by: Andreas Herrmann Cc: Mark Hounschell Cc: Borislav Petkov Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 4 ++- arch/x86/kernel/hpet.c | 45 +++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b182626739ea..01379a822646 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -492,10 +492,12 @@ and is between 256 and 4096 characters. It is defined in the file Default: 64 hpet= [X86-32,HPET] option to control HPET usage - Format: { enable (default) | disable | force } + Format: { enable (default) | disable | force | + verbose } disable: disable HPET and use PIT instead force: allow force enabled of undocumented chips (ICH4, VIA, nVidia) + verbose: show contents of HPET registers during setup com20020= [HW,NET] ARCnet - COM20020 chipset Format: diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a00545fe5cdd..1d86ca3c1f97 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -80,6 +80,7 @@ static inline void hpet_clear_mapping(void) */ static int boot_hpet_disable; int hpet_force_user; +static int hpet_verbose; static int __init hpet_setup(char *str) { @@ -88,6 +89,8 @@ static int __init hpet_setup(char *str) boot_hpet_disable = 1; if (!strncmp("force", str, 5)) hpet_force_user = 1; + if (!strncmp("verbose", str, 7)) + hpet_verbose = 1; } return 1; } @@ -119,6 +122,43 @@ int is_hpet_enabled(void) } EXPORT_SYMBOL_GPL(is_hpet_enabled); +static void _hpet_print_config(const char *function, int line) +{ + u32 i, timers, l, h; + printk(KERN_INFO "hpet: %s(%d):\n", function, line); + l = hpet_readl(HPET_ID); + h = hpet_readl(HPET_PERIOD); + timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h); + l = hpet_readl(HPET_CFG); + h = hpet_readl(HPET_STATUS); + printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h); + l = hpet_readl(HPET_COUNTER); + h = hpet_readl(HPET_COUNTER+4); + printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); + + for (i = 0; i < timers; i++) { + l = hpet_readl(HPET_Tn_CFG(i)); + h = hpet_readl(HPET_Tn_CFG(i)+4); + printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", + i, l, h); + l = hpet_readl(HPET_Tn_CMP(i)); + h = hpet_readl(HPET_Tn_CMP(i)+4); + printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", + i, l, h); + l = hpet_readl(HPET_Tn_ROUTE(i)); + h = hpet_readl(HPET_Tn_ROUTE(i)+4); + printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", + i, l, h); + } +} + +#define hpet_print_config() \ +do { \ + if (hpet_verbose) \ + _hpet_print_config(__FUNCTION__, __LINE__); \ +} while (0) + /* * When the hpet driver (/dev/hpet) is enabled, we need to reserve * timer 0 and timer 1 in case of RTC emulation. @@ -282,6 +322,7 @@ static void hpet_set_mode(enum clock_event_mode mode, hpet_writel(cmp, HPET_Tn_CMP(timer)); udelay(1); hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); + hpet_print_config(); break; case CLOCK_EVT_MODE_ONESHOT: @@ -308,6 +349,7 @@ static void hpet_set_mode(enum clock_event_mode mode, irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } + hpet_print_config(); break; } } @@ -526,6 +568,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); num_timers++; /* Value read out starts from 0 */ + hpet_print_config(); hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); if (!hpet_devs) @@ -793,6 +836,7 @@ int __init hpet_enable(void) * information and the number of channels */ id = hpet_readl(HPET_ID); + hpet_print_config(); #ifdef CONFIG_HPET_EMULATE_RTC /* @@ -845,6 +889,7 @@ static __init int hpet_late_init(void) return -ENODEV; hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + hpet_print_config(); for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); From 8d6f0c8214928f7c5083dd54ecb69c5d615b516e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Sat, 21 Feb 2009 00:10:44 +0100 Subject: [PATCH 09/26] x86: hpet: provide separate functions to stop and start the counter By splitting up existing hpet_start_counter function. Signed-off-by: Andreas Herrmann Cc: Mark Hounschell Cc: Borislav Petkov Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1d86ca3c1f97..7ae1f1e8b9b7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -231,27 +231,37 @@ static struct clock_event_device hpet_clockevent = { .rating = 50, }; -static void hpet_start_counter(void) +static void hpet_stop_counter(void) { unsigned long cfg = hpet_readl(HPET_CFG); - cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); hpet_writel(0, HPET_COUNTER); hpet_writel(0, HPET_COUNTER + 4); +} + +static void hpet_start_counter(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } +static void hpet_restart_counter(void) +{ + hpet_stop_counter(); + hpet_start_counter(); +} + static void hpet_resume_device(void) { force_hpet_resume(); } -static void hpet_restart_counter(void) +static void hpet_resume_counter(void) { hpet_resume_device(); - hpet_start_counter(); + hpet_restart_counter(); } static void hpet_enable_legacy_int(void) @@ -738,7 +748,7 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .resume = hpet_restart_counter, + .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 .vread = vread_hpet, #endif @@ -750,7 +760,7 @@ static int hpet_clocksource_register(void) cycle_t t1; /* Start the counter */ - hpet_start_counter(); + hpet_restart_counter(); /* Verify whether hpet counter works */ t1 = read_hpet(); From c23e253e67c9d8a91a0ffa33c1f571a17f0a2403 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Sat, 21 Feb 2009 00:16:35 +0100 Subject: [PATCH 10/26] x86: hpet: stop HPET_COUNTER when programming periodic mode Impact: fix system hang on some systems operating with HZ_1000 On a system that stalled with HZ_1000, the first value written to T0_CMP (when the main counter was not stopped) did not trigger an interrupt. Instead after the main counter wrapped around (after several minutes) an interrupt was triggered and afterwards the periodic interrupt took effect. This can be fixed by implementing HPET spec recommendation for programming the periodic mode (i.e. stopping the main counter). Signed-off-by: Andreas Herrmann Cc: Mark Hounschell Cc: Borislav Petkov Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7ae1f1e8b9b7..648b3a2a3a44 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -309,29 +309,22 @@ static int hpet_setup_msi_irq(unsigned int irq); static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { - unsigned long cfg, cmp, now; + unsigned long cfg; uint64_t delta; switch (mode) { case CLOCK_EVT_MODE_PERIODIC: + hpet_stop_counter(); delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; - now = hpet_readl(HPET_COUNTER); - cmp = now + (unsigned long) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); /* Make sure we use edge triggered interrupts */ cfg &= ~HPET_TN_LEVEL; cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); - /* - * The first write after writing TN_SETVAL to the - * config register sets the counter value, the second - * write sets the period. - */ - hpet_writel(cmp, HPET_Tn_CMP(timer)); - udelay(1); hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); + hpet_start_counter(); hpet_print_config(); break; From 53bbfa9e9437e70b322368e82c723112d690e304 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Feb 2008 07:58:42 +0100 Subject: [PATCH 11/26] time: ntp: clean up kernel/time/ntp.c Impact: cleanup, no functionality changed Make this file a bit more readable by applying a consistent coding style. No code changed: kernel/time/ntp.o: text data bss dec hex filename 2552 170 168 2890 b4a ntp.o.before 2552 170 168 2890 b4a ntp.o.after md5: eae1275df0b7d6290c13f6f6f8f05c8c ntp.o.before.asm eae1275df0b7d6290c13f6f6f8f05c8c ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 129 +++++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 48 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index e1fa3689a903..3479ec48e604 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -1,53 +1,81 @@ /* - * linux/kernel/time/ntp.c - * * NTP state machine interfaces and logic. * * This code was mainly moved from kernel/timer.c and kernel/time.c * Please see those files for relevant copyright info and historical * changelogs. */ - -#include -#include -#include -#include -#include #include -#include #include #include -#include +#include +#include +#include +#include +#include +#include /* - * Timekeeping variables + * NTP timekeeping variables: */ -unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ -unsigned long tick_nsec; /* ACTHZ period (nsec) */ -u64 tick_length; -static u64 tick_length_base; -static struct hrtimer leap_timer; +/* USER_HZ period (usecs): */ +unsigned long tick_usec = TICK_USEC; -#define MAX_TICKADJ 500 /* microsecs */ -#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ - NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +/* ACTHZ period (nsecs): */ +unsigned long tick_nsec; + +u64 tick_length; +static u64 tick_length_base; + +static struct hrtimer leap_timer; + +#define MAX_TICKADJ 500 /* usecs */ +#define MAX_TICKADJ_SCALED \ + (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -static int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -static long time_tai; /* TAI offset (s) */ -static s64 time_offset; /* time adjustment (ns) */ -static long time_constant = 2; /* pll time constant */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static s64 time_freq; /* frequency offset (scaled ns/s)*/ -static long time_reftime; /* time at last adjustment (s) */ -long time_adjust; -static long ntp_tick_adj; + +/* + * clock synchronization status + * + * (TIME_ERROR prevents overwriting the CMOS clock) + */ +static int time_state = TIME_OK; + +/* clock status bits: */ +int time_status = STA_UNSYNC; + +/* TAI offset (secs): */ +static long time_tai; + +/* time adjustment (nsecs): */ +static s64 time_offset; + +/* pll time constant: */ +static long time_constant = 2; + +/* maximum error (usecs): */ +long time_maxerror = NTP_PHASE_LIMIT; + +/* estimated error (usecs): */ +long time_esterror = NTP_PHASE_LIMIT; + +/* frequency offset (scaled nsecs/secs): */ +static s64 time_freq; + +/* time at last adjustment (secs): */ +static long time_reftime; + +long time_adjust; + +static long ntp_tick_adj; + +/* + * NTP methods: + */ static void ntp_update_frequency(void) { @@ -118,15 +146,15 @@ static void ntp_update_offset(long offset) */ void ntp_clear(void) { - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; ntp_update_frequency(); - tick_length = tick_length_base; - time_offset = 0; + tick_length = tick_length_base; + time_offset = 0; } /* @@ -147,8 +175,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) xtime.tv_sec--; wall_to_monotonic.tv_sec++; time_state = TIME_OOP; - printk(KERN_NOTICE "Clock: " - "inserting leap second 23:59:60 UTC\n"); + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); res = HRTIMER_RESTART; break; @@ -157,8 +185,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_tai--; wall_to_monotonic.tv_sec--; time_state = TIME_WAIT; - printk(KERN_NOTICE "Clock: " - "deleting leap second 23:59:59 UTC\n"); + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); break; case TIME_OOP: time_tai++; @@ -199,10 +227,10 @@ void second_overflow(void) * Compute the phase adjustment for the next second. The offset is * reduced by a fixed factor times the time constant. */ - tick_length = tick_length_base; - time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); - time_offset -= time_adj; - tick_length += time_adj; + tick_length = tick_length_base; + time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); + time_offset -= time_adj; + tick_length += time_adj; if (unlikely(time_adjust)) { if (time_adjust > MAX_TICKADJ) { @@ -240,12 +268,13 @@ static void sync_cmos_clock(struct work_struct *work) * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... */ - if (!ntp_synced()) + if (!ntp_synced()) { /* * Not synced, exit, do not restart a timer (if one is * running, let it run out). */ return; + } getnstimeofday(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) @@ -277,7 +306,8 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif -/* adjtimex mainly allows reading (and writing, if superuser) of +/* + * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ int do_adjtimex(struct timex *txc) @@ -298,7 +328,10 @@ int do_adjtimex(struct timex *txc) if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; - /* if the quartz is off by more than 10% something is VERY wrong! */ + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) From 3c972c2444dcb7088999c32b8c5a7ab3b8a6c0b6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:06:57 +0100 Subject: [PATCH 12/26] time: ntp: simplify the second_overflow() code flow Impact: cleanup, no functionality changed Instead of a hierarchy of conditions, transform them to clean gradual conditions and return's. This makes the flow easier to read and makes the purpose of the function easier to understand. kernel/time/ntp.o: text data bss dec hex filename 2552 170 168 2890 b4a ntp.o.before 2552 170 168 2890 b4a ntp.o.after md5: eae1275df0b7d6290c13f6f6f8f05c8c ntp.o.before.asm eae1275df0b7d6290c13f6f6f8f05c8c ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3479ec48e604..1fa6615b317a 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -232,19 +232,24 @@ void second_overflow(void) time_offset -= time_adj; tick_length += time_adj; - if (unlikely(time_adjust)) { - if (time_adjust > MAX_TICKADJ) { - time_adjust -= MAX_TICKADJ; - tick_length += MAX_TICKADJ_SCALED; - } else if (time_adjust < -MAX_TICKADJ) { - time_adjust += MAX_TICKADJ; - tick_length -= MAX_TICKADJ_SCALED; - } else { - tick_length += (s64)(time_adjust * NSEC_PER_USEC / - NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; - time_adjust = 0; - } + if (!time_adjust) + return; + + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + return; } + + if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + return; + } + + tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + time_adjust = 0; } #ifdef CONFIG_GENERIC_CMOS_UPDATE From bbd1267690bb6940d0722dd33e929442c0409c01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:11:11 +0100 Subject: [PATCH 13/26] time: ntp: simplify the MAX_TICKADJ_SCALED definition Impact: cleanup, no functionality changed There's an ugly u64 typecase in the MAX_TICKADJ_SCALED definition, this can be eliminated by making the MAX_TICKADJ constant's type 64-bit (signed). kernel/time/ntp.o: text data bss dec hex filename 2504 114 136 2754 ac2 ntp.o.before 2504 114 136 2754 ac2 ntp.o.after md5: 41f3009debc9b397d7394dd77d912f0a ntp.o.before.asm 41f3009debc9b397d7394dd77d912f0a ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 1fa6615b317a..2b758c935c65 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -30,9 +30,9 @@ static u64 tick_length_base; static struct hrtimer leap_timer; -#define MAX_TICKADJ 500 /* usecs */ +#define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ - (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) + (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables From 9ce616aaefcb9309cb9c49a36310ebda6061b98b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:42:59 +0100 Subject: [PATCH 14/26] time: ntp: clean up ntp_update_frequency() Impact: cleanup, no functionality changed Prepare a refactoring of ntp_update_frequency(). kernel/time/ntp.o: text data bss dec hex filename 2504 114 136 2754 ac2 ntp.o.before 2504 114 136 2754 ac2 ntp.o.after md5: 41f3009debc9b397d7394dd77d912f0a ntp.o.before.asm 41f3009debc9b397d7394dd77d912f0a ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 2b758c935c65..7d281d9fbe30 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -77,24 +77,33 @@ static long ntp_tick_adj; * NTP methods: */ +/* + * Update (tick_length, tick_length_base, tick_nsec), based + * on (tick_usec, ntp_tick_adj, time_freq): + */ static void ntp_update_frequency(void) { - u64 old_tick_length_base = tick_length_base; - u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) - << NTP_SCALE_SHIFT; - second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; - second_length += time_freq; + u64 prev_base; + u64 second_length; - tick_length_base = second_length; + prev_base = tick_length_base; - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; - tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << NTP_SCALE_SHIFT; + + second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; + second_length += time_freq; + + tick_length_base = second_length; + + tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; + tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); /* * Don't wait for the next second_overflow, apply * the change to the tick length immediately */ - tick_length += tick_length_base - old_tick_length_base; + tick_length += tick_length_base - prev_base; } static void ntp_update_offset(long offset) From bc26c31d446bc9c24cd6f7003777a05fe268ae48 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:17:36 +0100 Subject: [PATCH 15/26] time: ntp: refactor up ntp_update_frequency() Impact: cleanup, no functionality changed Change ntp_update_frequency() from a hard to follow code flow that uses global variables as temporaries, to a clean input+output flow. Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7d281d9fbe30..f1abad738579 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -83,10 +83,8 @@ static long ntp_tick_adj; */ static void ntp_update_frequency(void) { - u64 prev_base; u64 second_length; - - prev_base = tick_length_base; + u64 new_base; second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; @@ -94,16 +92,15 @@ static void ntp_update_frequency(void) second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; second_length += time_freq; - tick_length_base = second_length; - tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; - tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); + new_base = div_u64(second_length, NTP_INTERVAL_FREQ); /* * Don't wait for the next second_overflow, apply - * the change to the tick length immediately + * the change to the tick length immediately: */ - tick_length += tick_length_base - prev_base; + tick_length += new_base - tick_length_base; + tick_length_base = new_base; } static void ntp_update_offset(long offset) From f939890b6687e05c42361655fb6610fa08f5a601 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 12:57:49 +0100 Subject: [PATCH 16/26] time: ntp: refactor and clean up ntp_update_offset() Impact: cleanup, no functionality changed - introduce the ntp_update_offset_fll() helper - clean up the flow and variable naming kernel/time/ntp.o: text data bss dec hex filename 2504 114 136 2754 ac2 ntp.o.before 2504 114 136 2754 ac2 ntp.o.after md5: 01f7b8e1a5472a3056f9e4ae84d46315 ntp.o.before.asm 01f7b8e1a5472a3056f9e4ae84d46315 ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f1abad738579..ee437e1445d1 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -103,10 +103,27 @@ static void ntp_update_frequency(void) tick_length_base = new_base; } +static inline s64 ntp_update_offset_fll(s64 freq_adj, s64 offset64, long secs) +{ + time_status &= ~STA_MODE; + + if (secs < MINSEC) + return freq_adj; + + if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + return freq_adj; + + freq_adj += div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); + time_status |= STA_MODE; + + return freq_adj; +} + static void ntp_update_offset(long offset) { - long mtemp; s64 freq_adj; + s64 offset64; + long secs; if (!(time_status & STA_PLL)) return; @@ -127,22 +144,21 @@ static void ntp_update_offset(long offset) */ if (time_status & STA_FREQHOLD || time_reftime == 0) time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; + + secs = xtime.tv_sec - time_reftime; time_reftime = xtime.tv_sec; - freq_adj = (s64)offset * mtemp; - freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); - time_status &= ~STA_MODE; - if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { - freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL), - mtemp); - time_status |= STA_MODE; - } - freq_adj += time_freq; - freq_adj = min(freq_adj, MAXFREQ_SCALED); - time_freq = max(freq_adj, -MAXFREQ_SCALED); + offset64 = offset; + freq_adj = (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); - time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); + freq_adj = ntp_update_offset_fll(freq_adj, offset64, secs); + + freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + + time_freq = max(freq_adj, -MAXFREQ_SCALED); + + time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } /** From 478b7aab1682246a3d1e76e27a0aecb2f0013379 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 13:22:23 +0100 Subject: [PATCH 17/26] time: ntp: simplify ntp_update_offset_fll() Impact: cleanup, no functionality changed Change ntp_update_offset_fll() to delta logic instead of absolute value logic. This eliminates 'freq_adj' from the function. Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ee437e1445d1..5202dde2f0af 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -103,20 +103,19 @@ static void ntp_update_frequency(void) tick_length_base = new_base; } -static inline s64 ntp_update_offset_fll(s64 freq_adj, s64 offset64, long secs) +static inline s64 ntp_update_offset_fll(s64 offset64, long secs) { time_status &= ~STA_MODE; if (secs < MINSEC) - return freq_adj; + return 0; if (!(time_status & STA_FLL) && (secs <= MAXSEC)) - return freq_adj; + return 0; - freq_adj += div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); time_status |= STA_MODE; - return freq_adj; + return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } static void ntp_update_offset(long offset) @@ -152,7 +151,7 @@ static void ntp_update_offset(long offset) freq_adj = (offset64 * secs) << (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); - freq_adj = ntp_update_offset_fll(freq_adj, offset64, secs); + freq_adj += ntp_update_offset_fll(offset64, secs); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); From c7986acba211e8285e14c9603fb89e6f4ea0b9f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 13:29:09 +0100 Subject: [PATCH 18/26] time: ntp: micro-optimize ntp_update_offset() Impact: cleanup, no functionality changed The time_reftime update in ntp_update_offset() to xtime.tv_sec is a convoluted way of saying that we want to freeze the frequency and want the 'secs' delta to be 0. Also make this branch unlikely. This shaves off 8 bytes from the code size: text data bss dec hex filename 2504 114 136 2754 ac2 ntp.o.before 2496 114 136 2746 aba ntp.o.after Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5202dde2f0af..580a35028693 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -141,10 +141,10 @@ static void ntp_update_offset(long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - secs = xtime.tv_sec - time_reftime; + if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0)) + secs = 0; + time_reftime = xtime.tv_sec; offset64 = offset; From 10dd31a7a17254d6ba793305fc590455393e610e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 13:38:40 +0100 Subject: [PATCH 19/26] time: ntp: fix bug in ntp_update_offset() & do_adjtimex() Impact: change (fix) the way the NTP PLL seconds offset is initialized/tracked Fix a bug and do a micro-optimization: When PLL is enabled we do not reset time_reftime. If the PLL was off for a long time (for example after bootup), this is arguably the wrong thing to do. We already had a hack for the common boot-time case in ntp_update_offset(), in form of: if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0)) secs = 0; But the update delta should be reset later on too - not just when the PLL is enabled for the first time after bootup. So do it on !STA_PLL -> STA_PLL transitions. This changes behavior, as previously if ntpd was disabled for a long time and we restarted it, we'd run from that last update, with a very large delta. Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 580a35028693..fc08eb10ced4 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -142,7 +142,7 @@ static void ntp_update_offset(long offset) * and in which mode (PLL or FLL). */ secs = xtime.tv_sec - time_reftime; - if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0)) + if (unlikely(time_status & STA_FREQHOLD)) secs = 0; time_reftime = xtime.tv_sec; @@ -394,6 +394,13 @@ int do_adjtimex(struct timex *txc) } /* only set allowed bits */ time_status &= STA_RONLY; + /* + * If we turn on PLL adjustments then reset the + * reference time to current time. + */ + if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + time_reftime = xtime.tv_sec; + time_status |= txc->status & ~STA_RONLY; switch (time_state) { From 80f2257116474ceed5fccab510b4f7245c0f49d7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 15:15:32 +0100 Subject: [PATCH 20/26] time: ntp: refactor do_adjtimex() Impact: cleanup, no functionality changed do_adjtimex() is currently a monster function with a maze of branches. Refactor the txc->modes setting aspects of it into two new helper functions: process_adj_status() process_adjtimex_modes() kernel/time/ntp.o: text data bss dec hex filename 2512 114 136 2762 aca ntp.o.before 2512 114 136 2762 aca ntp.o.after Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 182 +++++++++++++++++++++++++--------------------- 1 file changed, 99 insertions(+), 83 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index fc08eb10ced4..aded09be98cc 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -332,6 +332,102 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif + +/* + * Propagate a new txc->status value into the NTP state: + */ +static inline void process_adj_status(struct timex *txc, struct timespec *ts) +{ + long now; + + if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { + time_state = TIME_OK; + time_status = STA_UNSYNC; + } + /* only set allowed bits */ + time_status &= STA_RONLY; + + /* + * If we turn on PLL adjustments then reset the + * reference time to current time. + */ + if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + time_reftime = xtime.tv_sec; + + time_status |= txc->status & ~STA_RONLY; + + switch (time_state) { + case TIME_OK: + start_timer: + now = ts->tv_sec; + if (time_status & STA_INS) { + time_state = TIME_INS; + now += 86400 - now % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + } else if (time_status & STA_DEL) { + time_state = TIME_DEL; + now += 86400 - (now + 1) % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + } + break; + case TIME_INS: + case TIME_DEL: + time_state = TIME_OK; + goto start_timer; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + case TIME_OOP: + hrtimer_restart(&leap_timer); + break; + } +} +/* + * Called with the xtime lock held, so we can access and modify + * all the global NTP state: + */ +static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) +{ + if (txc->modes & ADJ_STATUS) + process_adj_status(txc, ts); + + if (txc->modes & ADJ_NANO) + time_status |= STA_NANO; + if (txc->modes & ADJ_MICRO) + time_status &= ~STA_NANO; + + if (txc->modes & ADJ_FREQUENCY) { + time_freq = (s64)txc->freq * PPM_SCALE; + time_freq = min(time_freq, MAXFREQ_SCALED); + time_freq = max(time_freq, -MAXFREQ_SCALED); + } + + if (txc->modes & ADJ_MAXERROR) + time_maxerror = txc->maxerror; + if (txc->modes & ADJ_ESTERROR) + time_esterror = txc->esterror; + + if (txc->modes & ADJ_TIMECONST) { + time_constant = txc->constant; + if (!(time_status & STA_NANO)) + time_constant += 4; + time_constant = min(time_constant, (long)MAXTC); + time_constant = max(time_constant, 0l); + } + + if (txc->modes & ADJ_TAI && txc->constant > 0) + time_tai = txc->constant; + + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); +} + /* * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. @@ -383,90 +479,10 @@ int do_adjtimex(struct timex *txc) txc->offset = save_adjust; goto adj_done; } - if (txc->modes) { - long sec; - if (txc->modes & ADJ_STATUS) { - if ((time_status & STA_PLL) && - !(txc->status & STA_PLL)) { - time_state = TIME_OK; - time_status = STA_UNSYNC; - } - /* only set allowed bits */ - time_status &= STA_RONLY; - /* - * If we turn on PLL adjustments then reset the - * reference time to current time. - */ - if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = xtime.tv_sec; - - time_status |= txc->status & ~STA_RONLY; - - switch (time_state) { - case TIME_OK: - start_timer: - sec = ts.tv_sec; - if (time_status & STA_INS) { - time_state = TIME_INS; - sec += 86400 - sec % 86400; - hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); - } else if (time_status & STA_DEL) { - time_state = TIME_DEL; - sec += 86400 - (sec + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); - } - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - goto start_timer; - break; - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } - } - - if (txc->modes & ADJ_NANO) - time_status |= STA_NANO; - if (txc->modes & ADJ_MICRO) - time_status &= ~STA_NANO; - - if (txc->modes & ADJ_FREQUENCY) { - time_freq = (s64)txc->freq * PPM_SCALE; - time_freq = min(time_freq, MAXFREQ_SCALED); - time_freq = max(time_freq, -MAXFREQ_SCALED); - } - - if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; - if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; - - if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; - if (!(time_status & STA_NANO)) - time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); - } - - if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; - - if (txc->modes & ADJ_OFFSET) - ntp_update_offset(txc->offset); - if (txc->modes & ADJ_TICK) - tick_usec = txc->tick; - - if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) - ntp_update_frequency(); - } + /* If there are input parameters, then process them: */ + if (txc->modes) + process_adjtimex_modes(txc, &ts); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); From e96291653b2e4df02f160b574070f6e632868e5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 15:35:18 +0100 Subject: [PATCH 21/26] time: ntp: refactor do_adjtimex() some more Impact: cleanup, no functionality changed Further simplify do_adjtimex(): - introduce the ntp_start_leap_timer() helper function - eliminate the goto adj_done complication Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 65 +++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index aded09be98cc..4346ed6e623f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -332,14 +332,33 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif +/* + * Start the leap seconds timer: + */ +static inline void ntp_start_leap_timer(struct timespec *ts) +{ + long now = ts->tv_sec; + + if (time_status & STA_INS) { + time_state = TIME_INS; + now += 86400 - now % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + + return; + } + + if (time_status & STA_DEL) { + time_state = TIME_DEL; + now += 86400 - (now + 1) % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + } +} /* * Propagate a new txc->status value into the NTP state: */ static inline void process_adj_status(struct timex *txc, struct timespec *ts) { - long now; - if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; @@ -358,22 +377,12 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) switch (time_state) { case TIME_OK: - start_timer: - now = ts->tv_sec; - if (time_status & STA_INS) { - time_state = TIME_INS; - now += 86400 - now % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } else if (time_status & STA_DEL) { - time_state = TIME_DEL; - now += 86400 - (now + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } + ntp_start_leap_timer(ts); break; case TIME_INS: case TIME_DEL: time_state = TIME_OK; - goto start_timer; + ntp_start_leap_timer(ts); case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; @@ -394,6 +403,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts if (txc->modes & ADJ_NANO) time_status |= STA_NANO; + if (txc->modes & ADJ_MICRO) time_status &= ~STA_NANO; @@ -405,6 +415,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts if (txc->modes & ADJ_MAXERROR) time_maxerror = txc->maxerror; + if (txc->modes & ADJ_ESTERROR) time_esterror = txc->esterror; @@ -421,6 +432,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); + if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@ -457,7 +469,7 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) - return -EINVAL; + return -EINVAL; if (txc->modes & ADJ_STATUS && time_state != TIME_OK) hrtimer_cancel(&leap_timer); @@ -467,7 +479,6 @@ int do_adjtimex(struct timex *txc) write_seqlock_irq(&xtime_lock); - /* If there are input parameters, then process them */ if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -477,19 +488,18 @@ int do_adjtimex(struct timex *txc) ntp_update_frequency(); } txc->offset = save_adjust; - goto adj_done; + } else { + + /* If there are input parameters, then process them: */ + if (txc->modes) + process_adjtimex_modes(txc, &ts); + + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; } - /* If there are input parameters, then process them: */ - if (txc->modes) - process_adjtimex_modes(txc, &ts); - - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - -adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; @@ -514,6 +524,7 @@ adj_done: txc->calcnt = 0; txc->errcnt = 0; txc->stbcnt = 0; + write_sequnlock_irq(&xtime_lock); txc->time.tv_sec = ts.tv_sec; From 2b9d1496e7835a603c340e8f0dd81f4b74d5f248 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 15:48:43 +0100 Subject: [PATCH 22/26] time: ntp: make 64-bit constants more robust Impact: cleanup, no functionality changed - make PPM_SCALE an explicit s64 constant, to remove (s64) casts from usage sites. kernel/time/ntp.o: text data bss dec hex filename 2536 114 136 2786 ae2 ntp.o.before 2536 114 136 2786 ae2 ntp.o.after md5: 40a7728d1188aa18e83e21a81fa7b150 ntp.o.before.asm 40a7728d1188aa18e83e21a81fa7b150 ntp.o.after.asm Signed-off-by: Ingo Molnar --- include/linux/timex.h | 2 +- kernel/time/ntp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/timex.h b/include/linux/timex.h index 998a55d80acf..aa3475fcff64 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -190,7 +190,7 @@ struct timex { * offset and maximum frequency tolerance. */ #define SHIFT_USEC 16 /* frequency offset scale (shift) */ -#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) +#define PPM_SCALE ((s64)NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) #define PPM_SCALE_INV_SHIFT 19 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ PPM_SCALE + 1) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4346ed6e623f..7447d57e021a 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -408,7 +408,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { - time_freq = (s64)txc->freq * PPM_SCALE; + time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); } @@ -505,7 +505,7 @@ int do_adjtimex(struct timex *txc) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); + PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; From 069569e025706f27f939785f86a94d5d8ce55dce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 16:03:37 +0100 Subject: [PATCH 23/26] time: ntp: simplify ntp_tick_adj calculations Impact: micro-optimization Convert the (internal) ntp_tick_adj value we store from unscaled units to scaled units. This is a constant that we never modify, so scaling it up once during bootup is enough - we dont have to do it for every adjustment step. Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7447d57e021a..a3fe7ef2d83b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -71,7 +71,8 @@ static long time_reftime; long time_adjust; -static long ntp_tick_adj; +/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ +static s64 ntp_tick_adj; /* * NTP methods: @@ -89,7 +90,7 @@ static void ntp_update_frequency(void) second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; - second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; + second_length += ntp_tick_adj; second_length += time_freq; tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; @@ -540,6 +541,8 @@ int do_adjtimex(struct timex *txc) static int __init ntp_tick_adj_setup(char *str) { ntp_tick_adj = simple_strtol(str, NULL, 0); + ntp_tick_adj <<= NTP_SCALE_SHIFT; + return 1; } From 39854fe8c165872d743f6a0c4860ca2de8e45ac9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 16:06:58 +0100 Subject: [PATCH 24/26] time: ntp: clean up second_overflow() Impact: cleanup, no functionality changed The 'time_adj' local variable is named in a very confusing way because it almost shadows the 'time_adjust' global variable - which is used in this same function. Rename it to 'delta' - to make them stand apart more clearly. kernel/time/ntp.o: text data bss dec hex filename 2545 114 144 2803 af3 ntp.o.before 2545 114 144 2803 af3 ntp.o.after md5: 1bf0b3be564512279ba7cee299d1d2be ntp.o.before.asm 1bf0b3be564512279ba7cee299d1d2be ntp.o.after.asm Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index a3fe7ef2d83b..c74eb7d9d854 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -236,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) */ void second_overflow(void) { - s64 time_adj; + s64 delta; /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -249,10 +249,11 @@ void second_overflow(void) * Compute the phase adjustment for the next second. The offset is * reduced by a fixed factor times the time constant. */ - tick_length = tick_length_base; - time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); - time_offset -= time_adj; - tick_length += time_adj; + tick_length = tick_length_base; + + delta = shift_right(time_offset, SHIFT_PLL + time_constant); + time_offset -= delta; + tick_length += delta; if (!time_adjust) return; From a2a5ac8650b570bea3cb3614f77739dcd07d6632 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 26 Feb 2009 09:46:14 -0800 Subject: [PATCH 25/26] time: ntp: fix bug in ntp_update_offset() & do_adjtimex(), fix The time_status conditional was accidentally placed right after we clear the checked time_status bits, which causes us to take the conditional every time through. This fixes it by moving the conditional to before we clear the time_status bits. Signed-off-by: John Stultz Cc: Clark Williams Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c74eb7d9d854..7fc64375ff43 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -365,8 +365,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_state = TIME_OK; time_status = STA_UNSYNC; } - /* only set allowed bits */ - time_status &= STA_RONLY; /* * If we turn on PLL adjustments then reset the @@ -375,6 +373,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) time_reftime = xtime.tv_sec; + /* only set allowed bits */ + time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; switch (time_state) { From 37bebc70d7ad4144c571d74500db3bb26ec0c0eb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 23 Mar 2009 20:34:11 +0100 Subject: [PATCH 26/26] posix timers: fix RLIMIT_CPU && fork() See http://bugzilla.kernel.org/show_bug.cgi?id=12911 copy_signal() copies signal->rlim, but RLIMIT_CPU is "lost". Because posix_cpu_timers_init_group() sets cputime_expires.prof_exp = 0 and thus fastpath_timer_check() returns false unless we have other cpu timers. This is the minimal fix for 2.6.29 (tested) and 2.6.28. The patch is not optimal, we need further cleanups here. With this patch update_rlimit_cpu() is not really needed, but I don't think it should be removed. The proper fix (I think) is: - set_process_cpu_timer() should just start the cputimer->running logic (it does), no need to change cputime_expires.xxx_exp - posix_cpu_timers_init_group() should set ->running when needed - fastpath_timer_check() can check ->running instead of task_cputime_zero(signal->cputime_expires) Reported-by: Peter Lojkin Signed-off-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Roland McGrath Cc: [for 2.6.29.x] LKML-Reference: <20090323193411.GA17514@redhat.com> Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e976e505648d..8e5d9a68b022 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1370,7 +1370,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } - return 0; + + return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; } /*