From 7de4e74430139f2484cb16cedf6c281d1a5a696e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Apr 2015 11:39:18 -0700 Subject: [PATCH 001/106] timer_list: Reduce SEQ_printf footprint This macro can be converted to a static function to reduce object size. (x86-64 defconfig) $ size kernel/time/timer_list.o* text data bss dec hex filename 6583 8 0 6591 19bf kernel/time/timer_list.o.old 4647 8 0 4655 122f kernel/time/timer_list.o.new Signed-off-by: Joe Perches Cc: John Stultz Link: http://lkml.kernel.org/r/1429295958.2850.104.camel@perches.com Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index e878c2e0ba45..5960af2196ac 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -35,13 +35,20 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) +__printf(2, 3) +static void SEQ_printf(struct seq_file *m, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + + if (m) + seq_vprintf(m, fmt, args); + else + vprintk(fmt, args); + + va_end(args); +} static void print_name_offset(struct seq_file *m, void *sym) { From 4796cf9b02b5bea141632e21d64556a7eb883a65 Mon Sep 17 00:00:00 2001 From: Yingjoe Chen Date: Fri, 10 Apr 2015 21:55:50 +0800 Subject: [PATCH 002/106] time: Remove nonexistent function prototype The function clocksource_get_next() was removed in commit 75c5158f70 (timekeeping: Update clocksource with stop_machine), but the prototype was not removed with it. Remove the prototype. Signed-off-by: Yingjoe Chen Cc: Cc: Martin Schwidefsky Cc: Cc: John Stultz Link: http://lkml.kernel.org/r/1428674150-1780-1-git-send-email-yingjoe.chen@mediatek.com Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 135509821c39..a25fc6e873b8 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -181,7 +181,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); -extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); From 51a03393bac061a4e13fd17214d3ef93a5b296e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 22 Apr 2015 11:44:15 +0200 Subject: [PATCH 003/106] timekeeping: Remove stale function prototype commit 61edec81d260 "timekeeping: Simplify timekeeping_clocktai()" implemented timekeeping_clocktai() as an inline function, but left the old extern prototype in the header file. Remove it. Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index ead8794b9a4e..5b57f6c9ae34 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -15,7 +15,6 @@ extern u64 timekeeping_max_deferment(void); extern int timekeeping_inject_offset(struct timespec *ts); extern s32 timekeeping_get_tai_offset(void); extern void timekeeping_set_tai_offset(s32 tai_offset); -extern void timekeeping_clocktai(struct timespec *ts); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); From 91e5a2170e795989da9f90c18ba18984f23acc5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Apr 2015 21:02:22 +0000 Subject: [PATCH 004/106] hrtimer: Document hrtimer_forward[_now]() proper Document the calling context conditions. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20150413210035.178751779@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 17 ++++++++++++++++- kernel/time/hrtimer.c | 8 ++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 05f6df1fdf5b..7770676c387a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -418,7 +418,22 @@ static inline int hrtimer_callback_running(struct hrtimer *timer) extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); -/* Forward a hrtimer so it expires after the hrtimer's current now */ +/** + * hrtimer_forward_now - forward the timer expiry so it expires after now + * @timer: hrtimer to forward + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire after the current time + * of the hrtimer clock base. Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. + */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 76d4bd962b19..b1a74ee3e99a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -801,6 +801,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { From d9f0acdeef48570c4e6159d3108f12b64571392e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 14 Apr 2015 21:08:25 +0000 Subject: [PATCH 005/106] hrtimer: Update active_bases before calling hrtimer_force_reprogram() 'active_bases' indicates which clock-base have active timer. The intention of this bit field was to avoid evaluating inactive bases. It was introduced with the introduction of the BOOTTIME and TAI clock bases, but it was never brought into full use. We want to use it now, but in __remove_hrtimer() the update happens after the calling hrtimer_force_reprogram() which has to evaluate all clock bases for the next expiring timer. So in case the last timer of a clock base got removed we still see the active bit and therefor evaluate the clock base for no value. There are further optimizations possible when active_bases is updated in the right place. Move the update before the call to hrtimer_force_reprogram() [ tglx: Massaged changelog ] Signed-off-by: Viresh Kumar Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/20150414203500.533438642@linutronix.de Link: http://lkml.kernel.org/r/c7c8ebcd9ed88bb09d76059c745a1fafb48314e7.1428039899.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b1a74ee3e99a..9abd50b60e83 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -887,6 +887,9 @@ static void __remove_hrtimer(struct hrtimer *timer, next_timer = timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); + if (!timerqueue_getnext(&base->active)) + base->cpu_base->active_bases &= ~(1 << base->index); + if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ @@ -900,8 +903,6 @@ static void __remove_hrtimer(struct hrtimer *timer, } #endif } - if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); out: timer->state = newstate; } From 398ca17fb54b212cdc9da7ff4a17a35c48dd2103 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:27 +0000 Subject: [PATCH 006/106] hrtimer: Get rid of the resolution field in hrtimer_clock_base The field has no value because all clock bases have the same resolution. The resolution only changes when we switch to high resolution timer mode. We can evaluate that from a single static variable as well. In the !HIGHRES case its simply a constant. Export the variable, so we can simplify the usage sites. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.645454122@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 ++++-- kernel/time/hrtimer.c | 26 +++++++++----------------- kernel/time/timer_list.c | 8 ++++---- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7770676c387a..bc6f91b5443b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -137,7 +137,6 @@ struct hrtimer_sleeper { * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @active: red black tree root node for the active timers - * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base @@ -147,7 +146,6 @@ struct hrtimer_clock_base { int index; clockid_t clockid; struct timerqueue_head active; - ktime_t resolution; ktime_t (*get_time)(void); ktime_t softirq_time; ktime_t offset; @@ -295,11 +293,15 @@ extern void hrtimer_peek_ahead_timers(void); extern void clock_was_set_delayed(void); +extern unsigned int hrtimer_resolution; + #else # define MONOTONIC_RES_NSEC LOW_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_LOW_RES +#define hrtimer_resolution LOW_RES_NSEC + static inline void hrtimer_peek_ahead_timers(void) { } /* diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9abd50b60e83..965687a1fce6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -66,7 +66,6 @@ */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { @@ -74,25 +73,21 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, - .resolution = KTIME_LOW_RES, }, } }; @@ -478,6 +473,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) * High resolution timer enabled ? */ static int hrtimer_hres_enabled __read_mostly = 1; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode @@ -660,7 +657,7 @@ static void retrigger_next_event(void *arg) */ static int hrtimer_switch_to_hres(void) { - int i, cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; @@ -676,8 +673,7 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - base->clock_base[i].resolution = KTIME_HIGH_RES; + hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ @@ -820,8 +816,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta.tv64 < 0) return 0; - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; + if (interval.tv64 < hrtimer_resolution) + interval.tv64 = hrtimer_resolution; if (unlikely(delta.tv64 >= interval.tv64)) { s64 incr = ktime_to_ns(interval); @@ -963,7 +959,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); #endif } @@ -1193,12 +1189,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { - struct hrtimer_cpu_base *cpu_base; - int base = hrtimer_clockid_to_base(which_clock); - - cpu_base = raw_cpu_ptr(&hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); - + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; return 0; } EXPORT_SYMBOL_GPL(hrtimer_get_res); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 5960af2196ac..bdd5e987f115 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -127,10 +127,10 @@ static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .base: %pK\n", base); - SEQ_printf(m, " .index: %d\n", - base->index); - SEQ_printf(m, " .resolution: %Lu nsecs\n", - (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .index: %d\n", base->index); + + SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution); + SEQ_printf(m, " .get_time: "); print_name_offset(m, base->get_time); SEQ_printf(m, "\n"); From 1e3176885cce8e0137d6f4072c5910bfa00901ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:28 +0000 Subject: [PATCH 007/106] net: sched: Use hrtimer_resolution instead of hrtimer_get_res() No point in converting a timespec now that the value is directly accessible. Signed-off-by: Thomas Gleixner Acked-by: David S. Miller Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Jamal Hadi Salim Link: http://lkml.kernel.org/r/20150414203500.720623028@linutronix.de Signed-off-by: Thomas Gleixner --- net/sched/sch_api.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index ad9eed70bc8f..45bc63ae18e3 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1883,13 +1883,10 @@ EXPORT_SYMBOL(tcf_destroy_chain); #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { - struct timespec ts; - - hrtimer_get_res(CLOCK_MONOTONIC, &ts); seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, - (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); + (u32)NSEC_PER_SEC / hrtimer_resolution); return 0; } From 447fbbdc2cd58cdaf410fefef365a9ce38833157 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:30 +0000 Subject: [PATCH 008/106] sound: Use hrtimer_resolution instead of hrtimer_get_res() No point in converting a timespec now that the value is directly accessible. Get rid of the null check while at it. Resolution is guaranteed to be > 0. Signed-off-by: Thomas Gleixner Acked-by: Takashi Iwai Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Jaroslav Kysela Cc: alsa-devel@alsa-project.org Link: http://lkml.kernel.org/r/20150414203500.799133359@linutronix.de Signed-off-by: Thomas Gleixner --- sound/core/hrtimer.c | 9 +-------- sound/drivers/pcsp/pcsp.c | 15 ++++++--------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c index 886be7da989d..f845ecf7e172 100644 --- a/sound/core/hrtimer.c +++ b/sound/core/hrtimer.c @@ -121,16 +121,9 @@ static struct snd_timer *mytimer; static int __init snd_hrtimer_init(void) { struct snd_timer *timer; - struct timespec tp; int err; - hrtimer_get_res(CLOCK_MONOTONIC, &tp); - if (tp.tv_sec > 0 || !tp.tv_nsec) { - pr_err("snd-hrtimer: Invalid resolution %u.%09u", - (unsigned)tp.tv_sec, (unsigned)tp.tv_nsec); - return -EINVAL; - } - resolution = tp.tv_nsec; + resolution = hrtimer_resolution; /* Create a new timer and set up the fields */ err = snd_timer_global_new("hrtimer", SNDRV_TIMER_GLOBAL_HRTIMER, diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c index d9647bd84d0f..eb54702037cc 100644 --- a/sound/drivers/pcsp/pcsp.c +++ b/sound/drivers/pcsp/pcsp.c @@ -42,16 +42,13 @@ struct snd_pcsp pcsp_chip; static int snd_pcsp_create(struct snd_card *card) { static struct snd_device_ops ops = { }; - struct timespec tp; - int err; - int div, min_div, order; - - hrtimer_get_res(CLOCK_MONOTONIC, &tp); + unsigned int resolution = hrtimer_resolution; + int err, div, min_div, order; if (!nopcm) { - if (tp.tv_sec || tp.tv_nsec > PCSP_MAX_PERIOD_NS) { + if (resolution > PCSP_MAX_PERIOD_NS) { printk(KERN_ERR "PCSP: Timer resolution is not sufficient " - "(%linS)\n", tp.tv_nsec); + "(%linS)\n", resolution); printk(KERN_ERR "PCSP: Make sure you have HPET and ACPI " "enabled.\n"); printk(KERN_ERR "PCSP: Turned into nopcm mode.\n"); @@ -59,13 +56,13 @@ static int snd_pcsp_create(struct snd_card *card) } } - if (loops_per_jiffy >= PCSP_MIN_LPJ && tp.tv_nsec <= PCSP_MIN_PERIOD_NS) + if (loops_per_jiffy >= PCSP_MIN_LPJ && resolution <= PCSP_MIN_PERIOD_NS) min_div = MIN_DIV; else min_div = MAX_DIV; #if PCSP_DEBUG printk(KERN_DEBUG "PCSP: lpj=%li, min_div=%i, res=%li\n", - loops_per_jiffy, min_div, tp.tv_nsec); + loops_per_jiffy, min_div, resolution); #endif div = MAX_DIV / min_div; From 056a3cacbc46e5aca27b350ce4ecb3b33ebb0700 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:32 +0000 Subject: [PATCH 009/106] hrtimer: Get rid of hrtimer_get_res() The resolution is directly accessible now. So its simpler just to fill in the values of the timespec and be done with it. Text size reduction (combined with "hrtimer: Get rid of the resolution field in hrtimer_clock_base"): x8664 -61, i386 -221, ARM -60, power64 -48 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.879888080@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - kernel/time/alarmtimer.c | 6 +++--- kernel/time/hrtimer.c | 16 ---------------- kernel/time/posix-timers.c | 17 ++++++++++++----- 4 files changed, 15 insertions(+), 25 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bc6f91b5443b..8025156c8fa1 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -385,7 +385,6 @@ static inline int hrtimer_restart(struct hrtimer *timer) /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); -extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); extern ktime_t hrtimer_get_next_event(void); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1b001ed1edb9..0b55a7570c90 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -495,12 +495,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, */ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) { - clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; - if (!alarmtimer_get_rtcdev()) return -EINVAL; - return hrtimer_get_res(baseid, tp); + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; } /** diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 965687a1fce6..73131dab787e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1179,22 +1179,6 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution - * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. - */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = hrtimer_resolution; - return 0; -} -EXPORT_SYMBOL_GPL(hrtimer_get_res); - static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31ea01f42e1f..31d11ac9fa47 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, @@ -290,7 +297,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -300,7 +307,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_monotonic_raw, }; struct k_clock clock_realtime_coarse = { @@ -312,7 +319,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_coarse, }; struct k_clock clock_tai = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_tai, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -322,7 +329,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_boottime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_boottime, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, From a6ffebce7f89f6f97cc22838a5d4383b15d6774f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:34 +0000 Subject: [PATCH 010/106] hrtimer: Make the statistics fields smaller No point in having usigned long for /proc/timer_list statistics. Make them unsigned int. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203500.959773467@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 8 ++++---- kernel/time/hrtimer.c | 4 ++-- kernel/time/timer_list.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 8025156c8fa1..d39f2847754c 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -187,10 +187,10 @@ struct hrtimer_cpu_base { int in_hrtirq; int hres_active; int hang_detected; - unsigned long nr_events; - unsigned long nr_retries; - unsigned long nr_hangs; - ktime_t max_hang_time; + unsigned int nr_events; + unsigned int nr_retries; + unsigned int nr_hangs; + unsigned int max_hang_time; #endif struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; }; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 73131dab787e..874e0914eb3c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1327,8 +1327,8 @@ retry: cpu_base->hang_detected = 1; raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); - if (delta.tv64 > cpu_base->max_hang_time.tv64) - cpu_base->max_hang_time = delta; + if ((unsigned int)delta.tv64 > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta.tv64; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdd5e987f115..6232fc536185 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -165,7 +165,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(nr_events); P(nr_retries); P(nr_hangs); - P_ns(max_hang_time); + P(max_hang_time); #endif #undef P #undef P_ns From 21d6d52a1b7028e6a6840bd82e354aefa9a5e203 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:35 +0000 Subject: [PATCH 011/106] hrtimer: Get rid of softirq time The softirq time field in the clock bases is an optimization from the early days of hrtimers. It provides a coarse "jiffies" like time mostly for self rearming timers. But that comes with a price: - Larger code size - Extra storage space - Duplicated functions with really small differences The benefit of this is optimization is marginal for contemporary systems. Consolidate everything on the high resolution timer implementation. This makes further optimizations possible. Text size reduction: x8664 -95, i386 -356, ARM -148, ARM64 -40, power64 -16 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.039977424@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 24 ++----- kernel/time/hrtimer.c | 148 +++++++++++++++----------------------- kernel/time/timekeeping.c | 32 --------- kernel/time/timekeeping.h | 3 - 4 files changed, 64 insertions(+), 143 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index d39f2847754c..e292830b58f0 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -138,7 +138,6 @@ struct hrtimer_sleeper { * @clockid: clock id for per_cpu support * @active: red black tree root node for the active timers * @get_time: function to retrieve the current time of the clock - * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -147,7 +146,6 @@ struct hrtimer_clock_base { clockid_t clockid; struct timerqueue_head active; ktime_t (*get_time)(void); - ktime_t softirq_time; ktime_t offset; }; @@ -260,19 +258,16 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) return ktime_sub(timer->node.expires, timer->base->get_time()); } -#ifdef CONFIG_HIGH_RES_TIMERS -struct clock_event_device; - -extern void hrtimer_interrupt(struct clock_event_device *dev); - -/* - * In high resolution mode the time reference must be read accurate - */ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) { return timer->base->get_time(); } +#ifdef CONFIG_HIGH_RES_TIMERS +struct clock_event_device; + +extern void hrtimer_interrupt(struct clock_event_device *dev); + static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return timer->base->cpu_base->hres_active; @@ -304,15 +299,6 @@ extern unsigned int hrtimer_resolution; static inline void hrtimer_peek_ahead_timers(void) { } -/* - * In non high resolution mode the time reference is taken from - * the base softirq time variable. - */ -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) -{ - return timer->base->softirq_time; -} - static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return 0; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 874e0914eb3c..9e111dd83ca3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -104,27 +104,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) return hrtimer_clock_to_base_table[clock_id]; } - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, mono, boot, tai; - ktime_t off_real, off_boot, off_tai; - - mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); - boot = ktime_add(mono, off_boot); - xtim = ktime_add(mono, off_real); - tai = ktime_add(mono, off_tai); - - base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; - base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; -} - /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -466,6 +445,15 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) } #endif +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -516,7 +504,12 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - ktime_t expires_next = __hrtimer_get_next_event(cpu_base); + ktime_t expires_next; + + if (!cpu_base->hres_active) + return; + + expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -625,15 +618,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) -{ - ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; - ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); -} - /* * Retrigger next event is called after clock was set * @@ -1179,10 +1163,10 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t *now) { - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; enum hrtimer_restart (*fn)(struct hrtimer *); int restart; @@ -1219,34 +1203,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) timer->state &= ~HRTIMER_STATE_CALLBACK; } -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires_next, now, entry_time, delta; - int i, retries = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); - entry_time = now = hrtimer_update_base(cpu_base); -retry: - cpu_base->in_hrtirq = 1; - /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. - */ - cpu_base->expires_next.tv64 = KTIME_MAX; + int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *base; @@ -1279,9 +1238,42 @@ retry: if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow); } } +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + __hrtimer_run_queues(cpu_base, now); + /* Reevaluate the clock bases for the next expiry */ expires_next = __hrtimer_get_next_event(cpu_base); /* @@ -1416,38 +1408,16 @@ void hrtimer_run_pending(void) */ void hrtimer_run_queues(void) { - struct timerqueue_node *node; struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1; + ktime_t now; if (hrtimer_hres_active()) return; - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - if (!timerqueue_getnext(&base->active)) - continue; - - if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - raw_spin_lock(&cpu_base->lock); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= - hrtimer_get_expires_tv64(timer)) - break; - - __run_hrtimer(timer, &base->softirq_time); - } - raw_spin_unlock(&cpu_base->lock); - } + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now); + raw_spin_unlock(&cpu_base->lock); } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 946acb72179f..dd1efa6a4ea4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1925,37 +1925,6 @@ void do_timer(unsigned long ticks) calc_global_load(ticks); } -/** - * ktime_get_update_offsets_tick - hrtimer helper - * @offs_real: pointer to storage for monotonic -> realtime offset - * @offs_boot: pointer to storage for monotonic -> boottime offset - * @offs_tai: pointer to storage for monotonic -> clock tai offset - * - * Returns monotonic time at last tick and various offsets - */ -ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; - ktime_t base; - u64 nsecs; - - do { - seq = read_seqcount_begin(&tk_core.seq); - - base = tk->tkr_mono.base; - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - return ktime_add_ns(base, nsecs); -} - -#ifdef CONFIG_HIGH_RES_TIMERS /** * ktime_get_update_offsets_now - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset @@ -1986,7 +1955,6 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, return ktime_add_ns(base, nsecs); } -#endif /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 5b57f6c9ae34..4d177fce37d5 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -3,9 +3,6 @@ /* * Internal interfaces for kernel/time/ */ -extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai); From 868a3e915f7f5eba8f8cb4f7da2276760807c51c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:37 +0000 Subject: [PATCH 012/106] hrtimer: Make offset update smarter On every tick/hrtimer interrupt we update the offset variables of the clock bases. That's silly because these offsets change very seldom. Add a sequence counter to the time keeping code which keeps track of the offset updates (clock_was_set()). Have a sequence cache in the hrtimer cpu bases to evaluate whether the offsets must be updated or not. This allows us later to avoid pointless cacheline pollution. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203501.132820245@linutronix.de Signed-off-by: Thomas Gleixner Cc: John Stultz --- include/linux/hrtimer.h | 4 ++-- include/linux/timekeeper_internal.h | 2 ++ kernel/time/hrtimer.c | 3 ++- kernel/time/timekeeping.c | 23 ++++++++++++++++------- kernel/time/timekeeping.h | 7 ++++--- 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e292830b58f0..5e04f8fc26f6 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -163,7 +163,7 @@ enum hrtimer_base_type { * and timers * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set: Indicates that clock was set from irq context. + * @clock_was_set_seq: Sequence counter of clock was set events * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @in_hrtirq: hrtimer_interrupt() is currently executing @@ -179,7 +179,7 @@ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned int cpu; unsigned int active_bases; - unsigned int clock_was_set; + unsigned int clock_was_set_seq; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int in_hrtirq; diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index fb86963859c7..6f8276ae579c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -49,6 +49,7 @@ struct tk_read_base { * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds + * @clock_was_set_seq: The sequence number of clock was set events * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP @@ -85,6 +86,7 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; s32 tai_offset; + unsigned int clock_was_set_seq; struct timespec64 raw_time; /* The following members are for timekeeping internal use */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9e111dd83ca3..8ce9b3138017 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -451,7 +451,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); + return ktime_get_update_offsets_now(&base->clock_was_set_seq, + offs_real, offs_boot, offs_tai); } /* High resolution timer related functions */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dd1efa6a4ea4..3365e32dc208 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -602,6 +602,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + + if (action & TK_CLOCK_WAS_SET) + tk->clock_was_set_seq++; } /** @@ -1927,15 +1930,19 @@ void do_timer(unsigned long ticks) /** * ktime_get_update_offsets_now - hrtimer helper + * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * - * Returns current monotonic time and updates the offsets + * Returns current monotonic time and updates the offsets if the + * sequence number in @cwsseq and timekeeper.clock_was_set_seq are + * different. + * * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) +ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -1947,10 +1954,12 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); - - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; + if (*cwsseq != tk->clock_was_set_seq) { + *cwsseq = tk->clock_was_set_seq; + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 4d177fce37d5..704f595ce83f 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -3,9 +3,10 @@ /* * Internal interfaces for kernel/time/ */ -extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, + ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); From e19ffe8be2cd0a1f726b235443eba21e64f6be5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:39 +0000 Subject: [PATCH 013/106] hrtimer: Use bits for various boolean indicators No point in wasting 12 byte storage space. Generates better code as well. Text size reduction: x8664 -64, i386 -16, ARM -132, ARM64 -0, power64 -48 Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.227955358@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 +++--- kernel/time/hrtimer.c | 24 ++++++++++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5e04f8fc26f6..17a59ddcc79a 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -181,10 +181,10 @@ struct hrtimer_cpu_base { unsigned int active_bases; unsigned int clock_was_set_seq; #ifdef CONFIG_HIGH_RES_TIMERS + unsigned int in_hrtirq : 1, + hres_active : 1, + hang_detected : 1; ktime_t expires_next; - int in_hrtirq; - int hres_active; - int hang_detected; unsigned int nr_events; unsigned int nr_retries; unsigned int nr_hangs; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8ce9b3138017..9bbfe33f6575 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -492,9 +492,14 @@ static inline int hrtimer_is_hres_enabled(void) /* * Is the high resolution mode active ? */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return cpu_base->hres_active; +} + static inline int hrtimer_hres_active(void) { - return __this_cpu_read(hrtimer_bases.hres_active); + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } /* @@ -628,7 +633,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!hrtimer_hres_active()) + if (!base->hres_active) return; raw_spin_lock(&base->lock); @@ -685,6 +690,7 @@ void clock_was_set_delayed(void) #else +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } @@ -862,25 +868,27 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { + struct hrtimer_cpu_base *cpu_base = base->cpu_base; struct timerqueue_node *next_timer; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; next_timer = timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); + cpu_base->active_bases &= ~(1 << base->index); if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { + if (reprogram && cpu_base->hres_active) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (base->cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(base->cpu_base, 1); + if (cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(cpu_base, 1); } #endif } @@ -1114,7 +1122,7 @@ ktime_t hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) + if (!__hrtimer_hres_active(cpu_base)) mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), ktime_get()); @@ -1412,7 +1420,7 @@ void hrtimer_run_queues(void) struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t now; - if (hrtimer_hres_active()) + if (__hrtimer_hres_active(cpu_base)) return; raw_spin_lock(&cpu_base->lock); From 34aee88a02ba296a8e8c9523cdf77147731903f1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:41 +0000 Subject: [PATCH 014/106] hrtimer: Use cpu_base->active_base for hotpath iterators The active_bases field is guaranteed to be in sync with the timerqueue of the corresponding clock base. So we can use it for iterating over the clock bases. This allows to break out early if no more active clock bases are available and avoids touching the cache lines of inactive clock bases. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.322887675@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 9bbfe33f6575..fce0ccf97b51 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -419,16 +419,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; - int i; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + for (; active; base++, active >>= 1) { struct timerqueue_node *next; struct hrtimer *timer; - next = timerqueue_getnext(&base->active); - if (!next) + if (!(active & 0x01)) continue; + next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires.tv64 < expires_next.tv64) @@ -1214,17 +1214,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - struct hrtimer_clock_base *base; + for (; active; base++, active >>= 1) { struct timerqueue_node *node; ktime_t basenow; - if (!(cpu_base->active_bases & (1 << i))) + if (!(active & 0x01)) continue; - base = cpu_base->clock_base + i; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { From 6d9a1411393d51f17bee3fe163430b21b2cb2de9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:42 +0000 Subject: [PATCH 015/106] hrtimer: Cache line align the hrtimer cpu base We really want that data structure to start at a cache line boundary. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.417597627@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 17a59ddcc79a..0853f52f8ffb 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -191,7 +191,7 @@ struct hrtimer_cpu_base { unsigned int max_hang_time; #endif struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; -}; +} ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { From b8e38413ac2c33c497e72895fcd5da709fd1b908 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:44 +0000 Subject: [PATCH 016/106] hrtimer: Align the hrtimer clock bases as well We don't use cacheline_align here because that might waste lot of space on 32bit machine with 64 bytes cachelines and on 64bit machines with 128 bytes cachelines. The size of struct hrtimer_clock_base is 64byte on 64bit and 32byte on 32bit machines. So we utilize the cache lines proper. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.498165771@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0853f52f8ffb..e5c22d611850 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -130,6 +130,12 @@ struct hrtimer_sleeper { struct task_struct *task; }; +#ifdef CONFIG_64BIT +# define HRTIMER_CLOCK_BASE_ALIGN 64 +#else +# define HRTIMER_CLOCK_BASE_ALIGN 32 +#endif + /** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base @@ -147,7 +153,7 @@ struct hrtimer_clock_base { struct timerqueue_head active; ktime_t (*get_time)(void); ktime_t offset; -}; +} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN))); enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, @@ -195,6 +201,8 @@ struct hrtimer_cpu_base { static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { + BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN); + timer->node.expires = time; timer->_softexpires = time; } From c320642e1ced3b81592610e374894fea995f475b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:46 +0000 Subject: [PATCH 017/106] timerqueue: Let timerqueue_add/del return information The hrtimer code is interested whether the added timer is the first one to expire and whether the removed timer was the last one in the tree. The add/del routines have that information already. So we can return it right away instead of reevaluating it at the call site. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203501.579063647@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/timerqueue.h | 8 ++++---- lib/timerqueue.c | 10 +++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index a520fd70a59f..7eec17ad7fa1 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -16,10 +16,10 @@ struct timerqueue_head { }; -extern void timerqueue_add(struct timerqueue_head *head, - struct timerqueue_node *node); -extern void timerqueue_del(struct timerqueue_head *head, - struct timerqueue_node *node); +extern bool timerqueue_add(struct timerqueue_head *head, + struct timerqueue_node *node); +extern bool timerqueue_del(struct timerqueue_head *head, + struct timerqueue_node *node); extern struct timerqueue_node *timerqueue_iterate_next( struct timerqueue_node *node); diff --git a/lib/timerqueue.c b/lib/timerqueue.c index a382e4a32609..782ae8ca2c06 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -36,7 +36,7 @@ * Adds the timer node to the timerqueue, sorted by the * node's expires value. */ -void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) +bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { struct rb_node **p = &head->head.rb_node; struct rb_node *parent = NULL; @@ -56,8 +56,11 @@ void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) rb_link_node(&node->node, parent, p); rb_insert_color(&node->node, &head->head); - if (!head->next || node->expires.tv64 < head->next->expires.tv64) + if (!head->next || node->expires.tv64 < head->next->expires.tv64) { head->next = node; + return true; + } + return false; } EXPORT_SYMBOL_GPL(timerqueue_add); @@ -69,7 +72,7 @@ EXPORT_SYMBOL_GPL(timerqueue_add); * * Removes the timer node from the timerqueue. */ -void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) +bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); @@ -82,6 +85,7 @@ void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) } rb_erase(&node->node, &head->head); RB_CLEAR_NODE(&node->node); + return head->next != NULL; } EXPORT_SYMBOL_GPL(timerqueue_del); From b97f44c9b658d52e0139c947ea5519e51ba38d81 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:47 +0000 Subject: [PATCH 018/106] hrtimer: Make use of timerqueue_add/del return values Use the return value instead of reevaluating the information. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.658152945@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fce0ccf97b51..0cd1e0b8099d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -842,7 +842,6 @@ static int enqueue_hrtimer(struct hrtimer *timer, { debug_activate(timer); - timerqueue_add(&base->active, &timer->node); base->cpu_base->active_bases |= 1 << base->index; /* @@ -851,7 +850,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ timer->state |= HRTIMER_STATE_ENQUEUED; - return (&timer->node == base->active.next); + return timerqueue_add(&base->active, &timer->node); } /* @@ -875,8 +874,7 @@ static void __remove_hrtimer(struct hrtimer *timer, goto out; next_timer = timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); - if (!timerqueue_getnext(&base->active)) + if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); if (&timer->node == next_timer) { From 895bdfa793f6e912d1a58fc445b3dd4d686f7bd3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:49 +0000 Subject: [PATCH 019/106] hrtimer: Keep pointer to first timer and simplify __remove_hrtimer() __remove_hrtimer() needs to evaluate the expiry time to figure out whether the timer which is removed is eventually the first expiring timer on the cpu. Keep a pointer to it, which is lazily updated, so we can avoid the evaluation dance and retrieve the information from there. Generates slightly better code. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.752838019@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 ++++++ kernel/time/hrtimer.c | 46 +++++++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e5c22d611850..d194c1dacdaa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -172,6 +172,7 @@ enum hrtimer_base_type { * @clock_was_set_seq: Sequence counter of clock was set events * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() + * @next_timer: Pointer to the first expiring timer * @in_hrtirq: hrtimer_interrupt() is currently executing * @hres_active: State of high resolution mode * @hang_detected: The last hrtimer interrupt detected a hang @@ -180,6 +181,10 @@ enum hrtimer_base_type { * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @clock_base: array of clock bases for this cpu + * + * Note: next_timer is just an optimization for __remove_hrtimer(). + * Do not dereference the pointer because it is not reliable on + * cross cpu removals. */ struct hrtimer_cpu_base { raw_spinlock_t lock; @@ -191,6 +196,7 @@ struct hrtimer_cpu_base { hres_active : 1, hang_detected : 1; ktime_t expires_next; + struct hrtimer *next_timer; unsigned int nr_events; unsigned int nr_retries; unsigned int nr_hangs; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0cd1e0b8099d..30178d0656cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -415,12 +415,21 @@ static inline void debug_deactivate(struct hrtimer *timer) } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *timer) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + cpu_base->next_timer = timer; +#endif +} + static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; unsigned int active = cpu_base->active_bases; + hrtimer_update_next_timer(cpu_base, NULL); for (; active; base++, active >>= 1) { struct timerqueue_node *next; struct hrtimer *timer; @@ -431,8 +440,10 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires.tv64 < expires_next.tv64) + if (expires.tv64 < expires_next.tv64) { expires_next = expires; + hrtimer_update_next_timer(cpu_base, timer); + } } /* * clock_was_set() might have changed base->offset of any of @@ -597,6 +608,8 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (cpu_base->in_hrtirq) return 0; + cpu_base->next_timer = timer; + /* * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry @@ -868,30 +881,27 @@ static void __remove_hrtimer(struct hrtimer *timer, unsigned long newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - struct timerqueue_node *next_timer; + unsigned int state = timer->state; - if (!(timer->state & HRTIMER_STATE_ENQUEUED)) - goto out; + timer->state = newstate; + if (!(state & HRTIMER_STATE_ENQUEUED)) + return; - next_timer = timerqueue_getnext(&base->active); if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); - if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS - /* Reprogram the clock event device. if enabled */ - if (reprogram && cpu_base->hres_active) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(cpu_base, 1); - } + /* + * Note: If reprogram is false we do not update + * cpu_base->next_timer. This happens when we remove the first + * timer on a remote cpu. No harm as we never dereference + * cpu_base->next_timer. So the worst thing what can happen is + * an superflous call to hrtimer_force_reprogram() on the + * remote cpu later on if the same timer gets enqueued again. + */ + if (reprogram && timer == cpu_base->next_timer) + hrtimer_force_reprogram(cpu_base, 1); #endif - } -out: - timer->state = newstate; } /* From c6eb3f70d4482806dc2d3e1e3c7736f497b1d418 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:51 +0000 Subject: [PATCH 020/106] hrtimer: Get rid of hrtimer softirq hrtimer softirq is a leftover from the initial implementation and serves only the purpose to handle the enqueueing of already expired timers in the high resolution timer mode. We discussed whether we change the return value and force all start sites to handle that the timer is already expired, but that would be a Herculean task and I'm not sure whether its a good idea to enforce that handling on everyone. A simpler solution is to enforce a timer interrupt instead of raising and scheduling a softirq. Just use the existing infrastructure to do so and remove all the softirq leftovers. The HRTIMER softirq enum is now unused, but kept around because trace parsers rely on the existing numbering. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203501.840834708@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 - include/linux/interrupt.h | 3 +- kernel/time/hrtimer.c | 163 ++++++++++---------------------------- kernel/time/tick-common.c | 10 +++ kernel/time/timer.c | 2 - 5 files changed, 55 insertions(+), 124 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index d194c1dacdaa..048270a27bc5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -459,7 +459,6 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); -extern void hrtimer_run_pending(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 950ae4501826..6bf15a66bce7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -413,7 +413,8 @@ enum BLOCK_IOPOLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, - HRTIMER_SOFTIRQ, + HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the + numbering. Sigh! */ RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30178d0656cf..fc6b6d25f93d 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -555,59 +555,48 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) } /* - * Shared reprogramming for clock_realtime and clock_monotonic - * * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * - * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming - * and no expiry check happens. The timer gets enqueued into the rbtree. The - * reprogramming and expiry check is done in the hrtimer_interrupt or in the - * softirq. - * * Called with interrupts disabled and base->cpu_base.lock held */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static void hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - int res; WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. */ - if (hrtimer_callback_running(timer)) - return 0; + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; /* * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. + * expiry time which is less than base->offset. Set it to 0. */ if (expires.tv64 < 0) - return -ETIME; + expires.tv64 = 0; if (expires.tv64 >= cpu_base->expires_next.tv64) - return 0; - - /* - * When the target cpu of the timer is currently executing - * hrtimer_interrupt(), then we do not touch the clock event - * device. hrtimer_interrupt() will reevaluate all clock bases - * before reprogramming the device. - */ - if (cpu_base->in_hrtirq) - return 0; + return; + /* Update the pointer to the next expiring timer */ cpu_base->next_timer = timer; /* @@ -617,15 +606,14 @@ static int hrtimer_reprogram(struct hrtimer *timer, * to make progress. */ if (cpu_base->hang_detected) - return 0; + return; /* - * Clockevents returns -ETIME, when the event was in the past. + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - cpu_base->expires_next = expires; - return res; + cpu_base->expires_next = expires; + tick_program_event(expires, 1); } /* @@ -660,19 +648,11 @@ static void retrigger_next_event(void *arg) */ static int hrtimer_switch_to_hres(void) { - int cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", base->cpu); return 0; } base->hres_active = 1; @@ -681,7 +661,6 @@ static int hrtimer_switch_to_hres(void) tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); - local_irq_restore(flags); return 1; } @@ -984,26 +963,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * on dynticks target. */ wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && - hrtimer_reprogram(timer, new_base)) { - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (wakeup) { - /* - * We need to drop cpu_base->lock to avoid a - * lock ordering issue vs. rq->lock. - */ - raw_spin_unlock(&new_base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - local_irq_restore(flags); - return ret; - } else { - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - } + } else { + hrtimer_reprogram(timer, new_base); } unlock_hrtimer_base(timer, &flags); @@ -1354,7 +1315,7 @@ retry: * local version of hrtimer_peek_ahead_timers() called with interrupts * disabled. */ -static void __hrtimer_peek_ahead_timers(void) +static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1366,29 +1327,6 @@ static void __hrtimer_peek_ahead_timers(void) hrtimer_interrupt(td->evtdev); } -/** - * hrtimer_peek_ahead_timers -- run soft-expired timers now - * - * hrtimer_peek_ahead_timers will peek at the timer queue of - * the current cpu and check if there are any timers for which - * the soft expires time has passed. If any such timers exist, - * they are run immediately and then removed from the timer queue. - * - */ -void hrtimer_peek_ahead_timers(void) -{ - unsigned long flags; - - local_irq_save(flags); - __hrtimer_peek_ahead_timers(); - local_irq_restore(flags); -} - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } @@ -1396,31 +1334,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. - */ -void hrtimer_run_pending(void) -{ - if (hrtimer_hres_active()) - return; - - /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. - */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) - hrtimer_switch_to_hres(); -} - -/* - * Called from hardirq context every jiffy + * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { @@ -1430,6 +1344,18 @@ void hrtimer_run_queues(void) if (__hrtimer_hres_active(cpu_base)) return; + /* + * This _is_ ugly: We have to check periodically, whether we + * can switch to highres and / or nohz mode. The clocksource + * switch happens with xtime_lock held. Notification from + * there only sets the check bit in the tick_oneshot code, + * otherwise we might deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { + hrtimer_switch_to_hres(); + return; + } + raw_spin_lock(&cpu_base->lock); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now); @@ -1700,9 +1626,6 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 3ae6afa1eb98..ea5f9eae8f74 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -102,6 +102,16 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); +#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) + /* + * The cpu might have transitioned to HIGHRES or NOHZ mode via + * update_process_times() -> run_local_timers() -> + * hrtimer_run_queues(). + */ + if (dev->event_handler != tick_handle_periodic) + return; +#endif + if (dev->state != CLOCK_EVT_STATE_ONESHOT) return; for (;;) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2ece3aa5069c..b31f13f4fe41 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1409,8 +1409,6 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __this_cpu_read(tvec_bases); - hrtimer_run_pending(); - if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } From afc08b15cc2a3d2c48cbd427be8e0eea05698363 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:52 +0000 Subject: [PATCH 021/106] tick: sched: Remove hrtimer_active() checks hrtimer_start() enforces a timer interrupt if the timer is already expired. Get rid of the checks and the forward loop. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: John Stultz Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/20150414203501.943658239@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 914259128145..dc586c371687 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -696,11 +696,9 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - goto out; + goto out; } else if (!tick_program_event(expires, 0)) - goto out; + goto out; /* * We are past the event already. So we crossed a * jiffie boundary. Update jiffies and raise the @@ -888,8 +886,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) break; } else { if (!tick_program_event( @@ -1167,15 +1163,8 @@ void tick_setup_sched_timer(void) hrtimer_add_expires_ns(&ts->sched_timer, offset); } - for (;;) { - hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - now = ktime_get(); - } + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); #ifdef CONFIG_NO_HZ_COMMON if (tick_nohz_enabled) { From 0ff53d09642204c648424def0caa9117e7a3caaf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:54 +0000 Subject: [PATCH 022/106] tick: sched: Force tick interrupt and get rid of softirq magic We already got rid of the hrtimer reprogramming loops and hoops as hrtimer now enforces an interrupt if the enqueued time is in the past. Do the same for the nohz non highres mode. That gets rid of the need to raise the softirq which only serves the purpose of getting the machine out of the inner idle loop. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/20150414203502.023464878@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 83 ++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 54 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index dc586c371687..0f07ff2ba22b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -565,6 +565,20 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -691,22 +705,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; - } + } - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); - goto out; - } else if (!tick_program_event(expires, 0)) - goto out; - /* - * We are past the event already. So we crossed a - * jiffie boundary. Update jiffies and raise the - * softirq. - */ - tick_do_update_jiffies64(ktime_get()); + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(expires, 1); + } else { + /* Tick is stopped, but required now. Enforce it */ + tick_nohz_restart(ts, now); } - raise_softirq_irqoff(TIMER_SOFTIRQ); + out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; @@ -874,30 +884,6 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } -static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) -{ - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - break; - } else { - if (!tick_program_event( - hrtimer_get_expires(&ts->sched_timer), 0)) - break; - } - /* Reread time and update jiffies */ - now = ktime_get(); - tick_do_update_jiffies64(now); - } -} - static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -968,12 +954,6 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } -static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) -{ - hrtimer_forward(&ts->sched_timer, now, tick_period); - return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); -} - /* * The nohz low res interrupt handler */ @@ -992,10 +972,8 @@ static void tick_nohz_handler(struct clock_event_device *dev) if (unlikely(ts->tick_stopped)) return; - while (tick_nohz_reprogram(ts, now)) { - now = ktime_get(); - tick_do_update_jiffies64(now); - } + hrtimer_forward(&ts->sched_timer, now, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /** @@ -1025,12 +1003,9 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - for (;;) { - hrtimer_set_expires(&ts->sched_timer, next); - if (!tick_program_event(next, 0)) - break; - next = ktime_add(next, tick_period); - } + hrtimer_forward_now(&ts->sched_timer, tick_period); + hrtimer_set_expires(&ts->sched_timer, next); + tick_program_event(next, 1); local_irq_enable(); } From 157d29e101c7d032e886df067aeea1b21a366cc5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:56 +0000 Subject: [PATCH 023/106] tick: Sched: Restructure code Get rid of one indentation level. Preparatory patch for a major rework. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/20150414203502.101563235@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 195 +++++++++++++++++++-------------------- 1 file changed, 93 insertions(+), 102 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0f07ff2ba22b..4c5f4a9dcc0a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -611,112 +611,103 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } } - /* - * Do not stop the tick, if we are only one off (or less) - * or if the cpu is required for RCU: - */ - if (!ts->tick_stopped && delta_jiffies <= 1) - goto out; - - /* Schedule the tick, if we are at least one jiffie off */ - if ((long)delta_jiffies >= 1) { - - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. Keep track of the fact that it was the one - * which had the do_timer() duty last. If this cpu is - * the one which had the do_timer() duty last, we - * limit the sleep time to the timekeeping - * max_deferement value which we retrieved - * above. Otherwise we can sleep as long as we want. - */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->do_timer_last = 1; - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - time_delta = KTIME_MAX; - ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - time_delta = KTIME_MAX; - } - -#ifdef CONFIG_NO_HZ_FULL - if (!ts->inidle) { - time_delta = min(time_delta, - scheduler_tick_max_deferment()); - } -#endif - - /* - * calculate the expiry time for the next timer wheel - * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals - * that there is no timer pending or at least extremely - * far into the future (12 days for HZ=1000). In this - * case we set the expiry to the end of time. - */ - if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { - /* - * Calculate the time delta for the next timer event. - * If the time delta exceeds the maximum time delta - * permitted by the current clocksource then adjust - * the time delta accordingly to ensure the - * clocksource does not wrap. - */ - time_delta = min_t(u64, time_delta, - tick_period.tv64 * delta_jiffies); - } - - if (time_delta < KTIME_MAX) - expires = ktime_add_ns(last_update, time_delta); - else - expires.tv64 = KTIME_MAX; - - /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + if ((long)delta_jiffies <= 1) { + if (!ts->tick_stopped) goto out; - - ret = expires; - - /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. - */ - if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); - - ts->last_tick = hrtimer_get_expires(&ts->sched_timer); - ts->tick_stopped = 1; - trace_tick_stop(1, " "); - } - - /* - * If the expiration time == KTIME_MAX, then - * in this case we simply stop the tick timer. - */ - if (unlikely(expires.tv64 == KTIME_MAX)) { - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_cancel(&ts->sched_timer); + if (delta_jiffies == 0) { + /* Tick is stopped, but required now. Enforce it */ + tick_nohz_restart(ts, now); goto out; - } - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); - else - tick_program_event(expires, 1); - } else { - /* Tick is stopped, but required now. Enforce it */ - tick_nohz_restart(ts, now); + } } + /* + * If this cpu is the one which updates jiffies, then give up + * the assignment and let it be taken by the cpu which runs + * the tick timer next, which might be this cpu as well. If we + * don't drop this here the jiffies might be stale and + * do_timer() never invoked. Keep track of the fact that it + * was the one which had the do_timer() duty last. If this cpu + * is the one which had the do_timer() duty last, we limit the + * sleep time to the timekeeping max_deferement value which we + * retrieved above. Otherwise we can sleep as long as we want. + */ + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + time_delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + time_delta = KTIME_MAX; + } + +#ifdef CONFIG_NO_HZ_FULL + if (!ts->inidle) + time_delta = min(time_delta, scheduler_tick_max_deferment()); +#endif + + /* + * calculate the expiry time for the next timer wheel + * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that + * there is no timer pending or at least extremely far into + * the future (12 days for HZ=1000). In this case we set the + * expiry to the end of time. + */ + if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { + /* + * Calculate the time delta for the next timer event. + * If the time delta exceeds the maximum time delta + * permitted by the current clocksource then adjust + * the time delta accordingly to ensure the + * clocksource does not wrap. + */ + time_delta = min_t(u64, time_delta, + tick_period.tv64 * delta_jiffies); + } + + if (time_delta < KTIME_MAX) + expires = ktime_add_ns(last_update, time_delta); + else + expires.tv64 = KTIME_MAX; + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + goto out; + + ret = expires; + + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + nohz_balance_enter_idle(cpu); + calc_load_enter_idle(); + + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; + trace_tick_stop(1, " "); + } + + /* + * If the expiration time == KTIME_MAX, then + * in this case we simply stop the tick timer. + */ + if (unlikely(expires.tv64 == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + goto out; + } + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(expires, 1); out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; From c1ad348b452aacd784fb97403d03d71723c72ee1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:08:58 +0000 Subject: [PATCH 024/106] tick: Nohz: Rework next timer evaluation The evaluation of the next timer in the nohz code is based on jiffies while all the tick internals are nano seconds based. We have also to convert hrtimer nanoseconds to jiffies in the !highres case. That's just wrong and introduces interesting corner cases. Turn it around and convert the next timer wheel timer expiry and the rcu event to clock monotonic and base all calculations on nanoseconds. That identifies the case where no timer is pending clearly with an absolute expiry value of KTIME_MAX. Makes the code more readable and gets rid of the jiffies magic in the nohz code. Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: John Stultz Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/20150414203502.184198593@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- include/linux/rcupdate.h | 6 +- include/linux/rcutree.h | 2 +- include/linux/timer.h | 7 --- kernel/rcu/tree_plugin.h | 14 +++-- kernel/time/hrtimer.c | 14 ++--- kernel/time/tick-internal.h | 2 + kernel/time/tick-sched.c | 109 ++++++++++++++++-------------------- kernel/time/tick-sched.h | 2 +- kernel/time/timer.c | 73 ++++++++++++------------ kernel/time/timer_list.c | 4 +- 11 files changed, 108 insertions(+), 127 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 048270a27bc5..2c68f71ffd24 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -386,7 +386,7 @@ static inline int hrtimer_restart(struct hrtimer *timer) /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); -extern ktime_t hrtimer_get_next_event(void); +extern u64 hrtimer_get_next_event(void); /* * A timer is active, when it is enqueued into the rbtree or the diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 573a5afd5ed8..0627a447c589 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,6 +44,8 @@ #include #include #include +#include + #include extern int rcu_expedited; /* for sysctl */ @@ -1154,9 +1156,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) -static inline int rcu_needs_cpu(unsigned long *delta_jiffies) +static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } #endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d2e583a6aaca..db2e31beaae7 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -32,7 +32,7 @@ void rcu_note_context_switch(void); #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *delta_jiffies); +int rcu_needs_cpu(u64 basem, u64 *nextevt); #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ void rcu_cpu_stall_reset(void); diff --git a/include/linux/timer.h b/include/linux/timer.h index 8c5a197e1587..fbb80e0030bf 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -187,13 +187,6 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) -/* - * Return when the next timer-wheel timeout occurs (in absolute jiffies), - * locks the timer base and does the comparison against the given - * jiffie. - */ -extern unsigned long get_next_timer_interrupt(unsigned long now); - /* * Timer-statistics info: */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8c0ec0f5a027..0ef80a0bbabb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1368,9 +1368,9 @@ static void rcu_prepare_kthreads(int cpu) * any flavor of RCU. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *delta_jiffies) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; + *nextevt = KTIME_MAX; return rcu_cpu_has_callbacks(NULL); } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ @@ -1481,16 +1481,17 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * The caller must have disabled interrupts. */ #ifndef CONFIG_RCU_NOCB_CPU_ALL -int rcu_needs_cpu(unsigned long *dj) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + unsigned long dj; /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { - *dj = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } @@ -1504,11 +1505,12 @@ int rcu_needs_cpu(unsigned long *dj) /* Request timer delay depending on laziness, and round. */ if (!rdtp->all_lazy) { - *dj = round_up(rcu_idle_gp_delay + jiffies, + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { - *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } + *nextevt = basemono + dj * TICK_NSEC; return 0; } #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fc6b6d25f93d..179b991cfdcb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1080,26 +1080,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); /** * hrtimer_get_next_event - get the time until next expiry event * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. + * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -ktime_t hrtimer_get_next_event(void) +u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t mindelta = { .tv64 = KTIME_MAX }; + u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) - mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), - ktime_get()); + expires = __hrtimer_get_next_event(cpu_base).tv64; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; + return expires; } #endif diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b64fdd8054c5..65273f0a11ed 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -137,3 +137,5 @@ extern void tick_nohz_init(void); # else static inline void tick_nohz_init(void) { } #endif + +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4c5f4a9dcc0a..753c211f6195 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -582,39 +582,46 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, ret = { .tv64 = 0 }; - unsigned long rcu_delta_jiffies; struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - u64 time_delta; - - time_delta = timekeeping_max_deferment(); + u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; + unsigned long seq, basejiff; + ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&jiffies_lock); - last_update = last_jiffies_update; - last_jiffies = jiffies; + basemono = last_jiffies_update.tv64; + basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); + ts->last_jiffies = basejiff; - if (rcu_needs_cpu(&rcu_delta_jiffies) || + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || irq_work_needs_cpu()) { - next_jiffies = last_jiffies + 1; - delta_jiffies = 1; + next_tick = basemono + TICK_NSEC; } else { - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - if (rcu_delta_jiffies < delta_jiffies) { - next_jiffies = last_jiffies + rcu_delta_jiffies; - delta_jiffies = rcu_delta_jiffies; - } + /* + * Get the next pending timer. If high resolution + * timers are enabled this only takes the timer wheel + * timers into account. If high resolution timers are + * disabled this also looks at the next expiring + * hrtimer. + */ + next_tmr = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tmr; + /* Take the next rcu event into account */ + next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; } - if ((long)delta_jiffies <= 1) { + /* + * If the tick is due in the next period, keep it ticking or + * restart it proper. + */ + delta = next_tick - basemono; + if (delta <= (u64)TICK_NSEC) { + tick.tv64 = 0; if (!ts->tick_stopped) goto out; - if (delta_jiffies == 0) { + if (delta == 0) { /* Tick is stopped, but required now. Enforce it */ tick_nohz_restart(ts, now); goto out; @@ -629,54 +636,39 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. If this cpu * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferement value which we - * retrieved above. Otherwise we can sleep as long as we want. + * sleep time to the timekeeping max_deferement value. + * Otherwise we can sleep as long as we want. */ + delta = timekeeping_max_deferment(); if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - time_delta = KTIME_MAX; + delta = KTIME_MAX; ts->do_timer_last = 0; } else if (!ts->do_timer_last) { - time_delta = KTIME_MAX; + delta = KTIME_MAX; } #ifdef CONFIG_NO_HZ_FULL + /* Limit the tick delta to the maximum scheduler deferment */ if (!ts->inidle) - time_delta = min(time_delta, scheduler_tick_max_deferment()); + delta = min(delta, scheduler_tick_max_deferment()); #endif - /* - * calculate the expiry time for the next timer wheel - * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that - * there is no timer pending or at least extremely far into - * the future (12 days for HZ=1000). In this case we set the - * expiry to the end of time. - */ - if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { - /* - * Calculate the time delta for the next timer event. - * If the time delta exceeds the maximum time delta - * permitted by the current clocksource then adjust - * the time delta accordingly to ensure the - * clocksource does not wrap. - */ - time_delta = min_t(u64, time_delta, - tick_period.tv64 * delta_jiffies); - } - - if (time_delta < KTIME_MAX) - expires = ktime_add_ns(last_update, time_delta); + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; else - expires.tv64 = KTIME_MAX; + expires = KTIME_MAX; + + expires = min_t(u64, expires, next_tick); + tick.tv64 = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + if (ts->tick_stopped && (expires == dev->next_event.tv64)) goto out; - ret = expires; - /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when @@ -694,26 +686,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } /* - * If the expiration time == KTIME_MAX, then - * in this case we simply stop the tick timer. + * If the expiration time == KTIME_MAX, then we simply stop + * the tick timer. */ - if (unlikely(expires.tv64 == KTIME_MAX)) { + if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); else - tick_program_event(expires, 1); + tick_program_event(tick, 1); out: - ts->next_jiffies = next_jiffies; - ts->last_jiffies = last_jiffies; + /* Update the estimated sleep length */ ts->sleep_length = ktime_sub(dev->next_event, now); - - return ret; + return tick; } static void tick_nohz_full_stop_tick(struct tick_sched *ts) diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 28b5da3e1a17..42fdf4958bcc 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -57,7 +57,7 @@ struct tick_sched { ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; - unsigned long next_jiffies; + u64 next_timer; ktime_t idle_expires; int do_timer_last; }; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index b31f13f4fe41..172b83cd2f8e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -49,6 +49,8 @@ #include #include +#include "tick-internal.h" + #define CREATE_TRACE_POINTS #include @@ -1311,54 +1313,48 @@ cascade: * Check, if the next hrtimer event is before the next timer wheel * event: */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; + u64 nextevt = hrtimer_get_next_event(); - if (hr_delta.tv64 == KTIME_MAX) + /* + * If high resolution timers are enabled + * hrtimer_get_next_event() returns KTIME_MAX. + */ + if (expires <= nextevt) return expires; /* - * Expired timer available, let it expire in the next tick + * If the next timer is already expired, return the tick base + * time so the tick is fired immediately. */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); + if (nextevt <= basem) + return basem; /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to + * make sure that this tick really expires the timer to avoid + * a ping pong of the nohz stop code. + * + * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; - - /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq - */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; + return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } /** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) + * get_next_timer_interrupt - return the time (clock mono) of the next timer + * @basej: base time jiffies + * @basem: base time clock monotonic + * + * Returns the tick aligned clock monotonic time of the next pending + * timer or KTIME_MAX if no timer is pending. */ -unsigned long get_next_timer_interrupt(unsigned long now) +u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires = now + NEXT_TIMER_MAX_DELTA; + u64 expires = KTIME_MAX; + unsigned long nextevt; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1371,14 +1367,15 @@ unsigned long get_next_timer_interrupt(unsigned long now) if (base->active_timers) { if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + nextevt = base->next_timer; + if (time_before_eq(nextevt, basej)) + expires = basem; + else + expires = basem + (nextevt - basej) * TICK_NSEC; } spin_unlock(&base->lock); - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); + return cmp_next_hrtimer_event(basem, expires); } #endif diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 6232fc536185..66f39bba5353 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -191,7 +191,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(idle_sleeptime); P_ns(iowait_sleeptime); P(last_jiffies); - P(next_jiffies); + P(next_timer); P_ns(idle_expires); SEQ_printf(m, "jiffies: %Lu\n", (unsigned long long)jiffies); @@ -289,7 +289,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) static inline void timer_list_header(struct seq_file *m, u64 now) { - SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "Timer List Version: v0.8\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); From 514c2304b4574dae28c3d7c0ad6b9cf296994140 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:00 +0000 Subject: [PATCH 025/106] x86: perf: Use hrtimer_start() hrtimer_start() does not longer defer already expired timers to the softirq. Get rid of the __hrtimer_start_range_ns() invocation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20150414203502.260487331@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_rapl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 999289b94025..10190c044a7e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -204,9 +204,8 @@ again: static void rapl_start_hrtimer(struct rapl_pmu *pmu) { - __hrtimer_start_range_ns(&pmu->hrtimer, - pmu->timer_interval, 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&pmu->hrtimer, pmu->timer_interval, + HRTIMER_MODE_REL_PINNED); } static void rapl_stop_hrtimer(struct rapl_pmu *pmu) From 576b0704c9def6d54b3ae9e13b0b7567c713f568 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:01 +0000 Subject: [PATCH 026/106] x86: perf: uncore: Use hrtimer_start() hrtimer_start() does not longer defer already expired timers to the softirq. Get rid of the __hrtimer_start_range_ns() invocation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20150414203502.360555157@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c635b8b49e93..7c411f0e58fd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -233,9 +233,8 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { - __hrtimer_start_range_ns(&box->hrtimer, - ns_to_ktime(box->hrtimer_duration), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), + HRTIMER_MODE_REL_PINNED); } void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) From 3497d206c4d9b266d2e56c8b20e51b2f0e6a3c72 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:03 +0000 Subject: [PATCH 027/106] perf: core: Use hrtimer_start() hrtimer_start() does not longer defer already expired timers to the softirq. Get rid of the __hrtimer_start_range_ns() invocation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.452104213@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 81aa3a4ece9f..05309fdba2a4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -834,9 +834,7 @@ static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) if (hrtimer_active(hr)) return; - if (!hrtimer_callback_running(hr)) - __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, - 0, HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(hr, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED); } void perf_pmu_disable(struct pmu *pmu) @@ -6843,9 +6841,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) } else { period = max_t(u64, 10000, hwc->sample_period); } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), + HRTIMER_MODE_REL_PINNED); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) From 4961b6e11825c2b05b516374b1800fc5dfc2cb78 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:05 +0000 Subject: [PATCH 028/106] sched: core: Use hrtimer_start[_expires]() hrtimer_start() now enforces a timer interrupt when an already expired timer is enqueued. Get rid of the __hrtimer_start_range_ns() invocations and the loops around it. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.531131739@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 28 ++++++++-------------------- kernel/sched/fair.c | 2 +- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f9123a82cbb6..3026678113e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -92,22 +92,11 @@ void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { - unsigned long delta; - ktime_t soft, hard, now; + if (hrtimer_active(period_timer)) + return; - for (;;) { - if (hrtimer_active(period_timer)) - break; - - now = hrtimer_cb_get_time(period_timer); - hrtimer_forward(period_timer, now, period); - - soft = hrtimer_get_softexpires(period_timer); - hard = hrtimer_get_expires(period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } + hrtimer_forward_now(period_timer, period); + hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED); } DEFINE_MUTEX(sched_domains_mutex); @@ -355,12 +344,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) #ifdef CONFIG_SMP -static int __hrtick_restart(struct rq *rq) +static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = hrtimer_get_softexpires(timer); - return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* @@ -440,8 +428,8 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense. Rely on vruntime for fairness. */ delay = max_t(u64, delay, 10000LL); - __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED); } static inline void init_hrtick(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffeaa4105e48..854881b2526b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3850,7 +3850,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; * Are we near the end of the current quota period? * * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the - * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * hrtimer base being cleared by hrtimer_start. In the case of * migrate_hrtimers, base is never cleared, so we are fine. */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) From cc9684d3c1188ac5f1cf0ee9f8be7ba456099d7b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:06 +0000 Subject: [PATCH 029/106] sched: deadline: Use hrtimer_start() hrtimer_start() does not longer defer already expired timers to the softirq. Get rid of the __hrtimer_start_range_ns() invocation. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.627353666@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/sched/deadline.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5e95145088fd..21d6907d2b9f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -503,8 +503,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; - ktime_t soft, hard; - unsigned long range; s64 delta; if (boosted) @@ -527,15 +525,9 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) if (ktime_us_delta(act, now) < 0) return 0; - hrtimer_set_expires(&dl_se->dl_timer, act); + hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS); - soft = hrtimer_get_softexpires(&dl_se->dl_timer); - hard = hrtimer_get_expires(&dl_se->dl_timer); - range = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&dl_se->dl_timer, soft, - range, HRTIMER_MODE_ABS, 0); - - return hrtimer_active(&dl_se->dl_timer); + return 1; } /* From 58f1f803f1d6ef9ab280de13246d65970a09cb95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:08 +0000 Subject: [PATCH 030/106] hrtimer: Get rid of __hrtimer_start_range_ns() No more callers. Remove the leftovers. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.707871492@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 4 ---- kernel/time/hrtimer.c | 38 +++++++++++++++----------------------- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2c68f71ffd24..a80baa86bb24 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -359,10 +359,6 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode); extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); -extern int -__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, - const enum hrtimer_mode mode, int wakeup); extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 179b991cfdcb..88d6ea25dde4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -916,9 +916,20 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) return 0; } -int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode, - int wakeup) +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -971,25 +982,6 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return ret; } -EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); - -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); -} EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** @@ -1006,7 +998,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); + return hrtimer_start_range_ns(timer, tim, 0, mode); } EXPORT_SYMBOL_GPL(hrtimer_start); From 02a171af1a46966dcdb5b38cdc33e4f43e92c778 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:10 +0000 Subject: [PATCH 031/106] hrtimer: Make hrtimer_start() a inline wrapper No point for an extra export just to set the extra argument of hrtimer_start_range_ns() to 0. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.808544539@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 19 +++++++++++++++++-- kernel/time/hrtimer.c | 19 ------------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index a80baa86bb24..42074ab3d5c3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -355,11 +355,26 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ -extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, - const enum hrtimer_mode mode); extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); +/** + * hrtimer_start - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ + return hrtimer_start_range_ns(timer, tim, 0, mode); +} + extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88d6ea25dde4..e5cf71aa6d77 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -984,25 +984,6 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); -/** - * hrtimer_start - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - return hrtimer_start_range_ns(timer, tim, 0, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_start); - - /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop From 3f7b349ac14885472a19c46840235114e5ad5e52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:11 +0000 Subject: [PATCH 032/106] hrtimer: Remove bogus hrtimer_active() check The check for hrtimer_active() after starting the timer is pointless. If the timer is inactive it has expired already and therefor the task pointer is already NULL. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.907149271@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e5cf71aa6d77..c38f0b6024b4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1361,8 +1361,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t->timer, mode); - if (!hrtimer_active(&t->timer)) - t->task = NULL; if (likely(t->task)) freezable_schedule(); @@ -1633,8 +1631,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, hrtimer_init_sleeper(&t, current); hrtimer_start_expires(&t.timer, mode); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); From 2e4b0d3fe88bc2618fd5d081ace338a70f8c23da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:13 +0000 Subject: [PATCH 033/106] futex: Remove bogus hrtimer_active() check The check for hrtimer_active() after starting the timer is pointless. If the timer is inactive it has expired already and therefor the task pointer is already NULL. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203502.985825453@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/futex.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 2579e407ff67..720eacff6b58 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2063,11 +2063,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, queue_me(q, hb); /* Arm the timer */ - if (timeout) { + if (timeout) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } /* * If we have been removed from the hash list, then another task From ccdd92c17e144c8494f4c94ab85b48d297545cec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:15 +0000 Subject: [PATCH 034/106] rtmutex: Remove bogus hrtimer_active() check The check for hrtimer_active() after starting the timer is pointless. If the timer is inactive it has expired already and therefor the task pointer is already NULL. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.081830481@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b73279367087..8626437acf0c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1180,11 +1180,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { + if (unlikely(timeout)) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); From 46ac2f53dfe93a25b251696b946af7b673978ccb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:16 +0000 Subject: [PATCH 035/106] net: core: pktgen: Remove bogus hrtimer_active() check The check for hrtimer_active() after starting the timer is pointless. If the timer is inactive it has expired already and therefor the task pointer is already NULL. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Acked-by: David S. Miller Link: http://lkml.kernel.org/r/20150414203503.165258315@linutronix.de Signed-off-by: Thomas Gleixner --- net/core/pktgen.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 508155b283dd..54817d365366 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2212,8 +2212,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); From b193217e6dc3f88b599b573b53e0e0f6671d969a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:18 +0000 Subject: [PATCH 036/106] alarmtimer: Get rid of unused return value We want to get rid of the hrtimer_start() return value and the alarm timer return value is nowhere used. Remove it. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: John Stultz Link: http://lkml.kernel.org/r/20150414203503.243910615@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/alarmtimer.h | 4 ++-- kernel/time/alarmtimer.c | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index a899402a5a0e..52f3b7da4f2d 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -43,8 +43,8 @@ struct alarm { void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); -int alarm_start(struct alarm *alarm, ktime_t start); -int alarm_start_relative(struct alarm *alarm, ktime_t start); +void alarm_start(struct alarm *alarm, ktime_t start); +void alarm_start_relative(struct alarm *alarm, ktime_t start); void alarm_restart(struct alarm *alarm); int alarm_try_to_cancel(struct alarm *alarm); int alarm_cancel(struct alarm *alarm); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0b55a7570c90..7fbba635a549 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init); * @alarm: ptr to alarm to set * @start: time to run the alarm */ -int alarm_start(struct alarm *alarm, ktime_t start) +void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret; spin_lock_irqsave(&base->lock, flags); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); - ret = hrtimer_start(&alarm->timer, alarm->node.expires, - HRTIMER_MODE_ABS); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - return ret; } EXPORT_SYMBOL_GPL(alarm_start); @@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start); * @alarm: ptr to alarm to set * @start: time relative to now to run the alarm */ -int alarm_start_relative(struct alarm *alarm, ktime_t start) +void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; start = ktime_add(start, base->gettime()); - return alarm_start(alarm, start); + alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); From b8a62f1ff0ccb18fdc25c6150d1cd394610f4753 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:22 +0000 Subject: [PATCH 037/106] tick: broadcast-hrtimer: Remove overly clever return value abuse The assignment of bc_moved in the conditional construct relies on the fact that in the case of hrtimer_start() invocation the return value is always 0. It took me a while to understand it. We want to get rid of the hrtimer_start() return value. Open code the logic which makes it readable as well. Signed-off-by: Thomas Gleixner Reviewed-by: Preeti U Murthy Acked-by: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.404751457@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast-hrtimer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 6aac4beedbbe..96428d706b16 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -66,9 +66,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * hrtimer_{start/cancel} functions call into tracing, * calls to these functions must be bound within RCU_NONIDLE. */ - RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? - !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : - 0); + RCU_NONIDLE({ + bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; + if (bc_moved) + hrtimer_start(&bctimer, expires, + HRTIMER_MODE_ABS_PINNED);}); if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); From 61699e13072a89880aa584dcc64c6da465fb2ccc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:23 +0000 Subject: [PATCH 038/106] hrtimer: Remove hrtimer_start() return value No user was ever interested whether the timer was active or not when it was started. All abusers of the return value are gone, so get rid of it. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.483556394@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 22 +++++++++------------- include/linux/interrupt.h | 6 +++--- kernel/time/hrtimer.c | 23 +++++++---------------- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 42074ab3d5c3..470d876c2eda 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -355,7 +355,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ -extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, +extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); /** @@ -364,34 +364,30 @@ extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active */ -static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim, - const enum hrtimer_mode mode) +static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) { - return hrtimer_start_range_ns(timer, tim, 0, mode); + hrtimer_start_range_ns(timer, tim, 0, mode); } extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -static inline int hrtimer_start_expires(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void hrtimer_start_expires(struct hrtimer *timer, + enum hrtimer_mode mode) { unsigned long delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); - return hrtimer_start_range_ns(timer, soft, delta, mode); + hrtimer_start_range_ns(timer, soft, delta, mode); } -static inline int hrtimer_restart(struct hrtimer *timer) +static inline void hrtimer_restart(struct hrtimer *timer) { - return hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 6bf15a66bce7..be7e75c945e9 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -593,10 +593,10 @@ tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, clockid_t which_clock, enum hrtimer_mode mode); static inline -int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, - const enum hrtimer_mode mode) +void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, + const enum hrtimer_mode mode) { - return hrtimer_start(&ttimer->timer, time, mode); + hrtimer_start(&ttimer->timer, time, mode); } static inline diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c38f0b6024b4..beab02d3ff1e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -923,22 +923,18 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * @delta_ns: "slack" range for the timer * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret, leftmost; + int leftmost; base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); + remove_hrtimer(timer, base); if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, base->get_time()); @@ -962,11 +958,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); leftmost = enqueue_hrtimer(timer, new_base); - - if (!leftmost) { - unlock_hrtimer_base(timer, &flags); - return ret; - } + if (!leftmost) + goto unlock; if (!hrtimer_is_hres_active(timer)) { /* @@ -977,10 +970,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } else { hrtimer_reprogram(timer, new_base); } - +unlock: unlock_hrtimer_base(timer, &flags); - - return ret; } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); From 19d9f4225dd6a47fca430f15eeae345ceb95c301 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:25 +0000 Subject: [PATCH 039/106] hrtimer: Avoid locking in hrtimer_cancel() if timer not active We can do a lockless check for hrtimer_active before actually taking the lock in hrtimer[_try_to]_cancel. This is useful for hotpath users like nanosleep as they avoid the lock dance when the timer has expired. This is safe because active is true when the timer is enqueued or the callback is running. Taking the hrtimer base lock does not protect against concurrent hrtimer_start calls, the callsite has to do the proper serialization itself. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.580273114@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index beab02d3ff1e..3bac94269a98 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -991,6 +991,15 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) unsigned long flags; int ret = -1; + /* + * Check lockless first. If the timer is not active (neither + * enqueued nor running the callback, nothing to do here. The + * base lock does not serialize against a concurrent enqueue, + * so we can avoid taking it. + */ + if (!hrtimer_active(timer)) + return 0; + base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) From 6deba083e1de3f92f65c9849254e92a1ef001b73 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:28 +0000 Subject: [PATCH 040/106] timer: Remove pointless return value of do_usleep_range() The only user ignores it anyway and rightfully so. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.756060258@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 172b83cd2f8e..e9cc7e0642f2 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1692,14 +1692,14 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); -static int __sched do_usleep_range(unsigned long min, unsigned long max) +static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; unsigned long delta; kmin = ktime_set(0, min * NSEC_PER_USEC); delta = (max - min) * NSEC_PER_USEC; - return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); + schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } /** From 2ad5d3272d8e20e24d8242ebac9f3007f1ea56bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2015 21:09:30 +0000 Subject: [PATCH 041/106] timer: Put usleep_range into the __sched section do_usleep_range() and schedule_hrtimeout_range() are __sched as well. So it makes no sense to have the exported function in a different section. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Preeti U Murthy Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20150414203503.833709502@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e9cc7e0642f2..03f926c7a8ee 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1707,7 +1707,7 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep */ -void usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range(unsigned long min, unsigned long max) { __set_current_state(TASK_UNINTERRUPTIBLE); do_usleep_range(min, max); From 5de2755c8c8b3a6b8414870e2c284914a2b42e4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 May 2014 15:49:48 +0200 Subject: [PATCH 042/106] hrtimer: Allow concurrent hrtimer_start() for self restarting timers Because we drop cpu_base->lock around calling hrtimer::function, it is possible for hrtimer_start() to come in between and enqueue the timer. If hrtimer::function then returns HRTIMER_RESTART we'll hit the BUG_ON because HRTIMER_STATE_ENQUEUED will be set. Since the above is a perfectly valid scenario, remove the BUG_ON and make the enqueue_hrtimer() call conditional on the timer not being enqueued already. NOTE: in that concurrent scenario its entirely common for both sites to want to modify the hrtimer, since hrtimers don't provide serialization themselves be sure to provide some such that the hrtimer::function and the hrtimer_start() caller don't both try and fudge the expiration state at the same time. To that effect, add a WARN when someone tries to forward an already enqueued timer, the most common way to change the expiry of self restarting timers. Ideally we'd put the WARN in everything modifying the expiry but most of that is inlines and we don't need the bloat. Fixes: 2d44ae4d7135 ("hrtimer: clean up cpu->base locking tricks") Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Roman Gushchin Cc: Paul Turner Link: http://lkml.kernel.org/r/20150415113105.GT5029@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3bac94269a98..4adf32067862 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -799,6 +799,9 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta.tv64 < 0) return 0; + if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + return 0; + if (interval.tv64 < hrtimer_resolution) interval.tv64 = hrtimer_resolution; @@ -1139,11 +1142,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, * Note: We clear the CALLBACK bit after enqueue_hrtimer and * we do not reprogramm the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() + * + * Note: Because we dropped the cpu_base->lock above, + * hrtimer_start_range_ns() can have popped in and enqueued the timer + * for us already. */ - if (restart != HRTIMER_NORESTART) { - BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + if (restart != HRTIMER_NORESTART && + !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base); - } WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); From 77a4d1a1b9a122ca1fa3507bd30aec1520d7a8a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Apr 2015 11:41:57 +0200 Subject: [PATCH 043/106] sched: Cleanup bandwidth timers Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth(). The more I look at that code the more I'm convinced its crack, that entire __start_cfs_bandwidth() thing is brain melting, we don't need to cancel a timer before starting it, *hrtimer_start*() will happily remove the timer for you if its still enqueued. Removing that, removes a big part of the problem, no more ugly cancel loop to get stuck in. So now, if I understand things right, the entire reason you have this cfs_b->lock guarded ->timer_active nonsense is to make sure we don't accidentally lose the timer. It appears to me that it should be possible to guarantee that same by unconditionally (re)starting the timer when !queued. Because regardless what hrtimer::function will return, if we beat it to (re)enqueue the timer, it doesn't matter. Now, because hrtimers don't come with any serialization guarantees we must ensure both handler and (re)start loop serialize their access to the hrtimer to avoid both trying to forward the timer at the same time. Update the rt bandwidth timer to match. This effectively reverts: 09dc4ab03936 ("sched/fair: Fix tg_set_cfs_bandwidth() deadlock on rq->lock"). Reported-by: Roman Gushchin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Paul Turner Link: http://lkml.kernel.org/r/20150415095011.804589208@infradead.org Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 15 +++++------ kernel/sched/fair.c | 59 +++++++++++--------------------------------- kernel/sched/rt.c | 14 +++++------ kernel/sched/sched.h | 4 +-- 4 files changed, 31 insertions(+), 61 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3026678113e7..d8a6196465d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -92,10 +92,13 @@ void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { - if (hrtimer_active(period_timer)) - return; + /* + * Do not forward the expiration time of active timers; + * we do not want to loose an overrun. + */ + if (!hrtimer_active(period_timer)) + hrtimer_forward_now(period_timer, period); - hrtimer_forward_now(period_timer, period); hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED); } @@ -8113,10 +8116,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) __refill_cfs_bandwidth_runtime(cfs_b); /* restart the period timer (if active) to handle new period expiry */ - if (runtime_enabled && cfs_b->timer_active) { - /* force a reprogram */ - __start_cfs_bandwidth(cfs_b, true); - } + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); raw_spin_unlock_irq(&cfs_b->lock); for_each_online_cpu(i) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 854881b2526b..e3b32ebfe421 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { - /* - * If the bandwidth pool has become inactive, then at least one - * period must have elapsed since the last consumption. - * Refresh the global state and ensure bandwidth timer becomes - * active. - */ - if (!cfs_b->timer_active) { - __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b, false); - } + start_cfs_bandwidth(cfs_b); if (cfs_b->runtime > 0) { amount = min(cfs_b->runtime, min_amount); @@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, dequeue = 1; + bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); + empty = list_empty(&cfs_rq->throttled_list); + /* * Add to the _head_ of the list, so that an already-started * distribute_cfs_runtime will not see us */ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b, false); + + /* + * If we're the first throttled task, make sure the bandwidth + * timer is running. + */ + if (empty) + start_cfs_bandwidth(cfs_b); + raw_spin_unlock(&cfs_b->lock); } @@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (cfs_b->idle && !throttled) goto out_deactivate; - /* - * if we have relooped after returning idle once, we need to update our - * status as actually running, so that other cpus doing - * __start_cfs_bandwidth will stop trying to cancel us. - */ - cfs_b->timer_active = 1; - __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) return 0; out_deactivate: - cfs_b->timer_active = 0; return 1; } @@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); return HRTIMER_NORESTART; @@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); - ktime_t now; int overrun; int idle = 0; raw_spin_lock(&cfs_b->lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, cfs_b->period); - + overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; @@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) INIT_LIST_HEAD(&cfs_rq->throttled_list); } -/* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - /* - * The timer may be active because we're trying to set a new bandwidth - * period or because we're racing with the tear-down path - * (timer_active==0 becomes visible before the hrtimer call-back - * terminates). In either case we ensure that it's re-programmed - */ - while (unlikely(hrtimer_active(&cfs_b->period_timer)) && - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { - /* bounce the lock to allow do_sched_cfs_period_timer to run */ - raw_spin_unlock(&cfs_b->lock); - cpu_relax(); - raw_spin_lock(&cfs_b->lock); - /* if someone else restarted the timer then we're done */ - if (!force && cfs_b->timer_active) - return; - } - - cfs_b->timer_active = 1; start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 575da76a3874..b0febf25d8f1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; int idle = 0; + int overrun; + raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - + overrun = hrtimer_forward_now(timer, rt_b->rt_period); if (!overrun) break; + raw_spin_unlock(&rt_b->rt_runtime_lock); idle = do_sched_rt_period_timer(rt_b, overrun); + raw_spin_lock(&rt_b->rt_runtime_lock); } + raw_spin_unlock(&rt_b->rt_runtime_lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); raw_spin_unlock(&rt_b->rt_runtime_lock); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0e129993958..08606a1f8c4d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -215,7 +215,7 @@ struct cfs_bandwidth { s64 hierarchical_quota; u64 runtime_expires; - int idle, timer_active; + int idle; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; @@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); From 272325c4821f052092c41feac21f4a1a46f0ad48 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Apr 2015 11:41:58 +0200 Subject: [PATCH 044/106] perf: Fix mux_interval hrtimer wreckage Thomas stumbled over the hrtimer_forward_now() in perf_event_mux_interval_ms_store() and noticed its broken-ness. You cannot just change the expiry time of an active timer, it will destroy the red-black tree order and cause havoc. Change it to (re)start the timer instead, (re)starting a timer will dequeue and enqueue a timer and therefore preserve rb-tree order. Since we cannot enqueue remotely, wrap the thing in cpu_function_call(), this however mandates that we restrict ourselves to online cpus. Also serialize the entire setting so we don't get multiple concurrent threads trying to update to different values. Also fix a problem in perf_mux_hrtimer_restart(), checking against hrtimer_active() can actually loose us the timer when timer->state == HRTIMER_STATE_CALLBACK and the callback has already decided NORESTART. Furthermore it doesn't make any sense to test hrtimer_callback_running() when we already tested hrtimer_active(), but with the above change, we explicitly must call it when callback_running. Lastly, rename a few functions: s/perf_cpu_hrtimer_/perf_mux_hrtimer_/ -- because I could not find the mux timer function s/\/timer/ -- because that's the normal way of calling things. Fixes: 62b856397927 ("perf: Add sysfs entry to adjust multiplexing interval per PMU") Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20150415095011.863052571@infradead.org Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 63 +++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 05309fdba2a4..e7ed00b4a6ed 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -51,9 +51,11 @@ static struct workqueue_struct *perf_wq; +typedef int (*remote_function_f)(void *); + struct remote_function_call { struct task_struct *p; - int (*func)(void *info); + remote_function_f func; void *info; int ret; }; @@ -86,7 +88,7 @@ static void remote_function(void *data) * -EAGAIN - when the process moved away */ static int -task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, @@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) * * returns: @func return value or -ENXIO when the cpu is offline */ -static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, @@ -747,7 +749,7 @@ perf_cgroup_mark_enabled(struct perf_event *event, /* * function must be called with interrupts disbled */ -static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; enum hrtimer_restart ret = HRTIMER_NORESTART; @@ -771,7 +773,7 @@ static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) } /* CPU is going down */ -void perf_cpu_hrtimer_cancel(int cpu) +void perf_mux_hrtimer_cancel(int cpu) { struct perf_cpu_context *cpuctx; struct pmu *pmu; @@ -798,11 +800,11 @@ void perf_cpu_hrtimer_cancel(int cpu) local_irq_restore(flags); } -static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) { - struct hrtimer *hr = &cpuctx->hrtimer; + struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; - int timer; + u64 interval; /* no multiplexing needed for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) @@ -812,29 +814,30 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) * check default is sane, if not set then force to * default interval (1/tick) */ - timer = pmu->hrtimer_interval_ms; - if (timer < 1) - timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + interval = pmu->hrtimer_interval_ms; + if (interval < 1) + interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - hr->function = perf_cpu_hrtimer_handler; + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + timer->function = perf_mux_hrtimer_handler; } -static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) { - struct hrtimer *hr = &cpuctx->hrtimer; + struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; /* not for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) - return; + return 0; - if (hrtimer_active(hr)) - return; + if (hrtimer_is_queued(timer)) + return 0; - hrtimer_start(hr, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED); + hrtimer_start(timer, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED); + return 0; } void perf_pmu_disable(struct pmu *pmu) @@ -1913,7 +1916,7 @@ group_sched_in(struct perf_event *group_event, if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1960,7 +1963,7 @@ group_error: pmu->cancel_txn(pmu); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -2233,7 +2236,7 @@ static int __perf_event_enable(void *info) */ if (leader != event) { group_sched_out(leader, cpuctx, ctx); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); } if (leader->attr.pinned) { update_group_times(leader); @@ -7143,6 +7146,8 @@ perf_event_mux_interval_ms_show(struct device *dev, return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); } +static DEFINE_MUTEX(mux_interval_mutex); + static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, @@ -7162,17 +7167,21 @@ perf_event_mux_interval_ms_store(struct device *dev, if (timer == pmu->hrtimer_interval_ms) return count; + mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - for_each_possible_cpu(cpu) { + get_online_cpus(); + for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - if (hrtimer_active(&cpuctx->hrtimer)) - hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + cpu_function_call(cpu, + (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } + put_online_cpus(); + mutex_unlock(&mux_interval_mutex); return count; } @@ -7277,7 +7286,7 @@ skip_type: lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; - __perf_cpu_hrtimer_init(cpuctx, cpu); + __perf_mux_hrtimer_init(cpuctx, cpu); cpuctx->unique_pmu = pmu; } From 9183034879a1196714836c8142340c850c747323 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 23 Apr 2015 04:00:00 +0800 Subject: [PATCH 045/106] perf: perf_mux_hrtimer_cancel() can be static Signed-off-by: Fengguang Wu Cc: kbuild-all@01.org Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20150422200000.GA122603@lkp-sb04 Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e7ed00b4a6ed..598182dcc260 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -773,7 +773,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) } /* CPU is going down */ -void perf_mux_hrtimer_cancel(int cpu) +static void perf_mux_hrtimer_cancel(int cpu) { struct perf_cpu_context *cpuctx; struct pmu *pmu; From b484403b9abe5f444ae2fee6a249759bb3c35bcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Apr 2015 13:58:09 +0200 Subject: [PATCH 046/106] sched: debug: Remove the cfs bandwidth timer_active printout The struct member is gone. Reported-by: fengguang.wu@intel.com> Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra --- kernel/sched/debug.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a245c1fc6f0a..f94724eda407 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -230,8 +230,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif #endif #ifdef CONFIG_CFS_BANDWIDTH - SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", - cfs_rq->tg->cfs_bandwidth.timer_active); SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); SEQ_printf(m, " .%-30s: %d\n", "throttle_count", From 4101ecc23db90ae7e9b2a4b1cfe4aaf37405b4a8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 13 Apr 2015 16:43:35 +0200 Subject: [PATCH 047/106] power: reset: ltc2952: Remove bogus hrtimer_start() return value checks The return value of hrtimer_start() tells whether the timer was inactive or active already when hrtimer_start() was called. The code emits a bogus warning if the timer was active already claiming that the timer could not be started. Remove it along with the bogus comment in the else path. Signed-off-by: Thomas Gleixner Acked-by: Frans Klaver Cc: Sebastian Reichel Cc: Dmitry Eremin-Solenikov Cc: David Woodhouse Cc: Wolfram Sang Cc: linux-pm@vger.kernel.org --- drivers/power/reset/ltc2952-poweroff.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c index 7ef193b6f7fe..5f855f99bdfc 100644 --- a/drivers/power/reset/ltc2952-poweroff.c +++ b/drivers/power/reset/ltc2952-poweroff.c @@ -120,18 +120,7 @@ static enum hrtimer_restart ltc2952_poweroff_timer_wde(struct hrtimer *timer) static void ltc2952_poweroff_start_wde(struct ltc2952_poweroff *data) { - if (hrtimer_start(&data->timer_wde, data->wde_interval, - HRTIMER_MODE_REL)) { - /* - * The device will not toggle the watchdog reset, - * thus shut down is only safe if the PowerPath controller - * has a long enough time-off before triggering a hardware - * power-off. - * - * Only sending a warning as the system will power-off anyway - */ - dev_err(data->dev, "unable to start the timer\n"); - } + hrtimer_start(&data->timer_wde, data->wde_interval, HRTIMER_MODE_REL); } static enum hrtimer_restart @@ -165,12 +154,10 @@ static irqreturn_t ltc2952_poweroff_handler(int irq, void *dev_id) } if (gpiod_get_value(data->gpio_trigger)) { - if (hrtimer_start(&data->timer_trigger, data->trigger_delay, - HRTIMER_MODE_REL)) - dev_err(data->dev, "unable to start the wait timer\n"); + hrtimer_start(&data->timer_trigger, data->trigger_delay, + HRTIMER_MODE_REL); } else { hrtimer_cancel(&data->timer_trigger); - /* omitting return value check, timer should have been valid */ } return IRQ_HANDLED; } From 30fbd59057004f97f45467124693f22e8b6f3e16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 May 2015 13:51:12 +0200 Subject: [PATCH 048/106] perf: Remove unused function perf_mux_hrtimer_cancel() Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 598182dcc260..f5288293d667 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -772,34 +772,6 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) return ret; } -/* CPU is going down */ -static void perf_mux_hrtimer_cancel(int cpu) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - if (WARN_ON(cpu != smp_processor_id())) - return; - - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - if (pmu->task_ctx_nr == perf_sw_context) - continue; - - hrtimer_cancel(&cpuctx->hrtimer); - } - - rcu_read_unlock(); - - local_irq_restore(flags); -} - static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) { struct hrtimer *timer = &cpuctx->hrtimer; From 2951d5c031a3aaefa31b688fbf229e75692f4786 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2015 10:00:13 +0200 Subject: [PATCH 049/106] tick: broadcast: Prevent livelock from event handler With the removal of the hrtimer softirq the switch to highres/nohz mode happens in the tick interrupt. That leads to a livelock when the per cpu event handler is directly called from the broadcast handler under broadcast lock because broadcast lock needs to be taken for the highres/nohz switch as well. Solve this by calling the cpu local handler outside the broadcast_lock held region. Fixes: c6eb3f70d448 "hrtimer: Get rid of hrtimer softirq" Reported-and-tested-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 55 +++++++++++++++++------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7e8ca4f448a8..5d9e4aab9797 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -255,18 +255,18 @@ int tick_receive_broadcast(void) /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ -static void tick_do_broadcast(struct cpumask *mask) +static bool tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; + bool local = false; /* * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { cpumask_clear_cpu(cpu, mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->event_handler(td->evtdev); + local = true; } if (!cpumask_empty(mask)) { @@ -279,16 +279,17 @@ static void tick_do_broadcast(struct cpumask *mask) td = &per_cpu(tick_cpu_device, cpumask_first(mask)); td->evtdev->broadcast(mask); } + return local; } /* * Periodic broadcast: * - invoke the broadcast handlers */ -static void tick_do_periodic_broadcast(void) +static bool tick_do_periodic_broadcast(void) { cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); - tick_do_broadcast(tmpmask); + return tick_do_broadcast(tmpmask); } /* @@ -296,34 +297,26 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { - ktime_t next; + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool bc_local; raw_spin_lock(&tick_broadcast_lock); + bc_local = tick_do_periodic_broadcast(); - tick_do_periodic_broadcast(); + if (dev->state == CLOCK_EVT_STATE_ONESHOT) { + ktime_t next = ktime_add(dev->next_event, tick_period); - /* - * The device is in periodic mode. No reprogramming necessary: - */ - if (dev->state == CLOCK_EVT_STATE_PERIODIC) - goto unlock; - - /* - * Setup the next period for devices, which do not have - * periodic mode. We read dev->next_event first and add to it - * when the event already expired. clockevents_program_event() - * sets dev->next_event only when the event is really - * programmed to the device. - */ - for (next = dev->next_event; ;) { - next = ktime_add(next, tick_period); - - if (!clockevents_program_event(dev, next, false)) - goto unlock; - tick_do_periodic_broadcast(); + clockevents_program_event(dev, next, true); } -unlock: raw_spin_unlock(&tick_broadcast_lock); + + /* + * We run the handler of the local cpu after dropping + * tick_broadcast_lock because the handler might deadlock when + * trying to switch to oneshot mode. + */ + if (bc_local) + td->evtdev->event_handler(td->evtdev); } /** @@ -622,9 +615,13 @@ again: cpumask_and(tmpmask, tmpmask, cpu_online_mask); /* - * Wakeup the cpus which have an expired event. + * Wakeup the cpus which have an expired event and handle the + * broadcast event of the local cpu. */ - tick_do_broadcast(tmpmask); + if (tick_do_broadcast(tmpmask)) { + td = this_cpu_ptr(&tick_cpu_device); + td->evtdev->event_handler(td->evtdev); + } /* * Two reasons for reprogram: From 298dbd1c5cd66f0ac85981b83b7d519a5d88d1b8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2015 09:44:24 +0200 Subject: [PATCH 050/106] tick: broadcast: Simplify oneshot logic and shorten lock region Simplify the oneshot logic by avoiding the reprogramming loops. That also allows to call the cpu local handler outside of the broadcast_lock held region. Tested-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 42 +++++++++++++++--------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 5d9e4aab9797..12fcc55d607a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -525,18 +525,14 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc, irq_set_affinity(bc->irq, bc->cpumask); } -static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, - ktime_t expires, int force) +static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires) { - int ret; - if (bc->state != CLOCK_EVT_STATE_ONESHOT) clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); - ret = clockevents_program_event(bc, expires, force); - if (!ret) - tick_broadcast_set_affinity(bc, cpumask_of(cpu)); - return ret; + clockevents_program_event(bc, expires, 1); + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); } static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -573,9 +569,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) struct tick_device *td; ktime_t now, next_event; int cpu, next_cpu = 0; + bool bc_local; raw_spin_lock(&tick_broadcast_lock); -again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; cpumask_clear(tmpmask); @@ -615,13 +611,9 @@ again: cpumask_and(tmpmask, tmpmask, cpu_online_mask); /* - * Wakeup the cpus which have an expired event and handle the - * broadcast event of the local cpu. + * Wakeup the cpus which have an expired event. */ - if (tick_do_broadcast(tmpmask)) { - td = this_cpu_ptr(&tick_cpu_device); - td->evtdev->event_handler(td->evtdev); - } + bc_local = tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -633,15 +625,15 @@ again: * - There are pending events on sleeping CPUs which were not * in the event mask */ - if (next_event.tv64 != KTIME_MAX) { - /* - * Rearm the broadcast device. If event expired, - * repeat the above - */ - if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) - goto again; - } + if (next_event.tv64 != KTIME_MAX) + tick_broadcast_set_event(dev, next_cpu, next_event); + raw_spin_unlock(&tick_broadcast_lock); + + if (bc_local) { + td = this_cpu_ptr(&tick_cpu_device); + td->evtdev->event_handler(td->evtdev); + } } static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) @@ -723,7 +715,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) */ if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(bc, cpu, dev->next_event, 1); + tick_broadcast_set_event(bc, cpu, dev->next_event); } /* * If the current CPU owns the hrtimer broadcast @@ -858,7 +850,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(bc, cpu, tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period); } else bc->next_event.tv64 = KTIME_MAX; } else { From 1ef09cd713c90781b683a0b4e0a874803c172b1d Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Tue, 28 Apr 2015 14:15:20 +0530 Subject: [PATCH 051/106] tick-broadcast: Fix the printing of broadcast masks Today the number of bits of the broadcast masks that is output into /proc/timer_list is sizeof(unsigned long). This means that on machines with a larger number of CPUs, the bitmasks of CPUs beyond this range do not appear. Fix this by using bitmap printing through "%*pb" instead, so as to output the broadcast masks for the range of nr_cpu_ids into /proc/timer_list. Signed-off-by: Preeti U Murthy Cc: peterz@infradead.org Cc: linuxppc-dev@ozlabs.org Cc: john.stultz@linaro.org Link: http://lkml.kernel.org/r/20150428084520.3314.62668.stgit@preeti.in.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 66f39bba5353..18b074b215b0 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -276,11 +276,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) { #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); - SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_mask())[0]); + SEQ_printf(m, "tick_broadcast_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_mask())); #ifdef CONFIG_TICK_ONESHOT - SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); + SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_oneshot_mask())); #endif SEQ_printf(m, "\n"); #endif From ff569fcd7123d9a9176d59174a27bab9a4fad328 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 2 May 2015 17:03:23 +0200 Subject: [PATCH 052/106] clocksource: asm9260: Fix of_io_request_and_map error check of_io_request_and map returns an error pointer, but the current code assumes that on error the returned pointer will be NULL. Obviously, that makes the check completely useless. Change the test to actually check for the proper error code. Signed-off-by: Maxime Ripard Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/1430579006-32702-4-git-send-email-maxime.ripard@free-electrons.com Signed-off-by: Thomas Gleixner --- drivers/clocksource/asm9260_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/asm9260_timer.c b/drivers/clocksource/asm9260_timer.c index 2c9c993727c8..4c2ba59897e8 100644 --- a/drivers/clocksource/asm9260_timer.c +++ b/drivers/clocksource/asm9260_timer.c @@ -178,7 +178,7 @@ static void __init asm9260_timer_init(struct device_node *np) unsigned long rate; priv.base = of_io_request_and_map(np, 0, np->name); - if (!priv.base) + if (IS_ERR(priv.base)) panic("%s: unable to map resource", np->name); clk = of_clk_get(np, 0); From bd580e7ed4add8ce9b2c1dd1911f8fb9d444128b Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 2 May 2015 17:03:24 +0200 Subject: [PATCH 053/106] clocksource: integrator: Fix of_io_request_and_map error check of_io_request_and map returns an error pointer, but the current code assumes that on error the returned pointer will be NULL. Obviously, that makes the check completely useless. Change the test to actually check for the proper error code. Signed-off-by: Maxime Ripard Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/1430579006-32702-5-git-send-email-maxime.ripard@free-electrons.com Signed-off-by: Thomas Gleixner --- drivers/clocksource/timer-integrator-ap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-integrator-ap.c b/drivers/clocksource/timer-integrator-ap.c index b9efd30513d5..c97d1980c0f8 100644 --- a/drivers/clocksource/timer-integrator-ap.c +++ b/drivers/clocksource/timer-integrator-ap.c @@ -166,7 +166,7 @@ static void __init integrator_ap_timer_init_of(struct device_node *node) struct device_node *sec_node; base = of_io_request_and_map(node, 0, "integrator-timer"); - if (!base) + if (IS_ERR(base)) return; clk = of_clk_get(node, 0); From 9fa8cc0a85c774b75218ba5f385792be387c190b Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Sat, 2 May 2015 17:03:25 +0200 Subject: [PATCH 054/106] clocksource: sun5i: Fix of_io_request_and_map error check of_io_request_and map returns an error pointer, but the current code assumes that on error the returned pointer will be NULL. Obviously, that makes the check completely useless. Change the test to actually check for the proper error code. Signed-off-by: Maxime Ripard Cc: Daniel Lezcano Link: http://lkml.kernel.org/r/1430579006-32702-6-git-send-email-maxime.ripard@free-electrons.com Signed-off-by: Thomas Gleixner --- drivers/clocksource/timer-sun5i.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c index 28aa4b7bb602..0ffb4ea7c925 100644 --- a/drivers/clocksource/timer-sun5i.c +++ b/drivers/clocksource/timer-sun5i.c @@ -324,7 +324,7 @@ static void __init sun5i_timer_init(struct device_node *node) int irq; timer_base = of_io_request_and_map(node, 0, of_node_full_name(node)); - if (!timer_base) + if (IS_ERR(timer_base)) panic("Can't map registers"); irq = irq_of_parse_and_map(node, 0); From 781978e6e156101209f62b9ebc8783b70ef248de Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Mon, 27 Apr 2015 19:21:49 -0700 Subject: [PATCH 055/106] timer: Use timer->base for flag checks At present, internal_add_timer() examines flags with 'base' which doesn't contain flags. Examine with 'timer->base' to avoid unnecessary waking up of nohz CPU when timer base has TIMER_DEFERRABLE set. Signed-off-by: Joonwoo Park Cc: sboyd@codeaurora.org Cc: skannan@codeaurora.org Cc: John Stultz Link: http://lkml.kernel.org/r/1430187709-21087-1-git-send-email-joonwoop@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 03f926c7a8ee..d4af7c56c95d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -436,7 +436,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) * require special care against races with idle_cpu(), lets deal * with that later. */ - if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) + if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(base->cpu)) wake_up_nohz_cpu(base->cpu); } From 38d23a6cc16c02f7b0c920266053f340b5601735 Mon Sep 17 00:00:00 2001 From: Andreas Sandberg Date: Fri, 24 Apr 2015 13:06:05 +0000 Subject: [PATCH 056/106] tick: hrtimer-broadcast: Prevent endless restarting when broadcast device is unused The hrtimer callback in the hrtimer's tick broadcast code sometimes incorrectly ends up scheduling events at the current tick causing the kernel to hang servicing the same hrtimer forever. This typically happens when a device is swapped out by tick_install_broadcast_device(), which replaces the event handler with clock_events_handle_noop() and sets the device mode to CLOCK_EVT_MODE_UNUSED. If the timer is scheduled when this happens, the next_event field will not be updated and the hrtimer ends up being restarted at the current tick. To prevent this from happening, only try to restart the hrtimer if the broadcast clock event device is in one of the active modes and try to cancel the timer when entering the CLOCK_EVT_MODE_UNUSED mode. Signed-off-by: Andreas Sandberg Tested-by: Catalin Marinas Acked-by: Mark Rutland Reviewed-by: Preeti U Murthy Link: http://lkml.kernel.org/r/1429880765-5558-1-git-send-email-andreas.sandberg@arm.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast-hrtimer.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 96428d706b16..3e7db49a2381 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -22,6 +22,7 @@ static void bc_set_mode(enum clock_event_mode mode, struct clock_event_device *bc) { switch (mode) { + case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: /* * Note, we cannot cancel the timer here as we might @@ -101,10 +102,13 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + switch (ce_broadcast_hrtimer.mode) { + case CLOCK_EVT_MODE_ONESHOT: + if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) + return HRTIMER_RESTART; + default: return HRTIMER_NORESTART; - - return HRTIMER_RESTART; + } } void tick_setup_hrtimer_broadcast(void) From 6b442bc81337913eb775965a67ffdb8a36935422 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 7 May 2015 14:35:59 +0200 Subject: [PATCH 057/106] nohz: Fix !HIGH_RES_TIMERS hang Simon Horman reported this crash on a system with high-res timers disabled but nohz enabled: > ------------[ cut here ]------------ > kernel BUG at kernel/irq_work.c:135! BUG_ON(!irqs_disabled()); So something enabled interrupts in the periodic tick handling machinery, and that code path indeed has a local_irq_disable()/enable pair in tick_nohz_switch_to_nohz() which causes havoc. Fix it. This patch also fixes a +nohz -hrtimers hang reported by Ingo Molnar. Reported-by: Simon Horman Reported-by: Ingo Molnar Tested-by: Simon Horman Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: H. Peter Anvin Cc: LAK Cc: Magnus Damm Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1505071425520.4225@nanos Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 753c211f6195..812f7a3b9898 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -967,11 +967,9 @@ static void tick_nohz_switch_to_nohz(void) if (!tick_nohz_enabled) return; - local_irq_disable(); - if (tick_switch_to_oneshot(tick_nohz_handler)) { - local_irq_enable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) return; - } + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; @@ -986,7 +984,6 @@ static void tick_nohz_switch_to_nohz(void) hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); tick_program_event(next, 1); - local_irq_enable(); } /* @@ -1171,7 +1168,7 @@ void tick_oneshot_notify(void) * Called cyclic from the hrtimer softirq (driven by the timer * softirq) allow_nohz signals, that we can switch into low-res nohz * mode, because high resolution timers are disabled (either compile - * or runtime). + * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { From c78c881824cc3354c0518c0d21febe04ffc69638 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 May 2015 09:14:51 +0200 Subject: [PATCH 058/106] ALSA: drivers: pcsp: Fix printout of resolution The recent conversion of the hrtimer resolution failed to convert the printk format from %li to %u. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- sound/drivers/pcsp/pcsp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c index eb54702037cc..27e25bb78c97 100644 --- a/sound/drivers/pcsp/pcsp.c +++ b/sound/drivers/pcsp/pcsp.c @@ -48,7 +48,7 @@ static int snd_pcsp_create(struct snd_card *card) if (!nopcm) { if (resolution > PCSP_MAX_PERIOD_NS) { printk(KERN_ERR "PCSP: Timer resolution is not sufficient " - "(%linS)\n", resolution); + "(%unS)\n", resolution); printk(KERN_ERR "PCSP: Make sure you have HPET and ACPI " "enabled.\n"); printk(KERN_ERR "PCSP: Turned into nopcm mode.\n"); @@ -61,7 +61,7 @@ static int snd_pcsp_create(struct snd_card *card) else min_div = MAX_DIV; #if PCSP_DEBUG - printk(KERN_DEBUG "PCSP: lpj=%li, min_div=%i, res=%li\n", + printk(KERN_DEBUG "PCSP: lpj=%li, min_div=%i, res=%u\n", loops_per_jiffy, min_div, resolution); #endif From 4cfafd3082afc707653aeb82e9f8e7b596fbbfd6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 May 2015 12:23:11 +0200 Subject: [PATCH 059/106] sched,perf: Fix periodic timers In the below two commits (see Fixes) we have periodic timers that can stop themselves when they're no longer required, but need to be (re)-started when their idle condition changes. Further complications is that we want the timer handler to always do the forward such that it will always correctly deal with the overruns, and we do not want to race such that the handler has already decided to stop, but the (external) restart sees the timer still active and we end up with a 'lost' timer. The problem with the current code is that the re-start can come before the callback does the forward, at which point the forward from the callback will WARN about forwarding an enqueued timer. Now, conceptually its easy to detect if you're before or after the fwd by comparing the expiration time against the current time. Of course, that's expensive (and racy) because we don't have the current time. Alternatively one could cache this state inside the timer, but then everybody pays the overhead of maintaining this extra state, and that is undesired. The only other option that I could see is the external timer_active variable, which I tried to kill before. I would love a nicer interface for this seemingly simple 'problem' but alas. Fixes: 272325c4821f ("perf: Fix mux_interval hrtimer wreckage") Fixes: 77a4d1a1b9a1 ("sched: Cleanup bandwidth timers") Cc: pjt@google.com Cc: tglx@linutronix.de Cc: klamm@yandex-team.ru Cc: mingo@kernel.org Cc: bsegall@google.com Cc: hpa@zytor.com Cc: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20150514102311.GX21418@twins.programming.kicks-ass.net --- include/linux/perf_event.h | 4 ++++ kernel/events/core.c | 29 ++++++++++++++++------------- kernel/sched/core.c | 12 ------------ kernel/sched/fair.c | 17 +++++++++++++---- kernel/sched/rt.c | 8 +++++++- kernel/sched/sched.h | 5 ++--- 6 files changed, 42 insertions(+), 33 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61992cf2e977..cf3342a8ad80 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -566,8 +566,12 @@ struct perf_cpu_context { struct perf_event_context *task_ctx; int active_oncpu; int exclusive; + + raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; + unsigned int hrtimer_active; + struct pmu *unique_pmu; struct perf_cgroup *cgrp; }; diff --git a/kernel/events/core.c b/kernel/events/core.c index f5288293d667..d9c93f36e379 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -752,24 +752,21 @@ perf_cgroup_mark_enabled(struct perf_event *event, static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; - enum hrtimer_restart ret = HRTIMER_NORESTART; int rotations = 0; WARN_ON(!irqs_disabled()); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); - /* - * arm timer if needed - */ - if (rotations) { + raw_spin_lock(&cpuctx->hrtimer_lock); + if (rotations) hrtimer_forward_now(hr, cpuctx->hrtimer_interval); - ret = HRTIMER_RESTART; - } + else + cpuctx->hrtimer_active = 0; + raw_spin_unlock(&cpuctx->hrtimer_lock); - return ret; + return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) @@ -792,7 +789,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + raw_spin_lock_init(&cpuctx->hrtimer_lock); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); timer->function = perf_mux_hrtimer_handler; } @@ -800,15 +798,20 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) { struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; + unsigned long flags; /* not for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) return 0; - if (hrtimer_is_queued(timer)) - return 0; + raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); + if (!cpuctx->hrtimer_active) { + cpuctx->hrtimer_active = 1; + hrtimer_forward_now(timer, cpuctx->hrtimer_interval); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + } + raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); - hrtimer_start(timer, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8a6196465d5..e84aeb280777 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,18 +90,6 @@ #define CREATE_TRACE_POINTS #include -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) -{ - /* - * Do not forward the expiration time of active timers; - * we do not want to loose an overrun. - */ - if (!hrtimer_active(period_timer)) - hrtimer_forward_now(period_timer, period); - - hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED); -} - DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e3b32ebfe421..69be2825262d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3870,8 +3870,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) if (runtime_refresh_within(cfs_b, min_left)) return; - start_bandwidth_timer(&cfs_b->slack_timer, - ns_to_ktime(cfs_bandwidth_slack_period)); + hrtimer_start(&cfs_b->slack_timer, + ns_to_ktime(cfs_bandwidth_slack_period), + HRTIMER_MODE_REL); } /* we know any runtime found here is valid as update_curr() precedes return */ @@ -4012,6 +4013,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) idle = do_sched_cfs_period_timer(cfs_b, overrun); } + if (idle) + cfs_b->period_active = 0; raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; @@ -4025,7 +4028,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period = ns_to_ktime(default_cfs_period()); INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; @@ -4039,7 +4042,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); + lockdep_assert_held(&cfs_b->lock); + + if (!cfs_b->period_active) { + cfs_b->period_active = 1; + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); + } } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b0febf25d8f1..e43da5391dcd 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -31,6 +31,8 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) idle = do_sched_rt_period_timer(rt_b, overrun); raw_spin_lock(&rt_b->rt_runtime_lock); } + if (idle) + rt_b->rt_period_active = 0; raw_spin_unlock(&rt_b->rt_runtime_lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; @@ -54,7 +56,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) return; raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + if (!rt_b->rt_period_active) { + rt_b->rt_period_active = 1; + hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); + } raw_spin_unlock(&rt_b->rt_runtime_lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 08606a1f8c4d..f9a58ef373b4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -131,6 +131,7 @@ struct rt_bandwidth { ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer; + unsigned int rt_period_active; }; void __dl_clear_params(struct task_struct *p); @@ -215,7 +216,7 @@ struct cfs_bandwidth { s64 hierarchical_quota; u64 runtime_expires; - int idle; + int idle, period_active; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; @@ -1406,8 +1407,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); - /* * __task_rq_lock - lock the rq @p resides on. */ From 0a227985d4a993a322ff72ecbaeee2611d624216 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Mon, 18 May 2015 14:19:12 +0200 Subject: [PATCH 060/106] time: Move timeconst.h into include/generated kernel/time/timeconst.h is moved to include/generated/ and generated by the top level Kbuild. This allows using timeconst.h in an earlier build stage. Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1431951554-5563-1-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- Kbuild | 34 +++++++++++++++++++++++++++------- kernel/time/Makefile | 17 +---------------- kernel/time/time.c | 2 +- kernel/time/timeconst.bc | 3 ++- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/Kbuild b/Kbuild index 6f0d82a9245d..df99a5f53beb 100644 --- a/Kbuild +++ b/Kbuild @@ -2,8 +2,9 @@ # Kbuild for top-level directory of the kernel # This file takes care of the following: # 1) Generate bounds.h -# 2) Generate asm-offsets.h (may need bounds.h) -# 3) Check for missing system calls +# 2) Generate timeconst.h +# 3) Generate asm-offsets.h (may need bounds.h and timeconst.h) +# 4) Check for missing system calls # Default sed regexp - multiline due to syntax constraints define sed-y @@ -47,7 +48,26 @@ $(obj)/$(bounds-file): kernel/bounds.s FORCE $(call filechk,offsets,__LINUX_BOUNDS_H__) ##### -# 2) Generate asm-offsets.h +# 2) Generate timeconst.h + +timeconst-file := include/generated/timeconst.h + +#always += $(timeconst-file) +targets += $(timeconst-file) + +quiet_cmd_gentimeconst = GEN $@ +define cmd_gentimeconst + (echo $(CONFIG_HZ) | bc -q $< ) > $@ +endef +define filechk_gentimeconst + (echo $(CONFIG_HZ) | bc -q $< ) +endef + +$(obj)/$(timeconst-file): kernel/time/timeconst.bc FORCE + $(call filechk,gentimeconst) + +##### +# 3) Generate asm-offsets.h # offsets-file := include/generated/asm-offsets.h @@ -57,7 +77,7 @@ targets += arch/$(SRCARCH)/kernel/asm-offsets.s # We use internal kbuild rules to avoid the "is up to date" message from make arch/$(SRCARCH)/kernel/asm-offsets.s: arch/$(SRCARCH)/kernel/asm-offsets.c \ - $(obj)/$(bounds-file) FORCE + $(obj)/$(timeconst-file) $(obj)/$(bounds-file) FORCE $(Q)mkdir -p $(dir $@) $(call if_changed_dep,cc_s_c) @@ -65,7 +85,7 @@ $(obj)/$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE $(call filechk,offsets,__ASM_OFFSETS_H__) ##### -# 3) Check for missing system calls +# 4) Check for missing system calls # always += missing-syscalls @@ -77,5 +97,5 @@ quiet_cmd_syscalls = CALL $< missing-syscalls: scripts/checksyscalls.sh $(offsets-file) FORCE $(call cmd,syscalls) -# Keep these two files during make clean -no-clean-files := $(bounds-file) $(offsets-file) +# Keep these three files during make clean +no-clean-files := $(bounds-file) $(offsets-file) $(timeconst-file) diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 01f0312419b3..ffc4cc3dcd47 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -13,19 +13,4 @@ obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - +$(obj)/time.o: $(objtree)/include/config/ diff --git a/kernel/time/time.c b/kernel/time/time.c index 2c85b7724af4..4fa1d26a9843 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -41,7 +41,7 @@ #include #include -#include "timeconst.h" +#include #include "timekeeping.h" /* diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index 511bdf2cafda..c7388dee8635 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -50,7 +50,7 @@ define timeconst(hz) { print "#include \n\n" print "#if HZ != ", hz, "\n" - print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n" print "#endif\n\n" if (hz < 2) { @@ -105,4 +105,5 @@ define timeconst(hz) { halt } +hz = read(); timeconst(hz) From ca42aaf0c8616cde6161ea4391dff364efeee46a Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Mon, 18 May 2015 14:19:13 +0200 Subject: [PATCH 061/106] time: Refactor msecs_to_jiffies Refactor the msecs_to_jiffies conditional code part in time.c and jiffies.h putting it into conditional functions rather than #ifdefs to improve readability. [ tglx: Verified that there is no binary code change ] Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1431951554-5563-2-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- include/linux/jiffies.h | 64 ++++++++++++++++++++++++++++++++++++++++- kernel/time/time.c | 57 ++++++++++++------------------------ 2 files changed, 81 insertions(+), 40 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index c367cbdf73ab..9527ddbb0f1b 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -7,6 +7,7 @@ #include #include #include /* for HZ */ +#include /* * The following defines establish the engineering parameters of the PLL @@ -288,7 +289,68 @@ static inline u64 jiffies_to_nsecs(const unsigned long j) return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } -extern unsigned long msecs_to_jiffies(const unsigned int m); +extern unsigned long __msecs_to_jiffies(const unsigned int m); +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) +/* + * HZ is equal to or smaller than 1000, and 1000 is a nice round + * multiple of HZ, divide with the factor between them, but round + * upwards: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +} +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) +/* + * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - + * simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot overflow: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return m * (HZ / MSEC_PER_SEC); +} +#else +/* + * Generic case - multiply, round and divide. But first check that if + * we are doing a net multiplication, that we wouldn't overflow: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + >> MSEC_TO_HZ_SHR32; +} +#endif +/** + * msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() + * + * the HZ range specific helpers _msecs_to_jiffies() are called from + * __msecs_to_jiffies(). + */ +static inline unsigned long msecs_to_jiffies(const unsigned int m) +{ + return __msecs_to_jiffies(m); +} + extern unsigned long usecs_to_jiffies(const unsigned int u); extern unsigned long timespec_to_jiffies(const struct timespec *value); extern void jiffies_to_timespec(const unsigned long jiffies, diff --git a/kernel/time/time.c b/kernel/time/time.c index 4fa1d26a9843..c42c2c3214fe 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -483,9 +483,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec) } EXPORT_SYMBOL(ns_to_timespec64); #endif -/* - * When we convert to jiffies then we interpret incoming values - * the following way: +/** + * msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * @@ -493,51 +495,28 @@ EXPORT_SYMBOL(ns_to_timespec64); * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying - * the input value by a factor or dividing it with a factor + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() * - * We must also be careful about 32-bit overflows. + * msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the _msecs_to_jiffies helpers are the HZ dependent conversion + * routines found in include/linux/jiffies.h */ -unsigned long msecs_to_jiffies(const unsigned int m) +unsigned long __msecs_to_jiffies(const unsigned int m) { /* * Negative value, means infinite timeout: */ if ((int)m < 0) return MAX_JIFFY_OFFSET; - -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - /* - * HZ is equal to or smaller than 1000, and 1000 is a nice - * round multiple of HZ, divide with the factor between them, - * but round upwards: - */ - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - /* - * HZ is larger than 1000, and HZ is a nice round multiple of - * 1000 - simply multiply with the factor between them. - * - * But first make sure the multiplication result cannot - * overflow: - */ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return m * (HZ / MSEC_PER_SEC); -#else - /* - * Generic case - multiply, round and divide. But first - * check that if we are doing a net multiplication, that - * we wouldn't overflow: - */ - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; -#endif + return _msecs_to_jiffies(m); } -EXPORT_SYMBOL(msecs_to_jiffies); +EXPORT_SYMBOL(__msecs_to_jiffies); unsigned long usecs_to_jiffies(const unsigned int u) { From daa67b4b70568a07fef3cffacb2055891bf42ddb Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Mon, 18 May 2015 14:19:14 +0200 Subject: [PATCH 062/106] time: Allow gcc to fold constants when possible To allow constant folding in msecs_to_jiffies() conditionally calls the HZ dependent _msecs_to_jiffies() helpers or, when gcc can not figure out constant folding, __msecs_to_jiffies which is the renamed original msecs_to_jiffies() function. Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1431951554-5563-3-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- include/linux/jiffies.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 9527ddbb0f1b..5e75af6cf1bc 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -343,12 +343,24 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) * handling any 32-bit overflows. * for the details see __msecs_to_jiffies() * - * the HZ range specific helpers _msecs_to_jiffies() are called from - * __msecs_to_jiffies(). + * msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the HZ range specific helpers _msecs_to_jiffies() are called both + * directly here and from __msecs_to_jiffies() in the case where + * constant folding is not possible. */ static inline unsigned long msecs_to_jiffies(const unsigned int m) { - return __msecs_to_jiffies(m); + if (__builtin_constant_p(m)) { + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + return _msecs_to_jiffies(m); + } else { + return __msecs_to_jiffies(m); + } } extern unsigned long usecs_to_jiffies(const unsigned int u); From 8fff52fd50934580c5108afed12043a774edf728 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 3 Apr 2015 09:04:04 +0530 Subject: [PATCH 063/106] clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state When no timers/hrtimers are pending, the expiry time is set to a special value: 'KTIME_MAX'. This normally happens with NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes. When 'expiry == KTIME_MAX', we either cancel the 'tick-sched' hrtimer (NOHZ_MODE_HIGHRES) or skip reprogramming clockevent device (NOHZ_MODE_LOWRES). But, the clockevent device is already reprogrammed from the tick-handler for next tick. As the clock event device is programmed in ONESHOT mode it will at least fire one more time (unnecessarily). Timers on few implementations (like arm_arch_timer, etc.) only support PERIODIC mode and their drivers emulate ONESHOT over that. Which means that on these platforms we will get spurious interrupts periodically (at last programmed interval rate, normally tick rate). In order to avoid spurious interrupts, the clockevent device should be stopped or its interrupts should be masked. A simple (yet hacky) solution to get this fixed could be: update hrtimer_force_reprogram() to always reprogram clockevent device and update clockevent drivers to STOP generating events (or delay it to max time) when 'expires' is set to KTIME_MAX. But the drawback here is that every clockevent driver has to be hacked for this particular case and its very easy for new ones to miss this. However, Thomas suggested to add an optional state ONESHOT_STOPPED to solve this problem: lkml.org/lkml/2014/5/9/508. This patch adds support for ONESHOT_STOPPED state in clockevents core. It will only be available to drivers that implement the state-specific callbacks instead of the legacy ->set_mode() callback. Signed-off-by: Viresh Kumar Reviewed-by: Preeti U. Murthy Cc: linaro-kernel@lists.linaro.org Cc: Frederic Weisbecker Cc: Kevin Hilman Cc: Daniel Lezcano Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/b8b383a03ac07b13312c16850b5106b82e4245b5.1428031396.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 7 ++++++- kernel/time/clockevents.c | 14 +++++++++++++- kernel/time/timer_list.c | 6 ++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 96c280b2c263..271fa4c8eb29 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -37,12 +37,15 @@ enum clock_event_mode { * reached from DETACHED or SHUTDOWN. * ONESHOT: Device is programmed to generate event only once. Can be reached * from DETACHED or SHUTDOWN. + * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily + * stopped. */ enum clock_event_state { CLOCK_EVT_STATE_DETACHED, CLOCK_EVT_STATE_SHUTDOWN, CLOCK_EVT_STATE_PERIODIC, CLOCK_EVT_STATE_ONESHOT, + CLOCK_EVT_STATE_ONESHOT_STOPPED, }; /* @@ -90,6 +93,7 @@ enum clock_event_state { * @set_mode: legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME. * @set_state_periodic: switch state to periodic, if !set_mode * @set_state_oneshot: switch state to oneshot, if !set_mode + * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode * @set_state_shutdown: switch state to shutdown, if !set_mode * @tick_resume: resume clkevt device, if !set_mode * @broadcast: function to broadcast events @@ -121,11 +125,12 @@ struct clock_event_device { * State transition callback(s): Only one of the two groups should be * defined: * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME. - * - set_state_{shutdown|periodic|oneshot}(), tick_resume(). + * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume(). */ void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); int (*set_state_periodic)(struct clock_event_device *); int (*set_state_oneshot)(struct clock_event_device *); + int (*set_state_oneshot_stopped)(struct clock_event_device *); int (*set_state_shutdown)(struct clock_event_device *); int (*tick_resume)(struct clock_event_device *); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 637a09461c1d..dc6afb485027 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -134,6 +134,17 @@ static int __clockevents_set_state(struct clock_event_device *dev, return -ENOSYS; return dev->set_state_oneshot(dev); + case CLOCK_EVT_STATE_ONESHOT_STOPPED: + /* Core internal bug */ + if (WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT, + "Current state: %d\n", dev->state)) + return -EINVAL; + + if (dev->set_state_oneshot_stopped) + return dev->set_state_oneshot_stopped(dev); + else + return -ENOSYS; + default: return -ENOSYS; } @@ -445,7 +456,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev) if (dev->set_mode) { /* We shouldn't be supporting new modes now */ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || - dev->set_state_shutdown || dev->tick_resume); + dev->set_state_shutdown || dev->tick_resume || + dev->set_state_oneshot_stopped); BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); return 0; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 18b074b215b0..1327004429be 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -258,6 +258,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, "\n"); } + if (dev->set_state_oneshot_stopped) { + SEQ_printf(m, " oneshot stopped: "); + print_name_offset(m, dev->set_state_oneshot_stopped); + SEQ_printf(m, "\n"); + } + if (dev->tick_resume) { SEQ_printf(m, " resume: "); print_name_offset(m, dev->tick_resume); From d25408756accbd2171abaa0678f986adae139e6f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 3 Apr 2015 09:04:05 +0530 Subject: [PATCH 064/106] clockevents: Stop unused clockevent devices To avoid getting spurious interrupts on a tickless CPU, clockevent device can now be stopped by switching to ONESHOT_STOPPED state. The natural place for handling this transition is tick_program_event(). On 'expires == KTIME_MAX', we skip programming the event and so we need to fix such call sites as well, to always call tick_program_event() irrespective of the expires value. Once the clockevent device is required again, check if it was earlier put into ONESHOT_STOPPED state. If yes, switch its state to ONESHOT before programming its event. To make sure we haven't missed any corner case, add a WARN() for the case where we try to reprogram clockevent device while we aren't configured in ONESHOT_STOPPED state. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: Frederic Weisbecker Cc: Kevin Hilman Cc: Daniel Lezcano Cc: Preeti U Murthy Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/5146b07be7f0bc497e0ebae036590ec2fa73e540.1428031396.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 4 ++++ kernel/time/hrtimer.c | 6 ++---- kernel/time/tick-oneshot.c | 16 ++++++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index dc6afb485027..4922f1b805ea 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -331,6 +331,10 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; + /* We must be in ONESHOT state here */ + WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT, "Current state: %d\n", + dev->state); + /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4adf32067862..278d4b36fd94 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -550,8 +550,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (cpu_base->hang_detected) return; - if (cpu_base->expires_next.tv64 != KTIME_MAX) - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(cpu_base->expires_next, 1); } /* @@ -1237,8 +1236,7 @@ retry: raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 == KTIME_MAX || - !tick_program_event(expires_next, 0)) { + if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 67a64b1670bf..f8de75715c2f 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + if (unlikely(expires.tv64 == KTIME_MAX)) { + /* + * We don't need the clock event device any more, stop it. + */ + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + return 0; + } + + if (unlikely(dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED)) { + /* + * We need the clock event again, configure it in ONESHOT mode + * before using it. + */ + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + } + return clockevents_program_event(dev, expires, force); } From 4e3d9cb0134fea035e6eb1707e5e7d8aaffa186d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2015 17:14:51 +0200 Subject: [PATCH 065/106] jiffies: Remove the extra indentation level Somehow I missed to clean that up when applying the patches. Fix it up now. Reported-by: Joe Perches Signed-off-by: Thomas Gleixner Cc: Nicholas Mc Guire --- include/linux/jiffies.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 5e75af6cf1bc..3bde5eb8568b 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -298,7 +298,7 @@ extern unsigned long __msecs_to_jiffies(const unsigned int m); */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); } #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) /* @@ -309,9 +309,9 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - return m * (HZ / MSEC_PER_SEC); + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return m * (HZ / MSEC_PER_SEC); } #else /* @@ -320,11 +320,10 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m) */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; } #endif /** From 6f7d79849a00bba82d3139ff91ff2aaabd12841e Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 1 Dec 2014 23:04:06 -0500 Subject: [PATCH 066/106] time: Make sure tz_minuteswest is set to a valid value when setting time Invalid values may overflow later, leading to undefined behaviour when multiplied by 60 to get the amount of seconds. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Sasha Levin Signed-off-by: John Stultz --- kernel/time/time.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/time/time.c b/kernel/time/time.c index c42c2c3214fe..972e3bbac963 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) return error; if (tz) { + /* Verify we're witin the +-15 hrs range */ + if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) + return -EINVAL; + sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { From 6374f9124efea5fae9cba263108583c39e22f86b Mon Sep 17 00:00:00 2001 From: Harald Geyer Date: Tue, 7 Apr 2015 11:12:35 +0000 Subject: [PATCH 067/106] timekeeping: Provide new API to get the current time resolution This patch series introduces a new function u32 ktime_get_resolution_ns(void) which allows to clean up some driver code. In particular the IIO subsystem has a function to provide timestamps for events but no means to get their resolution. So currently the dht11 driver tries to guess the resolution in a rather messy and convoluted way. We can do much better with the new code. This API is not designed to be exposed to user space. This has been tested on i386, sunxi and mxs. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Harald Geyer [jstultz: Tweaked to make it build after upstream changes] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 99176af216af..9af5c1214682 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -163,6 +163,7 @@ extern ktime_t ktime_get(void); extern ktime_t ktime_get_with_offset(enum tk_offsets offs); extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); +extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3365e32dc208..85d376396313 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -702,6 +702,23 @@ ktime_t ktime_get(void) } EXPORT_SYMBOL_GPL(ktime_get); +u32 ktime_get_resolution_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u32 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return nsecs; +} +EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); + static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, From 57d05a93ada77c4f8a6112cbc867a2948dce7991 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 13 May 2015 16:04:47 -0700 Subject: [PATCH 068/106] time: Rework debugging variables so they aren't global Ingo suggested that the timekeeping debugging variables recently added should not be global, and should be tied to the timekeeper's read_base. Thus this patch implements that suggestion. This version is different from the earlier versions as it keeps the variables in the timekeeper structure rather then in the tkr. Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Signed-off-by: John Stultz --- include/linux/timekeeper_internal.h | 15 +++++++++++++ kernel/time/timekeeping.c | 33 ++++++++++------------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 6f8276ae579c..e1f5a1136554 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -61,6 +61,9 @@ struct tk_read_base { * shifted nano seconds. * @ntp_error_shift: Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. + * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) + * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) + * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffie times) @@ -106,6 +109,18 @@ struct timekeeper { s64 ntp_error; u32 ntp_error_shift; u32 ntp_err_mult; +#ifdef CONFIG_DEBUG_TIMEKEEPING + long last_warning; + /* + * These simple flag variables are managed + * without locks, which is racy, but they are + * ok since we don't really care about being + * super precise about how many events were + * seen, just that a problem was observed. + */ + int underflow_seen; + int overflow_seen; +#endif }; #ifdef CONFIG_GENERIC_TIME_VSYSCALL diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 85d376396313..2f10b6557a1c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ -/* - * These simple flag variables are managed - * without locks, which is racy, but ok since - * we don't really care about being super - * precise about how many events were seen, - * just that a problem was observed. - */ -static int timekeeping_underflow_seen; -static int timekeeping_overflow_seen; - -/* last_warning is only modified under the timekeeping lock */ -static long timekeeping_last_warning; static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) { @@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) } } - if (timekeeping_underflow_seen) { - if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + if (tk->underflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); - timekeeping_last_warning = jiffies; + tk->last_warning = jiffies; } - timekeeping_underflow_seen = 0; + tk->underflow_seen = 0; } - if (timekeeping_overflow_seen) { - if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + if (tk->overflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); - timekeeping_last_warning = jiffies; + tk->last_warning = jiffies; } - timekeeping_overflow_seen = 0; + tk->overflow_seen = 0; } } static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) { + struct timekeeper *tk = &tk_core.timekeeper; cycle_t now, last, mask, max, delta; unsigned int seq; @@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) * mask-relative negative values. */ if (unlikely((~delta & mask) < (mask >> 3))) { - timekeeping_underflow_seen = 1; + tk->underflow_seen = 1; delta = 0; } /* Cap delta value to the max_cycles values to avoid mult overflows */ if (unlikely(delta > max)) { - timekeeping_overflow_seen = 1; + tk->overflow_seen = 1; delta = tkr->clock->max_cycles; } From 4e413e8526aa53393d0b3d9ecbdb0436203586ee Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Thu, 7 May 2015 16:20:34 -0700 Subject: [PATCH 069/106] tracing: timer: Add deferrable flag to timer_start The timer_start event now shows whether the timer is deferrable in case of a low-res timer. The debug_activate function now includes a deferrable flag while calling the trace_timer_start event. Cc: Thomas Gleixner Cc: Ingo Molnar Acked-by: Steven Rostedt Signed-off-by: Badhri Jagan Sridharan [jstultz: Fixed minor whitespace and grammer tweaks pointed out by Ingo] Signed-off-by: John Stultz --- include/trace/events/timer.h | 13 +++++++++---- kernel/time/timer.c | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 68c2c2000f02..d7abef1fe6e0 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -43,15 +43,18 @@ DEFINE_EVENT(timer_class, timer_init, */ TRACE_EVENT(timer_start, - TP_PROTO(struct timer_list *timer, unsigned long expires), + TP_PROTO(struct timer_list *timer, + unsigned long expires, + unsigned int deferrable), - TP_ARGS(timer, expires), + TP_ARGS(timer, expires, deferrable), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) __field( unsigned long, now ) + __field( unsigned int, deferrable ) ), TP_fast_assign( @@ -59,11 +62,13 @@ TRACE_EVENT(timer_start, __entry->function = timer->function; __entry->expires = expires; __entry->now = jiffies; + __entry->deferrable = deferrable; ), - TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld]", + TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] defer=%c", __entry->timer, __entry->function, __entry->expires, - (long)__entry->expires - __entry->now) + (long)__entry->expires - __entry->now, + __entry->deferrable > 0 ? 'y':'n') ); /** diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d4af7c56c95d..7775d454a204 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -650,7 +650,7 @@ static inline void debug_activate(struct timer_list *timer, unsigned long expires) { debug_timer_activate(timer); - trace_timer_start(timer, expires); + trace_timer_start(timer, expires, tbase_get_deferrable(timer->base)); } static inline void debug_deactivate(struct timer_list *timer) From 30f3b3f9836c7c7e1f42ca855bbe8127fff4b99a Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 9 Apr 2015 09:04:40 +0800 Subject: [PATCH 070/106] time: Include math64.h in time64.h On 32-bit systems, timespec64_add_ns() calls __iter_div_u64_rem() which needs math64.h, and we want to include time64.h in some cases. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Xunlei Pang Signed-off-by: John Stultz --- include/linux/time64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/time64.h b/include/linux/time64.h index a3831478d9cf..12d4e82b0276 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -2,6 +2,7 @@ #define _LINUX_TIME64_H #include +#include typedef __s64 time64_t; From 689911c734ca77d682b6507354c704e06a64b1b6 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 9 Apr 2015 09:04:41 +0800 Subject: [PATCH 071/106] s390: time: Provide read_boot_clock64() and read_persistent_clock64() As part of addressing the "y2038 problem" for in-kernel uses, this patch converts read_boot_clock() to read_boot_clock64() and read_persistent_clock() to read_persistent_clock64() using timespec64. Rename some instances of 'timespec' to 'timespec64' in time.c and related references Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux390@de.ibm.com Signed-off-by: Xunlei Pang [jstultz: Fixed minor style and grammer tweaks pointed out by Ingo] Signed-off-by: John Stultz --- arch/s390/include/asm/timex.h | 5 +++-- arch/s390/kernel/debug.c | 11 ++++++----- arch/s390/kernel/time.c | 6 +++--- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 98eb2a579223..dcb6312a0b91 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -10,6 +10,7 @@ #define _ASM_S390_TIMEX_H #include +#include /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL @@ -108,10 +109,10 @@ int get_sync_clock(unsigned long long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); -void tod_to_timeval(__u64, struct timespec *); +void tod_to_timeval(__u64 todval, struct timespec64 *xt); static inline -void stck_to_timespec(unsigned long long stck, struct timespec *ts) +void stck_to_timespec64(unsigned long long stck, struct timespec64 *ts) { tod_to_timeval(stck - TOD_UNIX_EPOCH, ts); } diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index c1f21aca76e7..6fca0e46464e 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -1457,23 +1457,24 @@ int debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, int area, debug_entry_t * entry, char *out_buf) { - struct timespec time_spec; + struct timespec64 time_spec; char *except_str; unsigned long caller; int rc = 0; unsigned int level; level = entry->id.fields.level; - stck_to_timespec(entry->id.stck, &time_spec); + stck_to_timespec64(entry->id.stck, &time_spec); if (entry->id.fields.exception) except_str = "*"; else except_str = "-"; caller = ((unsigned long) entry->caller) & PSW_ADDR_INSN; - rc += sprintf(out_buf, "%02i %011lu:%06lu %1u %1s %02i %p ", - area, time_spec.tv_sec, time_spec.tv_nsec / 1000, level, - except_str, entry->id.fields.cpuid, (void *) caller); + rc += sprintf(out_buf, "%02i %011lld:%06lu %1u %1s %02i %p ", + area, (long long)time_spec.tv_sec, + time_spec.tv_nsec / 1000, level, except_str, + entry->id.fields.cpuid, (void *)caller); return rc; } EXPORT_SYMBOL(debug_dflt_header_fn); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 170ddd2018b3..9e733d965e08 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -76,7 +76,7 @@ unsigned long long monotonic_clock(void) } EXPORT_SYMBOL(monotonic_clock); -void tod_to_timeval(__u64 todval, struct timespec *xt) +void tod_to_timeval(__u64 todval, struct timespec64 *xt) { unsigned long long sec; @@ -181,12 +181,12 @@ static void timing_alert_interrupt(struct ext_code ext_code, static void etr_reset(void); static void stp_reset(void); -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { tod_to_timeval(get_tod_clock() - TOD_UNIX_EPOCH, ts); } -void read_boot_clock(struct timespec *ts) +void read_boot_clock64(struct timespec64 *ts) { tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts); } From e83d0a4106d81dd08b70318f078f3bad6acdc110 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 9 Apr 2015 09:04:42 +0800 Subject: [PATCH 072/106] time: Remove read_boot_clock() Now that we have a read_boot_clock64() function available on every architecture, and converted all the users to it, it's time to remove the (now unused) read_boot_clock() completely from the kernel. Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Xunlei Pang [jstultz: Minor commit message tweak suggested by Ingo] Signed-off-by: John Stultz --- include/linux/timekeeping.h | 1 - kernel/time/timekeeping.c | 14 +++----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 9af5c1214682..3aa72e648650 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -267,7 +267,6 @@ extern int persistent_clock_is_local; extern void read_persistent_clock(struct timespec *ts); extern void read_persistent_clock64(struct timespec64 *ts); -extern void read_boot_clock(struct timespec *ts); extern void read_boot_clock64(struct timespec64 *ts); extern int update_persistent_clock(struct timespec now); extern int update_persistent_clock64(struct timespec64 now); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2f10b6557a1c..90ed5db67c1d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1188,28 +1188,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock - Return time of the system start. + * read_boot_clock64 - Return time of the system start. * * Weak dummy function for arches that do not yet support it. * Function to read the exact time the system has been started. - * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -void __weak read_boot_clock(struct timespec *ts) +void __weak read_boot_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } -void __weak read_boot_clock64(struct timespec64 *ts64) -{ - struct timespec ts; - - read_boot_clock(&ts); - *ts64 = timespec_to_timespec64(ts); -} - /* Flag for if timekeeping_resume() has injected sleeptime */ static bool sleeptime_injected; From ac34ad27fc160b5bd31c731cdaaf6e1d1890ccb2 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 16 Jan 2015 10:05:51 +0100 Subject: [PATCH 073/106] clockevents: Do not suspend/resume if unused There is no point in calling suspend/resume for unused clockevents as they are already stopped and disabled. This is really important for AT91 as the hardware is a trainwreck and takes ages to synchronize. Reported-by: Sylvain Rochet Signed-off-by: Alexandre Belloni Cc: Daniel Lezcano Cc: Nicolas Ferre Cc: Boris Brezillon Cc: Maxime Ripard Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1421399151-26800-1-git-send-email-alexandre.belloni@free-electrons.com Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 4922f1b805ea..2a5c369e50ab 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -638,7 +638,7 @@ void clockevents_suspend(void) struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) - if (dev->suspend) + if (dev->suspend && dev->mode != CLOCK_EVT_MODE_UNUSED) dev->suspend(dev); } @@ -650,7 +650,7 @@ void clockevents_resume(void) struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) - if (dev->resume) + if (dev->resume && dev->mode != CLOCK_EVT_MODE_UNUSED) dev->resume(dev); } From 4ba15d1d41ba9a51da2dc986c145b7514cc394be Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 10 Apr 2015 16:11:02 -0700 Subject: [PATCH 074/106] clocksource/drivers/qcom: Remove dead code This code is no longer used now that mach-msm has been removed. Delete it. Cc: David Brown Cc: Bryan Huntsman Cc: Daniel Walker Signed-off-by: Stephen Boyd Signed-off-by: Daniel Lezcano --- drivers/clocksource/qcom-timer.c | 59 -------------------------------- 1 file changed, 59 deletions(-) diff --git a/drivers/clocksource/qcom-timer.c b/drivers/clocksource/qcom-timer.c index 098c542e5c53..cba2d015564c 100644 --- a/drivers/clocksource/qcom-timer.c +++ b/drivers/clocksource/qcom-timer.c @@ -40,8 +40,6 @@ #define GPT_HZ 32768 -#define MSM_DGT_SHIFT 5 - static void __iomem *event_base; static void __iomem *sts_base; @@ -232,7 +230,6 @@ err: register_current_timer_delay(&msm_delay_timer); } -#ifdef CONFIG_ARCH_QCOM static void __init msm_dt_timer_init(struct device_node *np) { u32 freq; @@ -285,59 +282,3 @@ static void __init msm_dt_timer_init(struct device_node *np) } CLOCKSOURCE_OF_DECLARE(kpss_timer, "qcom,kpss-timer", msm_dt_timer_init); CLOCKSOURCE_OF_DECLARE(scss_timer, "qcom,scss-timer", msm_dt_timer_init); -#else - -static int __init msm_timer_map(phys_addr_t addr, u32 event, u32 source, - u32 sts) -{ - void __iomem *base; - - base = ioremap(addr, SZ_256); - if (!base) { - pr_err("Failed to map timer base\n"); - return -ENOMEM; - } - event_base = base + event; - source_base = base + source; - if (sts) - sts_base = base + sts; - - return 0; -} - -static notrace cycle_t msm_read_timer_count_shift(struct clocksource *cs) -{ - /* - * Shift timer count down by a constant due to unreliable lower bits - * on some targets. - */ - return msm_read_timer_count(cs) >> MSM_DGT_SHIFT; -} - -void __init msm7x01_timer_init(void) -{ - struct clocksource *cs = &msm_clocksource; - - if (msm_timer_map(0xc0100000, 0x0, 0x10, 0x0)) - return; - cs->read = msm_read_timer_count_shift; - cs->mask = CLOCKSOURCE_MASK((32 - MSM_DGT_SHIFT)); - /* 600 KHz */ - msm_timer_init(19200000 >> MSM_DGT_SHIFT, 32 - MSM_DGT_SHIFT, 7, - false); -} - -void __init msm7x30_timer_init(void) -{ - if (msm_timer_map(0xc0100000, 0x4, 0x24, 0x80)) - return; - msm_timer_init(24576000 / 4, 32, 1, false); -} - -void __init qsd8x50_timer_init(void) -{ - if (msm_timer_map(0xAC100000, 0x0, 0x10, 0x34)) - return; - msm_timer_init(19200000 / 4, 32, 7, false); -} -#endif From 37285674f3f14d9f0a7e26565b48e68e3edc635f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 30 Apr 2015 13:42:51 +0900 Subject: [PATCH 075/106] clocksource/drivers/exynos_mct: Change exynos4_mct_tick_clear return type to void Return value of exynos4_mct_tick_clear() was never checked so it can be safely changed to void. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Daniel Lezcano --- drivers/clocksource/exynos_mct.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 83564c9cfdbe..87c2e558c465 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -413,7 +413,7 @@ static inline void exynos4_tick_set_mode(enum clock_event_mode mode, } } -static int exynos4_mct_tick_clear(struct mct_clock_event_device *mevt) +static void exynos4_mct_tick_clear(struct mct_clock_event_device *mevt) { struct clock_event_device *evt = &mevt->evt; @@ -426,12 +426,8 @@ static int exynos4_mct_tick_clear(struct mct_clock_event_device *mevt) exynos4_mct_tick_stop(mevt); /* Clear the MCT tick interrupt */ - if (readl_relaxed(reg_base + mevt->base + MCT_L_INT_CSTAT_OFFSET) & 1) { + if (readl_relaxed(reg_base + mevt->base + MCT_L_INT_CSTAT_OFFSET) & 1) exynos4_mct_write(0x1, mevt->base + MCT_L_INT_CSTAT_OFFSET); - return 1; - } else { - return 0; - } } static irqreturn_t exynos4_mct_tick_isr(int irq, void *dev_id) From 6c10bf637207818f3caaea965d4b72007c36fc92 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 30 Apr 2015 13:42:52 +0900 Subject: [PATCH 076/106] clocksource/drivers/exynos_mct: Staticize struct clocksource The struct clocksource 'mct_frc' is not exported and used outside so make it static. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Daniel Lezcano --- drivers/clocksource/exynos_mct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 87c2e558c465..8b2a9fc66dcc 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -209,7 +209,7 @@ static void exynos4_frc_resume(struct clocksource *cs) exynos4_mct_frc_start(); } -struct clocksource mct_frc = { +static struct clocksource mct_frc = { .name = "mct-frc", .rating = 400, .read = exynos4_frc_read, From 65ec7b2718719cef0c9e9db853a014db5a102def Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 30 Apr 2015 13:42:53 +0900 Subject: [PATCH 077/106] clocksource/drivers/exynos_mct: Remove old platform mct_init() Since commit 228e3023eb04 ("Merge tag 'mct-exynos-for-v3.10' of ...") the mct_init() was superseded by mct_init_dt() and is not referenced anywhere. Remove it. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Daniel Lezcano --- drivers/clocksource/exynos_mct.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 8b2a9fc66dcc..935b05936dbd 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -560,18 +560,6 @@ out_irq: free_percpu_irq(mct_irqs[MCT_L0_IRQ], &percpu_mct_tick); } -void __init mct_init(void __iomem *base, int irq_g0, int irq_l0, int irq_l1) -{ - mct_irqs[MCT_G0_IRQ] = irq_g0; - mct_irqs[MCT_L0_IRQ] = irq_l0; - mct_irqs[MCT_L1_IRQ] = irq_l1; - mct_int_type = MCT_INT_SPI; - - exynos4_timer_resources(NULL, base); - exynos4_clocksource_init(); - exynos4_clockevent_init(); -} - static void __init mct_init_dt(struct device_node *np, unsigned int int_type) { u32 nr_irqs, i; From 050dd3222ba9fe8cadcb5146de052139ffb30b61 Mon Sep 17 00:00:00 2001 From: Joachim Eastwood Date: Tue, 12 May 2015 00:00:48 +0200 Subject: [PATCH 078/106] clocksource/drivers/lpc32xx: Add the lpc32xx timer driver Add support for using the NXP LPC timer as clocksource and clock event. These timers are present on many NXP devices including LPC32xx, LPC17xx, LPC18xx and LPC43xx. The timer has a 32-bit timer counter register with a programmable 32-bit prescaler. It supports up to 4 compare match values with interrupt generation and reset/stop timer counter action. Signed-off-by: Joachim Eastwood Signed-off-by: Daniel Lezcano Reviewed-by: Ezequiel Garcia Acked-by: Arnd Bergmann --- drivers/clocksource/Kconfig | 5 + drivers/clocksource/Makefile | 1 + drivers/clocksource/time-lpc32xx.c | 272 +++++++++++++++++++++++++++++ 3 files changed, 278 insertions(+) create mode 100644 drivers/clocksource/time-lpc32xx.c diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 51d7865fdddb..47f9f22cd516 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -106,6 +106,11 @@ config CLKSRC_EFM32 Support to use the timers of EFM32 SoCs as clock source and clock event device. +config CLKSRC_LPC32XX + bool + select CLKSRC_MMIO + select CLKSRC_OF + config ARM_ARCH_TIMER bool select CLKSRC_OF if OF diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 5b85f6adb258..5928e35cb64d 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_ARCH_BCM_MOBILE) += bcm_kona_timer.o obj-$(CONFIG_CADENCE_TTC_TIMER) += cadence_ttc_timer.o obj-$(CONFIG_CLKSRC_EFM32) += time-efm32.o obj-$(CONFIG_CLKSRC_EXYNOS_MCT) += exynos_mct.o +obj-$(CONFIG_CLKSRC_LPC32XX) += time-lpc32xx.o obj-$(CONFIG_CLKSRC_SAMSUNG_PWM) += samsung_pwm_timer.o obj-$(CONFIG_FSL_FTM_TIMER) += fsl_ftm_timer.o obj-$(CONFIG_VF_PIT_TIMER) += vf_pit_timer.o diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c new file mode 100644 index 000000000000..a1c06a2bc77c --- /dev/null +++ b/drivers/clocksource/time-lpc32xx.c @@ -0,0 +1,272 @@ +/* + * Clocksource driver for NXP LPC32xx/18xx/43xx timer + * + * Copyright (C) 2015 Joachim Eastwood + * + * Based on: + * time-efm32 Copyright (C) 2013 Pengutronix + * mach-lpc32xx/timer.c Copyright (C) 2009 - 2010 NXP Semiconductors + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + * + */ + +#define pr_fmt(fmt) "%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LPC32XX_TIMER_IR 0x000 +#define LPC32XX_TIMER_IR_MR0INT BIT(0) +#define LPC32XX_TIMER_TCR 0x004 +#define LPC32XX_TIMER_TCR_CEN BIT(0) +#define LPC32XX_TIMER_TCR_CRST BIT(1) +#define LPC32XX_TIMER_TC 0x008 +#define LPC32XX_TIMER_PR 0x00c +#define LPC32XX_TIMER_MCR 0x014 +#define LPC32XX_TIMER_MCR_MR0I BIT(0) +#define LPC32XX_TIMER_MCR_MR0R BIT(1) +#define LPC32XX_TIMER_MCR_MR0S BIT(2) +#define LPC32XX_TIMER_MR0 0x018 +#define LPC32XX_TIMER_CTCR 0x070 + +struct lpc32xx_clock_event_ddata { + struct clock_event_device evtdev; + void __iomem *base; +}; + +/* Needed for the sched clock */ +static void __iomem *clocksource_timer_counter; + +static u64 notrace lpc32xx_read_sched_clock(void) +{ + return readl(clocksource_timer_counter); +} + +static int lpc32xx_clkevt_next_event(unsigned long delta, + struct clock_event_device *evtdev) +{ + struct lpc32xx_clock_event_ddata *ddata = + container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); + + /* + * Place timer in reset and program the delta in the prescale + * register (PR). When the prescale counter matches the value + * in PR the counter register is incremented and the compare + * match will trigger. After setup the timer is released from + * reset and enabled. + */ + writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR); + writel_relaxed(delta, ddata->base + LPC32XX_TIMER_PR); + writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR); + + return 0; +} + +static int lpc32xx_clkevt_shutdown(struct clock_event_device *evtdev) +{ + struct lpc32xx_clock_event_ddata *ddata = + container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); + + /* Disable the timer */ + writel_relaxed(0, ddata->base + LPC32XX_TIMER_TCR); + + return 0; +} + +static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev) +{ + /* + * When using oneshot, we must also disable the timer + * to wait for the first call to set_next_event(). + */ + return lpc32xx_clkevt_shutdown(evtdev); +} + +static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id) +{ + struct lpc32xx_clock_event_ddata *ddata = dev_id; + + /* Clear match on channel 0 */ + writel_relaxed(LPC32XX_TIMER_IR_MR0INT, ddata->base + LPC32XX_TIMER_IR); + + ddata->evtdev.event_handler(&ddata->evtdev); + + return IRQ_HANDLED; +} + +static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = { + .evtdev = { + .name = "lpc3220 clockevent", + .features = CLOCK_EVT_FEAT_ONESHOT, + .rating = 300, + .set_next_event = lpc32xx_clkevt_next_event, + .set_state_shutdown = lpc32xx_clkevt_shutdown, + .set_state_oneshot = lpc32xx_clkevt_oneshot, + }, +}; + +static int __init lpc32xx_clocksource_init(struct device_node *np) +{ + void __iomem *base; + unsigned long rate; + struct clk *clk; + int ret; + + clk = of_clk_get_by_name(np, "timerclk"); + if (IS_ERR(clk)) { + pr_err("clock get failed (%lu)\n", PTR_ERR(clk)); + return PTR_ERR(clk); + } + + ret = clk_prepare_enable(clk); + if (ret) { + pr_err("clock enable failed (%d)\n", ret); + goto err_clk_enable; + } + + base = of_iomap(np, 0); + if (!base) { + pr_err("unable to map registers\n"); + ret = -EADDRNOTAVAIL; + goto err_iomap; + } + + /* + * Disable and reset timer then set it to free running timer + * mode (CTCR) with no prescaler (PR) or match operations (MCR). + * After setup the timer is released from reset and enabled. + */ + writel_relaxed(LPC32XX_TIMER_TCR_CRST, base + LPC32XX_TIMER_TCR); + writel_relaxed(0, base + LPC32XX_TIMER_PR); + writel_relaxed(0, base + LPC32XX_TIMER_MCR); + writel_relaxed(0, base + LPC32XX_TIMER_CTCR); + writel_relaxed(LPC32XX_TIMER_TCR_CEN, base + LPC32XX_TIMER_TCR); + + rate = clk_get_rate(clk); + ret = clocksource_mmio_init(base + LPC32XX_TIMER_TC, "lpc3220 timer", + rate, 300, 32, clocksource_mmio_readl_up); + if (ret) { + pr_err("failed to init clocksource (%d)\n", ret); + goto err_clocksource_init; + } + + clocksource_timer_counter = base + LPC32XX_TIMER_TC; + sched_clock_register(lpc32xx_read_sched_clock, 32, rate); + + return 0; + +err_clocksource_init: + iounmap(base); +err_iomap: + clk_disable_unprepare(clk); +err_clk_enable: + clk_put(clk); + return ret; +} + +static int __init lpc32xx_clockevent_init(struct device_node *np) +{ + void __iomem *base; + unsigned long rate; + struct clk *clk; + int ret, irq; + + clk = of_clk_get_by_name(np, "timerclk"); + if (IS_ERR(clk)) { + pr_err("clock get failed (%lu)\n", PTR_ERR(clk)); + return PTR_ERR(clk); + } + + ret = clk_prepare_enable(clk); + if (ret) { + pr_err("clock enable failed (%d)\n", ret); + goto err_clk_enable; + } + + base = of_iomap(np, 0); + if (!base) { + pr_err("unable to map registers\n"); + ret = -EADDRNOTAVAIL; + goto err_iomap; + } + + irq = irq_of_parse_and_map(np, 0); + if (!irq) { + pr_err("get irq failed\n"); + ret = -ENOENT; + goto err_irq; + } + + /* + * Disable timer and clear any pending interrupt (IR) on match + * channel 0 (MR0). Configure a compare match value of 1 on MR0 + * and enable interrupt, reset on match and stop on match (MCR). + */ + writel_relaxed(0, base + LPC32XX_TIMER_TCR); + writel_relaxed(0, base + LPC32XX_TIMER_CTCR); + writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR); + writel_relaxed(1, base + LPC32XX_TIMER_MR0); + writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R | + LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR); + + rate = clk_get_rate(clk); + lpc32xx_clk_event_ddata.base = base; + clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev, + rate, 1, -1); + + ret = request_irq(irq, lpc32xx_clock_event_handler, + IRQF_TIMER | IRQF_IRQPOLL, "lpc3220 clockevent", + &lpc32xx_clk_event_ddata); + if (ret) { + pr_err("request irq failed\n"); + goto err_irq; + } + + return 0; + +err_irq: + iounmap(base); +err_iomap: + clk_disable_unprepare(clk); +err_clk_enable: + clk_put(clk); + return ret; +} + +/* + * This function asserts that we have exactly one clocksource and one + * clock_event_device in the end. + */ +static void __init lpc32xx_timer_init(struct device_node *np) +{ + static int has_clocksource, has_clockevent; + int ret; + + if (!has_clocksource) { + ret = lpc32xx_clocksource_init(np); + if (!ret) { + has_clocksource = 1; + return; + } + } + + if (!has_clockevent) { + ret = lpc32xx_clockevent_init(np); + if (!ret) { + has_clockevent = 1; + return; + } + } +} +CLOCKSOURCE_OF_DECLARE(lpc32xx_timer, "nxp,lpc3220-timer", lpc32xx_timer_init); From 5fc9b49deadc16d088e8fc6ca4fee85644380497 Mon Sep 17 00:00:00 2001 From: Joachim Eastwood Date: Tue, 12 May 2015 00:00:49 +0200 Subject: [PATCH 079/106] doc: dt: Add documentation for lpc3220-timer Add DT bindings documentation for lpc3220-timer. This timer is used as clocksource on many NXP platforms. Signed-off-by: Joachim Eastwood Signed-off-by: Daniel Lezcano Acked-by: Arnd Bergmann --- .../bindings/timer/nxp,lpc3220-timer.txt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt diff --git a/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt b/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt new file mode 100644 index 000000000000..51b05a0e70d1 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt @@ -0,0 +1,26 @@ +* NXP LPC3220 timer + +The NXP LPC3220 timer is used on a wide range of NXP SoCs. This +includes LPC32xx, LPC178x, LPC18xx and LPC43xx parts. + +Required properties: +- compatible: + Should be "nxp,lpc3220-timer". +- reg: + Address and length of the register set. +- interrupts: + Reference to the timer interrupt +- clocks: + Should contain a reference to timer clock. +- clock-names: + Should contain "timerclk". + +Example: + +timer1: timer@40085000 { + compatible = "nxp,lpc3220-timer"; + reg = <0x40085000 0x1000>; + interrupts = <13>; + clocks = <&ccu1 CLK_CPU_TIMER1>; + clock-names = "timerclk"; +}; From 571fc8e836a1537b87804358c7baa9882d25b754 Mon Sep 17 00:00:00 2001 From: Maxime Coquelin Date: Sat, 9 May 2015 09:53:45 +0200 Subject: [PATCH 080/106] dt-bindings: Document the ARM System timer bindings This adds documentation of device tree bindings for the ARM System timer. Tested-by: Chanwoo Choi Acked-by: Rob Herring Signed-off-by: Maxime Coquelin Signed-off-by: Daniel Lezcano --- .../bindings/arm/armv7m_systick.txt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 Documentation/devicetree/bindings/arm/armv7m_systick.txt diff --git a/Documentation/devicetree/bindings/arm/armv7m_systick.txt b/Documentation/devicetree/bindings/arm/armv7m_systick.txt new file mode 100644 index 000000000000..7cf4a24601eb --- /dev/null +++ b/Documentation/devicetree/bindings/arm/armv7m_systick.txt @@ -0,0 +1,26 @@ +* ARMv7M System Timer + +ARMv7-M includes a system timer, known as SysTick. Current driver only +implements the clocksource feature. + +Required properties: +- compatible : Should be "arm,armv7m-systick" +- reg : The address range of the timer + +Required clocking property, have to be one of: +- clocks : The input clock of the timer +- clock-frequency : The rate in HZ in input of the ARM SysTick + +Examples: + +systick: timer@e000e010 { + compatible = "arm,armv7m-systick"; + reg = <0xe000e010 0x10>; + clocks = <&clk_systick>; +}; + +systick: timer@e000e010 { + compatible = "arm,armv7m-systick"; + reg = <0xe000e010 0x10>; + clock-frequency = <90000000>; +}; From 4958ebb3d027886c46b936453745dba59b09c578 Mon Sep 17 00:00:00 2001 From: Maxime Coquelin Date: Sat, 9 May 2015 09:53:46 +0200 Subject: [PATCH 081/106] clocksource/drivers/armv7m_systick: Add ARM System timer driver This patch adds clocksource support for ARMv7-M's System timer, also known as SysTick. Tested-by: Chanwoo Choi Acked-by: Daniel Lezcano Signed-off-by: Maxime Coquelin Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 7 +++ drivers/clocksource/Makefile | 1 + drivers/clocksource/armv7m_systick.c | 79 ++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 drivers/clocksource/armv7m_systick.c diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 47f9f22cd516..2c67ca94ed3b 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -144,6 +144,13 @@ config CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK help Use ARM global timer clock source as sched_clock +config ARMV7M_SYSTICK + bool + select CLKSRC_OF if OF + select CLKSRC_MMIO + help + This options enables support for the ARMv7M system timer unit + config ATMEL_PIT select CLKSRC_OF if OF def_bool SOC_AT91SAM9 || SOC_SAMA5 diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 5928e35cb64d..ebfb5dc1995c 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_MTK_TIMER) += mtk_timer.o obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o obj-$(CONFIG_ARM_GLOBAL_TIMER) += arm_global_timer.o +obj-$(CONFIG_ARMV7M_SYSTICK) += armv7m_systick.o obj-$(CONFIG_CLKSRC_METAG_GENERIC) += metag_generic.o obj-$(CONFIG_ARCH_HAS_TICK_BROADCAST) += dummy_timer.o obj-$(CONFIG_ARCH_KEYSTONE) += timer-keystone.o diff --git a/drivers/clocksource/armv7m_systick.c b/drivers/clocksource/armv7m_systick.c new file mode 100644 index 000000000000..addfd2c64f54 --- /dev/null +++ b/drivers/clocksource/armv7m_systick.c @@ -0,0 +1,79 @@ +/* + * Copyright (C) Maxime Coquelin 2015 + * Author: Maxime Coquelin + * License terms: GNU General Public License (GPL), version 2 + */ + +#include +#include +#include +#include +#include +#include +#include + +#define SYST_CSR 0x00 +#define SYST_RVR 0x04 +#define SYST_CVR 0x08 +#define SYST_CALIB 0x0c + +#define SYST_CSR_ENABLE BIT(0) + +#define SYSTICK_LOAD_RELOAD_MASK 0x00FFFFFF + +static void __init system_timer_of_register(struct device_node *np) +{ + struct clk *clk = NULL; + void __iomem *base; + u32 rate; + int ret; + + base = of_iomap(np, 0); + if (!base) { + pr_warn("system-timer: invalid base address\n"); + return; + } + + ret = of_property_read_u32(np, "clock-frequency", &rate); + if (ret) { + clk = of_clk_get(np, 0); + if (IS_ERR(clk)) + goto out_unmap; + + ret = clk_prepare_enable(clk); + if (ret) + goto out_clk_put; + + rate = clk_get_rate(clk); + if (!rate) + goto out_clk_disable; + } + + writel_relaxed(SYSTICK_LOAD_RELOAD_MASK, base + SYST_RVR); + writel_relaxed(SYST_CSR_ENABLE, base + SYST_CSR); + + ret = clocksource_mmio_init(base + SYST_CVR, "arm_system_timer", rate, + 200, 24, clocksource_mmio_readl_down); + if (ret) { + pr_err("failed to init clocksource (%d)\n", ret); + if (clk) + goto out_clk_disable; + else + goto out_unmap; + } + + pr_info("ARM System timer initialized as clocksource\n"); + + return; + +out_clk_disable: + clk_disable_unprepare(clk); +out_clk_put: + clk_put(clk); +out_unmap: + iounmap(base); + pr_warn("ARM System timer register failed (%d)\n", ret); +} + +CLOCKSOURCE_OF_DECLARE(arm_systick, "arm,armv7m-systick", + system_timer_of_register); From 4853914ffcea925ded0ac2be205f55688f302caa Mon Sep 17 00:00:00 2001 From: Maxime Coquelin Date: Fri, 22 May 2015 23:03:32 +0200 Subject: [PATCH 082/106] dt-bindings: Document the STM32 timer bindings This adds documentation of device tree bindings for the STM32 timer. Tested-by: Chanwoo Choi Acked-by: Rob Herring Signed-off-by: Maxime Coquelin Signed-off-by: Daniel Lezcano --- .../bindings/timer/st,stm32-timer.txt | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 Documentation/devicetree/bindings/timer/st,stm32-timer.txt diff --git a/Documentation/devicetree/bindings/timer/st,stm32-timer.txt b/Documentation/devicetree/bindings/timer/st,stm32-timer.txt new file mode 100644 index 000000000000..8ef28e70d6e8 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/st,stm32-timer.txt @@ -0,0 +1,22 @@ +. STMicroelectronics STM32 timer + +The STM32 MCUs family has several general-purpose 16 and 32 bits timers. + +Required properties: +- compatible : Should be "st,stm32-timer" +- reg : Address and length of the register set +- clocks : Reference on the timer input clock +- interrupts : Reference to the timer interrupt + +Optional properties: +- resets: Reference to a reset controller asserting the timer + +Example: + +timer5: timer@40000c00 { + compatible = "st,stm32-timer"; + reg = <0x40000c00 0x400>; + interrupts = <50>; + resets = <&rrc 259>; + clocks = <&clk_pmtr1>; +}; From e37e45934afed32f00e16db745c410d5c675456d Mon Sep 17 00:00:00 2001 From: Maxime Coquelin Date: Fri, 22 May 2015 23:03:33 +0200 Subject: [PATCH 083/106] clockevents/drivers: Add STM32 Timer driver STM32 MCUs feature 16 and 32 bits general purpose timers with prescalers. The drivers detects whether the time is 16 or 32 bits, and applies a 1024 prescaler value if it is 16 bits. Reviewed-by: Linus Walleij Tested-by: Chanwoo Choi Signed-off-by: Maxime Coquelin Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 5 + drivers/clocksource/Makefile | 1 + drivers/clocksource/timer-stm32.c | 184 ++++++++++++++++++++++++++++++ 3 files changed, 190 insertions(+) create mode 100644 drivers/clocksource/timer-stm32.c diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 2c67ca94ed3b..bec25b39ecf2 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -111,6 +111,11 @@ config CLKSRC_LPC32XX select CLKSRC_MMIO select CLKSRC_OF +config CLKSRC_STM32 + bool "Clocksource for STM32 SoCs" if COMPILE_TEST + depends on OF + select CLKSRC_MMIO + config ARM_ARCH_TIMER bool select CLKSRC_OF if OF diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index ebfb5dc1995c..1831a588b988 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_ARCH_NSPIRE) += zevio-timer.o obj-$(CONFIG_ARCH_BCM_MOBILE) += bcm_kona_timer.o obj-$(CONFIG_CADENCE_TTC_TIMER) += cadence_ttc_timer.o obj-$(CONFIG_CLKSRC_EFM32) += time-efm32.o +obj-$(CONFIG_CLKSRC_STM32) += timer-stm32.o obj-$(CONFIG_CLKSRC_EXYNOS_MCT) += exynos_mct.o obj-$(CONFIG_CLKSRC_LPC32XX) += time-lpc32xx.o obj-$(CONFIG_CLKSRC_SAMSUNG_PWM) += samsung_pwm_timer.o diff --git a/drivers/clocksource/timer-stm32.c b/drivers/clocksource/timer-stm32.c new file mode 100644 index 000000000000..fad2e2e7a0b8 --- /dev/null +++ b/drivers/clocksource/timer-stm32.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) Maxime Coquelin 2015 + * Author: Maxime Coquelin + * License terms: GNU General Public License (GPL), version 2 + * + * Inspired by time-efm32.c from Uwe Kleine-Koenig + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TIM_CR1 0x00 +#define TIM_DIER 0x0c +#define TIM_SR 0x10 +#define TIM_EGR 0x14 +#define TIM_PSC 0x28 +#define TIM_ARR 0x2c + +#define TIM_CR1_CEN BIT(0) +#define TIM_CR1_OPM BIT(3) +#define TIM_CR1_ARPE BIT(7) + +#define TIM_DIER_UIE BIT(0) + +#define TIM_SR_UIF BIT(0) + +#define TIM_EGR_UG BIT(0) + +struct stm32_clock_event_ddata { + struct clock_event_device evtdev; + unsigned periodic_top; + void __iomem *base; +}; + +static void stm32_clock_event_set_mode(enum clock_event_mode mode, + struct clock_event_device *evtdev) +{ + struct stm32_clock_event_ddata *data = + container_of(evtdev, struct stm32_clock_event_ddata, evtdev); + void *base = data->base; + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + writel_relaxed(data->periodic_top, base + TIM_ARR); + writel_relaxed(TIM_CR1_ARPE | TIM_CR1_CEN, base + TIM_CR1); + break; + + case CLOCK_EVT_MODE_ONESHOT: + default: + writel_relaxed(0, base + TIM_CR1); + break; + } +} + +static int stm32_clock_event_set_next_event(unsigned long evt, + struct clock_event_device *evtdev) +{ + struct stm32_clock_event_ddata *data = + container_of(evtdev, struct stm32_clock_event_ddata, evtdev); + + writel_relaxed(evt, data->base + TIM_ARR); + writel_relaxed(TIM_CR1_ARPE | TIM_CR1_OPM | TIM_CR1_CEN, + data->base + TIM_CR1); + + return 0; +} + +static irqreturn_t stm32_clock_event_handler(int irq, void *dev_id) +{ + struct stm32_clock_event_ddata *data = dev_id; + + writel_relaxed(0, data->base + TIM_SR); + + data->evtdev.event_handler(&data->evtdev); + + return IRQ_HANDLED; +} + +static struct stm32_clock_event_ddata clock_event_ddata = { + .evtdev = { + .name = "stm32 clockevent", + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, + .set_mode = stm32_clock_event_set_mode, + .set_next_event = stm32_clock_event_set_next_event, + .rating = 200, + }, +}; + +static void __init stm32_clockevent_init(struct device_node *np) +{ + struct stm32_clock_event_ddata *data = &clock_event_ddata; + struct clk *clk; + struct reset_control *rstc; + unsigned long rate, max_delta; + int irq, ret, bits, prescaler = 1; + + clk = of_clk_get(np, 0); + if (IS_ERR(clk)) { + ret = PTR_ERR(clk); + pr_err("failed to get clock for clockevent (%d)\n", ret); + goto err_clk_get; + } + + ret = clk_prepare_enable(clk); + if (ret) { + pr_err("failed to enable timer clock for clockevent (%d)\n", + ret); + goto err_clk_enable; + } + + rate = clk_get_rate(clk); + + rstc = of_reset_control_get(np, NULL); + if (!IS_ERR(rstc)) { + reset_control_assert(rstc); + reset_control_deassert(rstc); + } + + data->base = of_iomap(np, 0); + if (!data->base) { + pr_err("failed to map registers for clockevent\n"); + goto err_iomap; + } + + irq = irq_of_parse_and_map(np, 0); + if (!irq) { + pr_err("%s: failed to get irq.\n", np->full_name); + goto err_get_irq; + } + + /* Detect whether the timer is 16 or 32 bits */ + writel_relaxed(~0UL, data->base + TIM_ARR); + max_delta = readl_relaxed(data->base + TIM_ARR); + if (max_delta == ~0UL) { + prescaler = 1; + bits = 32; + } else { + prescaler = 1024; + bits = 16; + } + writel_relaxed(0, data->base + TIM_ARR); + + writel_relaxed(prescaler - 1, data->base + TIM_PSC); + writel_relaxed(TIM_EGR_UG, data->base + TIM_EGR); + writel_relaxed(TIM_DIER_UIE, data->base + TIM_DIER); + writel_relaxed(0, data->base + TIM_SR); + + data->periodic_top = DIV_ROUND_CLOSEST(rate, prescaler * HZ); + + clockevents_config_and_register(&data->evtdev, + DIV_ROUND_CLOSEST(rate, prescaler), + 0x1, max_delta); + + ret = request_irq(irq, stm32_clock_event_handler, IRQF_TIMER, + "stm32 clockevent", data); + if (ret) { + pr_err("%s: failed to request irq.\n", np->full_name); + goto err_get_irq; + } + + pr_info("%s: STM32 clockevent driver initialized (%d bits)\n", + np->full_name, bits); + + return; + +err_get_irq: + iounmap(data->base); +err_iomap: + clk_disable_unprepare(clk); +err_clk_enable: + clk_put(clk); +err_clk_get: + return; +} + +CLOCKSOURCE_OF_DECLARE(stm32, "st,stm32-timer", stm32_clockevent_init); From d4688bdc6335e9faaf3f0173f96932cd520cee1a Mon Sep 17 00:00:00 2001 From: Maxime Coquelin Date: Thu, 28 May 2015 07:05:53 +0200 Subject: [PATCH 084/106] clockevents/drivers/timer-stm32: Fix build warning spotted by kbuild test robot This patch fixes below warning spotted by kbuild test robot when building with ARCH=powerpc: drivers/clocksource/timer-stm32.c: In function 'stm32_clockevent_init': >> drivers/clocksource/timer-stm32.c:140:9: warning: large integer implicitly truncated to unsigned type [-Woverflow] writel_relaxed(~0UL, data->base + TIM_ARR); The fix consists in using 0U instead of 0UL. Reported-by: kbuild test robot Signed-off-by: Maxime Coquelin Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-stm32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-stm32.c b/drivers/clocksource/timer-stm32.c index fad2e2e7a0b8..a97e8b50701c 100644 --- a/drivers/clocksource/timer-stm32.c +++ b/drivers/clocksource/timer-stm32.c @@ -137,9 +137,9 @@ static void __init stm32_clockevent_init(struct device_node *np) } /* Detect whether the timer is 16 or 32 bits */ - writel_relaxed(~0UL, data->base + TIM_ARR); + writel_relaxed(~0U, data->base + TIM_ARR); max_delta = readl_relaxed(data->base + TIM_ARR); - if (max_delta == ~0UL) { + if (max_delta == ~0U) { prescaler = 1; bits = 32; } else { From 3434d23b694e5cb6e44e966914563406c31c4053 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 May 2015 13:33:45 +0530 Subject: [PATCH 085/106] clockevents: Add helpers to check the state of a clockevent device Some clockevent drivers, once migrated to use per-state callbacks, need to check the state of the clockevent device in their callbacks or interrupt handler. Add accessor functions clockevent_state_*() to get this information. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/04a717d490335c688dd7af899fbcede97e1bb8ee.1432192527.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 271fa4c8eb29..64214ad85af9 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -149,6 +149,32 @@ struct clock_event_device { struct module *owner; } ____cacheline_aligned; +/* Helpers to verify state of a clockevent device */ +static inline bool clockevent_state_detached(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_DETACHED; +} + +static inline bool clockevent_state_shutdown(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_SHUTDOWN; +} + +static inline bool clockevent_state_periodic(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_PERIODIC; +} + +static inline bool clockevent_state_oneshot(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_ONESHOT; +} + +static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev) +{ + return dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED; +} + /* * Calculate a multiplication factor for scaled math, which is used to convert * nanoseconds based values to clock ticks: From 472c4a9437d3c6a0b1e59df7c5aa14075946aa70 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 May 2015 13:33:46 +0530 Subject: [PATCH 086/106] clockevents: Use helpers to check the state of a clockevent device Use accessor functions to check the state of clockevent devices in core code. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/fa2b9869fd17f210eaa156ec2b594efd0230b6c7.1432192527.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 24 ++++++++++++------------ kernel/time/tick-broadcast.c | 6 +++--- kernel/time/tick-common.c | 2 +- kernel/time/tick-oneshot.c | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 2a5c369e50ab..e568ec8c320b 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -136,7 +136,7 @@ static int __clockevents_set_state(struct clock_event_device *dev, case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ - if (WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT, + if (WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", dev->state)) return -EINVAL; @@ -170,7 +170,7 @@ void clockevents_set_state(struct clock_event_device *dev, * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ - if (state == CLOCK_EVT_STATE_ONESHOT) { + if (clockevent_state_oneshot(dev)) { if (unlikely(!dev->mult)) { dev->mult = 1; WARN_ON(1); @@ -259,7 +259,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; dev->retries++; @@ -296,7 +296,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; dev->retries++; @@ -328,11 +328,11 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, dev->next_event = expires; - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; /* We must be in ONESHOT state here */ - WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT, "Current state: %d\n", + WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", dev->state); /* Shortcut for clockevent devices that can deal with ktime. */ @@ -377,7 +377,7 @@ static int clockevents_replace(struct clock_event_device *ced) struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { - if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) + if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) @@ -403,7 +403,7 @@ static int clockevents_replace(struct clock_event_device *ced) static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ - if (ced->state == CLOCK_EVT_STATE_DETACHED) { + if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } @@ -561,10 +561,10 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->state == CLOCK_EVT_STATE_ONESHOT) + if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); - if (dev->state == CLOCK_EVT_STATE_PERIODIC) + if (clockevent_state_periodic(dev)) return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; @@ -625,7 +625,7 @@ void clockevents_exchange_device(struct clock_event_device *old, } if (new) { - BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); + BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } @@ -681,7 +681,7 @@ void tick_cleanup_dead_cpu(int cpu) if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { - BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); + BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 12fcc55d607a..132f819fdcdf 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -303,7 +303,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) raw_spin_lock(&tick_broadcast_lock); bc_local = tick_do_periodic_broadcast(); - if (dev->state == CLOCK_EVT_STATE_ONESHOT) { + if (clockevent_state_oneshot(dev)) { ktime_t next = ktime_add(dev->next_event, tick_period); clockevents_program_event(dev, next, true); @@ -528,7 +528,7 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc, static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, ktime_t expires) { - if (bc->state != CLOCK_EVT_STATE_ONESHOT) + if (!clockevent_state_oneshot(bc)) clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(bc, expires, 1); @@ -831,7 +831,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; + int was_periodic = clockevent_state_periodic(bc); bc->event_handler = tick_handle_oneshot_broadcast; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index ea5f9eae8f74..cf881c62c3c5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -112,7 +112,7 @@ void tick_handle_periodic(struct clock_event_device *dev) return; #endif - if (dev->state != CLOCK_EVT_STATE_ONESHOT) + if (!clockevent_state_oneshot(dev)) return; for (;;) { /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index f8de75715c2f..3f9715bec291 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -36,7 +36,7 @@ int tick_program_event(ktime_t expires, int force) return 0; } - if (unlikely(dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED)) { + if (unlikely(clockevent_state_oneshot_stopped(dev))) { /* * We need the clock event again, configure it in ONESHOT mode * before using it. From d7eb231c71420bc34ac3d35403115600f920cfc2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 2 Jun 2015 14:08:46 +0200 Subject: [PATCH 087/106] clockevents: Provide functions to set and get the state We want to rename dev->state, so provide proper get and set functions. Rename clockevents_set_state() to clockevents_switch_state() to avoid confusion. Signed-off-by: Thomas Gleixner Cc: Viresh Kumar Cc: Peter Zijlstra --- kernel/time/clockevents.c | 18 +++++++++--------- kernel/time/tick-broadcast.c | 12 ++++++------ kernel/time/tick-common.c | 4 ++-- kernel/time/tick-internal.h | 15 +++++++++++++-- kernel/time/tick-oneshot.c | 10 +++++----- 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index e568ec8c320b..a45f90c4b2d5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,8 +94,8 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) } EXPORT_SYMBOL_GPL(clockevent_delta2ns); -static int __clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state) +static int __clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) { /* Transition with legacy set_mode() callback */ if (dev->set_mode) { @@ -151,17 +151,17 @@ static int __clockevents_set_state(struct clock_event_device *dev, } /** - * clockevents_set_state - set the operating state of a clock event device + * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ -void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state) +void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) { if (dev->state != state) { - if (__clockevents_set_state(dev, state)) + if (__clockevents_switch_state(dev, state)) return; dev->state = state; @@ -185,7 +185,7 @@ void clockevents_set_state(struct clock_event_device *dev, */ void clockevents_shutdown(struct clock_event_device *dev) { - clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event.tv64 = KTIME_MAX; } @@ -565,7 +565,7 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) return clockevents_program_event(dev, dev->next_event, false); if (clockevent_state_periodic(dev)) - return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } @@ -619,7 +619,7 @@ void clockevents_exchange_device(struct clock_event_device *old, */ if (old) { module_put(old->owner); - clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); + clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_del(&old->list); list_add(&old->list, &clockevents_released); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 132f819fdcdf..d39f32cdd1b5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -529,7 +529,7 @@ static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, ktime_t expires) { if (!clockevent_state_oneshot(bc)) - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(bc, expires, 1); tick_broadcast_set_affinity(bc, cpumask_of(cpu)); @@ -537,7 +537,7 @@ static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* @@ -555,7 +555,7 @@ void tick_check_oneshot_broadcast_this_cpu(void) * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { - clockevents_set_state(td->evtdev, + clockevents_switch_state(td->evtdev, CLOCK_EVT_STATE_ONESHOT); } } @@ -659,7 +659,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, if (dev->next_event.tv64 < bc->next_event.tv64) return; } - clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } /** @@ -729,7 +729,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -847,7 +847,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); tick_broadcast_set_event(bc, cpu, tick_next_period); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index cf881c62c3c5..311e2e133517 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { - clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned long seq; ktime_t next; @@ -160,7 +160,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) next = tick_next_period; } while (read_seqretry(&jiffies_lock, seq)); - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 65273f0a11ed..4461de9bb4b8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev) +{ + return dev->state; +} + +static inline void clockevent_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + dev->state = state; +} + extern void clockevents_shutdown(struct clock_event_device *dev); extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); -extern void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state); +extern void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state); extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 3f9715bec291..b51344652330 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -32,7 +32,7 @@ int tick_program_event(ktime_t expires, int force) /* * We don't need the clock event device any more, stop it. */ - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); return 0; } @@ -41,7 +41,7 @@ int tick_program_event(ktime_t expires, int force) * We need the clock event again, configure it in ONESHOT mode * before using it. */ - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); } return clockevents_program_event(dev, expires, force); @@ -54,7 +54,7 @@ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } @@ -66,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, ktime_t next_event) { newdev->event_handler = handler; - clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } @@ -97,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } From 051ebd101b05c09d9b5b673e19fb0586e9bfec56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 2 Jun 2015 14:13:46 +0200 Subject: [PATCH 088/106] clockevents: Use set/get state helper functions Signed-off-by: Thomas Gleixner Cc: Viresh Kumar Cc: Peter Zijlstra --- kernel/time/clockevents.c | 11 ++++++----- kernel/time/tick-common.c | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a45f90c4b2d5..2397b97320d8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -137,7 +137,8 @@ static int __clockevents_switch_state(struct clock_event_device *dev, case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ if (WARN_ONCE(!clockevent_state_oneshot(dev), - "Current state: %d\n", dev->state)) + "Current state: %d\n", + clockevent_get_state(dev))) return -EINVAL; if (dev->set_state_oneshot_stopped) @@ -160,11 +161,11 @@ static int __clockevents_switch_state(struct clock_event_device *dev, void clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { - if (dev->state != state) { + if (clockevent_get_state(dev) != state) { if (__clockevents_switch_state(dev, state)) return; - dev->state = state; + clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash @@ -333,7 +334,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, /* We must be in ONESHOT state here */ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", - dev->state); + clockevent_get_state(dev)); /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) @@ -496,7 +497,7 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(clockevents_sanity_check(dev)); /* Initialize state to DETACHED */ - dev->state = CLOCK_EVT_STATE_DETACHED; + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 311e2e133517..17f144450050 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -377,7 +377,7 @@ void tick_shutdown(unsigned int cpu) * Prevent that the clock events layer tries to call * the set mode function! */ - dev->state = CLOCK_EVT_STATE_DETACHED; + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; From be3ef76e9d9b97962c70bd6351787d29071ae481 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 2 Jun 2015 14:30:11 +0200 Subject: [PATCH 089/106] clockevents: Rename state to state_use_accessors The only sensible way to make abuse of core internal fields obvious and easy to grep for. Signed-off-by: Thomas Gleixner Cc: Viresh Kumar Cc: Peter Zijlstra --- include/linux/clockchips.h | 14 +++++++------- kernel/time/tick-internal.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 64214ad85af9..597a1e836f22 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -87,7 +87,7 @@ enum clock_event_state { * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) * @mode: operating mode, relevant only to ->set_mode(), OBSOLETE - * @state: current state of the device, assigned by the core code + * @state_use_accessors:current state of the device, assigned by the core code * @features: features * @retries: number of forced programming retries * @set_mode: legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME. @@ -117,7 +117,7 @@ struct clock_event_device { u32 mult; u32 shift; enum clock_event_mode mode; - enum clock_event_state state; + enum clock_event_state state_use_accessors; unsigned int features; unsigned long retries; @@ -152,27 +152,27 @@ struct clock_event_device { /* Helpers to verify state of a clockevent device */ static inline bool clockevent_state_detached(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_DETACHED; + return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED; } static inline bool clockevent_state_shutdown(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_SHUTDOWN; + return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN; } static inline bool clockevent_state_periodic(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_PERIODIC; + return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC; } static inline bool clockevent_state_oneshot(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_ONESHOT; + return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT; } static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev) { - return dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED; + return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED; } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4461de9bb4b8..ec2208aabdd1 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -38,13 +38,13 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev) { - return dev->state; + return dev->state_use_accessors; } static inline void clockevent_set_state(struct clock_event_device *dev, enum clock_event_state state) { - dev->state = state; + dev->state_use_accessors = state; } extern void clockevents_shutdown(struct clock_event_device *dev); From d711b8b30c803b1b2aedf6a3474758798078f9e1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 6 Jun 2015 11:30:00 +0200 Subject: [PATCH 090/106] hrtimers: Make sure hrtimer_resolution is unsigned int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... in the !CONFIG_HIGH_RES_TIMERS case too. And thus fix warnings like this one: net/sched/sch_api.c: In function ‘psched_show’: net/sched/sch_api.c:1891:6: warning: format ‘%x’ expects argument of type ‘unsigned int’, but argument 6 has type ‘long int’ [-Wformat=] (u32)NSEC_PER_SEC / hrtimer_resolution); Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1433583000-32090-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner Cc: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 470d876c2eda..3f82a7edc03d 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -309,7 +309,7 @@ extern unsigned int hrtimer_resolution; # define MONOTONIC_RES_NSEC LOW_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_LOW_RES -#define hrtimer_resolution LOW_RES_NSEC +#define hrtimer_resolution (unsigned int)LOW_RES_NSEC static inline void hrtimer_peek_ahead_timers(void) { } From ae60d6a0e3a9197d37f8c8c4584a8ecd18518cd6 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 28 May 2015 19:09:55 +0200 Subject: [PATCH 091/106] time: Refactor usecs_to_jiffies Refactor the usecs_to_jiffies conditional code part in time.c and jiffies.h putting it into conditional functions rather than #ifdefs to improve readability. This is analogous to the msecs_to_jiffies() cleanup in commit ca42aaf0c861 ("time: Refactor msecs_to_jiffies") Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1432832996-12129-1-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- include/linux/jiffies.h | 27 ++++++++++++++++++++++++++- kernel/time/time.c | 13 +++---------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 3bde5eb8568b..a316ebea0a89 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -362,7 +362,32 @@ static inline unsigned long msecs_to_jiffies(const unsigned int m) } } -extern unsigned long usecs_to_jiffies(const unsigned int u); +extern unsigned long __usecs_to_jiffies(const unsigned int u); +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +} +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return u * (HZ / USEC_PER_SEC); +} +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ +#else +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; +} +#endif + +static inline unsigned long usecs_to_jiffies(const unsigned int u) +{ + return __usecs_to_jiffies(u); +} + extern unsigned long timespec_to_jiffies(const struct timespec *value); extern void jiffies_to_timespec(const unsigned long jiffies, struct timespec *value); diff --git a/kernel/time/time.c b/kernel/time/time.c index 972e3bbac963..85d5bb1d67eb 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -522,20 +522,13 @@ unsigned long __msecs_to_jiffies(const unsigned int m) } EXPORT_SYMBOL(__msecs_to_jiffies); -unsigned long usecs_to_jiffies(const unsigned int u) +unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) - >> USEC_TO_HZ_SHR32; -#endif + return _usecs_to_jiffies(u); } -EXPORT_SYMBOL(usecs_to_jiffies); +EXPORT_SYMBOL(__usecs_to_jiffies); /* * The TICK_NSEC - 1 rounds up the value to the next resolution. Note From c569a23d65ac2900d9998d3fe04044fe95be6b2f Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Thu, 28 May 2015 19:09:56 +0200 Subject: [PATCH 092/106] time: Allow gcc to fold usecs_to_jiffies(constant) To allow constant folding in usecs_to_jiffies() conditionally calls the HZ dependent _usecs_to_jiffies() helpers or, when gcc can not figure out constant folding, __usecs_to_jiffies, which is the renamed original usecs_to_jiffies() function. Signed-off-by: Nicholas Mc Guire Cc: Masahiro Yamada Cc: Sam Ravnborg Cc: Joe Perches Cc: John Stultz Cc: Andrew Hunter Cc: Paul Turner Cc: Michal Marek Link: http://lkml.kernel.org/r/1432832996-12129-2-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner --- include/linux/jiffies.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index a316ebea0a89..535fd3bb1ba8 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -383,9 +383,37 @@ static inline unsigned long _usecs_to_jiffies(const unsigned int u) } #endif +/** + * usecs_to_jiffies: - convert microseconds to jiffies + * @u: time in microseconds + * + * conversion is done as follows: + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows as for msecs_to_jiffies. + * + * usecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __usecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the HZ range specific helpers _usecs_to_jiffies() are called both + * directly here and from __msecs_to_jiffies() in the case where + * constant folding is not possible. + */ static inline unsigned long usecs_to_jiffies(const unsigned int u) { - return __usecs_to_jiffies(u); + if (__builtin_constant_p(u)) { + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return _usecs_to_jiffies(u); + } else { + return __usecs_to_jiffies(u); + } } extern unsigned long timespec_to_jiffies(const struct timespec *value); From 45bbfe64ea564a69e56ab6754006eee506224f46 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 25 May 2015 11:49:55 -0700 Subject: [PATCH 093/106] clocksource: Use current logging style clocksource messages aren't prefixed in dmesg so it's a bit unclear what subsystem emits the messages. Use pr_fmt and pr_ to auto-prefix the messages appropriately. Miscellanea: o Remove "Warning" from KERN_WARNING level messages o Align "timekeeping watchdog: " messages o Coalesce formats o Align multiline arguments Signed-off-by: Joe Perches Cc: John Stultz Link: http://lkml.kernel.org/r/1432579795.2846.75.camel@perches.com Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 15facb1b9c60..841b72f720e8 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,6 +23,8 @@ * o Allow clocksource drivers to be unregistered */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -216,10 +218,11 @@ static void clocksource_watchdog(unsigned long data) /* Check the deviation from the watchdog clocksource. */ if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); - pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", + cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, csnow, cslast, cs->mask); __clocksource_unstable(cs); continue; @@ -567,9 +570,8 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - printk(KERN_WARNING "Override clocksource %s is not " - "HRT compatible. Cannot switch while in " - "HRT/NOHZ mode\n", cs->name); + pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); override_name[0] = 0; } else /* Override clocksource can be used. */ @@ -708,8 +710,8 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocksource_update_max_deferment(cs); - pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", - cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); + pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); @@ -1008,12 +1010,10 @@ __setup("clocksource=", boot_override_clocksource); static int __init boot_override_clock(char* str) { if (!strcmp(str, "pmtmr")) { - printk("Warning: clock=pmtmr is deprecated. " - "Use clocksource=acpi_pm.\n"); + pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n"); return boot_override_clocksource("acpi_pm"); } - printk("Warning! clock= boot option is deprecated. " - "Use clocksource=xyz\n"); + pr_warn("clock= boot option is deprecated - use clocksource=xyz\n"); return boot_override_clocksource(str); } From d151832650ed98961a5650e73e85c349ad7839cb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 11 Jun 2015 15:54:53 -0700 Subject: [PATCH 094/106] time: Move clock_was_set_seq update before updating shadow-timekeeper It was reported that 868a3e915f7f5eba (hrtimer: Make offset update smarter) was causing timer problems after suspend/resume. The problem with that change is the modification to clock_was_set_seq in timekeeping_update is done prior to mirroring the time state to the shadow-timekeeper. Thus the next time we do update_wall_time() the updated sequence is overwritten by whats in the shadow copy. This patch moves the shadow-timekeeper mirroring to the end of the function, after all updates have been made, so all data is kept in sync. (This patch also affects the update_fast_timekeeper calls which were also problematically done prior to the mirroring). Reported-and-tested-by: Jeremiah Mahler Signed-off-by: John Stultz Cc: Preeti U Murthy Cc: Peter Zijlstra Cc: Viresh Kumar Cc: Marcelo Tosatti Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1434063297-28657-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 90ed5db67c1d..849b93265904 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -585,15 +585,19 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - if (action & TK_MIRROR) - memcpy(&shadow_timekeeper, &tk_core.timekeeper, - sizeof(tk_core.timekeeper)); - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; + /* + * The mirroring of the data to the shadow-timekeeper needs + * to happen last here to ensure we don't over-write the + * timekeeper structure on the next update with stale data + */ + if (action & TK_MIRROR) + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); } /** From 90bf361ceae28dee50a584c3dd4c1a96178d982c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 11 Jun 2015 15:54:54 -0700 Subject: [PATCH 095/106] ntp: Introduce and use SECS_PER_DAY macro instead of 86400 Currently the leapsecond logic uses what looks like magic values. Improve this by defining SECS_PER_DAY and using that macro to make the logic more clear. Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Daniel Bristot de Oliveira Cc: Richard Cochran Cc: Jan Kara Cc: Jiri Bohac Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1434063297-28657-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7a681003001c..7aa216188450 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -35,6 +35,7 @@ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; +#define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -390,7 +391,7 @@ int second_overflow(unsigned long secs) case TIME_INS: if (!(time_status & STA_INS)) time_state = TIME_OK; - else if (secs % 86400 == 0) { + else if (secs % SECS_PER_DAY == 0) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE @@ -400,7 +401,7 @@ int second_overflow(unsigned long secs) case TIME_DEL: if (!(time_status & STA_DEL)) time_state = TIME_OK; - else if ((secs + 1) % 86400 == 0) { + else if ((secs + 1) % SECS_PER_DAY == 0) { leap = 1; time_state = TIME_WAIT; printk(KERN_NOTICE From 833f32d763028c1bb371c64f457788b933773b3e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 11 Jun 2015 15:54:55 -0700 Subject: [PATCH 096/106] time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge Currently, leapsecond adjustments are done at tick time. As a result, the leapsecond was applied at the first timer tick *after* the leapsecond (~1-10ms late depending on HZ), rather then exactly on the second edge. This was in part historical from back when we were always tick based, but correcting this since has been avoided since it adds extra conditional checks in the gettime fastpath, which has performance overhead. However, it was recently pointed out that ABS_TIME CLOCK_REALTIME timers set for right after the leapsecond could fire a second early, since some timers may be expired before we trigger the timekeeping timer, which then applies the leapsecond. This isn't quite as bad as it sounds, since behaviorally it is similar to what is possible w/ ntpd made leapsecond adjustments done w/o using the kernel discipline. Where due to latencies, timers may fire just prior to the settimeofday call. (Also, one should note that all applications using CLOCK_REALTIME timers should always be careful, since they are prone to quirks from settimeofday() disturbances.) However, the purpose of having the kernel do the leap adjustment is to avoid such latencies, so I think this is worth fixing. So in order to properly keep those timers from firing a second early, this patch modifies the ntp and timekeeping logic so that we keep enough state so that the update_base_offsets_now accessor, which provides the hrtimer core the current time, can check and apply the leapsecond adjustment on the second edge. This prevents the hrtimer core from expiring timers too early. This patch does not modify any other time read path, so no additional overhead is incurred. However, this also means that the leap-second continues to be applied at tick time for all other read-paths. Apologies to Richard Cochran, who pushed for similar changes years ago, which I resisted due to the concerns about the performance overhead. While I suspect this isn't extremely critical, folks who care about strict leap-second correctness will likely want to watch this. Potentially a -stable candidate eventually. Originally-suggested-by: Richard Cochran Reported-by: Daniel Bristot de Oliveira Reported-by: Prarit Bhargava Signed-off-by: John Stultz Cc: Richard Cochran Cc: Jan Kara Cc: Jiri Bohac Cc: Shuah Khan Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/time64.h | 1 + include/linux/timekeeper_internal.h | 2 ++ kernel/time/ntp.c | 42 ++++++++++++++++++++++++----- kernel/time/ntp_internal.h | 1 + kernel/time/timekeeping.c | 23 +++++++++++++++- 5 files changed, 61 insertions(+), 8 deletions(-) diff --git a/include/linux/time64.h b/include/linux/time64.h index 12d4e82b0276..77b5df2acd2a 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -29,6 +29,7 @@ struct timespec64 { #define FSEC_PER_SEC 1000000000000000LL /* Located here for timespec[64]_valid_strict */ +#define TIME64_MAX ((s64)~((u64)1 << 63)) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index e1f5a1136554..25247220b4b7 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -50,6 +50,7 @@ struct tk_read_base { * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events + * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP @@ -90,6 +91,7 @@ struct timekeeper { ktime_t offs_tai; s32 tai_offset; unsigned int clock_was_set_seq; + ktime_t next_leap_ktime; struct timespec64 raw_time; /* The following members are for timekeeping internal use */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7aa216188450..033743e3647a 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -77,6 +77,9 @@ static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; +/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ +static time64_t ntp_next_leap_sec = TIME64_MAX; + #ifdef CONFIG_NTP_PPS /* @@ -350,6 +353,7 @@ void ntp_clear(void) tick_length = tick_length_base; time_offset = 0; + ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ pps_clear(); } @@ -360,6 +364,21 @@ u64 ntp_tick_length(void) return tick_length; } +/** + * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * + * Provides the time of the next leapsecond against CLOCK_REALTIME in + * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + */ +ktime_t ntp_get_next_leap(void) +{ + ktime_t ret; + + if ((time_state == TIME_INS) && (time_status & STA_INS)) + return ktime_set(ntp_next_leap_sec, 0); + ret.tv64 = KTIME_MAX; + return ret; +} /* * this routine handles the overflow of the microsecond field @@ -383,15 +402,21 @@ int second_overflow(unsigned long secs) */ switch (time_state) { case TIME_OK: - if (time_status & STA_INS) + if (time_status & STA_INS) { time_state = TIME_INS; - else if (time_status & STA_DEL) + ntp_next_leap_sec = secs + SECS_PER_DAY - + (secs % SECS_PER_DAY); + } else if (time_status & STA_DEL) { time_state = TIME_DEL; + ntp_next_leap_sec = secs + SECS_PER_DAY - + ((secs+1) % SECS_PER_DAY); + } break; case TIME_INS: - if (!(time_status & STA_INS)) + if (!(time_status & STA_INS)) { + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - else if (secs % SECS_PER_DAY == 0) { + } else if (secs % SECS_PER_DAY == 0) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE @@ -399,19 +424,21 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if (!(time_status & STA_DEL)) + if (!(time_status & STA_DEL)) { + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - else if ((secs + 1) % SECS_PER_DAY == 0) { + } else if ((secs + 1) % SECS_PER_DAY == 0) { leap = 1; + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; break; - case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; @@ -548,6 +575,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; + ntp_next_leap_sec = TIME64_MAX; /* restart PPS frequency calibration */ pps_reset_freq_interval(); } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index bbd102ad9df7..65430504ca26 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -5,6 +5,7 @@ extern void ntp_init(void); extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); +extern ktime_t ntp_get_next_leap(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 849b93265904..5d67ffb7e317 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -539,6 +539,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); +/* + * tk_update_leap_state - helper to update the next_leap_ktime + */ +static inline void tk_update_leap_state(struct timekeeper *tk) +{ + tk->next_leap_ktime = ntp_get_next_leap(); + if (tk->next_leap_ktime.tv64 != KTIME_MAX) + /* Convert to monotonic time */ + tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); +} + /* * Update the ktime_t based scalar nsec members of the timekeeper */ @@ -580,6 +591,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) ntp_clear(); } + tk_update_leap_state(tk); tk_update_ktime_data(tk); update_vsyscall(tk); @@ -1956,15 +1968,22 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); + base = ktime_add_ns(base, nsecs); + if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } + + /* Handle leapsecond insertion adjustments */ + if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64)) + *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); + } while (read_seqcount_retry(&tk_core.seq, seq)); - return ktime_add_ns(base, nsecs); + return base; } /** @@ -2006,6 +2025,8 @@ int do_adjtimex(struct timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } + tk_update_leap_state(tk); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); From 96efdcf2d080687e041b0353c604b708546689fd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 11 Jun 2015 15:54:56 -0700 Subject: [PATCH 097/106] ntp: Do leapsecond adjustment in adjtimex read path Since the leapsecond is applied at tick-time, this means there is a small window of time at the start of a leap-second where we cross into the next second before applying the leap. This patch modified adjtimex so that the leap-second is applied on the second edge. Providing more correct leapsecond behavior. This does make it so that adjtimex()'s returned time values can be inconsistent with time values read from gettimeofday() or clock_gettime(CLOCK_REALTIME,...) for a brief period of one tick at the leapsecond. However, those other interfaces do not provide the TIME_OOP time_state return that adjtimex() provides, which allows the leapsecond to be properly represented. They instead only see a time discontinuity, and cannot tell the first 23:59:59 from the repeated 23:59:59 leap second. This seems like a reasonable tradeoff given clock_gettime() / gettimeofday() cannot properly represent a leapsecond, and users likely care more about performance, while folks who are using adjtimex() more likely care about leap-second correctness. Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Daniel Bristot de Oliveira Cc: Richard Cochran Cc: Jan Kara Cc: Jiri Bohac Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1434063297-28657-5-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 033743e3647a..fb4d98c7fd43 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -740,6 +740,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; + /* Handle leapsec adjustments */ + if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { + if ((time_state == TIME_INS) && (time_status & STA_INS)) { + result = TIME_OOP; + txc->tai++; + txc->time.tv_sec--; + } + if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + result = TIME_WAIT; + txc->tai--; + txc->time.tv_sec++; + } + if ((time_state == TIME_OOP) && + (ts->tv_sec == ntp_next_leap_sec)) { + result = TIME_WAIT; + } + } + return result; } From 0c4a5fc95b1df42651a9b4c1f72d348b3d196ea0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 11 Jun 2015 15:54:57 -0700 Subject: [PATCH 098/106] selftests: timers: Add leap-second timer edge testing to leap-a-day.c Prarit reported an issue w/ timers around the leapsecond, where a timer set for Midnight UTC (00:00:00) might fire a second early right before the leapsecond (23:59:60 - though it appears as a repeated 23:59:59) is applied. So I've updated the leap-a-day.c test to integrate a similar test, where we set a timer and check if it triggers at the right time, and if the ntp state transition is managed properly. Reported-by: Daniel Bristot de Oliveira Reported-by: Prarit Bhargava Signed-off-by: John Stultz Cc: Richard Cochran Cc: Jan Kara Cc: Jiri Bohac Cc: Shuah Khan Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1434063297-28657-6-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- tools/testing/selftests/timers/leap-a-day.c | 76 +++++++++++++++++++-- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index b8272e6c4b3b..331c4f70d8a1 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,9 @@ static inline int ksft_exit_fail(void) #define NSEC_PER_SEC 1000000000ULL #define CLOCK_TAI 11 +time_t next_leap; +int error_found; + /* returns 1 if a <= b, 0 otherwise */ static inline int in_order(struct timespec a, struct timespec b) { @@ -134,6 +138,34 @@ void handler(int unused) exit(0); } +void sigalarm(int signo) +{ + struct timex tx; + char buf[26]; + int ret; + + tx.modes = 0; + ret = adjtimex(&tx); + + ctime_r(&tx.time.tv_sec, buf); + buf[strlen(buf)-1] = 0; /*remove trailing\n */ + printf("%s + %6ld us (%i)\t%s - TIMER FIRED\n", + buf, + tx.time.tv_usec, + tx.tai, + time_state_str(ret)); + + if (tx.time.tv_sec < next_leap) { + printf("Error: Early timer expiration!\n"); + error_found = 1; + } + if (ret != TIME_WAIT) { + printf("Error: Incorrect NTP state?\n"); + error_found = 1; + } +} + + /* Test for known hrtimer failure */ void test_hrtimer_failure(void) { @@ -144,12 +176,19 @@ void test_hrtimer_failure(void) clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &target, NULL); clock_gettime(CLOCK_REALTIME, &now); - if (!in_order(target, now)) + if (!in_order(target, now)) { printf("ERROR: hrtimer early expiration failure observed.\n"); + error_found = 1; + } } int main(int argc, char **argv) { + timer_t tm1; + struct itimerspec its1; + struct sigevent se; + struct sigaction act; + int signum = SIGRTMAX; int settime = 0; int tai_time = 0; int insert = 1; @@ -191,6 +230,12 @@ int main(int argc, char **argv) signal(SIGINT, handler); signal(SIGKILL, handler); + /* Set up timer signal handler: */ + sigfillset(&act.sa_mask); + act.sa_flags = 0; + act.sa_handler = sigalarm; + sigaction(signum, &act, NULL); + if (iterations < 0) printf("This runs continuously. Press ctrl-c to stop\n"); else @@ -201,7 +246,7 @@ int main(int argc, char **argv) int ret; struct timespec ts; struct timex tx; - time_t now, next_leap; + time_t now; /* Get the current time */ clock_gettime(CLOCK_REALTIME, &ts); @@ -251,10 +296,27 @@ int main(int argc, char **argv) printf("Scheduling leap second for %s", ctime(&next_leap)); + /* Set up timer */ + printf("Setting timer for %s", ctime(&next_leap)); + memset(&se, 0, sizeof(se)); + se.sigev_notify = SIGEV_SIGNAL; + se.sigev_signo = signum; + se.sigev_value.sival_int = 0; + if (timer_create(CLOCK_REALTIME, &se, &tm1) == -1) { + printf("Error: timer_create failed\n"); + return ksft_exit_fail(); + } + its1.it_value.tv_sec = next_leap; + its1.it_value.tv_nsec = 0; + its1.it_interval.tv_sec = 0; + its1.it_interval.tv_nsec = 0; + timer_settime(tm1, TIMER_ABSTIME, &its1, NULL); + /* Wake up 3 seconds before leap */ ts.tv_sec = next_leap - 3; ts.tv_nsec = 0; + while (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &ts, NULL)) printf("Something woke us up, returning to sleep\n"); @@ -276,6 +338,7 @@ int main(int argc, char **argv) while (now < next_leap + 2) { char buf[26]; struct timespec tai; + int ret; tx.modes = 0; ret = adjtimex(&tx); @@ -308,8 +371,13 @@ int main(int argc, char **argv) /* Note if kernel has known hrtimer failure */ test_hrtimer_failure(); - printf("Leap complete\n\n"); - + printf("Leap complete\n"); + if (error_found) { + printf("Errors observed\n"); + clear_time_state(); + return ksft_exit_fail(); + } + printf("\n"); if ((iterations != -1) && !(--iterations)) break; } From a9d20988ac7db47fec4510cefc966e876a4ce674 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 17 Jun 2015 16:04:46 +0530 Subject: [PATCH 099/106] clockevents: Check state instead of mode in suspend/resume path CLOCK_EVT_MODE_* macros are present for backward compatibility (as most of the drivers are still using old ->set_mode() interface). These macro's shouldn't be used anymore in code, that is common to both driver interfaces, i.e. ->set_mode() and ->set_state_*(). Drivers implementing ->set_state_*() interface, which have their clkevt->mode set to 0 (clkevt device structures are normally globally defined), will not participate in suspend/resume as they will always be marked as UNUSED. Fix this by checking state of the clockevent device instead of mode, which is updated for both the interfaces. Fixes: ac34ad27fc16 ("clockevents: Do not suspend/resume if unused") Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: alexandre.belloni@free-electrons.com Cc: sylvain.rochet@finsecur.com Link: http://lkml.kernel.org/r/a1964eef6e8a47d02b1ff9083c6c91f73f0ff643.1434537215.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 2397b97320d8..08ccc3da3ca0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -639,7 +639,7 @@ void clockevents_suspend(void) struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) - if (dev->suspend && dev->mode != CLOCK_EVT_MODE_UNUSED) + if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } @@ -651,7 +651,7 @@ void clockevents_resume(void) struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) - if (dev->resume && dev->mode != CLOCK_EVT_MODE_UNUSED) + if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } From 906c55579a6360dd9ef5a3101bb2e3ae396dfb97 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 17 Jun 2015 10:05:53 -0700 Subject: [PATCH 100/106] timekeeping: Copy the shadow-timekeeper over the real timekeeper last The fix in d151832650ed9 (time: Move clock_was_set_seq update before updating shadow-timekeeper) was unfortunately incomplete. The main gist of that change was to do the shadow-copy update last, so that any state changes were properly duplicated, and we wouldn't accidentally have stale data in the shadow. Unfortunately in the main update_wall_time() logic, we update use the shadow-timekeeper to calculate the next update values, then while holding the lock, copy the shadow-timekeeper over, then call timekeeping_update() to do some additional bookkeeping, (skipping the shadow mirror). The bug with this is the additional bookkeeping isn't all read-only, and some changes timkeeper state. Thus we might then overwrite this state change on the next update. To avoid this problem, do the timekeeping_update() on the shadow-timekeeper prior to copying the full state over to the real-timekeeper. This avoids problems with both the clock_was_set_seq and next_leap_ktime being overwritten and possibly the fast-timekeepers as well. Many thanks to Prarit for his rigorous testing, which discovered this problem, along with Prarit and Daniel's work validating this fix. Reported-by: Prarit Bhargava Tested-by: Prarit Bhargava Tested-by: Daniel Bristot de Oliveira Signed-off-by: John Stultz Cc: Richard Cochran Cc: Jan Kara Cc: Jiri Bohac Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1434560753-7441-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5d67ffb7e317..30b7a409bf1e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1853,8 +1853,9 @@ void update_wall_time(void) * memcpy under the tk_core.seq against one before we start * updating. */ + timekeeping_update(tk, clock_set); memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, clock_set); + /* The memcpy must come last. Do not put anything here! */ write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); From 51a16c1e887a5975ada27a3ae935a4f2783005da Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 17 Jun 2015 11:16:43 -0700 Subject: [PATCH 101/106] selftest: Timers: Avoid signal deadlock in leap-a-day In 0c4a5fc95b1df (Add leap-second timer edge testing to leap-a-day.c), we added a timer to the test which checks to make sure timers near the leapsecond edge behave correctly. However, the output generated from the timer uses ctime_r, which isn't async-signal safe, and should that signal land while the main test is using ctime_r to print its output, its possible for the test to deadlock on glibc internal locks. Thus this patch reworks the output to avoid using ctime_r in the signal handler. Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Daniel Bristot de Oliveira Cc: Richard Cochran Cc: Jan Kara Cc: Jiri Bohac Cc: Shuah Khan Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1434565003-3386-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- tools/testing/selftests/timers/leap-a-day.c | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index 331c4f70d8a1..fb46ad6ac92c 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -141,27 +141,28 @@ void handler(int unused) void sigalarm(int signo) { struct timex tx; - char buf[26]; int ret; tx.modes = 0; ret = adjtimex(&tx); - ctime_r(&tx.time.tv_sec, buf); - buf[strlen(buf)-1] = 0; /*remove trailing\n */ - printf("%s + %6ld us (%i)\t%s - TIMER FIRED\n", - buf, + if (tx.time.tv_sec < next_leap) { + printf("Error: Early timer expiration! (Should be %ld)\n", next_leap); + error_found = 1; + printf("adjtimex: %10ld sec + %6ld us (%i)\t%s\n", + tx.time.tv_sec, tx.time.tv_usec, tx.tai, time_state_str(ret)); - - if (tx.time.tv_sec < next_leap) { - printf("Error: Early timer expiration!\n"); - error_found = 1; } if (ret != TIME_WAIT) { - printf("Error: Incorrect NTP state?\n"); + printf("Error: Timer seeing incorrect NTP state? (Should be TIME_WAIT)\n"); error_found = 1; + printf("adjtimex: %10ld sec + %6ld us (%i)\t%s\n", + tx.time.tv_sec, + tx.time.tv_usec, + tx.tai, + time_state_str(ret)); } } @@ -297,7 +298,7 @@ int main(int argc, char **argv) printf("Scheduling leap second for %s", ctime(&next_leap)); /* Set up timer */ - printf("Setting timer for %s", ctime(&next_leap)); + printf("Setting timer for %ld - %s", next_leap, ctime(&next_leap)); memset(&se, 0, sizeof(se)); se.sigev_notify = SIGEV_SIGNAL; se.sigev_signo = signum; From c04dca02bc73096435a5c36efd5ccb2171edcbe1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 11 Jun 2015 14:46:44 +0200 Subject: [PATCH 102/106] hrtimer: Remove HRTIMER_STATE_MIGRATE I do not understand HRTIMER_STATE_MIGRATE. Unless I am totally confused it looks buggy and simply unneeded. migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but this is not enough: this can fool, say, hrtimer_is_queued() in dequeue_signal(). Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED? This fixes the race and we can kill STATE_MIGRATE. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.072387650@infradead.org Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 6 +----- kernel/time/hrtimer.c | 7 ++----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3f82a7edc03d..2f9e57d3d126 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -70,17 +70,13 @@ enum hrtimer_restart { * the handling of the timer. * * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This - * also affects HRTIMER_STATE_MIGRATE where the preservation is not - * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is - * enqueued on the new cpu. + * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 #define HRTIMER_STATE_CALLBACK 0x02 -#define HRTIMER_STATE_MIGRATE 0x04 /** * struct hrtimer - the basic hrtimer structure diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 278d4b36fd94..b1b795e5e0b1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1508,11 +1508,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, debug_deactivate(timer); /* - * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1523,9 +1523,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * event device. */ enqueue_hrtimer(timer, new_base); - - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; } } From 8edfb0362e8e52dec2de08fa163af01c9da2c9d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:45 +0200 Subject: [PATCH 103/106] hrtimer: Fix hrtimer_is_queued() hole A queued hrtimer that gets restarted (hrtimer_start*() while hrtimer_is_queued()) will briefly appear as unqueued/inactive, even though the timer has always been active, we just moved it. Close this hole by preserving timer->state in hrtimer_start_range_ns()'s remove_hrtimer() call. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.175989138@infradead.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b1b795e5e0b1..1604157374d7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -889,10 +889,10 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state; + unsigned long state = timer->state; int reprogram; /* @@ -906,12 +906,15 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state = timer->state & HRTIMER_STATE_CALLBACK; + + if (!restart) { + /* + * We must preserve the CALLBACK state flag here, + * otherwise we could move the timer base in + * switch_hrtimer_base. + */ + state &= HRTIMER_STATE_CALLBACK; + } __remove_hrtimer(timer, base, state, reprogram); return 1; } @@ -936,7 +939,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base); + remove_hrtimer(timer, base, true); if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, base->get_time()); @@ -1005,7 +1008,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base); + ret = remove_hrtimer(timer, base, false); unlock_hrtimer_base(timer, &flags); From a7c6f571ff51cc77d90dd54968f7c5c938c43998 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:46 +0200 Subject: [PATCH 104/106] seqcount: Rename write_seqcount_barrier() I'll shortly be introducing another seqcount primitive that's useful to provide ordering semantics and would like to use the write_seqcount_barrier() name for that. Seeing how there's only one user of the current primitive, lets rename it to invalidate, as that appears what its doing. While there, employ lockdep_assert_held() instead of assert_spin_locked() to not generate debug code for regular kernels. Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: Oleg Nesterov Cc: wanpeng.li@linux.intel.com Cc: Paul McKenney Cc: Al Viro Cc: Linus Torvalds Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.279926217@infradead.org Signed-off-by: Thomas Gleixner --- fs/dcache.c | 16 ++++++++-------- include/linux/seqlock.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 656ce522a218..b43a1694d2ca 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -322,17 +322,17 @@ static void dentry_free(struct dentry *dentry) } /** - * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups * @dentry: the target dentry * After this call, in-progress rcu-walk path lookup will fail. This * should be called after unhashing, and after changing d_inode (if * the dentry has not already been unhashed). */ -static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +static inline void dentry_rcuwalk_invalidate(struct dentry *dentry) { - assert_spin_locked(&dentry->d_lock); - /* Go through a barrier */ - write_seqcount_barrier(&dentry->d_seq); + lockdep_assert_held(&dentry->d_lock); + /* Go through am invalidation barrier */ + write_seqcount_invalidate(&dentry->d_seq); } /* @@ -372,7 +372,7 @@ static void dentry_unlink_inode(struct dentry * dentry) struct inode *inode = dentry->d_inode; __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -494,7 +494,7 @@ void __d_drop(struct dentry *dentry) __hlist_bl_del(&dentry->d_hash); dentry->d_hash.pprev = NULL; hlist_bl_unlock(b); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); } } EXPORT_SYMBOL(__d_drop); @@ -1752,7 +1752,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) if (inode) hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); __d_set_inode_and_type(dentry, inode, add_flags); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); spin_unlock(&dentry->d_lock); fsnotify_d_instantiate(dentry, inode); } diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5f68d0a391ce..c07e3a536099 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -266,13 +266,13 @@ static inline void write_seqcount_end(seqcount_t *s) } /** - * write_seqcount_barrier - invalidate in-progress read-side seq operations + * write_seqcount_invalidate - invalidate in-progress read-side seq operations * @s: pointer to seqcount_t * - * After write_seqcount_barrier, no read-side seq operations will complete + * After write_seqcount_invalidate, no read-side seq operations will complete * successfully and see data older than this. */ -static inline void write_seqcount_barrier(seqcount_t *s) +static inline void write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); s->sequence+=2; From c4bfa3f5f906aee2e084c5b1fb15caf876338ef8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Jun 2015 14:29:24 +0200 Subject: [PATCH 105/106] seqcount: Introduce raw_write_seqcount_barrier() Introduce raw_write_seqcount_barrier(), a new construct that can be used to provide write barrier semantics in seqcount read loops instead of the usual consistency guarantee. raw_write_seqcount_barier() is equivalent to: raw_write_seqcount_begin(); raw_write_seqcount_end(); But avoids issueing two back-to-back smp_wmb() instructions. This construct works because the read side will 'stall' when observing odd values. This means that -- referring to the example in the comment below -- even though there is no (matching) read barrier between the loads of X and Y, we cannot observe !x && !y, because: - if we observe Y == false we must observe the first sequence increment, which makes us loop, until - we observe !(seq & 1) -- the second sequence increment -- at which time we must also observe T == true. Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: umgwanakikbuti@gmail.com Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: Al Viro Cc: Linus Torvalds Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20150617122924.GP3644@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/seqlock.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index c07e3a536099..486e685a226a 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -233,6 +233,47 @@ static inline void raw_write_seqcount_end(seqcount_t *s) s->sequence++; } +/** + * raw_write_seqcount_barrier - do a seq write barrier + * @s: pointer to seqcount_t + * + * This can be used to provide an ordering guarantee instead of the + * usual consistency guarantee. It is one wmb cheaper, because we can + * collapse the two back-to-back wmb()s. + * + * seqcount_t seq; + * bool X = true, Y = false; + * + * void read(void) + * { + * bool x, y; + * + * do { + * int s = read_seqcount_begin(&seq); + * + * x = X; y = Y; + * + * } while (read_seqcount_retry(&seq, s)); + * + * BUG_ON(!x && !y); + * } + * + * void write(void) + * { + * Y = true; + * + * raw_write_seqcount_barrier(seq); + * + * X = false; + * } + */ +static inline void raw_write_seqcount_barrier(seqcount_t *s) +{ + s->sequence++; + smp_wmb(); + s->sequence++; +} + /* * raw_write_seqcount_latch - redirect readers to even/odd copy * @s: pointer to seqcount_t From 887d9dc989eb0154492e41e7c07492edbb088ba1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2015 14:46:48 +0200 Subject: [PATCH 106/106] hrtimer: Allow hrtimer::function() to free the timer Currently an hrtimer callback function cannot free its own timer because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK after it. Freeing the timer would result in a clear use-after-free. Solve this by using a scheme similar to regular timers; track the current running timer in hrtimer_clock_base::running. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: wanpeng.li@linux.intel.com Cc: Al Viro Cc: Linus Torvalds Cc: Paul McKenney Cc: Oleg Nesterov Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124743.471563047@infradead.org Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 41 ++++++--------- kernel/time/hrtimer.c | 114 ++++++++++++++++++++++++++++++++-------- 2 files changed, 107 insertions(+), 48 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2f9e57d3d126..5db055821ef3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -53,30 +53,25 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree - * 0x02 callback function running - * 0x04 timer is migrated to another cpu * - * Special cases: - * 0x03 callback function running and enqueued - * (was requeued on another CPU) - * 0x05 timer was migrated on CPU hotunplug + * The callback state is not part of the timer->state because clearing it would + * mean touching the timer after the callback, this makes it impossible to free + * the timer from the callback function. * - * The "callback function running and enqueued" status is only possible on - * SMP. It happens for example when a posix timer expired and the callback + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the - * signal and rearm the timer. We have to preserve the callback running state, - * as otherwise the timer could be removed before the softirq code finishes the - * the handling of the timer. - * - * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. + * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 -#define HRTIMER_STATE_CALLBACK 0x02 /** * struct hrtimer - the basic hrtimer structure @@ -163,6 +158,8 @@ enum hrtimer_base_type { * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers + * @seq: seqcount around __run_hrtimer + * @running: pointer to the currently running hrtimer * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events @@ -184,6 +181,8 @@ enum hrtimer_base_type { */ struct hrtimer_cpu_base { raw_spinlock_t lock; + seqcount_t seq; + struct hrtimer *running; unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; @@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern u64 hrtimer_get_next_event(void); -/* - * A timer is active, when it is enqueued into the rbtree or the - * callback function is running or it's in the state of being migrated - * to another cpu. - */ -static inline int hrtimer_active(const struct hrtimer *timer) -{ - return timer->state != HRTIMER_STATE_INACTIVE; -} +extern bool hrtimer_active(const struct hrtimer *timer); /* * Helper function to check, whether the timer is on one of the queues @@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) */ static inline int hrtimer_callback_running(struct hrtimer *timer) { - return timer->state & HRTIMER_STATE_CALLBACK; + return timer->base->cpu_base->running == timer; } /* Forward a hrtimer so it expires after now: */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1604157374d7..f026413de4d6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -67,6 +67,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { @@ -110,6 +111,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) */ #ifdef CONFIG_SMP +/* + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() + * such that hrtimer_callback_running() can unconditionally dereference + * timer->base->cpu_base + */ +static struct hrtimer_cpu_base migration_cpu_base = { + .seq = SEQCNT_ZERO(migration_cpu_base), + .clock_base = { { .cpu_base = &migration_cpu_base, }, }, +}; + +#define migration_base migration_cpu_base.clock_base[0] + /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * possible to set timer->base = &migration_base and drop the lock: the timer + * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, @@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; - if (likely(base != NULL)) { + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; @@ -194,8 +207,8 @@ again: if (unlikely(hrtimer_callback_running(timer))) return base; - /* See the comment in lock_timer_base() */ - timer->base = NULL; + /* See the comment in lock_hrtimer_base() */ + timer->base = &migration_base; raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); @@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; + timer->state = HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - if (!restart) { - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state &= HRTIMER_STATE_CALLBACK; - } + if (!restart) + state = HRTIMER_STATE_INACTIVE; + __remove_hrtimer(timer, base, state, reprogram); return 1; } @@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); +/* + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. + * + * It is important for this function to not return a false negative. + */ +bool hrtimer_active(const struct hrtimer *timer) +{ + struct hrtimer_cpu_base *cpu_base; + unsigned int seq; + + do { + cpu_base = READ_ONCE(timer->base->cpu_base); + seq = raw_read_seqcount_begin(&cpu_base->seq); + + if (timer->state != HRTIMER_STATE_INACTIVE || + cpu_base->running == timer) + return true; + + } while (read_seqcount_retry(&cpu_base->seq, seq) || + cpu_base != READ_ONCE(timer->base->cpu_base)); + + return false; +} +EXPORT_SYMBOL_GPL(hrtimer_active); + +/* + * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 + * distinct sections: + * + * - queued: the timer is queued + * - callback: the timer is being ran + * - post: the timer is inactive or (re)queued + * + * On the read side we ensure we observe timer->state and cpu_base->running + * from the same section, if anything changed while we looked at it, we retry. + * This includes timer->base changing because sequence numbers alone are + * insufficient for that. + * + * The sequence numbers are required because otherwise we could still observe + * a false negative if the read side got smeared over multiple consequtive + * __run_hrtimer() invocations. + */ + static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now) @@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, enum hrtimer_restart (*fn)(struct hrtimer *); int restart; - WARN_ON(!irqs_disabled()); + lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + cpu_base->running = timer; + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * Note: We clear the running state after enqueue_hrtimer and * we do not reprogramm the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * @@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base); - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); - timer->state &= ~HRTIMER_STATE_CALLBACK; + WARN_ON_ONCE(cpu_base->running != timer); + cpu_base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)