Timer subsystem changes for v6.8:
- Various preparatory cleanups & enhancements of the timer-wheel code, in preparation for the WIP 'pull timers at expiry' timer migration model series (which will replace the current 'push timers at enqueue' migration model), by Anna-Maria Behnsen: - Update comments and clean up confusing variable names - Add debug check to warn about time travel - Improve/expand timer-wheel tracepoints - Optimize away unnecessary IPIs for deferrable timers - Restructure & clean up next_expiry_recalc() - Clean up forward_timer_base() - Introduce __forward_timer_base() and use it to simplify and micro-optimize get_next_timer_interrupt() - Restructure the get_next_timer_interrupt()'s idle logic for better readability and to enable a minor optimization. - Fix the nextevt calculation when no timers are pending - Fix the sysfs_get_uname() prototype declaration Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmWb0XIRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1h9kg/9FpjbiogIKrDXb/pJHyhYkK6jzN4aNrQo wsOz4FDKyvioqLfr5ndpFE++DwsyzUibPfHJzfwD5IilTyolm2jW44VSCBzNdm72 lI6NGIcIxmIeCuO4bLmJj/fuQAugQ+ajmA2pyC/aBSO4Q2jtnxjYMGiV9zMWmOsa E816CK5zp6IVx+w0GWwK5yW5YR5dscSQCD+mBYVAdTWYoRNTy6xonsMTRuNi0ePx donetpu0eWG9NGwUdax/65oKVLZMR/rKAI/3pInhkOS9BsL2o8Rt4o2Y+9aBFi2t 2m+ZbFg5hngJwhP8Mfc7A+I3qiWgCOMGNGrebyzlwb+0PnNBPzrwnNPveW3R9QRx LMxSU3aH66bXeX+YCF4y2tjWSmYooAnztPstUGrs+sq36+NF0wyY6ip/36S6MRGk zjedqWnrHQeeZlzOLiKNzB+FIBnOt6bhZEh1Wk1/zwi9UWxw+7+I6tR0b57NqRxZ VHq38fp+O2OEAj5JvwJ6FomOd+onqQ2wwveG5OMCa+hwM2ZCuVXQRYgM2ohMfwl3 BMSd3KMZsBiHT0zyun3k/uJ7CaIjArPh016baSS10ArSl9sE64aJj7ELtuSLqtaD idJFXu3tv6VgDT2rMhLWNHvzQoK+gb8+/qnms4Ea+wY2f7nubi0aH20qHfugkgis 4KOkw9cQw0U= =n40J -----END PGP SIGNATURE----- Merge tag 'timers-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull timer subsystem updates from Ingo Molnar: - Various preparatory cleanups & enhancements of the timer-wheel code, in preparation for the WIP 'pull timers at expiry' timer migration model series (which will replace the current 'push timers at enqueue' migration model), by Anna-Maria Behnsen: - Update comments and clean up confusing variable names - Add debug check to warn about time travel - Improve/expand timer-wheel tracepoints - Optimize away unnecessary IPIs for deferrable timers - Restructure & clean up next_expiry_recalc() - Clean up forward_timer_base() - Introduce __forward_timer_base() and use it to simplify and micro-optimize get_next_timer_interrupt() - Restructure the get_next_timer_interrupt()'s idle logic for better readability and to enable a minor optimization. - Fix the nextevt calculation when no timers are pending - Fix the sysfs_get_uname() prototype declaration * tag 'timers-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: timers: Fix nextevt calculation when no timers are pending timers: Rework idle logic timers: Use already existing function for forwarding timer base timers: Split out forward timer base functionality timers: Clarify check in forward_timer_base() timers: Move store of next event into __next_timer_interrupt() timers: Do not IPI for deferrable timers tracing/timers: Add tracepoint for tracking timer base is_idle flag tracing/timers: Enhance timer_start tracepoint tick-sched: Warn when next tick seems to be in the past tick/sched: Cleanup confusing variables tick-sched: Fix function names in comments time: Make sysfs_get_uname() function visible in header
This commit is contained in:
commit
f24dc33f8e
@ -46,22 +46,21 @@ DEFINE_EVENT(timer_class, timer_init,
|
||||
|
||||
/**
|
||||
* timer_start - called when the timer is started
|
||||
* @timer: pointer to struct timer_list
|
||||
* @expires: the timers expiry time
|
||||
* @flags: the timers flags
|
||||
* @timer: pointer to struct timer_list
|
||||
* @bucket_expiry: the bucket expiry time
|
||||
*/
|
||||
TRACE_EVENT(timer_start,
|
||||
|
||||
TP_PROTO(struct timer_list *timer,
|
||||
unsigned long expires,
|
||||
unsigned int flags),
|
||||
unsigned long bucket_expiry),
|
||||
|
||||
TP_ARGS(timer, expires, flags),
|
||||
TP_ARGS(timer, bucket_expiry),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( void *, timer )
|
||||
__field( void *, function )
|
||||
__field( unsigned long, expires )
|
||||
__field( unsigned long, bucket_expiry )
|
||||
__field( unsigned long, now )
|
||||
__field( unsigned int, flags )
|
||||
),
|
||||
@ -69,15 +68,16 @@ TRACE_EVENT(timer_start,
|
||||
TP_fast_assign(
|
||||
__entry->timer = timer;
|
||||
__entry->function = timer->function;
|
||||
__entry->expires = expires;
|
||||
__entry->expires = timer->expires;
|
||||
__entry->bucket_expiry = bucket_expiry;
|
||||
__entry->now = jiffies;
|
||||
__entry->flags = flags;
|
||||
__entry->flags = timer->flags;
|
||||
),
|
||||
|
||||
TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
|
||||
TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
|
||||
__entry->timer, __entry->function, __entry->expires,
|
||||
(long)__entry->expires - __entry->now,
|
||||
__entry->flags & TIMER_CPUMASK,
|
||||
__entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
|
||||
__entry->flags >> TIMER_ARRAYSHIFT,
|
||||
decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
|
||||
);
|
||||
@ -142,6 +142,26 @@ DEFINE_EVENT(timer_class, timer_cancel,
|
||||
TP_ARGS(timer)
|
||||
);
|
||||
|
||||
TRACE_EVENT(timer_base_idle,
|
||||
|
||||
TP_PROTO(bool is_idle, unsigned int cpu),
|
||||
|
||||
TP_ARGS(is_idle, cpu),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( bool, is_idle )
|
||||
__field( unsigned int, cpu )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->is_idle = is_idle;
|
||||
__entry->cpu = cpu;
|
||||
),
|
||||
|
||||
TP_printk("is_idle=%d cpu=%d",
|
||||
__entry->is_idle, __entry->cpu)
|
||||
);
|
||||
|
||||
#define decode_clockid(type) \
|
||||
__print_symbolic(type, \
|
||||
{ CLOCK_REALTIME, "CLOCK_REALTIME" }, \
|
||||
|
@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev,
|
||||
ktime_t expires, bool force);
|
||||
extern void clockevents_handle_noop(struct clock_event_device *dev);
|
||||
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
|
||||
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
|
||||
|
||||
/* Broadcasting support */
|
||||
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
|
||||
@ -197,3 +196,5 @@ void hrtimers_resume_local(void);
|
||||
#else
|
||||
#define JIFFIES_SHIFT 8
|
||||
#endif
|
||||
|
||||
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
|
||||
|
@ -839,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
|
||||
ts->next_timer = next_tick;
|
||||
}
|
||||
|
||||
/* Make sure next_tick is never before basemono! */
|
||||
if (WARN_ON_ONCE(basemono > next_tick))
|
||||
next_tick = basemono;
|
||||
|
||||
/*
|
||||
* If the tick is due in the next period, keep it ticking or
|
||||
* force prod the timer.
|
||||
@ -887,7 +891,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
|
||||
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
|
||||
u64 basemono = ts->timer_expires_base;
|
||||
u64 expires = ts->timer_expires;
|
||||
ktime_t tick = expires;
|
||||
|
||||
/* Make sure we won't be trying to stop it twice in a row. */
|
||||
ts->timer_expires_base = 0;
|
||||
@ -910,7 +913,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
|
||||
/* Skip reprogram of event if it's not changed */
|
||||
if (ts->tick_stopped && (expires == ts->next_tick)) {
|
||||
/* Sanity check: make sure clockevent is actually programmed */
|
||||
if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
|
||||
if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(1);
|
||||
@ -920,11 +923,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
|
||||
}
|
||||
|
||||
/*
|
||||
* nohz_stop_sched_tick() can be called several times before
|
||||
* nohz_restart_sched_tick() is called. This happens when
|
||||
* interrupts arrive which do not cause a reschedule. In the
|
||||
* first call we save the current tick time, so we can restart
|
||||
* the scheduler tick in nohz_restart_sched_tick().
|
||||
* tick_nohz_stop_tick() can be called several times before
|
||||
* tick_nohz_restart_sched_tick() is called. This happens when
|
||||
* interrupts arrive which do not cause a reschedule. In the first
|
||||
* call we save the current tick time, so we can restart the
|
||||
* scheduler tick in tick_nohz_restart_sched_tick().
|
||||
*/
|
||||
if (!ts->tick_stopped) {
|
||||
calc_load_nohz_start();
|
||||
@ -935,7 +938,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
|
||||
trace_tick_stop(1, TICK_DEP_MASK_NONE);
|
||||
}
|
||||
|
||||
ts->next_tick = tick;
|
||||
ts->next_tick = expires;
|
||||
|
||||
/*
|
||||
* If the expiration time == KTIME_MAX, then we simply stop
|
||||
@ -950,11 +953,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
|
||||
}
|
||||
|
||||
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
|
||||
hrtimer_start(&ts->sched_timer, tick,
|
||||
hrtimer_start(&ts->sched_timer, expires,
|
||||
HRTIMER_MODE_ABS_PINNED_HARD);
|
||||
} else {
|
||||
hrtimer_set_expires(&ts->sched_timer, tick);
|
||||
tick_program_event(tick, 1);
|
||||
hrtimer_set_expires(&ts->sched_timer, expires);
|
||||
tick_program_event(expires, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
|
||||
static void
|
||||
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
|
||||
{
|
||||
if (!is_timers_nohz_active())
|
||||
return;
|
||||
|
||||
/*
|
||||
* TODO: This wants some optimizing similar to the code below, but we
|
||||
* will do that when we switch from push to pull for deferrable timers.
|
||||
* Deferrable timers do not prevent the CPU from entering dynticks and
|
||||
* are not taken into account on the idle/nohz_full path. An IPI when a
|
||||
* new deferrable timer is enqueued will wake up the remote CPU but
|
||||
* nothing will be done with the deferrable timer base. Therefore skip
|
||||
* the remote IPI for deferrable timers completely.
|
||||
*/
|
||||
if (timer->flags & TIMER_DEFERRABLE) {
|
||||
if (tick_nohz_full_cpu(base->cpu))
|
||||
wake_up_nohz_cpu(base->cpu);
|
||||
if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We might have to IPI the remote CPU if the base is idle and the
|
||||
@ -606,7 +603,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
|
||||
__set_bit(idx, base->pending_map);
|
||||
timer_set_idx(timer, idx);
|
||||
|
||||
trace_timer_start(timer, timer->expires, timer->flags);
|
||||
trace_timer_start(timer, bucket_expiry);
|
||||
|
||||
/*
|
||||
* Check whether this is the new first expiring timer. The
|
||||
@ -942,31 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags)
|
||||
return get_timer_this_cpu_base(tflags);
|
||||
}
|
||||
|
||||
static inline void forward_timer_base(struct timer_base *base)
|
||||
static inline void __forward_timer_base(struct timer_base *base,
|
||||
unsigned long basej)
|
||||
{
|
||||
unsigned long jnow = READ_ONCE(jiffies);
|
||||
|
||||
/*
|
||||
* No need to forward if we are close enough below jiffies.
|
||||
* Also while executing timers, base->clk is 1 offset ahead
|
||||
* of jiffies to avoid endless requeuing to current jiffies.
|
||||
* Check whether we can forward the base. We can only do that when
|
||||
* @basej is past base->clk otherwise we might rewind base->clk.
|
||||
*/
|
||||
if ((long)(jnow - base->clk) < 1)
|
||||
if (time_before_eq(basej, base->clk))
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the next expiry value is > jiffies, then we fast forward to
|
||||
* jiffies otherwise we forward to the next expiry value.
|
||||
*/
|
||||
if (time_after(base->next_expiry, jnow)) {
|
||||
base->clk = jnow;
|
||||
if (time_after(base->next_expiry, basej)) {
|
||||
base->clk = basej;
|
||||
} else {
|
||||
if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
|
||||
return;
|
||||
base->clk = base->next_expiry;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static inline void forward_timer_base(struct timer_base *base)
|
||||
{
|
||||
__forward_timer_base(base, READ_ONCE(jiffies));
|
||||
}
|
||||
|
||||
/*
|
||||
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
|
||||
@ -1803,8 +1803,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
|
||||
/*
|
||||
* Search the first expiring timer in the various clock levels. Caller must
|
||||
* hold base->lock.
|
||||
*
|
||||
* Store next expiry time in base->next_expiry.
|
||||
*/
|
||||
static unsigned long __next_timer_interrupt(struct timer_base *base)
|
||||
static void next_expiry_recalc(struct timer_base *base)
|
||||
{
|
||||
unsigned long clk, next, adj;
|
||||
unsigned lvl, offset = 0;
|
||||
@ -1870,10 +1872,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
|
||||
clk += adj;
|
||||
}
|
||||
|
||||
base->next_expiry = next;
|
||||
base->next_expiry_recalc = false;
|
||||
base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
@ -1921,8 +1922,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
|
||||
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
|
||||
{
|
||||
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
|
||||
unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA;
|
||||
u64 expires = KTIME_MAX;
|
||||
unsigned long nextevt;
|
||||
bool was_idle;
|
||||
|
||||
/*
|
||||
* Pretend that there is no timer pending if the cpu is offline.
|
||||
@ -1933,37 +1935,44 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
|
||||
|
||||
raw_spin_lock(&base->lock);
|
||||
if (base->next_expiry_recalc)
|
||||
base->next_expiry = __next_timer_interrupt(base);
|
||||
nextevt = base->next_expiry;
|
||||
next_expiry_recalc(base);
|
||||
|
||||
/*
|
||||
* We have a fresh next event. Check whether we can forward the
|
||||
* base. We can only do that when @basej is past base->clk
|
||||
* otherwise we might rewind base->clk.
|
||||
* base.
|
||||
*/
|
||||
if (time_after(basej, base->clk)) {
|
||||
if (time_after(nextevt, basej))
|
||||
base->clk = basej;
|
||||
else if (time_after(nextevt, base->clk))
|
||||
base->clk = nextevt;
|
||||
__forward_timer_base(base, basej);
|
||||
|
||||
if (base->timers_pending) {
|
||||
nextevt = base->next_expiry;
|
||||
|
||||
/* If we missed a tick already, force 0 delta */
|
||||
if (time_before(nextevt, basej))
|
||||
nextevt = basej;
|
||||
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
|
||||
} else {
|
||||
/*
|
||||
* Move next_expiry for the empty base into the future to
|
||||
* prevent a unnecessary raise of the timer softirq when the
|
||||
* next_expiry value will be reached even if there is no timer
|
||||
* pending.
|
||||
*/
|
||||
base->next_expiry = nextevt;
|
||||
}
|
||||
|
||||
if (time_before_eq(nextevt, basej)) {
|
||||
expires = basem;
|
||||
base->is_idle = false;
|
||||
} else {
|
||||
if (base->timers_pending)
|
||||
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
|
||||
/*
|
||||
* If we expect to sleep more than a tick, mark the base idle.
|
||||
* Also the tick is stopped so any added timer must forward
|
||||
* the base clk itself to keep granularity small. This idle
|
||||
* logic is only maintained for the BASE_STD base, deferrable
|
||||
* timers may still see large granularity skew (by design).
|
||||
*/
|
||||
if ((expires - basem) > TICK_NSEC)
|
||||
base->is_idle = true;
|
||||
}
|
||||
/*
|
||||
* Base is idle if the next event is more than a tick away.
|
||||
*
|
||||
* If the base is marked idle then any timer add operation must forward
|
||||
* the base clk itself to keep granularity small. This idle logic is
|
||||
* only maintained for the BASE_STD base, deferrable timers may still
|
||||
* see large granularity skew (by design).
|
||||
*/
|
||||
was_idle = base->is_idle;
|
||||
base->is_idle = time_after(nextevt, basej + 1);
|
||||
if (was_idle != base->is_idle)
|
||||
trace_timer_base_idle(base->is_idle, base->cpu);
|
||||
|
||||
raw_spin_unlock(&base->lock);
|
||||
|
||||
return cmp_next_hrtimer_event(basem, expires);
|
||||
@ -1984,7 +1993,10 @@ void timer_clear_idle(void)
|
||||
* sending the IPI a few instructions smaller for the cost of taking
|
||||
* the lock in the exit from idle path.
|
||||
*/
|
||||
base->is_idle = false;
|
||||
if (base->is_idle) {
|
||||
base->is_idle = false;
|
||||
trace_timer_base_idle(false, smp_processor_id());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -2015,8 +2027,12 @@ static inline void __run_timers(struct timer_base *base)
|
||||
*/
|
||||
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
|
||||
&& base->timers_pending);
|
||||
/*
|
||||
* While executing timers, base->clk is set 1 offset ahead of
|
||||
* jiffies to avoid endless requeuing to current jiffies.
|
||||
*/
|
||||
base->clk++;
|
||||
base->next_expiry = __next_timer_interrupt(base);
|
||||
next_expiry_recalc(base);
|
||||
|
||||
while (levels--)
|
||||
expire_timers(base, heads + levels);
|
||||
|
Loading…
Reference in New Issue
Block a user