e2f3e35f1f
Before being migrated to a new CPU, a task sees its PELT values synchronized with rq last_update_time. Once done, that same task will also have its sched_avg last_update_time reset. This means the time between the migration and the last clock update will not be accounted for in util_avg and a discontinuity will appear. This issue is amplified by the PELT clock scaling. It takes currently one tick after the CPU being idle to let clock_pelt catching up clock_task. This is especially problematic for asymmetric CPU capacity systems which need stable util_avg signals for task placement and energy estimation. Ideally, this problem would be solved by updating the runqueue clocks before the migration. But that would require taking the runqueue lock which is quite expensive [1]. Instead estimate the missing time and update the task util_avg with that value. To that end, we need sched_clock_cpu() but it is a costly function. Limit the usage to the case where the source CPU is idle as we know this is when the clock is having the biggest risk of being outdated. See comment in migrate_se_pelt_lag() for more details about how the PELT value is estimated. Notice though this estimation doesn't take into account IRQ and Paravirt time. [1] https://lkml.kernel.org/r/20190709115759.10451-1-chris.redpath@arm.com Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com> Signed-off-by: Vincent Donnefort <vdonnefort@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Tested-by: Lukasz Luba <lukasz.luba@arm.com> Link: https://lkml.kernel.org/r/20220621090414.433602-3-vdonnefort@google.com
236 lines
6.0 KiB
C
236 lines
6.0 KiB
C
#ifdef CONFIG_SMP
|
|
#include "sched-pelt.h"
|
|
|
|
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
|
|
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
|
|
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
|
|
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
|
|
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
|
|
|
|
#ifdef CONFIG_SCHED_THERMAL_PRESSURE
|
|
int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
|
|
|
|
static inline u64 thermal_load_avg(struct rq *rq)
|
|
{
|
|
return READ_ONCE(rq->avg_thermal.load_avg);
|
|
}
|
|
#else
|
|
static inline int
|
|
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline u64 thermal_load_avg(struct rq *rq)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
|
int update_irq_load_avg(struct rq *rq, u64 running);
|
|
#else
|
|
static inline int
|
|
update_irq_load_avg(struct rq *rq, u64 running)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
|
|
|
|
static inline u32 get_pelt_divider(struct sched_avg *avg)
|
|
{
|
|
return PELT_MIN_DIVIDER + avg->period_contrib;
|
|
}
|
|
|
|
static inline void cfs_se_util_change(struct sched_avg *avg)
|
|
{
|
|
unsigned int enqueued;
|
|
|
|
if (!sched_feat(UTIL_EST))
|
|
return;
|
|
|
|
/* Avoid store if the flag has been already reset */
|
|
enqueued = avg->util_est.enqueued;
|
|
if (!(enqueued & UTIL_AVG_UNCHANGED))
|
|
return;
|
|
|
|
/* Reset flag to report util_avg has been updated */
|
|
enqueued &= ~UTIL_AVG_UNCHANGED;
|
|
WRITE_ONCE(avg->util_est.enqueued, enqueued);
|
|
}
|
|
|
|
static inline u64 rq_clock_pelt(struct rq *rq)
|
|
{
|
|
lockdep_assert_rq_held(rq);
|
|
assert_clock_updated(rq);
|
|
|
|
return rq->clock_pelt - rq->lost_idle_time;
|
|
}
|
|
|
|
/* The rq is idle, we can sync to clock_task */
|
|
static inline void _update_idle_rq_clock_pelt(struct rq *rq)
|
|
{
|
|
rq->clock_pelt = rq_clock_task(rq);
|
|
|
|
u64_u32_store(rq->clock_idle, rq_clock(rq));
|
|
/* Paired with smp_rmb in migrate_se_pelt_lag() */
|
|
smp_wmb();
|
|
u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
|
|
}
|
|
|
|
/*
|
|
* The clock_pelt scales the time to reflect the effective amount of
|
|
* computation done during the running delta time but then sync back to
|
|
* clock_task when rq is idle.
|
|
*
|
|
*
|
|
* absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
|
|
* @ max capacity ------******---------------******---------------
|
|
* @ half capacity ------************---------************---------
|
|
* clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
|
|
*
|
|
*/
|
|
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
|
|
{
|
|
if (unlikely(is_idle_task(rq->curr))) {
|
|
_update_idle_rq_clock_pelt(rq);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* When a rq runs at a lower compute capacity, it will need
|
|
* more time to do the same amount of work than at max
|
|
* capacity. In order to be invariant, we scale the delta to
|
|
* reflect how much work has been really done.
|
|
* Running longer results in stealing idle time that will
|
|
* disturb the load signal compared to max capacity. This
|
|
* stolen idle time will be automatically reflected when the
|
|
* rq will be idle and the clock will be synced with
|
|
* rq_clock_task.
|
|
*/
|
|
|
|
/*
|
|
* Scale the elapsed time to reflect the real amount of
|
|
* computation
|
|
*/
|
|
delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
|
|
delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
|
|
|
|
rq->clock_pelt += delta;
|
|
}
|
|
|
|
/*
|
|
* When rq becomes idle, we have to check if it has lost idle time
|
|
* because it was fully busy. A rq is fully used when the /Sum util_sum
|
|
* is greater or equal to:
|
|
* (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
|
|
* For optimization and computing rounding purpose, we don't take into account
|
|
* the position in the current window (period_contrib) and we use the higher
|
|
* bound of util_sum to decide.
|
|
*/
|
|
static inline void update_idle_rq_clock_pelt(struct rq *rq)
|
|
{
|
|
u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
|
|
u32 util_sum = rq->cfs.avg.util_sum;
|
|
util_sum += rq->avg_rt.util_sum;
|
|
util_sum += rq->avg_dl.util_sum;
|
|
|
|
/*
|
|
* Reflecting stolen time makes sense only if the idle
|
|
* phase would be present at max capacity. As soon as the
|
|
* utilization of a rq has reached the maximum value, it is
|
|
* considered as an always running rq without idle time to
|
|
* steal. This potential idle time is considered as lost in
|
|
* this case. We keep track of this lost idle time compare to
|
|
* rq's clock_task.
|
|
*/
|
|
if (util_sum >= divider)
|
|
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
|
|
|
|
_update_idle_rq_clock_pelt(rq);
|
|
}
|
|
|
|
#ifdef CONFIG_CFS_BANDWIDTH
|
|
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
|
|
{
|
|
u64 throttled;
|
|
|
|
if (unlikely(cfs_rq->throttle_count))
|
|
throttled = U64_MAX;
|
|
else
|
|
throttled = cfs_rq->throttled_clock_pelt_time;
|
|
|
|
u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
|
|
}
|
|
|
|
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
|
|
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
|
|
{
|
|
if (unlikely(cfs_rq->throttle_count))
|
|
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
|
|
|
|
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
|
|
}
|
|
#else
|
|
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
|
|
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
|
|
{
|
|
return rq_clock_pelt(rq_of(cfs_rq));
|
|
}
|
|
#endif
|
|
|
|
#else
|
|
|
|
static inline int
|
|
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline u64 thermal_load_avg(struct rq *rq)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline int
|
|
update_irq_load_avg(struct rq *rq, u64 running)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline u64 rq_clock_pelt(struct rq *rq)
|
|
{
|
|
return rq_clock_task(rq);
|
|
}
|
|
|
|
static inline void
|
|
update_rq_clock_pelt(struct rq *rq, s64 delta) { }
|
|
|
|
static inline void
|
|
update_idle_rq_clock_pelt(struct rq *rq) { }
|
|
|
|
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
|
|
#endif
|
|
|
|
|