2013-04-19 15:10:49 -04:00
/*
* kernel / sched / proc . c
*
* Kernel load calculations , forked from sched / core . c
*/
# include <linux/export.h>
# include "sched.h"
/*
* Global load - average calculations
*
* We take a distributed and async approach to calculating the global load - avg
* in order to minimize overhead .
*
* The global load average is an exponentially decaying average of nr_running +
* nr_uninterruptible .
*
* Once every LOAD_FREQ :
*
* nr_active = 0 ;
* for_each_possible_cpu ( cpu )
* nr_active + = cpu_of ( cpu ) - > nr_running + cpu_of ( cpu ) - > nr_uninterruptible ;
*
* avenrun [ n ] = avenrun [ 0 ] * exp_n + nr_active * ( 1 - exp_n )
*
* Due to a number of reasons the above turns in the mess below :
*
* - for_each_possible_cpu ( ) is prohibitively expensive on machines with
* serious number of cpus , therefore we need to take a distributed approach
* to calculating nr_active .
*
* \ Sum_i x_i ( t ) = \ Sum_i x_i ( t ) - x_i ( t_0 ) | x_i ( t_0 ) : = 0
* = \ Sum_i { \ Sum_j = 1 x_i ( t_j ) - x_i ( t_j - 1 ) }
*
* So assuming nr_active : = 0 when we start out - - true per definition , we
* can simply take per - cpu deltas and fold those into a global accumulate
* to obtain the same result . See calc_load_fold_active ( ) .
*
* Furthermore , in order to avoid synchronizing all per - cpu delta folding
* across the machine , we assume 10 ticks is sufficient time for every
* cpu to have completed this task .
*
* This places an upper - bound on the IRQ - off latency of the machine . Then
* again , being late doesn ' t loose the delta , just wrecks the sample .
*
* - cpu_rq ( ) - > nr_uninterruptible isn ' t accurately tracked per - cpu because
* this would add another cross - cpu cacheline miss and atomic operation
* to the wakeup path . Instead we increment on whatever cpu the task ran
* when it went into uninterruptible state and decrement on whatever cpu
* did the wakeup . This means that only the sum of nr_uninterruptible over
* all cpus yields the correct result .
*
* This covers the NO_HZ = n code , for extra head - aches , see the comment below .
*/
/* Variables and functions for calc_load */
atomic_long_t calc_load_tasks ;
unsigned long calc_load_update ;
unsigned long avenrun [ 3 ] ;
EXPORT_SYMBOL ( avenrun ) ; /* should be removed */
/**
* get_avenrun - get the load average array
* @ loads : pointer to dest load array
* @ offset : offset to add
* @ shift : shift count to shift the result left
*
* These values are estimates at best , so no need for locking .
*/
void get_avenrun ( unsigned long * loads , unsigned long offset , int shift )
{
loads [ 0 ] = ( avenrun [ 0 ] + offset ) < < shift ;
loads [ 1 ] = ( avenrun [ 1 ] + offset ) < < shift ;
loads [ 2 ] = ( avenrun [ 2 ] + offset ) < < shift ;
}
long calc_load_fold_active ( struct rq * this_rq )
{
long nr_active , delta = 0 ;
nr_active = this_rq - > nr_running ;
nr_active + = ( long ) this_rq - > nr_uninterruptible ;
if ( nr_active ! = this_rq - > calc_load_active ) {
delta = nr_active - this_rq - > calc_load_active ;
this_rq - > calc_load_active = nr_active ;
}
return delta ;
}
/*
* a1 = a0 * e + a * ( 1 - e )
*/
static unsigned long
calc_load ( unsigned long load , unsigned long exp , unsigned long active )
{
load * = exp ;
load + = active * ( FIXED_1 - exp ) ;
load + = 1UL < < ( FSHIFT - 1 ) ;
return load > > FSHIFT ;
}
# ifdef CONFIG_NO_HZ_COMMON
/*
* Handle NO_HZ for the global load - average .
*
* Since the above described distributed algorithm to compute the global
* load - average relies on per - cpu sampling from the tick , it is affected by
* NO_HZ .
*
* The basic idea is to fold the nr_active delta into a global idle - delta upon
* entering NO_HZ state such that we can include this as an ' extra ' cpu delta
* when we read the global state .
*
* Obviously reality has to ruin such a delightfully simple scheme :
*
* - When we go NO_HZ idle during the window , we can negate our sample
* contribution , causing under - accounting .
*
* We avoid this by keeping two idle - delta counters and flipping them
* when the window starts , thus separating old and new NO_HZ load .
*
* The only trick is the slight shift in index flip for read vs write .
*
* 0 s 5 s 10 s 15 s
* + 10 + 10 + 10 + 10
* | - | - - - - - - - - - - - | - | - - - - - - - - - - - | - | - - - - - - - - - - - | - |
* r : 0 0 1 1 0 0 1 1 0
* w : 0 1 1 0 0 1 1 0 0
*
* This ensures we ' ll fold the old idle contribution in this window while
* accumlating the new one .
*
* - When we wake up from NO_HZ idle during the window , we push up our
* contribution , since we effectively move our sample point to a known
* busy state .
*
* This is solved by pushing the window forward , and thus skipping the
* sample , for this cpu ( effectively using the idle - delta for this cpu which
* was in effect at the time the window opened ) . This also solves the issue
* of having to deal with a cpu having been in NOHZ idle for multiple
* LOAD_FREQ intervals .
*
* When making the ILB scale , we should try to pull this in as well .
*/
static atomic_long_t calc_load_idle [ 2 ] ;
static int calc_load_idx ;
static inline int calc_load_write_idx ( void )
{
int idx = calc_load_idx ;
/*
* See calc_global_nohz ( ) , if we observe the new index , we also
* need to observe the new update time .
*/
smp_rmb ( ) ;
/*
* If the folding window started , make sure we start writing in the
* next idle - delta .
*/
if ( ! time_before ( jiffies , calc_load_update ) )
idx + + ;
return idx & 1 ;
}
static inline int calc_load_read_idx ( void )
{
return calc_load_idx & 1 ;
}
void calc_load_enter_idle ( void )
{
struct rq * this_rq = this_rq ( ) ;
long delta ;
/*
* We ' re going into NOHZ mode , if there ' s any pending delta , fold it
* into the pending idle delta .
*/
delta = calc_load_fold_active ( this_rq ) ;
if ( delta ) {
int idx = calc_load_write_idx ( ) ;
atomic_long_add ( delta , & calc_load_idle [ idx ] ) ;
}
}
void calc_load_exit_idle ( void )
{
struct rq * this_rq = this_rq ( ) ;
/*
* If we ' re still before the sample window , we ' re done .
*/
if ( time_before ( jiffies , this_rq - > calc_load_update ) )
return ;
/*
* We woke inside or after the sample window , this means we ' re already
* accounted through the nohz accounting , so skip the entire deal and
* sync up for the next window .
*/
this_rq - > calc_load_update = calc_load_update ;
if ( time_before ( jiffies , this_rq - > calc_load_update + 10 ) )
this_rq - > calc_load_update + = LOAD_FREQ ;
}
static long calc_load_fold_idle ( void )
{
int idx = calc_load_read_idx ( ) ;
long delta = 0 ;
if ( atomic_long_read ( & calc_load_idle [ idx ] ) )
delta = atomic_long_xchg ( & calc_load_idle [ idx ] , 0 ) ;
return delta ;
}
/**
* fixed_power_int - compute : x ^ n , in O ( log n ) time
*
* @ x : base of the power
* @ frac_bits : fractional bits of @ x
* @ n : power to raise @ x to .
*
* By exploiting the relation between the definition of the natural power
* function : x ^ n : = x * x * . . . * x ( x multiplied by itself for n times ) , and
* the binary encoding of numbers used by computers : n : = \ Sum n_i * 2 ^ i ,
* ( where : n_i \ elem { 0 , 1 } , the binary vector representing n ) ,
* we find : x ^ n : = x ^ ( \ Sum n_i * 2 ^ i ) : = \ Prod x ^ ( n_i * 2 ^ i ) , which is
* of course trivially computable in O ( log_2 n ) , the length of our binary
* vector .
*/
static unsigned long
fixed_power_int ( unsigned long x , unsigned int frac_bits , unsigned int n )
{
unsigned long result = 1UL < < frac_bits ;
if ( n ) for ( ; ; ) {
if ( n & 1 ) {
result * = x ;
result + = 1UL < < ( frac_bits - 1 ) ;
result > > = frac_bits ;
}
n > > = 1 ;
if ( ! n )
break ;
x * = x ;
x + = 1UL < < ( frac_bits - 1 ) ;
x > > = frac_bits ;
}
return result ;
}
/*
* a1 = a0 * e + a * ( 1 - e )
*
* a2 = a1 * e + a * ( 1 - e )
* = ( a0 * e + a * ( 1 - e ) ) * e + a * ( 1 - e )
* = a0 * e ^ 2 + a * ( 1 - e ) * ( 1 + e )
*
* a3 = a2 * e + a * ( 1 - e )
* = ( a0 * e ^ 2 + a * ( 1 - e ) * ( 1 + e ) ) * e + a * ( 1 - e )
* = a0 * e ^ 3 + a * ( 1 - e ) * ( 1 + e + e ^ 2 )
*
* . . .
*
* an = a0 * e ^ n + a * ( 1 - e ) * ( 1 + e + . . . + e ^ n - 1 ) [ 1 ]
* = a0 * e ^ n + a * ( 1 - e ) * ( 1 - e ^ n ) / ( 1 - e )
* = a0 * e ^ n + a * ( 1 - e ^ n )
*
* [ 1 ] application of the geometric series :
*
* n 1 - x ^ ( n + 1 )
* S_n : = \ Sum x ^ i = - - - - - - - - - - - - -
* i = 0 1 - x
*/
static unsigned long
calc_load_n ( unsigned long load , unsigned long exp ,
unsigned long active , unsigned int n )
{
return calc_load ( load , fixed_power_int ( exp , FSHIFT , n ) , active ) ;
}
/*
* NO_HZ can leave us missing all per - cpu ticks calling
* calc_load_account_active ( ) , but since an idle CPU folds its delta into
* calc_load_tasks_idle per calc_load_account_idle ( ) , all we need to do is fold
* in the pending idle delta if our idle period crossed a load cycle boundary .
*
* Once we ' ve updated the global active value , we need to apply the exponential
* weights adjusted to the number of cycles missed .
*/
static void calc_global_nohz ( void )
{
long delta , active , n ;
if ( ! time_before ( jiffies , calc_load_update + 10 ) ) {
/*
* Catch - up , fold however many we are behind still
*/
delta = jiffies - calc_load_update - 10 ;
n = 1 + ( delta / LOAD_FREQ ) ;
active = atomic_long_read ( & calc_load_tasks ) ;
active = active > 0 ? active * FIXED_1 : 0 ;
avenrun [ 0 ] = calc_load_n ( avenrun [ 0 ] , EXP_1 , active , n ) ;
avenrun [ 1 ] = calc_load_n ( avenrun [ 1 ] , EXP_5 , active , n ) ;
avenrun [ 2 ] = calc_load_n ( avenrun [ 2 ] , EXP_15 , active , n ) ;
calc_load_update + = n * LOAD_FREQ ;
}
/*
* Flip the idle index . . .
*
* Make sure we first write the new time then flip the index , so that
* calc_load_write_idx ( ) will see the new time when it reads the new
* index , this avoids a double flip messing things up .
*/
smp_wmb ( ) ;
calc_load_idx + + ;
}
# else /* !CONFIG_NO_HZ_COMMON */
static inline long calc_load_fold_idle ( void ) { return 0 ; }
static inline void calc_global_nohz ( void ) { }
# endif /* CONFIG_NO_HZ_COMMON */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks .
*/
void calc_global_load ( unsigned long ticks )
{
long active , delta ;
if ( time_before ( jiffies , calc_load_update + 10 ) )
return ;
/*
* Fold the ' old ' idle - delta to include all NO_HZ cpus .
*/
delta = calc_load_fold_idle ( ) ;
if ( delta )
atomic_long_add ( delta , & calc_load_tasks ) ;
active = atomic_long_read ( & calc_load_tasks ) ;
active = active > 0 ? active * FIXED_1 : 0 ;
avenrun [ 0 ] = calc_load ( avenrun [ 0 ] , EXP_1 , active ) ;
avenrun [ 1 ] = calc_load ( avenrun [ 1 ] , EXP_5 , active ) ;
avenrun [ 2 ] = calc_load ( avenrun [ 2 ] , EXP_15 , active ) ;
calc_load_update + = LOAD_FREQ ;
/*
* In case we idled for multiple LOAD_FREQ intervals , catch up in bulk .
*/
calc_global_nohz ( ) ;
}
/*
* Called from update_cpu_load ( ) to periodically update this CPU ' s
* active count .
*/
static void calc_load_account_active ( struct rq * this_rq )
{
long delta ;
if ( time_before ( jiffies , this_rq - > calc_load_update ) )
return ;
delta = calc_load_fold_active ( this_rq ) ;
if ( delta )
atomic_long_add ( delta , & calc_load_tasks ) ;
this_rq - > calc_load_update + = LOAD_FREQ ;
}
/*
* End of global load - average stuff
*/
/*
* The exact cpuload at various idx values , calculated at every tick would be
* load = ( 2 ^ idx - 1 ) / 2 ^ idx * load + 1 / 2 ^ idx * cur_load
*
* If a cpu misses updates for n - 1 ticks ( as it was idle ) and update gets called
* on nth tick when cpu may be busy , then we have :
* load = ( ( 2 ^ idx - 1 ) / 2 ^ idx ) ^ ( n - 1 ) * load
* load = ( 2 ^ idx - 1 ) / 2 ^ idx ) * load + 1 / 2 ^ idx * cur_load
*
* decay_load_missed ( ) below does efficient calculation of
* load = ( ( 2 ^ idx - 1 ) / 2 ^ idx ) ^ ( n - 1 ) * load
* avoiding 0. . n - 1 loop doing load = ( ( 2 ^ idx - 1 ) / 2 ^ idx ) * load
*
* The calculation is approximated on a 128 point scale .
* degrade_zero_ticks is the number of ticks after which load at any
* particular idx is approximated to be zero .
* degrade_factor is a precomputed table , a row for each load idx .
* Each column corresponds to degradation factor for a power of two ticks ,
* based on 128 point scale .
* Example :
* row 2 , col 3 ( = 12 ) says that the degradation at load idx 2 after
* 8 ticks is 12 / 128 ( which is an approximation of exact factor 3 ^ 8 / 4 ^ 8 ) .
*
* With this power of 2 load factors , we can degrade the load n times
* by looking at 1 bits in n and doing as many mult / shift instead of
* n mult / shifts needed by the exact degradation .
*/
# define DEGRADE_SHIFT 7
static const unsigned char
degrade_zero_ticks [ CPU_LOAD_IDX_MAX ] = { 0 , 8 , 32 , 64 , 128 } ;
static const unsigned char
degrade_factor [ CPU_LOAD_IDX_MAX ] [ DEGRADE_SHIFT + 1 ] = {
{ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ,
{ 64 , 32 , 8 , 0 , 0 , 0 , 0 , 0 } ,
{ 96 , 72 , 40 , 12 , 1 , 0 , 0 } ,
{ 112 , 98 , 75 , 43 , 15 , 1 , 0 } ,
{ 120 , 112 , 98 , 76 , 45 , 16 , 2 } } ;
/*
* Update cpu_load for any missed ticks , due to tickless idle . The backlog
* would be when CPU is idle and so we just decay the old load without
* adding any new load .
*/
static unsigned long
decay_load_missed ( unsigned long load , unsigned long missed_updates , int idx )
{
int j = 0 ;
if ( ! missed_updates )
return load ;
if ( missed_updates > = degrade_zero_ticks [ idx ] )
return 0 ;
if ( idx = = 1 )
return load > > missed_updates ;
while ( missed_updates ) {
if ( missed_updates % 2 )
load = ( load * degrade_factor [ idx ] [ j ] ) > > DEGRADE_SHIFT ;
missed_updates > > = 1 ;
j + + ;
}
return load ;
}
/*
* Update rq - > cpu_load [ ] statistics . This function is usually called every
* scheduler tick ( TICK_NSEC ) . With tickless idle this will not be called
* every tick . We fix it up based on jiffies .
*/
static void __update_cpu_load ( struct rq * this_rq , unsigned long this_load ,
unsigned long pending_updates )
{
int i , scale ;
this_rq - > nr_load_updates + + ;
/* Update our load: */
this_rq - > cpu_load [ 0 ] = this_load ; /* Fasttrack for idx 0 */
for ( i = 1 , scale = 2 ; i < CPU_LOAD_IDX_MAX ; i + + , scale + = scale ) {
unsigned long old_load , new_load ;
/* scale is effectively 1 << i now, and >> i divides by scale */
old_load = this_rq - > cpu_load [ i ] ;
old_load = decay_load_missed ( old_load , pending_updates - 1 , i ) ;
new_load = this_load ;
/*
* Round up the averaging division if load is increasing . This
* prevents us from getting stuck on 9 if the load is 10 , for
* example .
*/
if ( new_load > old_load )
new_load + = scale - 1 ;
this_rq - > cpu_load [ i ] = ( old_load * ( scale - 1 ) + new_load ) > > i ;
}
sched_avg_update ( this_rq ) ;
}
2013-06-20 10:18:50 +08:00
# ifdef CONFIG_SMP
2013-06-20 10:18:57 +08:00
static inline unsigned long get_rq_runnable_load ( struct rq * rq )
2013-06-20 10:18:50 +08:00
{
return rq - > cfs . runnable_load_avg ;
}
# else
2013-06-20 10:18:57 +08:00
static inline unsigned long get_rq_runnable_load ( struct rq * rq )
2013-06-20 10:18:50 +08:00
{
return rq - > load . weight ;
}
# endif
2013-04-19 15:10:49 -04:00
# ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
* causing off - by - one errors in observed deltas ; { 0 , 2 } instead of { 1 , 1 } .
*
* Therefore we cannot use the delta approach from the regular tick since that
* would seriously skew the load calculation . However we ' ll make do for those
* updates happening while idle ( nohz_idle_balance ) or coming out of idle
* ( tick_nohz_idle_exit ) .
*
* This means we might still be one tick off for nohz periods .
*/
/*
* Called from nohz_idle_balance ( ) to update the load ratings before doing the
* idle balance .
*/
void update_idle_cpu_load ( struct rq * this_rq )
{
unsigned long curr_jiffies = ACCESS_ONCE ( jiffies ) ;
2013-06-20 10:18:50 +08:00
unsigned long load = get_rq_runnable_load ( this_rq ) ;
2013-04-19 15:10:49 -04:00
unsigned long pending_updates ;
/*
* bail if there ' s load or we ' re actually up - to - date .
*/
if ( load | | curr_jiffies = = this_rq - > last_load_update_tick )
return ;
pending_updates = curr_jiffies - this_rq - > last_load_update_tick ;
this_rq - > last_load_update_tick = curr_jiffies ;
__update_cpu_load ( this_rq , load , pending_updates ) ;
}
/*
* Called from tick_nohz_idle_exit ( ) - - try and fix up the ticks we missed .
*/
void update_cpu_load_nohz ( void )
{
struct rq * this_rq = this_rq ( ) ;
unsigned long curr_jiffies = ACCESS_ONCE ( jiffies ) ;
unsigned long pending_updates ;
if ( curr_jiffies = = this_rq - > last_load_update_tick )
return ;
raw_spin_lock ( & this_rq - > lock ) ;
pending_updates = curr_jiffies - this_rq - > last_load_update_tick ;
if ( pending_updates ) {
this_rq - > last_load_update_tick = curr_jiffies ;
/*
* We were idle , this means load 0 , the current load might be
* ! 0 due to remote wakeups and the sort .
*/
__update_cpu_load ( this_rq , 0 , pending_updates ) ;
}
raw_spin_unlock ( & this_rq - > lock ) ;
}
# endif /* CONFIG_NO_HZ */
/*
* Called from scheduler_tick ( )
*/
void update_cpu_load_active ( struct rq * this_rq )
{
2013-06-20 10:18:50 +08:00
unsigned long load = get_rq_runnable_load ( this_rq ) ;
2013-04-19 15:10:49 -04:00
/*
* See the mess around update_idle_cpu_load ( ) / update_cpu_load_nohz ( ) .
*/
this_rq - > last_load_update_tick = jiffies ;
2013-06-20 10:18:50 +08:00
__update_cpu_load ( this_rq , load , 1 ) ;
2013-04-19 15:10:49 -04:00
calc_load_account_active ( this_rq ) ;
}