2007-07-09 20:51:58 +04:00
/*
* Completely Fair Scheduling ( CFS ) Class ( SCHED_NORMAL / SCHED_BATCH )
*
* Copyright ( C ) 2007 Red Hat , Inc . , Ingo Molnar < mingo @ redhat . com >
*
* Interactivity improvements by Mike Galbraith
* ( C ) 2007 Mike Galbraith < efault @ gmx . de >
*
* Various enhancements by Dmitry Adamushko .
* ( C ) 2007 Dmitry Adamushko < dmitry . adamushko @ gmail . com >
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation , 2007
* Author : Srivatsa Vaddagiri < vatsa @ linux . vnet . ibm . com >
*
* Scaled math optimizations by Thomas Gleixner
* Copyright ( C ) 2007 , Thomas Gleixner < tglx @ linutronix . de >
2007-08-25 20:41:53 +04:00
*
* Adaptive scheduling granularity , math enhancements by Peter Zijlstra
* Copyright ( C ) 2007 Red Hat , Inc . , Peter Zijlstra < pzijlstr @ redhat . com >
2007-07-09 20:51:58 +04:00
*/
2008-01-25 23:08:34 +03:00
# include <linux/latencytop.h>
2007-07-09 20:51:58 +04:00
/*
2007-08-25 20:41:53 +04:00
* Targeted preemption latency for CPU - bound tasks :
2007-11-26 23:21:49 +03:00
* ( default : 20 ms * ( 1 + ilog ( ncpus ) ) , units : nanoseconds )
2007-07-09 20:51:58 +04:00
*
2007-08-25 20:41:53 +04:00
* NOTE : this latency value is not the same as the concept of
2007-10-15 19:00:14 +04:00
* ' timeslice length ' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional , time - slice
* based scheduling concepts .
2007-07-09 20:51:58 +04:00
*
2007-10-15 19:00:14 +04:00
* ( to see the precise effective timeslice length of your workload ,
* run vmstat and monitor the context - switches ( cs ) field )
2007-07-09 20:51:58 +04:00
*/
2007-11-10 00:39:38 +03:00
unsigned int sysctl_sched_latency = 20000000ULL ;
2007-10-15 19:00:02 +04:00
/*
2007-11-10 00:39:37 +03:00
* Minimal preemption granularity for CPU - bound tasks :
2007-11-26 23:21:49 +03:00
* ( default : 4 msec * ( 1 + ilog ( ncpus ) ) , units : nanoseconds )
2007-10-15 19:00:02 +04:00
*/
2007-11-26 23:21:49 +03:00
unsigned int sysctl_sched_min_granularity = 4000000ULL ;
2007-08-25 20:41:53 +04:00
/*
2007-11-10 00:39:37 +03:00
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
2007-11-26 23:21:49 +03:00
static unsigned int sched_nr_latency = 5 ;
2007-11-10 00:39:37 +03:00
/*
* After fork , child runs first . ( default ) If set to 0 then
* parent will ( try to ) run first .
2007-08-25 20:41:53 +04:00
*/
2007-11-10 00:39:37 +03:00
const_debug unsigned int sysctl_sched_child_runs_first = 1 ;
2007-07-09 20:51:58 +04:00
2007-09-20 01:34:46 +04:00
/*
* sys_sched_yield ( ) compat mode
*
* This option switches the agressive yield implementation of the
* old scheduler back on .
*/
unsigned int __read_mostly sysctl_sched_compat_yield ;
2007-07-09 20:51:58 +04:00
/*
* SCHED_OTHER wake - up granularity .
2008-06-27 15:41:16 +04:00
* ( default : 5 msec * ( 1 + ilog ( ncpus ) ) , units : nanoseconds )
2007-07-09 20:51:58 +04:00
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over - scheduling . Synchronous workloads will still
* have immediate wakeup / sleep latencies .
*/
2008-06-27 15:41:16 +04:00
unsigned int sysctl_sched_wakeup_granularity = 5000000UL ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:18 +04:00
const_debug unsigned int sysctl_sched_migration_cost = 500000UL ;
2007-07-09 20:51:58 +04:00
/**************************************************************
* CFS operations on generic schedulable entities :
*/
2008-04-19 21:45:00 +04:00
static inline struct task_struct * task_of ( struct sched_entity * se )
{
return container_of ( se , struct task_struct , se ) ;
}
2007-10-15 19:00:03 +04:00
# ifdef CONFIG_FAIR_GROUP_SCHED
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:03 +04:00
/* cpu runqueue to which this cfs_rq is attached */
2007-07-09 20:51:58 +04:00
static inline struct rq * rq_of ( struct cfs_rq * cfs_rq )
{
2007-10-15 19:00:03 +04:00
return cfs_rq - > rq ;
2007-07-09 20:51:58 +04:00
}
2007-10-15 19:00:03 +04:00
/* An entity is a task if it doesn't "own" a runqueue */
# define entity_is_task(se) (!se->my_q)
2007-07-09 20:51:58 +04:00
2008-04-19 21:45:00 +04:00
/* Walk up scheduling entities hierarchy */
# define for_each_sched_entity(se) \
for ( ; se ; se = se - > parent )
static inline struct cfs_rq * task_cfs_rq ( struct task_struct * p )
{
return p - > se . cfs_rq ;
}
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq * cfs_rq_of ( struct sched_entity * se )
{
return se - > cfs_rq ;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq * group_cfs_rq ( struct sched_entity * grp )
{
return grp - > my_q ;
}
/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
* another cpu ( ' this_cpu ' )
*/
static inline struct cfs_rq * cpu_cfs_rq ( struct cfs_rq * cfs_rq , int this_cpu )
{
return cfs_rq - > tg - > cfs_rq [ this_cpu ] ;
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
# define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu ( cfs_rq , & rq - > leaf_cfs_rq_list , leaf_cfs_rq_list )
/* Do the two (enqueued) entities belong to the same group ? */
static inline int
is_same_group ( struct sched_entity * se , struct sched_entity * pse )
{
if ( se - > cfs_rq = = pse - > cfs_rq )
return 1 ;
return 0 ;
}
static inline struct sched_entity * parent_entity ( struct sched_entity * se )
{
return se - > parent ;
}
2007-10-15 19:00:03 +04:00
# else /* CONFIG_FAIR_GROUP_SCHED */
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:03 +04:00
static inline struct rq * rq_of ( struct cfs_rq * cfs_rq )
{
return container_of ( cfs_rq , struct rq , cfs ) ;
2007-07-09 20:51:58 +04:00
}
# define entity_is_task(se) 1
2008-04-19 21:45:00 +04:00
# define for_each_sched_entity(se) \
for ( ; se ; se = NULL )
2007-07-09 20:51:58 +04:00
2008-04-19 21:45:00 +04:00
static inline struct cfs_rq * task_cfs_rq ( struct task_struct * p )
2007-07-09 20:51:58 +04:00
{
2008-04-19 21:45:00 +04:00
return & task_rq ( p ) - > cfs ;
2007-07-09 20:51:58 +04:00
}
2008-04-19 21:45:00 +04:00
static inline struct cfs_rq * cfs_rq_of ( struct sched_entity * se )
{
struct task_struct * p = task_of ( se ) ;
struct rq * rq = task_rq ( p ) ;
return & rq - > cfs ;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq * group_cfs_rq ( struct sched_entity * grp )
{
return NULL ;
}
static inline struct cfs_rq * cpu_cfs_rq ( struct cfs_rq * cfs_rq , int this_cpu )
{
return & cpu_rq ( this_cpu ) - > cfs ;
}
# define for_each_leaf_cfs_rq(rq, cfs_rq) \
for ( cfs_rq = & rq - > cfs ; cfs_rq ; cfs_rq = NULL )
static inline int
is_same_group ( struct sched_entity * se , struct sched_entity * pse )
{
return 1 ;
}
static inline struct sched_entity * parent_entity ( struct sched_entity * se )
{
return NULL ;
}
# endif /* CONFIG_FAIR_GROUP_SCHED */
2007-07-09 20:51:58 +04:00
/**************************************************************
* Scheduling class tree data structure manipulation methods :
*/
2007-10-15 19:00:14 +04:00
static inline u64 max_vruntime ( u64 min_vruntime , u64 vruntime )
2007-10-15 19:00:07 +04:00
{
2007-10-15 19:00:11 +04:00
s64 delta = ( s64 ) ( vruntime - min_vruntime ) ;
if ( delta > 0 )
2007-10-15 19:00:07 +04:00
min_vruntime = vruntime ;
return min_vruntime ;
}
2007-10-15 19:00:14 +04:00
static inline u64 min_vruntime ( u64 min_vruntime , u64 vruntime )
2007-10-15 19:00:12 +04:00
{
s64 delta = ( s64 ) ( vruntime - min_vruntime ) ;
if ( delta < 0 )
min_vruntime = vruntime ;
return min_vruntime ;
}
2007-10-15 19:00:14 +04:00
static inline s64 entity_key ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-10-15 19:00:05 +04:00
{
2007-10-15 19:00:07 +04:00
return se - > vruntime - cfs_rq - > min_vruntime ;
2007-10-15 19:00:05 +04:00
}
2007-07-09 20:51:58 +04:00
/*
* Enqueue an entity into the rb - tree :
*/
2007-10-15 19:00:14 +04:00
static void __enqueue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
struct rb_node * * link = & cfs_rq - > tasks_timeline . rb_node ;
struct rb_node * parent = NULL ;
struct sched_entity * entry ;
2007-10-15 19:00:05 +04:00
s64 key = entity_key ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
int leftmost = 1 ;
/*
* Find the right place in the rbtree :
*/
while ( * link ) {
parent = * link ;
entry = rb_entry ( parent , struct sched_entity , run_node ) ;
/*
* We dont care about collisions . Nodes with
* the same key stay together .
*/
2007-10-15 19:00:05 +04:00
if ( key < entity_key ( cfs_rq , entry ) ) {
2007-07-09 20:51:58 +04:00
link = & parent - > rb_left ;
} else {
link = & parent - > rb_right ;
leftmost = 0 ;
}
}
/*
* Maintain a cache of leftmost tree entries ( it is frequently
* used ) :
*/
2008-03-14 22:55:51 +03:00
if ( leftmost ) {
2007-10-15 19:00:11 +04:00
cfs_rq - > rb_leftmost = & se - > run_node ;
2008-03-14 22:55:51 +03:00
/*
* maintain cfs_rq - > min_vruntime to be a monotonic increasing
* value tracking the leftmost vruntime in the tree .
*/
cfs_rq - > min_vruntime =
max_vruntime ( cfs_rq - > min_vruntime , se - > vruntime ) ;
}
2007-07-09 20:51:58 +04:00
rb_link_node ( & se - > run_node , parent , link ) ;
rb_insert_color ( & se - > run_node , & cfs_rq - > tasks_timeline ) ;
}
2007-10-15 19:00:14 +04:00
static void __dequeue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
2008-03-14 22:55:51 +03:00
if ( cfs_rq - > rb_leftmost = = & se - > run_node ) {
struct rb_node * next_node ;
struct sched_entity * next ;
next_node = rb_next ( & se - > run_node ) ;
cfs_rq - > rb_leftmost = next_node ;
if ( next_node ) {
next = rb_entry ( next_node ,
struct sched_entity , run_node ) ;
cfs_rq - > min_vruntime =
max_vruntime ( cfs_rq - > min_vruntime ,
next - > vruntime ) ;
}
}
2007-10-15 19:00:04 +04:00
2008-03-14 23:12:12 +03:00
if ( cfs_rq - > next = = se )
cfs_rq - > next = NULL ;
2007-07-09 20:51:58 +04:00
rb_erase ( & se - > run_node , & cfs_rq - > tasks_timeline ) ;
}
static inline struct rb_node * first_fair ( struct cfs_rq * cfs_rq )
{
return cfs_rq - > rb_leftmost ;
}
static struct sched_entity * __pick_next_entity ( struct cfs_rq * cfs_rq )
{
return rb_entry ( first_fair ( cfs_rq ) , struct sched_entity , run_node ) ;
}
2007-10-15 19:00:05 +04:00
static inline struct sched_entity * __pick_last_entity ( struct cfs_rq * cfs_rq )
{
2008-02-22 12:32:21 +03:00
struct rb_node * last = rb_last ( & cfs_rq - > tasks_timeline ) ;
2007-10-15 19:00:05 +04:00
2008-02-22 10:55:53 +03:00
if ( ! last )
return NULL ;
2008-02-22 12:32:21 +03:00
return rb_entry ( last , struct sched_entity , run_node ) ;
2007-10-15 19:00:05 +04:00
}
2007-07-09 20:51:58 +04:00
/**************************************************************
* Scheduling class statistics methods :
*/
2007-11-10 00:39:37 +03:00
# ifdef CONFIG_SCHED_DEBUG
int sched_nr_latency_handler ( struct ctl_table * table , int write ,
struct file * filp , void __user * buffer , size_t * lenp ,
loff_t * ppos )
{
int ret = proc_dointvec_minmax ( table , write , filp , buffer , lenp , ppos ) ;
if ( ret | | ! write )
return ret ;
sched_nr_latency = DIV_ROUND_UP ( sysctl_sched_latency ,
sysctl_sched_min_granularity ) ;
return 0 ;
}
# endif
2007-10-15 19:00:13 +04:00
2008-06-27 15:41:11 +04:00
/*
* delta * = w / rw
*/
static inline unsigned long
calc_delta_weight ( unsigned long delta , struct sched_entity * se )
{
for_each_sched_entity ( se ) {
delta = calc_delta_mine ( delta ,
se - > load . weight , & cfs_rq_of ( se ) - > load ) ;
}
return delta ;
}
/*
* delta * = rw / w
*/
static inline unsigned long
calc_delta_fair ( unsigned long delta , struct sched_entity * se )
{
for_each_sched_entity ( se ) {
delta = calc_delta_mine ( delta ,
cfs_rq_of ( se ) - > load . weight , & se - > load ) ;
}
return delta ;
}
2007-10-15 19:00:13 +04:00
/*
* The idea is to set a period in which each task runs once .
*
* When there are too many tasks ( sysctl_sched_nr_latency ) we have to stretch
* this period because otherwise the slices get too small .
*
* p = ( nr < = nl ) ? l : l * nr / nl
*/
2007-10-15 19:00:04 +04:00
static u64 __sched_period ( unsigned long nr_running )
{
u64 period = sysctl_sched_latency ;
2007-11-10 00:39:37 +03:00
unsigned long nr_latency = sched_nr_latency ;
2007-10-15 19:00:04 +04:00
if ( unlikely ( nr_running > nr_latency ) ) {
2008-01-25 23:08:21 +03:00
period = sysctl_sched_min_granularity ;
2007-10-15 19:00:04 +04:00
period * = nr_running ;
}
return period ;
}
2007-10-15 19:00:13 +04:00
/*
* We calculate the wall - time slice from the period by taking a part
* proportional to the weight .
*
* s = p * w / rw
*/
2007-10-15 19:00:05 +04:00
static u64 sched_slice ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-08-25 20:41:53 +04:00
{
2008-06-27 15:41:11 +04:00
return calc_delta_weight ( __sched_period ( cfs_rq - > nr_running ) , se ) ;
2007-07-09 20:51:58 +04:00
}
2007-10-15 19:00:13 +04:00
/*
2008-04-19 21:45:00 +04:00
* We calculate the vruntime slice of a to be inserted task
2007-10-15 19:00:13 +04:00
*
2008-06-27 15:41:11 +04:00
* vs = s * rw / w = p
2007-10-15 19:00:13 +04:00
*/
2008-04-19 21:45:00 +04:00
static u64 sched_vslice_add ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-10-15 19:00:10 +04:00
{
2008-04-19 21:45:00 +04:00
unsigned long nr_running = cfs_rq - > nr_running ;
2007-10-15 19:00:10 +04:00
2008-04-19 21:45:00 +04:00
if ( ! se - > on_rq )
nr_running + + ;
2007-10-15 19:00:10 +04:00
2008-06-27 15:41:11 +04:00
return __sched_period ( nr_running ) ;
}
/*
* The goal of calc_delta_asym ( ) is to be asymmetrically around NICE_0_LOAD , in
* that it favours > = 0 over < 0.
*
* - 20 |
* |
* 0 - - - - - - - - + - - - - - - -
* . '
* 19 . '
*
*/
static unsigned long
calc_delta_asym ( unsigned long delta , struct sched_entity * se )
{
struct load_weight lw = {
. weight = NICE_0_LOAD ,
. inv_weight = 1UL < < ( WMULT_SHIFT - NICE_0_SHIFT )
} ;
2007-10-15 19:00:12 +04:00
2008-04-19 21:45:00 +04:00
for_each_sched_entity ( se ) {
2008-06-27 15:41:11 +04:00
struct load_weight * se_lw = & se - > load ;
2008-06-27 15:41:13 +04:00
unsigned long rw = cfs_rq_of ( se ) - > load . weight ;
2008-04-19 21:45:00 +04:00
2008-06-27 15:41:12 +04:00
# ifdef CONFIG_FAIR_SCHED_GROUP
struct cfs_rq * cfs_rq = se - > my_q ;
struct task_group * tg = NULL
if ( cfs_rq )
tg = cfs_rq - > tg ;
if ( tg & & tg - > shares < NICE_0_LOAD ) {
/*
* scale shares to what it would have been had
* tg - > weight been NICE_0_LOAD :
*
* weight = 1024 * shares / tg - > weight
*/
lw . weight * = se - > load . weight ;
lw . weight / = tg - > shares ;
lw . inv_weight = 0 ;
se_lw = & lw ;
2008-06-27 15:41:13 +04:00
rw + = lw . weight - se - > load . weight ;
2008-06-27 15:41:12 +04:00
} else
# endif
2008-04-19 21:45:00 +04:00
2008-06-27 15:41:13 +04:00
if ( se - > load . weight < NICE_0_LOAD ) {
2008-06-27 15:41:11 +04:00
se_lw = & lw ;
2008-06-27 15:41:13 +04:00
rw + = NICE_0_LOAD - se - > load . weight ;
}
2008-04-19 21:45:00 +04:00
2008-06-27 15:41:13 +04:00
delta = calc_delta_mine ( delta , rw , se_lw ) ;
2008-04-19 21:45:00 +04:00
}
2008-06-27 15:41:11 +04:00
return delta ;
2007-10-15 19:00:10 +04:00
}
2007-07-09 20:51:58 +04:00
/*
* Update the current task ' s runtime statistics . Skip current tasks that
* are not in our scheduling class .
*/
static inline void
2007-10-15 19:00:03 +04:00
__update_curr ( struct cfs_rq * cfs_rq , struct sched_entity * curr ,
unsigned long delta_exec )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:06 +04:00
unsigned long delta_exec_weighted ;
2007-07-09 20:51:58 +04:00
2007-08-02 19:41:40 +04:00
schedstat_set ( curr - > exec_max , max ( ( u64 ) delta_exec , curr - > exec_max ) ) ;
2007-07-09 20:51:58 +04:00
curr - > sum_exec_runtime + = delta_exec ;
2007-10-15 19:00:06 +04:00
schedstat_add ( cfs_rq , exec_clock , delta_exec ) ;
2008-06-27 15:41:11 +04:00
delta_exec_weighted = calc_delta_fair ( delta_exec , curr ) ;
2007-10-15 19:00:04 +04:00
curr - > vruntime + = delta_exec_weighted ;
2007-07-09 20:51:58 +04:00
}
2007-08-09 13:16:47 +04:00
static void update_curr ( struct cfs_rq * cfs_rq )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:03 +04:00
struct sched_entity * curr = cfs_rq - > curr ;
2007-10-15 19:00:03 +04:00
u64 now = rq_of ( cfs_rq ) - > clock ;
2007-07-09 20:51:58 +04:00
unsigned long delta_exec ;
if ( unlikely ( ! curr ) )
return ;
/*
* Get the amount of time the current task was running
* since the last time we changed load ( this cannot
* overflow on 32 bits ) :
*/
2007-10-15 19:00:03 +04:00
delta_exec = ( unsigned long ) ( now - curr - > exec_start ) ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:03 +04:00
__update_curr ( cfs_rq , curr , delta_exec ) ;
curr - > exec_start = now ;
2007-12-02 22:04:49 +03:00
if ( entity_is_task ( curr ) ) {
struct task_struct * curtask = task_of ( curr ) ;
cpuacct_charge ( curtask , delta_exec ) ;
}
2007-07-09 20:51:58 +04:00
}
static inline void
2007-08-09 13:16:47 +04:00
update_stats_wait_start ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
2007-08-09 13:16:47 +04:00
schedstat_set ( se - > wait_start , rq_of ( cfs_rq ) - > clock ) ;
2007-07-09 20:51:58 +04:00
}
/*
* Task is being enqueued - update stats :
*/
2007-08-09 13:16:47 +04:00
static void update_stats_enqueue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
/*
* Are we enqueueing a waiting task ? ( for current tasks
* a dequeue / enqueue event is a NOP )
*/
2007-10-15 19:00:03 +04:00
if ( se ! = cfs_rq - > curr )
2007-08-09 13:16:47 +04:00
update_stats_wait_start ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
}
static void
2007-08-09 13:16:47 +04:00
update_stats_wait_end ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:06 +04:00
schedstat_set ( se - > wait_max , max ( se - > wait_max ,
rq_of ( cfs_rq ) - > clock - se - > wait_start ) ) ;
2008-01-25 23:08:35 +03:00
schedstat_set ( se - > wait_count , se - > wait_count + 1 ) ;
schedstat_set ( se - > wait_sum , se - > wait_sum +
rq_of ( cfs_rq ) - > clock - se - > wait_start ) ;
2007-08-02 19:41:40 +04:00
schedstat_set ( se - > wait_start , 0 ) ;
2007-07-09 20:51:58 +04:00
}
static inline void
2007-08-09 13:16:48 +04:00
update_stats_dequeue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
/*
* Mark the end of the wait period if dequeueing a
* waiting task :
*/
2007-10-15 19:00:03 +04:00
if ( se ! = cfs_rq - > curr )
2007-08-09 13:16:47 +04:00
update_stats_wait_end ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
}
/*
* We are picking a new current task - update its stats :
*/
static inline void
2007-08-09 13:16:47 +04:00
update_stats_curr_start ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
/*
* We are starting a new run period :
*/
2007-08-09 13:16:47 +04:00
se - > exec_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 20:51:58 +04:00
}
/**************************************************
* Scheduling class queueing methods :
*/
2008-06-27 15:41:14 +04:00
# if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
static void
add_cfs_task_weight ( struct cfs_rq * cfs_rq , unsigned long weight )
{
cfs_rq - > task_weight + = weight ;
}
# else
static inline void
add_cfs_task_weight ( struct cfs_rq * cfs_rq , unsigned long weight )
{
}
# endif
2007-10-15 19:00:07 +04:00
static void
account_entity_enqueue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
update_load_add ( & cfs_rq - > load , se - > load . weight ) ;
2008-06-27 15:41:14 +04:00
if ( ! parent_entity ( se ) )
inc_cpu_load ( rq_of ( cfs_rq ) , se - > load . weight ) ;
if ( entity_is_task ( se ) )
add_cfs_task_weight ( cfs_rq , se - > load . weight ) ;
2007-10-15 19:00:07 +04:00
cfs_rq - > nr_running + + ;
se - > on_rq = 1 ;
2008-04-19 21:45:00 +04:00
list_add ( & se - > group_node , & cfs_rq - > tasks ) ;
2007-10-15 19:00:07 +04:00
}
static void
account_entity_dequeue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
update_load_sub ( & cfs_rq - > load , se - > load . weight ) ;
2008-06-27 15:41:14 +04:00
if ( ! parent_entity ( se ) )
dec_cpu_load ( rq_of ( cfs_rq ) , se - > load . weight ) ;
if ( entity_is_task ( se ) )
add_cfs_task_weight ( cfs_rq , - se - > load . weight ) ;
2007-10-15 19:00:07 +04:00
cfs_rq - > nr_running - - ;
se - > on_rq = 0 ;
2008-04-19 21:45:00 +04:00
list_del_init ( & se - > group_node ) ;
2007-10-15 19:00:07 +04:00
}
2007-08-09 13:16:48 +04:00
static void enqueue_sleeper ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
# ifdef CONFIG_SCHEDSTATS
if ( se - > sleep_start ) {
2007-08-09 13:16:47 +04:00
u64 delta = rq_of ( cfs_rq ) - > clock - se - > sleep_start ;
2008-01-25 23:08:34 +03:00
struct task_struct * tsk = task_of ( se ) ;
2007-07-09 20:51:58 +04:00
if ( ( s64 ) delta < 0 )
delta = 0 ;
if ( unlikely ( delta > se - > sleep_max ) )
se - > sleep_max = delta ;
se - > sleep_start = 0 ;
se - > sum_sleep_runtime + = delta ;
2008-01-25 23:08:34 +03:00
account_scheduler_latency ( tsk , delta > > 10 , 1 ) ;
2007-07-09 20:51:58 +04:00
}
if ( se - > block_start ) {
2007-08-09 13:16:47 +04:00
u64 delta = rq_of ( cfs_rq ) - > clock - se - > block_start ;
2008-01-25 23:08:34 +03:00
struct task_struct * tsk = task_of ( se ) ;
2007-07-09 20:51:58 +04:00
if ( ( s64 ) delta < 0 )
delta = 0 ;
if ( unlikely ( delta > se - > block_max ) )
se - > block_max = delta ;
se - > block_start = 0 ;
se - > sum_sleep_runtime + = delta ;
2007-10-02 16:13:08 +04:00
/*
* Blocking time is in units of nanosecs , so shift by 20 to
* get a milliseconds - range estimation of the amount of
* time that the task spent sleeping :
*/
if ( unlikely ( prof_on = = SLEEP_PROFILING ) ) {
2007-10-15 19:00:06 +04:00
2007-10-02 16:13:08 +04:00
profile_hits ( SLEEP_PROFILING , ( void * ) get_wchan ( tsk ) ,
delta > > 20 ) ;
}
2008-01-25 23:08:34 +03:00
account_scheduler_latency ( tsk , delta > > 10 , 0 ) ;
2007-07-09 20:51:58 +04:00
}
# endif
}
2007-10-15 19:00:10 +04:00
static void check_spread ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
# ifdef CONFIG_SCHED_DEBUG
s64 d = se - > vruntime - cfs_rq - > min_vruntime ;
if ( d < 0 )
d = - d ;
if ( d > 3 * sysctl_sched_latency )
schedstat_inc ( cfs_rq , nr_spread_over ) ;
# endif
}
2007-10-15 19:00:05 +04:00
static void
place_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int initial )
{
2007-10-15 19:00:10 +04:00
u64 vruntime ;
2007-10-15 19:00:05 +04:00
2008-03-14 22:55:51 +03:00
if ( first_fair ( cfs_rq ) ) {
vruntime = min_vruntime ( cfs_rq - > min_vruntime ,
__pick_next_entity ( cfs_rq ) - > vruntime ) ;
} else
vruntime = cfs_rq - > min_vruntime ;
2007-10-15 19:00:05 +04:00
2007-11-10 00:39:37 +03:00
/*
* The ' current ' period is already promised to the current tasks ,
* however the extra weight of the new task will slow them down a
* little , place the new task so that it fits in the slot that
* stays open at the end .
*/
2007-10-15 19:00:05 +04:00
if ( initial & & sched_feat ( START_DEBIT ) )
2007-10-15 19:00:13 +04:00
vruntime + = sched_vslice_add ( cfs_rq , se ) ;
2007-10-15 19:00:05 +04:00
2007-10-15 19:00:11 +04:00
if ( ! initial ) {
2007-11-10 00:39:37 +03:00
/* sleeps upto a single latency don't count. */
2008-06-27 15:41:11 +04:00
if ( sched_feat ( NEW_FAIR_SLEEPERS ) ) {
unsigned long thresh = sysctl_sched_latency ;
/*
* convert the sleeper threshold into virtual time
*/
if ( sched_feat ( NORMALIZED_SLEEPER ) )
thresh = calc_delta_fair ( thresh , se ) ;
vruntime - = thresh ;
}
2007-10-15 19:00:11 +04:00
2007-11-10 00:39:37 +03:00
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime ( se - > vruntime , vruntime ) ;
2007-10-15 19:00:05 +04:00
}
2007-10-15 19:00:10 +04:00
se - > vruntime = vruntime ;
2007-10-15 19:00:05 +04:00
}
2007-07-09 20:51:58 +04:00
static void
2007-10-15 19:00:08 +04:00
enqueue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int wakeup )
2007-07-09 20:51:58 +04:00
{
/*
2007-10-15 19:00:13 +04:00
* Update run - time statistics of the ' current ' .
2007-07-09 20:51:58 +04:00
*/
2007-08-09 13:16:47 +04:00
update_curr ( cfs_rq ) ;
2008-05-06 01:56:17 +04:00
account_entity_enqueue ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:04 +04:00
if ( wakeup ) {
2007-10-15 19:00:05 +04:00
place_entity ( cfs_rq , se , 0 ) ;
2007-08-09 13:16:48 +04:00
enqueue_sleeper ( cfs_rq , se ) ;
2007-10-15 19:00:04 +04:00
}
2007-07-09 20:51:58 +04:00
2007-08-09 13:16:47 +04:00
update_stats_enqueue ( cfs_rq , se ) ;
2007-10-15 19:00:10 +04:00
check_spread ( cfs_rq , se ) ;
2007-10-15 19:00:08 +04:00
if ( se ! = cfs_rq - > curr )
__enqueue_entity ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
}
static void
2007-08-09 13:16:48 +04:00
dequeue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int sleep )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:13 +04:00
/*
* Update run - time statistics of the ' current ' .
*/
update_curr ( cfs_rq ) ;
2007-08-09 13:16:48 +04:00
update_stats_dequeue ( cfs_rq , se ) ;
2007-10-15 19:00:06 +04:00
if ( sleep ) {
2007-10-15 19:00:10 +04:00
# ifdef CONFIG_SCHEDSTATS
2007-07-09 20:51:58 +04:00
if ( entity_is_task ( se ) ) {
struct task_struct * tsk = task_of ( se ) ;
if ( tsk - > state & TASK_INTERRUPTIBLE )
2007-08-09 13:16:47 +04:00
se - > sleep_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 20:51:58 +04:00
if ( tsk - > state & TASK_UNINTERRUPTIBLE )
2007-08-09 13:16:47 +04:00
se - > block_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 20:51:58 +04:00
}
2007-10-15 19:00:06 +04:00
# endif
2007-10-15 19:00:10 +04:00
}
2007-10-15 19:00:08 +04:00
if ( se ! = cfs_rq - > curr )
2007-10-15 19:00:07 +04:00
__dequeue_entity ( cfs_rq , se ) ;
account_entity_dequeue ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
}
/*
* Preempt the current task with a newly woken task if needed :
*/
2007-09-05 16:32:49 +04:00
static void
2007-10-15 19:00:05 +04:00
check_preempt_tick ( struct cfs_rq * cfs_rq , struct sched_entity * curr )
2007-07-09 20:51:58 +04:00
{
2007-09-05 16:32:49 +04:00
unsigned long ideal_runtime , delta_exec ;
2007-10-15 19:00:05 +04:00
ideal_runtime = sched_slice ( cfs_rq , curr ) ;
2007-09-05 16:32:49 +04:00
delta_exec = curr - > sum_exec_runtime - curr - > prev_sum_exec_runtime ;
2007-11-10 00:39:39 +03:00
if ( delta_exec > ideal_runtime )
2007-07-09 20:51:58 +04:00
resched_task ( rq_of ( cfs_rq ) - > curr ) ;
}
2007-10-15 19:00:08 +04:00
static void
2007-08-09 13:16:48 +04:00
set_next_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:08 +04:00
/* 'current' is not kept within the tree. */
if ( se - > on_rq ) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU . So account for the time it spent waiting on the
* runqueue .
*/
update_stats_wait_end ( cfs_rq , se ) ;
__dequeue_entity ( cfs_rq , se ) ;
}
2007-08-09 13:16:47 +04:00
update_stats_curr_start ( cfs_rq , se ) ;
2007-10-15 19:00:03 +04:00
cfs_rq - > curr = se ;
2007-10-15 19:00:02 +04:00
# ifdef CONFIG_SCHEDSTATS
/*
* Track our maximum slice length , if the CPU ' s load is at
* least twice that of our own weight ( i . e . dont track it
* when there are only lesser - weight tasks around ) :
*/
2007-10-15 19:00:06 +04:00
if ( rq_of ( cfs_rq ) - > load . weight > = 2 * se - > load . weight ) {
2007-10-15 19:00:02 +04:00
se - > slice_max = max ( se - > slice_max ,
se - > sum_exec_runtime - se - > prev_sum_exec_runtime ) ;
}
# endif
2007-09-05 16:32:49 +04:00
se - > prev_sum_exec_runtime = se - > sum_exec_runtime ;
2007-07-09 20:51:58 +04:00
}
2008-03-14 23:12:12 +03:00
static struct sched_entity *
pick_next ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
2008-06-27 15:41:16 +04:00
struct rq * rq = rq_of ( cfs_rq ) ;
u64 pair_slice = rq - > clock - cfs_rq - > pair_start ;
2008-03-14 23:12:12 +03:00
2008-06-27 15:41:16 +04:00
if ( ! cfs_rq - > next | | pair_slice > sched_slice ( cfs_rq , cfs_rq - > next ) ) {
cfs_rq - > pair_start = rq - > clock ;
2008-03-14 23:12:12 +03:00
return se ;
2008-06-27 15:41:16 +04:00
}
2008-03-14 23:12:12 +03:00
return cfs_rq - > next ;
}
2007-08-09 13:16:48 +04:00
static struct sched_entity * pick_next_entity ( struct cfs_rq * cfs_rq )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:13 +04:00
struct sched_entity * se = NULL ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:13 +04:00
if ( first_fair ( cfs_rq ) ) {
se = __pick_next_entity ( cfs_rq ) ;
2008-03-14 23:12:12 +03:00
se = pick_next ( cfs_rq , se ) ;
2007-10-15 19:00:13 +04:00
set_next_entity ( cfs_rq , se ) ;
}
2007-07-09 20:51:58 +04:00
return se ;
}
2007-08-09 13:16:48 +04:00
static void put_prev_entity ( struct cfs_rq * cfs_rq , struct sched_entity * prev )
2007-07-09 20:51:58 +04:00
{
/*
* If still on the runqueue then deactivate_task ( )
* was not called and update_curr ( ) has to be done :
*/
if ( prev - > on_rq )
2007-08-09 13:16:47 +04:00
update_curr ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:10 +04:00
check_spread ( cfs_rq , prev ) ;
2007-10-15 19:00:07 +04:00
if ( prev - > on_rq ) {
2007-08-09 13:16:47 +04:00
update_stats_wait_start ( cfs_rq , prev ) ;
2007-10-15 19:00:07 +04:00
/* Put 'current' back into the tree. */
__enqueue_entity ( cfs_rq , prev ) ;
}
2007-10-15 19:00:03 +04:00
cfs_rq - > curr = NULL ;
2007-07-09 20:51:58 +04:00
}
2008-01-25 23:08:29 +03:00
static void
entity_tick ( struct cfs_rq * cfs_rq , struct sched_entity * curr , int queued )
2007-07-09 20:51:58 +04:00
{
/*
2007-10-15 19:00:07 +04:00
* Update run - time statistics of the ' current ' .
2007-07-09 20:51:58 +04:00
*/
2007-10-15 19:00:07 +04:00
update_curr ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
2008-01-25 23:08:29 +03:00
# ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice , so don ' t bother
* validating it and just reschedule .
*/
2008-04-25 05:17:55 +04:00
if ( queued ) {
resched_task ( rq_of ( cfs_rq ) - > curr ) ;
return ;
}
2008-01-25 23:08:29 +03:00
/*
* don ' t let the period tick interfere with the hrtick preemption
*/
if ( ! sched_feat ( DOUBLE_TICK ) & &
hrtimer_active ( & rq_of ( cfs_rq ) - > hrtick_timer ) )
return ;
# endif
2007-10-15 19:00:14 +04:00
if ( cfs_rq - > nr_running > 1 | | ! sched_feat ( WAKEUP_PREEMPT ) )
2007-10-15 19:00:05 +04:00
check_preempt_tick ( cfs_rq , curr ) ;
2007-07-09 20:51:58 +04:00
}
/**************************************************
* CFS operations on tasks :
*/
2008-01-25 23:08:29 +03:00
# ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair ( struct rq * rq , struct task_struct * p )
{
struct sched_entity * se = & p - > se ;
struct cfs_rq * cfs_rq = cfs_rq_of ( se ) ;
WARN_ON ( task_rq ( p ) ! = rq ) ;
if ( hrtick_enabled ( rq ) & & cfs_rq - > nr_running > 1 ) {
u64 slice = sched_slice ( cfs_rq , se ) ;
u64 ran = se - > sum_exec_runtime - se - > prev_sum_exec_runtime ;
s64 delta = slice - ran ;
if ( delta < 0 ) {
if ( rq - > curr = = p )
resched_task ( p ) ;
return ;
}
/*
* Don ' t schedule slices shorter than 10000 ns , that just
* doesn ' t make sense . Rely on vruntime for fairness .
*/
2008-07-18 20:01:23 +04:00
if ( rq - > curr ! = p )
2008-07-28 13:53:11 +04:00
delta = max_t ( s64 , 10000LL , delta ) ;
2008-01-25 23:08:29 +03:00
2008-07-18 20:01:23 +04:00
hrtick_start ( rq , delta ) ;
2008-01-25 23:08:29 +03:00
}
}
2008-06-24 22:09:43 +04:00
# else /* !CONFIG_SCHED_HRTICK */
2008-01-25 23:08:29 +03:00
static inline void
hrtick_start_fair ( struct rq * rq , struct task_struct * p )
{
}
# endif
2007-07-09 20:51:58 +04:00
/*
* The enqueue_task method is called before nr_running is
* increased . Here we update the fair scheduling stats and
* then put the task into the rbtree :
*/
2007-08-09 13:16:48 +04:00
static void enqueue_task_fair ( struct rq * rq , struct task_struct * p , int wakeup )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq ;
2008-02-25 19:34:02 +03:00
struct sched_entity * se = & p - > se ;
2007-07-09 20:51:58 +04:00
for_each_sched_entity ( se ) {
2008-02-25 19:34:02 +03:00
if ( se - > on_rq )
2007-07-09 20:51:58 +04:00
break ;
cfs_rq = cfs_rq_of ( se ) ;
2007-10-15 19:00:08 +04:00
enqueue_entity ( cfs_rq , se , wakeup ) ;
2007-10-15 19:00:12 +04:00
wakeup = 1 ;
2007-07-09 20:51:58 +04:00
}
2008-01-25 23:08:29 +03:00
hrtick_start_fair ( rq , rq - > curr ) ;
2007-07-09 20:51:58 +04:00
}
/*
* The dequeue_task method is called before nr_running is
* decreased . We remove the task from the rbtree and
* update the fair scheduling stats :
*/
2007-08-09 13:16:48 +04:00
static void dequeue_task_fair ( struct rq * rq , struct task_struct * p , int sleep )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq ;
2008-02-25 19:34:02 +03:00
struct sched_entity * se = & p - > se ;
2007-07-09 20:51:58 +04:00
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
2007-08-09 13:16:48 +04:00
dequeue_entity ( cfs_rq , se , sleep ) ;
2007-07-09 20:51:58 +04:00
/* Don't dequeue parent if it has other entities besides us */
2008-02-25 19:34:02 +03:00
if ( cfs_rq - > load . weight )
2007-07-09 20:51:58 +04:00
break ;
2007-10-15 19:00:12 +04:00
sleep = 1 ;
2007-07-09 20:51:58 +04:00
}
2008-01-25 23:08:29 +03:00
hrtick_start_fair ( rq , rq - > curr ) ;
2007-07-09 20:51:58 +04:00
}
/*
2007-09-20 01:34:46 +04:00
* sched_yield ( ) support is very simple - we dequeue and enqueue .
*
* If compat_yield is turned on then we requeue to the end of the tree .
2007-07-09 20:51:58 +04:00
*/
2007-10-15 19:00:08 +04:00
static void yield_task_fair ( struct rq * rq )
2007-07-09 20:51:58 +04:00
{
2007-12-04 19:04:39 +03:00
struct task_struct * curr = rq - > curr ;
struct cfs_rq * cfs_rq = task_cfs_rq ( curr ) ;
struct sched_entity * rightmost , * se = & curr - > se ;
2007-07-09 20:51:58 +04:00
/*
2007-09-20 01:34:46 +04:00
* Are we the only task in the tree ?
*/
if ( unlikely ( cfs_rq - > nr_running = = 1 ) )
return ;
2007-12-04 19:04:39 +03:00
if ( likely ( ! sysctl_sched_compat_yield ) & & curr - > policy ! = SCHED_BATCH ) {
2008-05-03 20:29:28 +04:00
update_rq_clock ( rq ) ;
2007-09-20 01:34:46 +04:00
/*
2007-10-15 19:00:13 +04:00
* Update run - time statistics of the ' current ' .
2007-09-20 01:34:46 +04:00
*/
2007-10-15 19:00:12 +04:00
update_curr ( cfs_rq ) ;
2007-09-20 01:34:46 +04:00
return ;
}
/*
* Find the rightmost entry in the rbtree :
2007-07-09 20:51:58 +04:00
*/
2007-10-15 19:00:12 +04:00
rightmost = __pick_last_entity ( cfs_rq ) ;
2007-09-20 01:34:46 +04:00
/*
* Already in the rightmost position ?
*/
2008-02-18 15:39:37 +03:00
if ( unlikely ( ! rightmost | | rightmost - > vruntime < se - > vruntime ) )
2007-09-20 01:34:46 +04:00
return ;
/*
* Minimally necessary key value to be last in the tree :
2007-10-15 19:00:12 +04:00
* Upon rescheduling , sched_class : : put_prev_task ( ) will place
* ' current ' within the tree based on its new key value .
2007-09-20 01:34:46 +04:00
*/
2007-10-15 19:00:07 +04:00
se - > vruntime = rightmost - > vruntime + 1 ;
2007-07-09 20:51:58 +04:00
}
2008-01-25 23:08:09 +03:00
/*
* wake_idle ( ) will wake a task on an idle cpu if task - > cpu is
* not idle and an idle cpu is available . The span of cpus to
* search starts with cpus closest then further out as needed ,
* so we always favor a closer , idle cpu .
2008-07-15 15:43:49 +04:00
* Domains may include CPUs that are not usable for migration ,
* hence we need to mask them out ( cpu_active_map )
2008-01-25 23:08:09 +03:00
*
* Returns the CPU we should wake onto .
*/
# if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle ( int cpu , struct task_struct * p )
{
cpumask_t tmp ;
struct sched_domain * sd ;
int i ;
/*
* If it is idle , then it is the best cpu to run this task .
*
* This cpu is also the best , if it has more than one task already .
* Siblings must be also busy ( in most cases ) as they didn ' t already
* pickup the extra load from this cpu and hence we need not check
* sibling runqueue info . This will avoid the checks and cache miss
* penalities associated with that .
*/
2008-04-28 20:40:01 +04:00
if ( idle_cpu ( cpu ) | | cpu_rq ( cpu ) - > cfs . nr_running > 1 )
2008-01-25 23:08:09 +03:00
return cpu ;
for_each_domain ( cpu , sd ) {
2008-04-15 09:04:23 +04:00
if ( ( sd - > flags & SD_WAKE_IDLE )
| | ( ( sd - > flags & SD_WAKE_IDLE_FAR )
& & ! task_hot ( p , task_rq ( p ) - > clock , sd ) ) ) {
2008-01-25 23:08:09 +03:00
cpus_and ( tmp , sd - > span , p - > cpus_allowed ) ;
2008-07-15 15:43:49 +04:00
cpus_and ( tmp , tmp , cpu_active_map ) ;
2008-05-12 23:21:13 +04:00
for_each_cpu_mask_nr ( i , tmp ) {
2008-01-25 23:08:09 +03:00
if ( idle_cpu ( i ) ) {
if ( i ! = task_cpu ( p ) ) {
schedstat_inc ( p ,
se . nr_wakeups_idle ) ;
}
return i ;
}
}
} else {
break ;
}
}
return cpu ;
}
2008-06-24 22:09:43 +04:00
# else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
2008-01-25 23:08:09 +03:00
static inline int wake_idle ( int cpu , struct task_struct * p )
{
return cpu ;
}
# endif
# ifdef CONFIG_SMP
2008-03-16 22:36:10 +03:00
2008-03-19 03:42:00 +03:00
static const struct sched_class fair_sched_class ;
2008-06-27 15:41:27 +04:00
# ifdef CONFIG_FAIR_GROUP_SCHED
2008-06-27 15:41:39 +04:00
/*
* effective_load ( ) calculates the load change as seen from the root_task_group
*
* Adding load to a group doesn ' t make a group heavier , but can cause movement
* of group shares between cpus . Assuming the shares were perfectly aligned one
* can calculate the shift in shares .
*
* The problem is that perfectly aligning the shares is rather expensive , hence
* we try to avoid doing that too often - see update_shares ( ) , which ratelimits
* this change .
*
* We compensate this by not only taking the current delta into account , but
* also considering the delta between when the shares were last adjusted and
* now .
*
* We still saw a performance dip , some tracing learned us that between
* cgroup : / and cgroup : / foo balancing the number of affine wakeups increased
* significantly . Therefore try to bias the error in direction of failing
* the affine wakeup .
*
*/
2008-06-27 15:41:38 +04:00
static long effective_load ( struct task_group * tg , int cpu ,
long wl , long wg )
2008-06-27 15:41:27 +04:00
{
2008-06-27 15:41:30 +04:00
struct sched_entity * se = tg - > se [ cpu ] ;
2008-06-27 15:41:38 +04:00
long more_w ;
if ( ! tg - > parent )
return wl ;
2008-06-27 15:41:39 +04:00
/*
* By not taking the decrease of shares on the other cpu into
* account our error leans towards reducing the affine wakeups .
*/
if ( ! wl & & sched_feat ( ASYM_EFF_LOAD ) )
return wl ;
2008-06-27 15:41:38 +04:00
/*
* Instead of using this increment , also add the difference
* between when the shares were last updated and now .
*/
more_w = se - > my_q - > load . weight - se - > my_q - > rq_weight ;
wl + = more_w ;
wg + = more_w ;
2008-06-27 15:41:27 +04:00
2008-06-27 15:41:30 +04:00
for_each_sched_entity ( se ) {
# define D(n) (likely(n) ? (n) : 1)
2008-06-27 15:41:32 +04:00
long S , rw , s , a , b ;
2008-06-27 15:41:30 +04:00
S = se - > my_q - > tg - > shares ;
s = se - > my_q - > shares ;
2008-06-27 15:41:38 +04:00
rw = se - > my_q - > rq_weight ;
2008-06-27 15:41:27 +04:00
2008-06-27 15:41:32 +04:00
a = S * ( rw + wl ) ;
b = S * rw + s * wg ;
2008-06-27 15:41:30 +04:00
2008-06-27 15:41:32 +04:00
wl = s * ( a - b ) / D ( b ) ;
sched: correct wakeup weight calculations
rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}
wakeup on cpu0, weight=1
rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}
s_0 = S * rw_0 / \Sum rw_j ->
\Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)
s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
1 * (2+1) / (7+1) = 3/8 (correct
so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-27 15:41:37 +04:00
/*
* Assume the group is already running and will
* thus already be accounted for in the weight .
*
* That is , moving shares between CPUs , does not
* alter the group weight .
*/
2008-06-27 15:41:30 +04:00
wg = 0 ;
# undef D
}
2008-06-27 15:41:27 +04:00
2008-06-27 15:41:30 +04:00
return wl ;
2008-06-27 15:41:27 +04:00
}
2008-06-27 15:41:30 +04:00
2008-06-27 15:41:27 +04:00
# else
2008-06-27 15:41:30 +04:00
sched: correct wakeup weight calculations
rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}
wakeup on cpu0, weight=1
rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}
s_0 = S * rw_0 / \Sum rw_j ->
\Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)
s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
1 * (2+1) / (7+1) = 3/8 (correct
so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-27 15:41:37 +04:00
static inline unsigned long effective_load ( struct task_group * tg , int cpu ,
unsigned long wl , unsigned long wg )
2008-06-27 15:41:30 +04:00
{
sched: correct wakeup weight calculations
rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}
wakeup on cpu0, weight=1
rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}
s_0 = S * rw_0 / \Sum rw_j ->
\Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)
s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
1 * (2+1) / (7+1) = 3/8 (correct
so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-27 15:41:37 +04:00
return wl ;
2008-06-27 15:41:27 +04:00
}
2008-06-27 15:41:30 +04:00
2008-06-27 15:41:27 +04:00
# endif
2008-03-16 22:36:10 +03:00
static int
2008-03-19 03:42:00 +03:00
wake_affine ( struct rq * rq , struct sched_domain * this_sd , struct rq * this_rq ,
struct task_struct * p , int prev_cpu , int this_cpu , int sync ,
int idx , unsigned long load , unsigned long this_load ,
2008-03-16 22:36:10 +03:00
unsigned int imbalance )
{
2008-03-19 03:42:00 +03:00
struct task_struct * curr = this_rq - > curr ;
sched: correct wakeup weight calculations
rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}
wakeup on cpu0, weight=1
rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}
s_0 = S * rw_0 / \Sum rw_j ->
\Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)
s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
1 * (2+1) / (7+1) = 3/8 (correct
so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-27 15:41:37 +04:00
struct task_group * tg ;
2008-03-16 22:36:10 +03:00
unsigned long tl = this_load ;
unsigned long tl_per_task ;
sched: correct wakeup weight calculations
rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}
wakeup on cpu0, weight=1
rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}
s_0 = S * rw_0 / \Sum rw_j ->
\Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)
s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
1 * (2+1) / (7+1) = 3/8 (correct
so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-27 15:41:37 +04:00
unsigned long weight ;
2008-05-29 13:11:41 +04:00
int balanced ;
2008-03-16 22:36:10 +03:00
2008-05-29 13:11:41 +04:00
if ( ! ( this_sd - > flags & SD_WAKE_AFFINE ) | | ! sched_feat ( AFFINE_WAKEUPS ) )
2008-03-16 22:36:10 +03:00
return 0 ;
2008-05-29 13:11:41 +04:00
/*
* If sync wakeup then subtract the ( maximum possible )
* effect of the currently running task from the load
* of the current CPU :
*/
sched: correct wakeup weight calculations
rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}
wakeup on cpu0, weight=1
rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}
s_0 = S * rw_0 / \Sum rw_j ->
\Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)
s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
1 * (2+1) / (7+1) = 3/8 (correct
so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-27 15:41:37 +04:00
if ( sync ) {
tg = task_group ( current ) ;
weight = current - > se . load . weight ;
tl + = effective_load ( tg , this_cpu , - weight , - weight ) ;
load + = effective_load ( tg , prev_cpu , 0 , - weight ) ;
}
2008-05-29 13:11:41 +04:00
sched: correct wakeup weight calculations
rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}
wakeup on cpu0, weight=1
rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}
s_0 = S * rw_0 / \Sum rw_j ->
\Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)
s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
1 * (2+1) / (7+1) = 3/8 (correct
so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-27 15:41:37 +04:00
tg = task_group ( p ) ;
weight = p - > se . load . weight ;
2008-05-29 13:11:41 +04:00
sched: correct wakeup weight calculations
rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}
wakeup on cpu0, weight=1
rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}
s_0 = S * rw_0 / \Sum rw_j ->
\Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)
s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
1 * (2+1) / (7+1) = 3/8 (correct
so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-27 15:41:37 +04:00
balanced = 100 * ( tl + effective_load ( tg , this_cpu , weight , weight ) ) < =
imbalance * ( load + effective_load ( tg , prev_cpu , 0 , weight ) ) ;
2008-05-29 13:11:41 +04:00
2008-03-16 22:36:10 +03:00
/*
2008-03-19 03:42:00 +03:00
* If the currently running task will sleep within
* a reasonable amount of time then attract this newly
* woken task :
2008-03-16 22:36:10 +03:00
*/
2008-06-28 00:30:00 +04:00
if ( sync & & balanced ) {
2008-03-19 03:42:00 +03:00
if ( curr - > se . avg_overlap < sysctl_sched_migration_cost & &
2008-06-28 00:30:00 +04:00
p - > se . avg_overlap < sysctl_sched_migration_cost )
2008-03-19 03:42:00 +03:00
return 1 ;
}
2008-03-16 22:36:10 +03:00
schedstat_inc ( p , se . nr_wakeups_affine_attempts ) ;
tl_per_task = cpu_avg_load_per_task ( this_cpu ) ;
2008-03-16 22:56:26 +03:00
if ( ( tl < = load & & tl + target_load ( prev_cpu , idx ) < = tl_per_task ) | |
2008-05-29 13:11:41 +04:00
balanced ) {
2008-03-16 22:36:10 +03:00
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain , and
* there is no bad imbalance .
*/
schedstat_inc ( this_sd , ttwu_move_affine ) ;
schedstat_inc ( p , se . nr_wakeups_affine ) ;
return 1 ;
}
return 0 ;
}
2008-01-25 23:08:09 +03:00
static int select_task_rq_fair ( struct task_struct * p , int sync )
{
struct sched_domain * sd , * this_sd = NULL ;
2008-03-16 22:56:26 +03:00
int prev_cpu , this_cpu , new_cpu ;
2008-03-16 22:36:10 +03:00
unsigned long load , this_load ;
2008-03-19 03:42:00 +03:00
struct rq * rq , * this_rq ;
2008-03-16 22:36:10 +03:00
unsigned int imbalance ;
int idx ;
2008-01-25 23:08:09 +03:00
2008-03-16 22:56:26 +03:00
prev_cpu = task_cpu ( p ) ;
rq = task_rq ( p ) ;
this_cpu = smp_processor_id ( ) ;
2008-03-19 03:42:00 +03:00
this_rq = cpu_rq ( this_cpu ) ;
2008-03-16 22:56:26 +03:00
new_cpu = prev_cpu ;
2008-01-25 23:08:09 +03:00
2008-03-16 22:56:26 +03:00
/*
* ' this_sd ' is the first domain that both
* this_cpu and prev_cpu are present in :
*/
2008-01-25 23:08:09 +03:00
for_each_domain ( this_cpu , sd ) {
2008-03-16 22:56:26 +03:00
if ( cpu_isset ( prev_cpu , sd - > span ) ) {
2008-01-25 23:08:09 +03:00
this_sd = sd ;
break ;
}
}
if ( unlikely ( ! cpu_isset ( this_cpu , p - > cpus_allowed ) ) )
2008-03-16 23:21:47 +03:00
goto out ;
2008-01-25 23:08:09 +03:00
/*
* Check for affine wakeup and passive balancing possibilities .
*/
2008-03-16 22:36:10 +03:00
if ( ! this_sd )
2008-03-16 23:21:47 +03:00
goto out ;
2008-01-25 23:08:09 +03:00
2008-03-16 22:36:10 +03:00
idx = this_sd - > wake_idx ;
imbalance = 100 + ( this_sd - > imbalance_pct - 100 ) / 2 ;
2008-03-16 22:56:26 +03:00
load = source_load ( prev_cpu , idx ) ;
2008-03-16 22:36:10 +03:00
this_load = target_load ( this_cpu , idx ) ;
2008-03-19 03:42:00 +03:00
if ( wake_affine ( rq , this_sd , this_rq , p , prev_cpu , this_cpu , sync , idx ,
load , this_load , imbalance ) )
return this_cpu ;
if ( prev_cpu = = this_cpu )
2008-03-16 23:21:47 +03:00
goto out ;
2008-03-16 22:36:10 +03:00
/*
* Start passive balancing when half the imbalance_pct
* limit is reached .
*/
if ( this_sd - > flags & SD_WAKE_BALANCE ) {
if ( imbalance * this_load < = 100 * load ) {
schedstat_inc ( this_sd , ttwu_move_balance ) ;
schedstat_inc ( p , se . nr_wakeups_passive ) ;
2008-03-19 03:42:00 +03:00
return this_cpu ;
2008-01-25 23:08:09 +03:00
}
}
2008-03-16 23:21:47 +03:00
out :
2008-01-25 23:08:09 +03:00
return wake_idle ( new_cpu , p ) ;
}
# endif /* CONFIG_SMP */
2008-04-19 21:44:57 +04:00
static unsigned long wakeup_gran ( struct sched_entity * se )
{
unsigned long gran = sysctl_sched_wakeup_granularity ;
/*
2008-06-27 15:41:11 +04:00
* More easily preempt - nice tasks , while not making it harder for
* + nice tasks .
2008-04-19 21:44:57 +04:00
*/
2008-06-27 15:41:12 +04:00
if ( sched_feat ( ASYM_GRAN ) )
gran = calc_delta_asym ( sysctl_sched_wakeup_granularity , se ) ;
else
gran = calc_delta_fair ( sysctl_sched_wakeup_granularity , se ) ;
2008-04-19 21:44:57 +04:00
return gran ;
}
/*
* Should ' se ' preempt ' curr ' .
*
* | s1
* | s2
* | s3
* g
* | < - - - > | c
*
* w ( c , s1 ) = - 1
* w ( c , s2 ) = 0
* w ( c , s3 ) = 1
*
*/
static int
wakeup_preempt_entity ( struct sched_entity * curr , struct sched_entity * se )
{
s64 gran , vdiff = curr - > vruntime - se - > vruntime ;
if ( vdiff < 0 )
return - 1 ;
gran = wakeup_gran ( curr ) ;
if ( vdiff > gran )
return 1 ;
return 0 ;
}
2008-01-25 23:08:09 +03:00
2008-04-19 21:44:59 +04:00
/* return depth at which a sched entity is present in the hierarchy */
static inline int depth_se ( struct sched_entity * se )
{
int depth = 0 ;
for_each_sched_entity ( se )
depth + + ;
return depth ;
}
2007-07-09 20:51:58 +04:00
/*
* Preempt the current task with a newly woken task if needed :
*/
2008-09-21 01:38:02 +04:00
static void check_preempt_wakeup ( struct rq * rq , struct task_struct * p , int sync )
2007-07-09 20:51:58 +04:00
{
struct task_struct * curr = rq - > curr ;
2007-10-15 19:00:12 +04:00
struct cfs_rq * cfs_rq = task_cfs_rq ( curr ) ;
2007-10-15 19:00:12 +04:00
struct sched_entity * se = & curr - > se , * pse = & p - > se ;
2008-04-19 21:44:59 +04:00
int se_depth , pse_depth ;
2007-07-09 20:51:58 +04:00
if ( unlikely ( rt_prio ( p - > prio ) ) ) {
2007-08-09 13:16:47 +04:00
update_rq_clock ( rq ) ;
2007-08-09 13:16:47 +04:00
update_curr ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
resched_task ( curr ) ;
return ;
}
2008-03-14 23:12:12 +03:00
2008-03-19 03:42:00 +03:00
if ( unlikely ( se = = pse ) )
return ;
2008-08-28 13:12:49 +04:00
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path .
*/
if ( test_tsk_need_resched ( curr ) )
return ;
2008-03-14 23:12:12 +03:00
cfs_rq_of ( pse ) - > next = pse ;
2007-10-15 19:00:18 +04:00
/*
* Batch tasks do not preempt ( their preemption is driven by
* the tick ) :
*/
if ( unlikely ( p - > policy = = SCHED_BATCH ) )
return ;
2007-07-09 20:51:58 +04:00
2007-11-10 00:39:39 +03:00
if ( ! sched_feat ( WAKEUP_PREEMPT ) )
return ;
2007-10-15 19:00:12 +04:00
2008-09-21 01:38:02 +04:00
if ( sched_feat ( WAKEUP_OVERLAP ) & & sync & &
se - > avg_overlap < sysctl_sched_migration_cost & &
pse - > avg_overlap < sysctl_sched_migration_cost ) {
resched_task ( curr ) ;
return ;
}
2008-04-19 21:44:59 +04:00
/*
* preemption test can be made between sibling entities who are in the
* same cfs_rq i . e who have a common parent . Walk up the hierarchy of
* both tasks until we find their ancestors who are siblings of common
* parent .
*/
/* First walk up until both entities are at same depth */
se_depth = depth_se ( se ) ;
pse_depth = depth_se ( pse ) ;
while ( se_depth > pse_depth ) {
se_depth - - ;
se = parent_entity ( se ) ;
}
while ( pse_depth > se_depth ) {
pse_depth - - ;
pse = parent_entity ( pse ) ;
}
2007-11-10 00:39:39 +03:00
while ( ! is_same_group ( se , pse ) ) {
se = parent_entity ( se ) ;
pse = parent_entity ( pse ) ;
2007-10-15 19:00:14 +04:00
}
2007-11-10 00:39:39 +03:00
2008-04-19 21:44:57 +04:00
if ( wakeup_preempt_entity ( se , pse ) = = 1 )
2007-11-10 00:39:39 +03:00
resched_task ( curr ) ;
2007-07-09 20:51:58 +04:00
}
2007-08-09 13:16:48 +04:00
static struct task_struct * pick_next_task_fair ( struct rq * rq )
2007-07-09 20:51:58 +04:00
{
2008-01-25 23:08:29 +03:00
struct task_struct * p ;
2007-07-09 20:51:58 +04:00
struct cfs_rq * cfs_rq = & rq - > cfs ;
struct sched_entity * se ;
if ( unlikely ( ! cfs_rq - > nr_running ) )
return NULL ;
do {
2007-08-09 13:16:48 +04:00
se = pick_next_entity ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
cfs_rq = group_cfs_rq ( se ) ;
} while ( cfs_rq ) ;
2008-01-25 23:08:29 +03:00
p = task_of ( se ) ;
hrtick_start_fair ( rq , p ) ;
return p ;
2007-07-09 20:51:58 +04:00
}
/*
* Account for a descheduled task :
*/
2007-08-09 13:16:49 +04:00
static void put_prev_task_fair ( struct rq * rq , struct task_struct * prev )
2007-07-09 20:51:58 +04:00
{
struct sched_entity * se = & prev - > se ;
struct cfs_rq * cfs_rq ;
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
2007-08-09 13:16:48 +04:00
put_prev_entity ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
}
}
2007-10-24 20:23:51 +04:00
# ifdef CONFIG_SMP
2007-07-09 20:51:58 +04:00
/**************************************************
* Fair scheduling class load - balancing methods :
*/
/*
* Load - balancing iterator . Note : while the runqueue stays locked
* during the whole iteration , the current task might be
* dequeued so the iterator has to be dequeue - safe . Here we
* achieve that by always pre - iterating before returning
* the current task :
*/
2007-10-15 19:00:13 +04:00
static struct task_struct *
2008-04-19 21:45:00 +04:00
__load_balance_iterator ( struct cfs_rq * cfs_rq , struct list_head * next )
2007-07-09 20:51:58 +04:00
{
2008-04-19 21:44:59 +04:00
struct task_struct * p = NULL ;
struct sched_entity * se ;
2007-07-09 20:51:58 +04:00
2008-08-11 15:32:02 +04:00
if ( next = = & cfs_rq - > tasks )
return NULL ;
/* Skip over entities that are not tasks */
do {
2008-04-19 21:45:00 +04:00
se = list_entry ( next , struct sched_entity , group_node ) ;
next = next - > next ;
2008-08-11 15:32:02 +04:00
} while ( next ! = & cfs_rq - > tasks & & ! entity_is_task ( se ) ) ;
2008-04-19 21:44:59 +04:00
2008-09-06 13:20:23 +04:00
if ( next = = & cfs_rq - > tasks & & ! entity_is_task ( se ) )
2008-08-11 15:32:02 +04:00
return NULL ;
2008-04-19 21:45:00 +04:00
cfs_rq - > balance_iterator = next ;
2008-08-11 15:32:02 +04:00
if ( entity_is_task ( se ) )
p = task_of ( se ) ;
2007-07-09 20:51:58 +04:00
return p ;
}
static struct task_struct * load_balance_start_fair ( void * arg )
{
struct cfs_rq * cfs_rq = arg ;
2008-04-19 21:45:00 +04:00
return __load_balance_iterator ( cfs_rq , cfs_rq - > tasks . next ) ;
2007-07-09 20:51:58 +04:00
}
static struct task_struct * load_balance_next_fair ( void * arg )
{
struct cfs_rq * cfs_rq = arg ;
2008-04-19 21:45:00 +04:00
return __load_balance_iterator ( cfs_rq , cfs_rq - > balance_iterator ) ;
2007-07-09 20:51:58 +04:00
}
2008-06-27 15:41:14 +04:00
static unsigned long
__load_balance_fair ( struct rq * this_rq , int this_cpu , struct rq * busiest ,
unsigned long max_load_move , struct sched_domain * sd ,
enum cpu_idle_type idle , int * all_pinned , int * this_best_prio ,
struct cfs_rq * cfs_rq )
2008-02-25 19:34:02 +03:00
{
2008-06-27 15:41:14 +04:00
struct rq_iterator cfs_rq_iterator ;
2008-02-25 19:34:02 +03:00
2008-06-27 15:41:14 +04:00
cfs_rq_iterator . start = load_balance_start_fair ;
cfs_rq_iterator . next = load_balance_next_fair ;
cfs_rq_iterator . arg = cfs_rq ;
2008-02-25 19:34:02 +03:00
2008-06-27 15:41:14 +04:00
return balance_tasks ( this_rq , this_cpu , busiest ,
max_load_move , sd , idle , all_pinned ,
this_best_prio , & cfs_rq_iterator ) ;
2008-02-25 19:34:02 +03:00
}
2008-06-27 15:41:14 +04:00
# ifdef CONFIG_FAIR_GROUP_SCHED
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 13:16:46 +04:00
static unsigned long
2007-07-09 20:51:58 +04:00
load_balance_fair ( struct rq * this_rq , int this_cpu , struct rq * busiest ,
2007-10-24 20:23:51 +04:00
unsigned long max_load_move ,
2007-08-09 13:16:46 +04:00
struct sched_domain * sd , enum cpu_idle_type idle ,
int * all_pinned , int * this_best_prio )
2007-07-09 20:51:58 +04:00
{
long rem_load_move = max_load_move ;
2008-06-27 15:41:14 +04:00
int busiest_cpu = cpu_of ( busiest ) ;
struct task_group * tg ;
2008-04-19 21:45:00 +04:00
2008-06-27 15:41:14 +04:00
rcu_read_lock ( ) ;
2008-06-27 15:41:23 +04:00
update_h_load ( busiest_cpu ) ;
2008-04-19 21:45:00 +04:00
2008-09-22 21:06:09 +04:00
list_for_each_entry_rcu ( tg , & task_groups , list ) {
2008-06-27 15:41:23 +04:00
struct cfs_rq * busiest_cfs_rq = tg - > cfs_rq [ busiest_cpu ] ;
2008-06-27 15:41:29 +04:00
unsigned long busiest_h_load = busiest_cfs_rq - > h_load ;
unsigned long busiest_weight = busiest_cfs_rq - > load . weight ;
2008-06-27 15:41:36 +04:00
u64 rem_load , moved_load ;
2008-04-19 21:45:00 +04:00
2008-06-27 15:41:14 +04:00
/*
* empty group
*/
2008-06-27 15:41:23 +04:00
if ( ! busiest_cfs_rq - > task_weight )
2007-07-09 20:51:58 +04:00
continue ;
2008-06-27 15:41:36 +04:00
rem_load = ( u64 ) rem_load_move * busiest_weight ;
rem_load = div_u64 ( rem_load , busiest_h_load + 1 ) ;
2007-07-09 20:51:58 +04:00
2008-06-27 15:41:14 +04:00
moved_load = __load_balance_fair ( this_rq , this_cpu , busiest ,
2008-06-27 15:41:20 +04:00
rem_load , sd , idle , all_pinned , this_best_prio ,
2008-06-27 15:41:14 +04:00
tg - > cfs_rq [ busiest_cpu ] ) ;
2007-07-09 20:51:58 +04:00
2008-06-27 15:41:14 +04:00
if ( ! moved_load )
2007-07-09 20:51:58 +04:00
continue ;
2008-06-27 15:41:29 +04:00
moved_load * = busiest_h_load ;
2008-06-27 15:41:36 +04:00
moved_load = div_u64 ( moved_load , busiest_weight + 1 ) ;
2007-07-09 20:51:58 +04:00
2008-06-27 15:41:14 +04:00
rem_load_move - = moved_load ;
if ( rem_load_move < 0 )
2007-07-09 20:51:58 +04:00
break ;
}
2008-06-27 15:41:14 +04:00
rcu_read_unlock ( ) ;
2007-07-09 20:51:58 +04:00
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 13:16:46 +04:00
return max_load_move - rem_load_move ;
2007-07-09 20:51:58 +04:00
}
2008-06-27 15:41:14 +04:00
# else
static unsigned long
load_balance_fair ( struct rq * this_rq , int this_cpu , struct rq * busiest ,
unsigned long max_load_move ,
struct sched_domain * sd , enum cpu_idle_type idle ,
int * all_pinned , int * this_best_prio )
{
return __load_balance_fair ( this_rq , this_cpu , busiest ,
max_load_move , sd , idle , all_pinned ,
this_best_prio , & busiest - > cfs ) ;
}
# endif
2007-07-09 20:51:58 +04:00
2007-10-24 20:23:51 +04:00
static int
move_one_task_fair ( struct rq * this_rq , int this_cpu , struct rq * busiest ,
struct sched_domain * sd , enum cpu_idle_type idle )
{
struct cfs_rq * busy_cfs_rq ;
struct rq_iterator cfs_rq_iterator ;
cfs_rq_iterator . start = load_balance_start_fair ;
cfs_rq_iterator . next = load_balance_next_fair ;
for_each_leaf_cfs_rq ( busiest , busy_cfs_rq ) {
/*
* pass busy_cfs_rq argument into
* load_balance_ [ start | next ] _fair iterators
*/
cfs_rq_iterator . arg = busy_cfs_rq ;
if ( iter_move_one_task ( this_rq , this_cpu , busiest , sd , idle ,
& cfs_rq_iterator ) )
return 1 ;
}
return 0 ;
}
2008-06-24 22:09:43 +04:00
# endif /* CONFIG_SMP */
2007-10-24 20:23:51 +04:00
2007-07-09 20:51:58 +04:00
/*
* scheduler tick hitting a task of our scheduling class :
*/
2008-01-25 23:08:29 +03:00
static void task_tick_fair ( struct rq * rq , struct task_struct * curr , int queued )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq ;
struct sched_entity * se = & curr - > se ;
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
2008-01-25 23:08:29 +03:00
entity_tick ( cfs_rq , se , queued ) ;
2007-07-09 20:51:58 +04:00
}
}
2007-10-29 23:18:11 +03:00
# define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
2007-10-15 19:00:04 +04:00
2007-07-09 20:51:58 +04:00
/*
* Share the fairness runtime between parent and child , thus the
* total amount of pressure for CPU stays equal - new tasks
* get a chance to run but frequent forkers are not allowed to
* monopolize the CPU . Note : the parent runqueue is locked ,
* the child is not running yet .
*/
2007-08-09 13:16:49 +04:00
static void task_new_fair ( struct rq * rq , struct task_struct * p )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq = task_cfs_rq ( p ) ;
2007-10-15 19:00:03 +04:00
struct sched_entity * se = & p - > se , * curr = cfs_rq - > curr ;
2007-10-15 19:00:14 +04:00
int this_cpu = smp_processor_id ( ) ;
2007-07-09 20:51:58 +04:00
sched_info_queued ( p ) ;
2007-08-28 14:53:24 +04:00
update_curr ( cfs_rq ) ;
2007-10-15 19:00:05 +04:00
place_entity ( cfs_rq , se , 1 ) ;
2007-10-15 19:00:04 +04:00
sched: fix copy_namespace() <-> sched_fork() dependency in do_fork
Sukadev Bhattiprolu reported a kernel crash with control groups.
There are couple of problems discovered by Suka's test:
- The test requires the cgroup filesystem to be mounted with
atleast the cpu and ns options (i.e both namespace and cpu
controllers are active in the same hierarchy).
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu,ns none cpuctl
(or simply)
# mount -t cgroup none cpuctl -> Will activate all controllers
in same hierarchy.
- The test invokes clone() with CLONE_NEWNS set. This causes a a new child
to be created, also a new group (do_fork->copy_namespaces->ns_cgroup_clone->
cgroup_clone) and the child is attached to the new group (cgroup_clone->
attach_task->sched_move_task). At this point in time, the child's scheduler
related fields are uninitialized (including its on_rq field, which it has
inherited from parent). As a result sched_move_task thinks its on
runqueue, when it isn't.
As a solution to this problem, I moved sched_fork() call, which
initializes scheduler related fields on a new task, before
copy_namespaces(). I am not sure though whether moving up will
cause other side-effects. Do you see any issue?
- The second problem exposed by this test is that task_new_fair()
assumes that parent and child will be part of the same group (which
needn't be as this test shows). As a result, cfs_rq->curr can be NULL
for the child.
The solution is to test for curr pointer being NULL in
task_new_fair().
With the patch below, I could run ns_exec() fine w/o a crash.
Reported-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-11-10 00:39:39 +03:00
/* 'curr' will be NULL if the child belongs to a different group */
2007-10-15 19:00:14 +04:00
if ( sysctl_sched_child_runs_first & & this_cpu = = task_cpu ( p ) & &
sched: fix copy_namespace() <-> sched_fork() dependency in do_fork
Sukadev Bhattiprolu reported a kernel crash with control groups.
There are couple of problems discovered by Suka's test:
- The test requires the cgroup filesystem to be mounted with
atleast the cpu and ns options (i.e both namespace and cpu
controllers are active in the same hierarchy).
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu,ns none cpuctl
(or simply)
# mount -t cgroup none cpuctl -> Will activate all controllers
in same hierarchy.
- The test invokes clone() with CLONE_NEWNS set. This causes a a new child
to be created, also a new group (do_fork->copy_namespaces->ns_cgroup_clone->
cgroup_clone) and the child is attached to the new group (cgroup_clone->
attach_task->sched_move_task). At this point in time, the child's scheduler
related fields are uninitialized (including its on_rq field, which it has
inherited from parent). As a result sched_move_task thinks its on
runqueue, when it isn't.
As a solution to this problem, I moved sched_fork() call, which
initializes scheduler related fields on a new task, before
copy_namespaces(). I am not sure though whether moving up will
cause other side-effects. Do you see any issue?
- The second problem exposed by this test is that task_new_fair()
assumes that parent and child will be part of the same group (which
needn't be as this test shows). As a result, cfs_rq->curr can be NULL
for the child.
The solution is to test for curr pointer being NULL in
task_new_fair().
With the patch below, I could run ns_exec() fine w/o a crash.
Reported-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-11-10 00:39:39 +03:00
curr & & curr - > vruntime < se - > vruntime ) {
2007-10-15 19:00:08 +04:00
/*
2007-10-15 19:00:08 +04:00
* Upon rescheduling , sched_class : : put_prev_task ( ) will place
* ' current ' within the tree based on its new key value .
*/
2007-10-15 19:00:04 +04:00
swap ( curr - > vruntime , se - > vruntime ) ;
2008-08-28 13:12:49 +04:00
resched_task ( rq - > curr ) ;
2007-10-15 19:00:04 +04:00
}
2007-07-09 20:51:58 +04:00
2007-10-17 18:55:11 +04:00
enqueue_task_fair ( rq , p , 0 ) ;
2007-07-09 20:51:58 +04:00
}
2008-01-25 23:08:22 +03:00
/*
* Priority of the task has changed . Check to see if we preempt
* the current task .
*/
static void prio_changed_fair ( struct rq * rq , struct task_struct * p ,
int oldprio , int running )
{
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased , or if we are not currently running on
* this runqueue and our priority is higher than the current ' s
*/
if ( running ) {
if ( p - > prio > oldprio )
resched_task ( rq - > curr ) ;
} else
2008-09-21 01:38:02 +04:00
check_preempt_curr ( rq , p , 0 ) ;
2008-01-25 23:08:22 +03:00
}
/*
* We switched to the sched_fair class .
*/
static void switched_to_fair ( struct rq * rq , struct task_struct * p ,
int running )
{
/*
* We were most likely switched from sched_rt , so
* kick off the schedule if running , otherwise just see
* if we can still preempt the current task .
*/
if ( running )
resched_task ( rq - > curr ) ;
else
2008-09-21 01:38:02 +04:00
check_preempt_curr ( rq , p , 0 ) ;
2008-01-25 23:08:22 +03:00
}
2007-10-15 19:00:08 +04:00
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq - > curr field when a task
* migrates between groups / classes .
*/
static void set_curr_task_fair ( struct rq * rq )
{
struct sched_entity * se = & rq - > curr - > se ;
for_each_sched_entity ( se )
set_next_entity ( cfs_rq_of ( se ) , se ) ;
}
2008-02-29 23:21:01 +03:00
# ifdef CONFIG_FAIR_GROUP_SCHED
static void moved_group_fair ( struct task_struct * p )
{
struct cfs_rq * cfs_rq = task_cfs_rq ( p ) ;
update_curr ( cfs_rq ) ;
place_entity ( cfs_rq , & p - > se , 1 ) ;
}
# endif
2007-07-09 20:51:58 +04:00
/*
* All the scheduling class methods :
*/
2007-10-15 19:00:12 +04:00
static const struct sched_class fair_sched_class = {
. next = & idle_sched_class ,
2007-07-09 20:51:58 +04:00
. enqueue_task = enqueue_task_fair ,
. dequeue_task = dequeue_task_fair ,
. yield_task = yield_task_fair ,
2008-01-25 23:08:09 +03:00
# ifdef CONFIG_SMP
. select_task_rq = select_task_rq_fair ,
# endif /* CONFIG_SMP */
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:05 +04:00
. check_preempt_curr = check_preempt_wakeup ,
2007-07-09 20:51:58 +04:00
. pick_next_task = pick_next_task_fair ,
. put_prev_task = put_prev_task_fair ,
2007-10-24 20:23:51 +04:00
# ifdef CONFIG_SMP
2007-07-09 20:51:58 +04:00
. load_balance = load_balance_fair ,
2007-10-24 20:23:51 +04:00
. move_one_task = move_one_task_fair ,
2007-10-24 20:23:51 +04:00
# endif
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:08 +04:00
. set_curr_task = set_curr_task_fair ,
2007-07-09 20:51:58 +04:00
. task_tick = task_tick_fair ,
. task_new = task_new_fair ,
2008-01-25 23:08:22 +03:00
. prio_changed = prio_changed_fair ,
. switched_to = switched_to_fair ,
2008-02-29 23:21:01 +03:00
# ifdef CONFIG_FAIR_GROUP_SCHED
. moved_group = moved_group_fair ,
# endif
2007-07-09 20:51:58 +04:00
} ;
# ifdef CONFIG_SCHED_DEBUG
2007-08-09 13:16:47 +04:00
static void print_cfs_stats ( struct seq_file * m , int cpu )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq ;
2008-01-25 23:08:34 +03:00
rcu_read_lock ( ) ;
2007-08-09 13:16:51 +04:00
for_each_leaf_cfs_rq ( cpu_rq ( cpu ) , cfs_rq )
2007-08-09 13:16:47 +04:00
print_cfs_rq ( m , cpu , cfs_rq ) ;
2008-01-25 23:08:34 +03:00
rcu_read_unlock ( ) ;
2007-07-09 20:51:58 +04:00
}
# endif