2007-07-09 18:51:58 +02:00
/*
* Completely Fair Scheduling ( CFS ) Class ( SCHED_NORMAL / SCHED_BATCH )
*
* Copyright ( C ) 2007 Red Hat , Inc . , Ingo Molnar < mingo @ redhat . com >
*
* Interactivity improvements by Mike Galbraith
* ( C ) 2007 Mike Galbraith < efault @ gmx . de >
*
* Various enhancements by Dmitry Adamushko .
* ( C ) 2007 Dmitry Adamushko < dmitry . adamushko @ gmail . com >
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation , 2007
* Author : Srivatsa Vaddagiri < vatsa @ linux . vnet . ibm . com >
*
* Scaled math optimizations by Thomas Gleixner
* Copyright ( C ) 2007 , Thomas Gleixner < tglx @ linutronix . de >
2007-08-25 18:41:53 +02:00
*
* Adaptive scheduling granularity , math enhancements by Peter Zijlstra
* Copyright ( C ) 2007 Red Hat , Inc . , Peter Zijlstra < pzijlstr @ redhat . com >
2007-07-09 18:51:58 +02:00
*/
/*
2007-08-25 18:41:53 +02:00
* Targeted preemption latency for CPU - bound tasks :
2007-11-09 22:39:38 +01:00
* ( default : 20 ms * ilog ( ncpus ) , units : nanoseconds )
2007-07-09 18:51:58 +02:00
*
2007-08-25 18:41:53 +02:00
* NOTE : this latency value is not the same as the concept of
2007-10-15 17:00:14 +02:00
* ' timeslice length ' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional , time - slice
* based scheduling concepts .
2007-07-09 18:51:58 +02:00
*
2007-10-15 17:00:14 +02:00
* ( to see the precise effective timeslice length of your workload ,
* run vmstat and monitor the context - switches ( cs ) field )
2007-07-09 18:51:58 +02:00
*/
2007-11-09 22:39:38 +01:00
unsigned int sysctl_sched_latency = 20000000ULL ;
2007-10-15 17:00:02 +02:00
/*
2007-11-09 22:39:37 +01:00
* Minimal preemption granularity for CPU - bound tasks :
2007-11-09 22:39:38 +01:00
* ( default : 1 msec * ilog ( ncpus ) , units : nanoseconds )
2007-10-15 17:00:02 +02:00
*/
2007-11-09 22:39:38 +01:00
unsigned int sysctl_sched_min_granularity = 1000000ULL ;
2007-08-25 18:41:53 +02:00
/*
2007-11-09 22:39:37 +01:00
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
2007-11-09 22:39:38 +01:00
unsigned int sched_nr_latency = 20 ;
2007-11-09 22:39:37 +01:00
/*
* After fork , child runs first . ( default ) If set to 0 then
* parent will ( try to ) run first .
2007-08-25 18:41:53 +02:00
*/
2007-11-09 22:39:37 +01:00
const_debug unsigned int sysctl_sched_child_runs_first = 1 ;
2007-07-09 18:51:58 +02:00
2007-09-19 23:34:46 +02:00
/*
* sys_sched_yield ( ) compat mode
*
* This option switches the agressive yield implementation of the
* old scheduler back on .
*/
unsigned int __read_mostly sysctl_sched_compat_yield ;
2007-07-09 18:51:58 +02:00
/*
* SCHED_BATCH wake - up granularity .
2007-11-09 22:39:38 +01:00
* ( default : 10 msec * ilog ( ncpus ) , units : nanoseconds )
2007-07-09 18:51:58 +02:00
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over - scheduling . Synchronous workloads will still
* have immediate wakeup / sleep latencies .
*/
2007-11-09 22:39:38 +01:00
unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL ;
2007-07-09 18:51:58 +02:00
/*
* SCHED_OTHER wake - up granularity .
2007-11-09 22:39:38 +01:00
* ( default : 10 msec * ilog ( ncpus ) , units : nanoseconds )
2007-07-09 18:51:58 +02:00
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over - scheduling . Synchronous workloads will still
* have immediate wakeup / sleep latencies .
*/
2007-11-09 22:39:38 +01:00
unsigned int sysctl_sched_wakeup_granularity = 10000000UL ;
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:18 +02:00
const_debug unsigned int sysctl_sched_migration_cost = 500000UL ;
2007-07-09 18:51:58 +02:00
/**************************************************************
* CFS operations on generic schedulable entities :
*/
2007-10-15 17:00:03 +02:00
# ifdef CONFIG_FAIR_GROUP_SCHED
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:03 +02:00
/* cpu runqueue to which this cfs_rq is attached */
2007-07-09 18:51:58 +02:00
static inline struct rq * rq_of ( struct cfs_rq * cfs_rq )
{
2007-10-15 17:00:03 +02:00
return cfs_rq - > rq ;
2007-07-09 18:51:58 +02:00
}
2007-10-15 17:00:03 +02:00
/* An entity is a task if it doesn't "own" a runqueue */
# define entity_is_task(se) (!se->my_q)
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:03 +02:00
# else /* CONFIG_FAIR_GROUP_SCHED */
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:03 +02:00
static inline struct rq * rq_of ( struct cfs_rq * cfs_rq )
{
return container_of ( cfs_rq , struct rq , cfs ) ;
2007-07-09 18:51:58 +02:00
}
# define entity_is_task(se) 1
# endif /* CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct * task_of ( struct sched_entity * se )
{
return container_of ( se , struct task_struct , se ) ;
}
/**************************************************************
* Scheduling class tree data structure manipulation methods :
*/
2007-10-15 17:00:14 +02:00
static inline u64 max_vruntime ( u64 min_vruntime , u64 vruntime )
2007-10-15 17:00:07 +02:00
{
2007-10-15 17:00:11 +02:00
s64 delta = ( s64 ) ( vruntime - min_vruntime ) ;
if ( delta > 0 )
2007-10-15 17:00:07 +02:00
min_vruntime = vruntime ;
return min_vruntime ;
}
2007-10-15 17:00:14 +02:00
static inline u64 min_vruntime ( u64 min_vruntime , u64 vruntime )
2007-10-15 17:00:12 +02:00
{
s64 delta = ( s64 ) ( vruntime - min_vruntime ) ;
if ( delta < 0 )
min_vruntime = vruntime ;
return min_vruntime ;
}
2007-10-15 17:00:14 +02:00
static inline s64 entity_key ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-10-15 17:00:05 +02:00
{
2007-10-15 17:00:07 +02:00
return se - > vruntime - cfs_rq - > min_vruntime ;
2007-10-15 17:00:05 +02:00
}
2007-07-09 18:51:58 +02:00
/*
* Enqueue an entity into the rb - tree :
*/
2007-10-15 17:00:14 +02:00
static void __enqueue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
struct rb_node * * link = & cfs_rq - > tasks_timeline . rb_node ;
struct rb_node * parent = NULL ;
struct sched_entity * entry ;
2007-10-15 17:00:05 +02:00
s64 key = entity_key ( cfs_rq , se ) ;
2007-07-09 18:51:58 +02:00
int leftmost = 1 ;
/*
* Find the right place in the rbtree :
*/
while ( * link ) {
parent = * link ;
entry = rb_entry ( parent , struct sched_entity , run_node ) ;
/*
* We dont care about collisions . Nodes with
* the same key stay together .
*/
2007-10-15 17:00:05 +02:00
if ( key < entity_key ( cfs_rq , entry ) ) {
2007-07-09 18:51:58 +02:00
link = & parent - > rb_left ;
} else {
link = & parent - > rb_right ;
leftmost = 0 ;
}
}
/*
* Maintain a cache of leftmost tree entries ( it is frequently
* used ) :
*/
if ( leftmost )
2007-10-15 17:00:11 +02:00
cfs_rq - > rb_leftmost = & se - > run_node ;
2007-07-09 18:51:58 +02:00
rb_link_node ( & se - > run_node , parent , link ) ;
rb_insert_color ( & se - > run_node , & cfs_rq - > tasks_timeline ) ;
}
2007-10-15 17:00:14 +02:00
static void __dequeue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
if ( cfs_rq - > rb_leftmost = = & se - > run_node )
2007-10-15 17:00:11 +02:00
cfs_rq - > rb_leftmost = rb_next ( & se - > run_node ) ;
2007-10-15 17:00:04 +02:00
2007-07-09 18:51:58 +02:00
rb_erase ( & se - > run_node , & cfs_rq - > tasks_timeline ) ;
}
static inline struct rb_node * first_fair ( struct cfs_rq * cfs_rq )
{
return cfs_rq - > rb_leftmost ;
}
static struct sched_entity * __pick_next_entity ( struct cfs_rq * cfs_rq )
{
return rb_entry ( first_fair ( cfs_rq ) , struct sched_entity , run_node ) ;
}
2007-10-15 17:00:05 +02:00
static inline struct sched_entity * __pick_last_entity ( struct cfs_rq * cfs_rq )
{
struct rb_node * * link = & cfs_rq - > tasks_timeline . rb_node ;
struct sched_entity * se = NULL ;
struct rb_node * parent ;
while ( * link ) {
parent = * link ;
se = rb_entry ( parent , struct sched_entity , run_node ) ;
link = & parent - > rb_right ;
}
return se ;
}
2007-07-09 18:51:58 +02:00
/**************************************************************
* Scheduling class statistics methods :
*/
2007-11-09 22:39:37 +01:00
# ifdef CONFIG_SCHED_DEBUG
int sched_nr_latency_handler ( struct ctl_table * table , int write ,
struct file * filp , void __user * buffer , size_t * lenp ,
loff_t * ppos )
{
int ret = proc_dointvec_minmax ( table , write , filp , buffer , lenp , ppos ) ;
if ( ret | | ! write )
return ret ;
sched_nr_latency = DIV_ROUND_UP ( sysctl_sched_latency ,
sysctl_sched_min_granularity ) ;
return 0 ;
}
# endif
2007-10-15 17:00:13 +02:00
/*
* The idea is to set a period in which each task runs once .
*
* When there are too many tasks ( sysctl_sched_nr_latency ) we have to stretch
* this period because otherwise the slices get too small .
*
* p = ( nr < = nl ) ? l : l * nr / nl
*/
2007-10-15 17:00:04 +02:00
static u64 __sched_period ( unsigned long nr_running )
{
u64 period = sysctl_sched_latency ;
2007-11-09 22:39:37 +01:00
unsigned long nr_latency = sched_nr_latency ;
2007-10-15 17:00:04 +02:00
if ( unlikely ( nr_running > nr_latency ) ) {
period * = nr_running ;
do_div ( period , nr_latency ) ;
}
return period ;
}
2007-10-15 17:00:13 +02:00
/*
* We calculate the wall - time slice from the period by taking a part
* proportional to the weight .
*
* s = p * w / rw
*/
2007-10-15 17:00:05 +02:00
static u64 sched_slice ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-08-25 18:41:53 +02:00
{
2007-10-15 17:00:13 +02:00
u64 slice = __sched_period ( cfs_rq - > nr_running ) ;
2007-08-25 18:41:53 +02:00
2007-10-15 17:00:13 +02:00
slice * = se - > load . weight ;
do_div ( slice , cfs_rq - > load . weight ) ;
2007-08-25 18:41:53 +02:00
2007-10-15 17:00:13 +02:00
return slice ;
2007-07-09 18:51:58 +02:00
}
2007-10-15 17:00:13 +02:00
/*
* We calculate the vruntime slice .
*
* vs = s / w = p / rw
*/
static u64 __sched_vslice ( unsigned long rq_weight , unsigned long nr_running )
2007-10-15 17:00:10 +02:00
{
2007-10-15 17:00:13 +02:00
u64 vslice = __sched_period ( nr_running ) ;
2007-10-15 17:00:10 +02:00
2007-11-09 22:39:37 +01:00
vslice * = NICE_0_LOAD ;
2007-10-15 17:00:13 +02:00
do_div ( vslice , rq_weight ) ;
2007-10-15 17:00:10 +02:00
2007-10-15 17:00:13 +02:00
return vslice ;
}
2007-10-15 17:00:12 +02:00
2007-10-15 17:00:13 +02:00
static u64 sched_vslice ( struct cfs_rq * cfs_rq )
{
return __sched_vslice ( cfs_rq - > load . weight , cfs_rq - > nr_running ) ;
}
static u64 sched_vslice_add ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
return __sched_vslice ( cfs_rq - > load . weight + se - > load . weight ,
cfs_rq - > nr_running + 1 ) ;
2007-10-15 17:00:10 +02:00
}
2007-07-09 18:51:58 +02:00
/*
* Update the current task ' s runtime statistics . Skip current tasks that
* are not in our scheduling class .
*/
static inline void
2007-10-15 17:00:03 +02:00
__update_curr ( struct cfs_rq * cfs_rq , struct sched_entity * curr ,
unsigned long delta_exec )
2007-07-09 18:51:58 +02:00
{
2007-10-15 17:00:06 +02:00
unsigned long delta_exec_weighted ;
2007-10-15 17:00:12 +02:00
u64 vruntime ;
2007-07-09 18:51:58 +02:00
2007-08-02 17:41:40 +02:00
schedstat_set ( curr - > exec_max , max ( ( u64 ) delta_exec , curr - > exec_max ) ) ;
2007-07-09 18:51:58 +02:00
curr - > sum_exec_runtime + = delta_exec ;
2007-10-15 17:00:06 +02:00
schedstat_add ( cfs_rq , exec_clock , delta_exec ) ;
2007-10-15 17:00:04 +02:00
delta_exec_weighted = delta_exec ;
if ( unlikely ( curr - > load . weight ! = NICE_0_LOAD ) ) {
delta_exec_weighted = calc_delta_fair ( delta_exec_weighted ,
& curr - > load ) ;
}
curr - > vruntime + = delta_exec_weighted ;
2007-10-15 17:00:07 +02:00
/*
* maintain cfs_rq - > min_vruntime to be a monotonic increasing
* value tracking the leftmost vruntime in the tree .
*/
if ( first_fair ( cfs_rq ) ) {
2007-10-15 17:00:12 +02:00
vruntime = min_vruntime ( curr - > vruntime ,
__pick_next_entity ( cfs_rq ) - > vruntime ) ;
2007-10-15 17:00:07 +02:00
} else
2007-10-15 17:00:12 +02:00
vruntime = curr - > vruntime ;
2007-10-15 17:00:07 +02:00
cfs_rq - > min_vruntime =
2007-10-15 17:00:12 +02:00
max_vruntime ( cfs_rq - > min_vruntime , vruntime ) ;
2007-07-09 18:51:58 +02:00
}
2007-08-09 11:16:47 +02:00
static void update_curr ( struct cfs_rq * cfs_rq )
2007-07-09 18:51:58 +02:00
{
2007-10-15 17:00:03 +02:00
struct sched_entity * curr = cfs_rq - > curr ;
2007-10-15 17:00:03 +02:00
u64 now = rq_of ( cfs_rq ) - > clock ;
2007-07-09 18:51:58 +02:00
unsigned long delta_exec ;
if ( unlikely ( ! curr ) )
return ;
/*
* Get the amount of time the current task was running
* since the last time we changed load ( this cannot
* overflow on 32 bits ) :
*/
2007-10-15 17:00:03 +02:00
delta_exec = ( unsigned long ) ( now - curr - > exec_start ) ;
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:03 +02:00
__update_curr ( cfs_rq , curr , delta_exec ) ;
curr - > exec_start = now ;
2007-07-09 18:51:58 +02:00
}
static inline void
2007-08-09 11:16:47 +02:00
update_stats_wait_start ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
2007-08-09 11:16:47 +02:00
schedstat_set ( se - > wait_start , rq_of ( cfs_rq ) - > clock ) ;
2007-07-09 18:51:58 +02:00
}
/*
* Task is being enqueued - update stats :
*/
2007-08-09 11:16:47 +02:00
static void update_stats_enqueue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
/*
* Are we enqueueing a waiting task ? ( for current tasks
* a dequeue / enqueue event is a NOP )
*/
2007-10-15 17:00:03 +02:00
if ( se ! = cfs_rq - > curr )
2007-08-09 11:16:47 +02:00
update_stats_wait_start ( cfs_rq , se ) ;
2007-07-09 18:51:58 +02:00
}
static void
2007-08-09 11:16:47 +02:00
update_stats_wait_end ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
2007-10-15 17:00:06 +02:00
schedstat_set ( se - > wait_max , max ( se - > wait_max ,
rq_of ( cfs_rq ) - > clock - se - > wait_start ) ) ;
2007-08-02 17:41:40 +02:00
schedstat_set ( se - > wait_start , 0 ) ;
2007-07-09 18:51:58 +02:00
}
static inline void
2007-08-09 11:16:48 +02:00
update_stats_dequeue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
/*
* Mark the end of the wait period if dequeueing a
* waiting task :
*/
2007-10-15 17:00:03 +02:00
if ( se ! = cfs_rq - > curr )
2007-08-09 11:16:47 +02:00
update_stats_wait_end ( cfs_rq , se ) ;
2007-07-09 18:51:58 +02:00
}
/*
* We are picking a new current task - update its stats :
*/
static inline void
2007-08-09 11:16:47 +02:00
update_stats_curr_start ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
/*
* We are starting a new run period :
*/
2007-08-09 11:16:47 +02:00
se - > exec_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 18:51:58 +02:00
}
/**************************************************
* Scheduling class queueing methods :
*/
2007-10-15 17:00:07 +02:00
static void
account_entity_enqueue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
update_load_add ( & cfs_rq - > load , se - > load . weight ) ;
cfs_rq - > nr_running + + ;
se - > on_rq = 1 ;
}
static void
account_entity_dequeue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
update_load_sub ( & cfs_rq - > load , se - > load . weight ) ;
cfs_rq - > nr_running - - ;
se - > on_rq = 0 ;
}
2007-08-09 11:16:48 +02:00
static void enqueue_sleeper ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
# ifdef CONFIG_SCHEDSTATS
if ( se - > sleep_start ) {
2007-08-09 11:16:47 +02:00
u64 delta = rq_of ( cfs_rq ) - > clock - se - > sleep_start ;
2007-07-09 18:51:58 +02:00
if ( ( s64 ) delta < 0 )
delta = 0 ;
if ( unlikely ( delta > se - > sleep_max ) )
se - > sleep_max = delta ;
se - > sleep_start = 0 ;
se - > sum_sleep_runtime + = delta ;
}
if ( se - > block_start ) {
2007-08-09 11:16:47 +02:00
u64 delta = rq_of ( cfs_rq ) - > clock - se - > block_start ;
2007-07-09 18:51:58 +02:00
if ( ( s64 ) delta < 0 )
delta = 0 ;
if ( unlikely ( delta > se - > block_max ) )
se - > block_max = delta ;
se - > block_start = 0 ;
se - > sum_sleep_runtime + = delta ;
2007-10-02 14:13:08 +02:00
/*
* Blocking time is in units of nanosecs , so shift by 20 to
* get a milliseconds - range estimation of the amount of
* time that the task spent sleeping :
*/
if ( unlikely ( prof_on = = SLEEP_PROFILING ) ) {
2007-10-15 17:00:06 +02:00
struct task_struct * tsk = task_of ( se ) ;
2007-10-02 14:13:08 +02:00
profile_hits ( SLEEP_PROFILING , ( void * ) get_wchan ( tsk ) ,
delta > > 20 ) ;
}
2007-07-09 18:51:58 +02:00
}
# endif
}
2007-10-15 17:00:10 +02:00
static void check_spread ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
# ifdef CONFIG_SCHED_DEBUG
s64 d = se - > vruntime - cfs_rq - > min_vruntime ;
if ( d < 0 )
d = - d ;
if ( d > 3 * sysctl_sched_latency )
schedstat_inc ( cfs_rq , nr_spread_over ) ;
# endif
}
2007-10-15 17:00:05 +02:00
static void
place_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int initial )
{
2007-10-15 17:00:10 +02:00
u64 vruntime ;
2007-10-15 17:00:05 +02:00
2007-10-15 17:00:10 +02:00
vruntime = cfs_rq - > min_vruntime ;
2007-10-15 17:00:05 +02:00
2007-10-15 17:00:13 +02:00
if ( sched_feat ( TREE_AVG ) ) {
2007-10-15 17:00:05 +02:00
struct sched_entity * last = __pick_last_entity ( cfs_rq ) ;
if ( last ) {
2007-10-15 17:00:10 +02:00
vruntime + = last - > vruntime ;
vruntime > > = 1 ;
2007-10-15 17:00:05 +02:00
}
2007-10-15 17:00:10 +02:00
} else if ( sched_feat ( APPROX_AVG ) & & cfs_rq - > nr_running )
2007-10-15 17:00:13 +02:00
vruntime + = sched_vslice ( cfs_rq ) / 2 ;
2007-10-15 17:00:05 +02:00
2007-11-09 22:39:37 +01:00
/*
* The ' current ' period is already promised to the current tasks ,
* however the extra weight of the new task will slow them down a
* little , place the new task so that it fits in the slot that
* stays open at the end .
*/
2007-10-15 17:00:05 +02:00
if ( initial & & sched_feat ( START_DEBIT ) )
2007-10-15 17:00:13 +02:00
vruntime + = sched_vslice_add ( cfs_rq , se ) ;
2007-10-15 17:00:05 +02:00
2007-10-15 17:00:11 +02:00
if ( ! initial ) {
2007-11-09 22:39:37 +01:00
/* sleeps upto a single latency don't count. */
2007-10-15 17:00:14 +02:00
if ( sched_feat ( NEW_FAIR_SLEEPERS ) & & entity_is_task ( se ) & &
task_of ( se ) - > policy ! = SCHED_BATCH )
2007-10-15 17:00:11 +02:00
vruntime - = sysctl_sched_latency ;
2007-11-09 22:39:37 +01:00
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime ( se - > vruntime , vruntime ) ;
2007-10-15 17:00:05 +02:00
}
2007-10-15 17:00:10 +02:00
se - > vruntime = vruntime ;
2007-10-15 17:00:05 +02:00
}
2007-07-09 18:51:58 +02:00
static void
2007-10-15 17:00:08 +02:00
enqueue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int wakeup )
2007-07-09 18:51:58 +02:00
{
/*
2007-10-15 17:00:13 +02:00
* Update run - time statistics of the ' current ' .
2007-07-09 18:51:58 +02:00
*/
2007-08-09 11:16:47 +02:00
update_curr ( cfs_rq ) ;
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:04 +02:00
if ( wakeup ) {
2007-10-15 17:00:05 +02:00
place_entity ( cfs_rq , se , 0 ) ;
2007-08-09 11:16:48 +02:00
enqueue_sleeper ( cfs_rq , se ) ;
2007-10-15 17:00:04 +02:00
}
2007-07-09 18:51:58 +02:00
2007-08-09 11:16:47 +02:00
update_stats_enqueue ( cfs_rq , se ) ;
2007-10-15 17:00:10 +02:00
check_spread ( cfs_rq , se ) ;
2007-10-15 17:00:08 +02:00
if ( se ! = cfs_rq - > curr )
__enqueue_entity ( cfs_rq , se ) ;
2007-10-15 17:00:07 +02:00
account_entity_enqueue ( cfs_rq , se ) ;
2007-07-09 18:51:58 +02:00
}
static void
2007-08-09 11:16:48 +02:00
dequeue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int sleep )
2007-07-09 18:51:58 +02:00
{
2007-10-15 17:00:13 +02:00
/*
* Update run - time statistics of the ' current ' .
*/
update_curr ( cfs_rq ) ;
2007-08-09 11:16:48 +02:00
update_stats_dequeue ( cfs_rq , se ) ;
2007-10-15 17:00:06 +02:00
if ( sleep ) {
2007-10-15 17:00:10 +02:00
# ifdef CONFIG_SCHEDSTATS
2007-07-09 18:51:58 +02:00
if ( entity_is_task ( se ) ) {
struct task_struct * tsk = task_of ( se ) ;
if ( tsk - > state & TASK_INTERRUPTIBLE )
2007-08-09 11:16:47 +02:00
se - > sleep_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 18:51:58 +02:00
if ( tsk - > state & TASK_UNINTERRUPTIBLE )
2007-08-09 11:16:47 +02:00
se - > block_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 18:51:58 +02:00
}
2007-10-15 17:00:06 +02:00
# endif
2007-10-15 17:00:10 +02:00
}
2007-10-15 17:00:08 +02:00
if ( se ! = cfs_rq - > curr )
2007-10-15 17:00:07 +02:00
__dequeue_entity ( cfs_rq , se ) ;
account_entity_dequeue ( cfs_rq , se ) ;
2007-07-09 18:51:58 +02:00
}
/*
* Preempt the current task with a newly woken task if needed :
*/
2007-09-05 14:32:49 +02:00
static void
2007-10-15 17:00:05 +02:00
check_preempt_tick ( struct cfs_rq * cfs_rq , struct sched_entity * curr )
2007-07-09 18:51:58 +02:00
{
2007-09-05 14:32:49 +02:00
unsigned long ideal_runtime , delta_exec ;
2007-10-15 17:00:05 +02:00
ideal_runtime = sched_slice ( cfs_rq , curr ) ;
2007-09-05 14:32:49 +02:00
delta_exec = curr - > sum_exec_runtime - curr - > prev_sum_exec_runtime ;
2007-11-09 22:39:39 +01:00
if ( delta_exec > ideal_runtime )
2007-07-09 18:51:58 +02:00
resched_task ( rq_of ( cfs_rq ) - > curr ) ;
}
2007-10-15 17:00:08 +02:00
static void
2007-08-09 11:16:48 +02:00
set_next_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 18:51:58 +02:00
{
2007-10-15 17:00:08 +02:00
/* 'current' is not kept within the tree. */
if ( se - > on_rq ) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU . So account for the time it spent waiting on the
* runqueue .
*/
update_stats_wait_end ( cfs_rq , se ) ;
__dequeue_entity ( cfs_rq , se ) ;
}
2007-08-09 11:16:47 +02:00
update_stats_curr_start ( cfs_rq , se ) ;
2007-10-15 17:00:03 +02:00
cfs_rq - > curr = se ;
2007-10-15 17:00:02 +02:00
# ifdef CONFIG_SCHEDSTATS
/*
* Track our maximum slice length , if the CPU ' s load is at
* least twice that of our own weight ( i . e . dont track it
* when there are only lesser - weight tasks around ) :
*/
2007-10-15 17:00:06 +02:00
if ( rq_of ( cfs_rq ) - > load . weight > = 2 * se - > load . weight ) {
2007-10-15 17:00:02 +02:00
se - > slice_max = max ( se - > slice_max ,
se - > sum_exec_runtime - se - > prev_sum_exec_runtime ) ;
}
# endif
2007-09-05 14:32:49 +02:00
se - > prev_sum_exec_runtime = se - > sum_exec_runtime ;
2007-07-09 18:51:58 +02:00
}
2007-08-09 11:16:48 +02:00
static struct sched_entity * pick_next_entity ( struct cfs_rq * cfs_rq )
2007-07-09 18:51:58 +02:00
{
2007-10-15 17:00:13 +02:00
struct sched_entity * se = NULL ;
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:13 +02:00
if ( first_fair ( cfs_rq ) ) {
se = __pick_next_entity ( cfs_rq ) ;
set_next_entity ( cfs_rq , se ) ;
}
2007-07-09 18:51:58 +02:00
return se ;
}
2007-08-09 11:16:48 +02:00
static void put_prev_entity ( struct cfs_rq * cfs_rq , struct sched_entity * prev )
2007-07-09 18:51:58 +02:00
{
/*
* If still on the runqueue then deactivate_task ( )
* was not called and update_curr ( ) has to be done :
*/
if ( prev - > on_rq )
2007-08-09 11:16:47 +02:00
update_curr ( cfs_rq ) ;
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:10 +02:00
check_spread ( cfs_rq , prev ) ;
2007-10-15 17:00:07 +02:00
if ( prev - > on_rq ) {
2007-08-09 11:16:47 +02:00
update_stats_wait_start ( cfs_rq , prev ) ;
2007-10-15 17:00:07 +02:00
/* Put 'current' back into the tree. */
__enqueue_entity ( cfs_rq , prev ) ;
}
2007-10-15 17:00:03 +02:00
cfs_rq - > curr = NULL ;
2007-07-09 18:51:58 +02:00
}
static void entity_tick ( struct cfs_rq * cfs_rq , struct sched_entity * curr )
{
/*
2007-10-15 17:00:07 +02:00
* Update run - time statistics of the ' current ' .
2007-07-09 18:51:58 +02:00
*/
2007-10-15 17:00:07 +02:00
update_curr ( cfs_rq ) ;
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:14 +02:00
if ( cfs_rq - > nr_running > 1 | | ! sched_feat ( WAKEUP_PREEMPT ) )
2007-10-15 17:00:05 +02:00
check_preempt_tick ( cfs_rq , curr ) ;
2007-07-09 18:51:58 +02:00
}
/**************************************************
* CFS operations on tasks :
*/
# ifdef CONFIG_FAIR_GROUP_SCHED
/* Walk up scheduling entities hierarchy */
# define for_each_sched_entity(se) \
for ( ; se ; se = se - > parent )
static inline struct cfs_rq * task_cfs_rq ( struct task_struct * p )
{
return p - > se . cfs_rq ;
}
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq * cfs_rq_of ( struct sched_entity * se )
{
return se - > cfs_rq ;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq * group_cfs_rq ( struct sched_entity * grp )
{
return grp - > my_q ;
}
/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
* another cpu ( ' this_cpu ' )
*/
static inline struct cfs_rq * cpu_cfs_rq ( struct cfs_rq * cfs_rq , int this_cpu )
{
2007-10-15 17:00:07 +02:00
return cfs_rq - > tg - > cfs_rq [ this_cpu ] ;
2007-07-09 18:51:58 +02:00
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
# define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry ( cfs_rq , & rq - > leaf_cfs_rq_list , leaf_cfs_rq_list )
2007-10-15 17:00:12 +02:00
/* Do the two (enqueued) entities belong to the same group ? */
static inline int
is_same_group ( struct sched_entity * se , struct sched_entity * pse )
2007-07-09 18:51:58 +02:00
{
2007-10-15 17:00:12 +02:00
if ( se - > cfs_rq = = pse - > cfs_rq )
2007-07-09 18:51:58 +02:00
return 1 ;
return 0 ;
}
2007-10-15 17:00:12 +02:00
static inline struct sched_entity * parent_entity ( struct sched_entity * se )
{
return se - > parent ;
}
2007-07-09 18:51:58 +02:00
# else /* CONFIG_FAIR_GROUP_SCHED */
# define for_each_sched_entity(se) \
for ( ; se ; se = NULL )
static inline struct cfs_rq * task_cfs_rq ( struct task_struct * p )
{
return & task_rq ( p ) - > cfs ;
}
static inline struct cfs_rq * cfs_rq_of ( struct sched_entity * se )
{
struct task_struct * p = task_of ( se ) ;
struct rq * rq = task_rq ( p ) ;
return & rq - > cfs ;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq * group_cfs_rq ( struct sched_entity * grp )
{
return NULL ;
}
static inline struct cfs_rq * cpu_cfs_rq ( struct cfs_rq * cfs_rq , int this_cpu )
{
return & cpu_rq ( this_cpu ) - > cfs ;
}
# define for_each_leaf_cfs_rq(rq, cfs_rq) \
for ( cfs_rq = & rq - > cfs ; cfs_rq ; cfs_rq = NULL )
2007-10-15 17:00:12 +02:00
static inline int
is_same_group ( struct sched_entity * se , struct sched_entity * pse )
2007-07-09 18:51:58 +02:00
{
return 1 ;
}
2007-10-15 17:00:12 +02:00
static inline struct sched_entity * parent_entity ( struct sched_entity * se )
{
return NULL ;
}
2007-07-09 18:51:58 +02:00
# endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* The enqueue_task method is called before nr_running is
* increased . Here we update the fair scheduling stats and
* then put the task into the rbtree :
*/
2007-08-09 11:16:48 +02:00
static void enqueue_task_fair ( struct rq * rq , struct task_struct * p , int wakeup )
2007-07-09 18:51:58 +02:00
{
struct cfs_rq * cfs_rq ;
struct sched_entity * se = & p - > se ;
for_each_sched_entity ( se ) {
if ( se - > on_rq )
break ;
cfs_rq = cfs_rq_of ( se ) ;
2007-10-15 17:00:08 +02:00
enqueue_entity ( cfs_rq , se , wakeup ) ;
2007-10-15 17:00:12 +02:00
wakeup = 1 ;
2007-07-09 18:51:58 +02:00
}
}
/*
* The dequeue_task method is called before nr_running is
* decreased . We remove the task from the rbtree and
* update the fair scheduling stats :
*/
2007-08-09 11:16:48 +02:00
static void dequeue_task_fair ( struct rq * rq , struct task_struct * p , int sleep )
2007-07-09 18:51:58 +02:00
{
struct cfs_rq * cfs_rq ;
struct sched_entity * se = & p - > se ;
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
2007-08-09 11:16:48 +02:00
dequeue_entity ( cfs_rq , se , sleep ) ;
2007-07-09 18:51:58 +02:00
/* Don't dequeue parent if it has other entities besides us */
if ( cfs_rq - > load . weight )
break ;
2007-10-15 17:00:12 +02:00
sleep = 1 ;
2007-07-09 18:51:58 +02:00
}
}
/*
2007-09-19 23:34:46 +02:00
* sched_yield ( ) support is very simple - we dequeue and enqueue .
*
* If compat_yield is turned on then we requeue to the end of the tree .
2007-07-09 18:51:58 +02:00
*/
2007-10-15 17:00:08 +02:00
static void yield_task_fair ( struct rq * rq )
2007-07-09 18:51:58 +02:00
{
2007-10-15 17:00:08 +02:00
struct cfs_rq * cfs_rq = task_cfs_rq ( rq - > curr ) ;
2007-10-15 17:00:08 +02:00
struct sched_entity * rightmost , * se = & rq - > curr - > se ;
2007-07-09 18:51:58 +02:00
/*
2007-09-19 23:34:46 +02:00
* Are we the only task in the tree ?
*/
if ( unlikely ( cfs_rq - > nr_running = = 1 ) )
return ;
if ( likely ( ! sysctl_sched_compat_yield ) ) {
__update_rq_clock ( rq ) ;
/*
2007-10-15 17:00:13 +02:00
* Update run - time statistics of the ' current ' .
2007-09-19 23:34:46 +02:00
*/
2007-10-15 17:00:12 +02:00
update_curr ( cfs_rq ) ;
2007-09-19 23:34:46 +02:00
return ;
}
/*
* Find the rightmost entry in the rbtree :
2007-07-09 18:51:58 +02:00
*/
2007-10-15 17:00:12 +02:00
rightmost = __pick_last_entity ( cfs_rq ) ;
2007-09-19 23:34:46 +02:00
/*
* Already in the rightmost position ?
*/
2007-10-15 17:00:12 +02:00
if ( unlikely ( rightmost - > vruntime < se - > vruntime ) )
2007-09-19 23:34:46 +02:00
return ;
/*
* Minimally necessary key value to be last in the tree :
2007-10-15 17:00:12 +02:00
* Upon rescheduling , sched_class : : put_prev_task ( ) will place
* ' current ' within the tree based on its new key value .
2007-09-19 23:34:46 +02:00
*/
2007-10-15 17:00:07 +02:00
se - > vruntime = rightmost - > vruntime + 1 ;
2007-07-09 18:51:58 +02:00
}
/*
* Preempt the current task with a newly woken task if needed :
*/
2007-10-15 17:00:05 +02:00
static void check_preempt_wakeup ( struct rq * rq , struct task_struct * p )
2007-07-09 18:51:58 +02:00
{
struct task_struct * curr = rq - > curr ;
2007-10-15 17:00:12 +02:00
struct cfs_rq * cfs_rq = task_cfs_rq ( curr ) ;
2007-10-15 17:00:12 +02:00
struct sched_entity * se = & curr - > se , * pse = & p - > se ;
2007-11-09 22:39:39 +01:00
unsigned long gran ;
2007-07-09 18:51:58 +02:00
if ( unlikely ( rt_prio ( p - > prio ) ) ) {
2007-08-09 11:16:47 +02:00
update_rq_clock ( rq ) ;
2007-08-09 11:16:47 +02:00
update_curr ( cfs_rq ) ;
2007-07-09 18:51:58 +02:00
resched_task ( curr ) ;
return ;
}
2007-10-15 17:00:18 +02:00
/*
* Batch tasks do not preempt ( their preemption is driven by
* the tick ) :
*/
if ( unlikely ( p - > policy = = SCHED_BATCH ) )
return ;
2007-07-09 18:51:58 +02:00
2007-11-09 22:39:39 +01:00
if ( ! sched_feat ( WAKEUP_PREEMPT ) )
return ;
2007-10-15 17:00:12 +02:00
2007-11-09 22:39:39 +01:00
while ( ! is_same_group ( se , pse ) ) {
se = parent_entity ( se ) ;
pse = parent_entity ( pse ) ;
2007-10-15 17:00:14 +02:00
}
2007-11-09 22:39:39 +01:00
gran = sysctl_sched_wakeup_granularity ;
if ( unlikely ( se - > load . weight ! = NICE_0_LOAD ) )
gran = calc_delta_fair ( gran , & se - > load ) ;
2007-11-09 22:39:39 +01:00
if ( pse - > vruntime + gran < se - > vruntime )
2007-11-09 22:39:39 +01:00
resched_task ( curr ) ;
2007-07-09 18:51:58 +02:00
}
2007-08-09 11:16:48 +02:00
static struct task_struct * pick_next_task_fair ( struct rq * rq )
2007-07-09 18:51:58 +02:00
{
struct cfs_rq * cfs_rq = & rq - > cfs ;
struct sched_entity * se ;
if ( unlikely ( ! cfs_rq - > nr_running ) )
return NULL ;
do {
2007-08-09 11:16:48 +02:00
se = pick_next_entity ( cfs_rq ) ;
2007-07-09 18:51:58 +02:00
cfs_rq = group_cfs_rq ( se ) ;
} while ( cfs_rq ) ;
return task_of ( se ) ;
}
/*
* Account for a descheduled task :
*/
2007-08-09 11:16:49 +02:00
static void put_prev_task_fair ( struct rq * rq , struct task_struct * prev )
2007-07-09 18:51:58 +02:00
{
struct sched_entity * se = & prev - > se ;
struct cfs_rq * cfs_rq ;
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
2007-08-09 11:16:48 +02:00
put_prev_entity ( cfs_rq , se ) ;
2007-07-09 18:51:58 +02:00
}
}
2007-10-24 18:23:51 +02:00
# ifdef CONFIG_SMP
2007-07-09 18:51:58 +02:00
/**************************************************
* Fair scheduling class load - balancing methods :
*/
/*
* Load - balancing iterator . Note : while the runqueue stays locked
* during the whole iteration , the current task might be
* dequeued so the iterator has to be dequeue - safe . Here we
* achieve that by always pre - iterating before returning
* the current task :
*/
2007-10-15 17:00:13 +02:00
static struct task_struct *
2007-07-09 18:51:58 +02:00
__load_balance_iterator ( struct cfs_rq * cfs_rq , struct rb_node * curr )
{
struct task_struct * p ;
if ( ! curr )
return NULL ;
p = rb_entry ( curr , struct task_struct , se . run_node ) ;
cfs_rq - > rb_load_balance_curr = rb_next ( curr ) ;
return p ;
}
static struct task_struct * load_balance_start_fair ( void * arg )
{
struct cfs_rq * cfs_rq = arg ;
return __load_balance_iterator ( cfs_rq , first_fair ( cfs_rq ) ) ;
}
static struct task_struct * load_balance_next_fair ( void * arg )
{
struct cfs_rq * cfs_rq = arg ;
return __load_balance_iterator ( cfs_rq , cfs_rq - > rb_load_balance_curr ) ;
}
2007-08-09 11:16:46 +02:00
# ifdef CONFIG_FAIR_GROUP_SCHED
2007-07-09 18:51:58 +02:00
static int cfs_rq_best_prio ( struct cfs_rq * cfs_rq )
{
struct sched_entity * curr ;
struct task_struct * p ;
if ( ! cfs_rq - > nr_running )
return MAX_PRIO ;
2007-10-15 17:00:09 +02:00
curr = cfs_rq - > curr ;
if ( ! curr )
curr = __pick_next_entity ( cfs_rq ) ;
2007-07-09 18:51:58 +02:00
p = task_of ( curr ) ;
return p - > prio ;
}
2007-08-09 11:16:46 +02:00
# endif
2007-07-09 18:51:58 +02:00
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
static unsigned long
2007-07-09 18:51:58 +02:00
load_balance_fair ( struct rq * this_rq , int this_cpu , struct rq * busiest ,
2007-10-24 18:23:51 +02:00
unsigned long max_load_move ,
2007-08-09 11:16:46 +02:00
struct sched_domain * sd , enum cpu_idle_type idle ,
int * all_pinned , int * this_best_prio )
2007-07-09 18:51:58 +02:00
{
struct cfs_rq * busy_cfs_rq ;
long rem_load_move = max_load_move ;
struct rq_iterator cfs_rq_iterator ;
cfs_rq_iterator . start = load_balance_start_fair ;
cfs_rq_iterator . next = load_balance_next_fair ;
for_each_leaf_cfs_rq ( busiest , busy_cfs_rq ) {
2007-08-09 11:16:46 +02:00
# ifdef CONFIG_FAIR_GROUP_SCHED
2007-07-09 18:51:58 +02:00
struct cfs_rq * this_cfs_rq ;
2007-08-10 23:05:11 +02:00
long imbalance ;
2007-07-09 18:51:58 +02:00
unsigned long maxload ;
this_cfs_rq = cpu_cfs_rq ( busy_cfs_rq , this_cpu ) ;
2007-08-10 23:05:11 +02:00
imbalance = busy_cfs_rq - > load . weight - this_cfs_rq - > load . weight ;
2007-07-09 18:51:58 +02:00
/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
if ( imbalance < = 0 )
continue ;
/* Don't pull more than imbalance/2 */
imbalance / = 2 ;
maxload = min ( rem_load_move , imbalance ) ;
2007-08-09 11:16:46 +02:00
* this_best_prio = cfs_rq_best_prio ( this_cfs_rq ) ;
# else
2007-08-10 23:05:11 +02:00
# define maxload rem_load_move
2007-08-09 11:16:46 +02:00
# endif
2007-10-24 18:23:51 +02:00
/*
* pass busy_cfs_rq argument into
2007-07-09 18:51:58 +02:00
* load_balance_ [ start | next ] _fair iterators
*/
cfs_rq_iterator . arg = busy_cfs_rq ;
2007-10-24 18:23:51 +02:00
rem_load_move - = balance_tasks ( this_rq , this_cpu , busiest ,
maxload , sd , idle , all_pinned ,
this_best_prio ,
& cfs_rq_iterator ) ;
2007-07-09 18:51:58 +02:00
2007-10-24 18:23:51 +02:00
if ( rem_load_move < = 0 )
2007-07-09 18:51:58 +02:00
break ;
}
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
return max_load_move - rem_load_move ;
2007-07-09 18:51:58 +02:00
}
2007-10-24 18:23:51 +02:00
static int
move_one_task_fair ( struct rq * this_rq , int this_cpu , struct rq * busiest ,
struct sched_domain * sd , enum cpu_idle_type idle )
{
struct cfs_rq * busy_cfs_rq ;
struct rq_iterator cfs_rq_iterator ;
cfs_rq_iterator . start = load_balance_start_fair ;
cfs_rq_iterator . next = load_balance_next_fair ;
for_each_leaf_cfs_rq ( busiest , busy_cfs_rq ) {
/*
* pass busy_cfs_rq argument into
* load_balance_ [ start | next ] _fair iterators
*/
cfs_rq_iterator . arg = busy_cfs_rq ;
if ( iter_move_one_task ( this_rq , this_cpu , busiest , sd , idle ,
& cfs_rq_iterator ) )
return 1 ;
}
return 0 ;
}
2007-10-24 18:23:51 +02:00
# endif
2007-10-24 18:23:51 +02:00
2007-07-09 18:51:58 +02:00
/*
* scheduler tick hitting a task of our scheduling class :
*/
static void task_tick_fair ( struct rq * rq , struct task_struct * curr )
{
struct cfs_rq * cfs_rq ;
struct sched_entity * se = & curr - > se ;
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
entity_tick ( cfs_rq , se ) ;
}
}
2007-10-29 21:18:11 +01:00
# define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
2007-10-15 17:00:04 +02:00
2007-07-09 18:51:58 +02:00
/*
* Share the fairness runtime between parent and child , thus the
* total amount of pressure for CPU stays equal - new tasks
* get a chance to run but frequent forkers are not allowed to
* monopolize the CPU . Note : the parent runqueue is locked ,
* the child is not running yet .
*/
2007-08-09 11:16:49 +02:00
static void task_new_fair ( struct rq * rq , struct task_struct * p )
2007-07-09 18:51:58 +02:00
{
struct cfs_rq * cfs_rq = task_cfs_rq ( p ) ;
2007-10-15 17:00:03 +02:00
struct sched_entity * se = & p - > se , * curr = cfs_rq - > curr ;
2007-10-15 17:00:14 +02:00
int this_cpu = smp_processor_id ( ) ;
2007-07-09 18:51:58 +02:00
sched_info_queued ( p ) ;
2007-08-28 12:53:24 +02:00
update_curr ( cfs_rq ) ;
2007-10-15 17:00:05 +02:00
place_entity ( cfs_rq , se , 1 ) ;
2007-10-15 17:00:04 +02:00
sched: fix copy_namespace() <-> sched_fork() dependency in do_fork
Sukadev Bhattiprolu reported a kernel crash with control groups.
There are couple of problems discovered by Suka's test:
- The test requires the cgroup filesystem to be mounted with
atleast the cpu and ns options (i.e both namespace and cpu
controllers are active in the same hierarchy).
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu,ns none cpuctl
(or simply)
# mount -t cgroup none cpuctl -> Will activate all controllers
in same hierarchy.
- The test invokes clone() with CLONE_NEWNS set. This causes a a new child
to be created, also a new group (do_fork->copy_namespaces->ns_cgroup_clone->
cgroup_clone) and the child is attached to the new group (cgroup_clone->
attach_task->sched_move_task). At this point in time, the child's scheduler
related fields are uninitialized (including its on_rq field, which it has
inherited from parent). As a result sched_move_task thinks its on
runqueue, when it isn't.
As a solution to this problem, I moved sched_fork() call, which
initializes scheduler related fields on a new task, before
copy_namespaces(). I am not sure though whether moving up will
cause other side-effects. Do you see any issue?
- The second problem exposed by this test is that task_new_fair()
assumes that parent and child will be part of the same group (which
needn't be as this test shows). As a result, cfs_rq->curr can be NULL
for the child.
The solution is to test for curr pointer being NULL in
task_new_fair().
With the patch below, I could run ns_exec() fine w/o a crash.
Reported-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-11-09 22:39:39 +01:00
/* 'curr' will be NULL if the child belongs to a different group */
2007-10-15 17:00:14 +02:00
if ( sysctl_sched_child_runs_first & & this_cpu = = task_cpu ( p ) & &
sched: fix copy_namespace() <-> sched_fork() dependency in do_fork
Sukadev Bhattiprolu reported a kernel crash with control groups.
There are couple of problems discovered by Suka's test:
- The test requires the cgroup filesystem to be mounted with
atleast the cpu and ns options (i.e both namespace and cpu
controllers are active in the same hierarchy).
# mkdir /dev/cpuctl
# mount -t cgroup -ocpu,ns none cpuctl
(or simply)
# mount -t cgroup none cpuctl -> Will activate all controllers
in same hierarchy.
- The test invokes clone() with CLONE_NEWNS set. This causes a a new child
to be created, also a new group (do_fork->copy_namespaces->ns_cgroup_clone->
cgroup_clone) and the child is attached to the new group (cgroup_clone->
attach_task->sched_move_task). At this point in time, the child's scheduler
related fields are uninitialized (including its on_rq field, which it has
inherited from parent). As a result sched_move_task thinks its on
runqueue, when it isn't.
As a solution to this problem, I moved sched_fork() call, which
initializes scheduler related fields on a new task, before
copy_namespaces(). I am not sure though whether moving up will
cause other side-effects. Do you see any issue?
- The second problem exposed by this test is that task_new_fair()
assumes that parent and child will be part of the same group (which
needn't be as this test shows). As a result, cfs_rq->curr can be NULL
for the child.
The solution is to test for curr pointer being NULL in
task_new_fair().
With the patch below, I could run ns_exec() fine w/o a crash.
Reported-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-11-09 22:39:39 +01:00
curr & & curr - > vruntime < se - > vruntime ) {
2007-10-15 17:00:08 +02:00
/*
2007-10-15 17:00:08 +02:00
* Upon rescheduling , sched_class : : put_prev_task ( ) will place
* ' current ' within the tree based on its new key value .
*/
2007-10-15 17:00:04 +02:00
swap ( curr - > vruntime , se - > vruntime ) ;
}
2007-07-09 18:51:58 +02:00
2007-10-17 16:55:11 +02:00
enqueue_task_fair ( rq , p , 0 ) ;
2007-10-15 17:00:02 +02:00
resched_task ( rq - > curr ) ;
2007-07-09 18:51:58 +02:00
}
2007-10-15 17:00:08 +02:00
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq - > curr field when a task
* migrates between groups / classes .
*/
static void set_curr_task_fair ( struct rq * rq )
{
struct sched_entity * se = & rq - > curr - > se ;
for_each_sched_entity ( se )
set_next_entity ( cfs_rq_of ( se ) , se ) ;
}
2007-07-09 18:51:58 +02:00
/*
* All the scheduling class methods :
*/
2007-10-15 17:00:12 +02:00
static const struct sched_class fair_sched_class = {
. next = & idle_sched_class ,
2007-07-09 18:51:58 +02:00
. enqueue_task = enqueue_task_fair ,
. dequeue_task = dequeue_task_fair ,
. yield_task = yield_task_fair ,
2007-10-15 17:00:05 +02:00
. check_preempt_curr = check_preempt_wakeup ,
2007-07-09 18:51:58 +02:00
. pick_next_task = pick_next_task_fair ,
. put_prev_task = put_prev_task_fair ,
2007-10-24 18:23:51 +02:00
# ifdef CONFIG_SMP
2007-07-09 18:51:58 +02:00
. load_balance = load_balance_fair ,
2007-10-24 18:23:51 +02:00
. move_one_task = move_one_task_fair ,
2007-10-24 18:23:51 +02:00
# endif
2007-07-09 18:51:58 +02:00
2007-10-15 17:00:08 +02:00
. set_curr_task = set_curr_task_fair ,
2007-07-09 18:51:58 +02:00
. task_tick = task_tick_fair ,
. task_new = task_new_fair ,
} ;
# ifdef CONFIG_SCHED_DEBUG
2007-08-09 11:16:47 +02:00
static void print_cfs_stats ( struct seq_file * m , int cpu )
2007-07-09 18:51:58 +02:00
{
struct cfs_rq * cfs_rq ;
2007-10-15 17:00:09 +02:00
# ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_rq ( m , cpu , & cpu_rq ( cpu ) - > cfs ) ;
# endif
2007-08-09 11:16:51 +02:00
for_each_leaf_cfs_rq ( cpu_rq ( cpu ) , cfs_rq )
2007-08-09 11:16:47 +02:00
print_cfs_rq ( m , cpu , cfs_rq ) ;
2007-07-09 18:51:58 +02:00
}
# endif