2007-07-09 20:51:58 +04:00
/*
* Completely Fair Scheduling ( CFS ) Class ( SCHED_NORMAL / SCHED_BATCH )
*
* Copyright ( C ) 2007 Red Hat , Inc . , Ingo Molnar < mingo @ redhat . com >
*
* Interactivity improvements by Mike Galbraith
* ( C ) 2007 Mike Galbraith < efault @ gmx . de >
*
* Various enhancements by Dmitry Adamushko .
* ( C ) 2007 Dmitry Adamushko < dmitry . adamushko @ gmail . com >
*
* Group scheduling enhancements by Srivatsa Vaddagiri
* Copyright IBM Corporation , 2007
* Author : Srivatsa Vaddagiri < vatsa @ linux . vnet . ibm . com >
*
* Scaled math optimizations by Thomas Gleixner
* Copyright ( C ) 2007 , Thomas Gleixner < tglx @ linutronix . de >
2007-08-25 20:41:53 +04:00
*
* Adaptive scheduling granularity , math enhancements by Peter Zijlstra
* Copyright ( C ) 2007 Red Hat , Inc . , Peter Zijlstra < pzijlstr @ redhat . com >
2007-07-09 20:51:58 +04:00
*/
/*
2007-08-25 20:41:53 +04:00
* Targeted preemption latency for CPU - bound tasks :
* ( default : 20 ms , units : nanoseconds )
2007-07-09 20:51:58 +04:00
*
2007-08-25 20:41:53 +04:00
* NOTE : this latency value is not the same as the concept of
* ' timeslice length ' - timeslices in CFS are of variable length .
* ( to see the precise effective timeslice length of your workload ,
* run vmstat and monitor the context - switches field )
2007-07-09 20:51:58 +04:00
*
* On SMP systems the value of this is multiplied by the log2 of the
* number of CPUs . ( i . e . factor 2 x on 2 - way systems , 3 x on 4 - way
* systems , 4 x on 8 - way systems , 5 x on 16 - way systems , etc . )
2007-08-25 20:41:53 +04:00
* Targeted preemption latency for CPU - bound tasks :
2007-07-09 20:51:58 +04:00
*/
2007-10-15 19:00:02 +04:00
const_debug unsigned int sysctl_sched_latency = 20000000ULL ;
/*
* After fork , child runs first . ( default ) If set to 0 then
* parent will ( try to ) run first .
*/
const_debug unsigned int sysctl_sched_child_runs_first = 1 ;
2007-08-25 20:41:53 +04:00
/*
* Minimal preemption granularity for CPU - bound tasks :
* ( default : 2 msec , units : nanoseconds )
*/
2007-08-25 20:41:53 +04:00
unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL ;
2007-07-09 20:51:58 +04:00
2007-09-20 01:34:46 +04:00
/*
* sys_sched_yield ( ) compat mode
*
* This option switches the agressive yield implementation of the
* old scheduler back on .
*/
unsigned int __read_mostly sysctl_sched_compat_yield ;
2007-07-09 20:51:58 +04:00
/*
* SCHED_BATCH wake - up granularity .
2007-08-24 22:39:10 +04:00
* ( default : 25 msec , units : nanoseconds )
2007-07-09 20:51:58 +04:00
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over - scheduling . Synchronous workloads will still
* have immediate wakeup / sleep latencies .
*/
2007-10-15 19:00:02 +04:00
const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL ;
2007-07-09 20:51:58 +04:00
/*
* SCHED_OTHER wake - up granularity .
* ( default : 1 msec , units : nanoseconds )
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over - scheduling . Synchronous workloads will still
* have immediate wakeup / sleep latencies .
*/
2007-10-15 19:00:05 +04:00
const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL ;
2007-07-09 20:51:58 +04:00
unsigned int sysctl_sched_runtime_limit __read_mostly ;
extern struct sched_class fair_sched_class ;
/**************************************************************
* CFS operations on generic schedulable entities :
*/
2007-10-15 19:00:03 +04:00
# ifdef CONFIG_FAIR_GROUP_SCHED
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:03 +04:00
/* cpu runqueue to which this cfs_rq is attached */
2007-07-09 20:51:58 +04:00
static inline struct rq * rq_of ( struct cfs_rq * cfs_rq )
{
2007-10-15 19:00:03 +04:00
return cfs_rq - > rq ;
2007-07-09 20:51:58 +04:00
}
2007-10-15 19:00:03 +04:00
/* An entity is a task if it doesn't "own" a runqueue */
# define entity_is_task(se) (!se->my_q)
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:03 +04:00
# else /* CONFIG_FAIR_GROUP_SCHED */
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:03 +04:00
static inline struct rq * rq_of ( struct cfs_rq * cfs_rq )
{
return container_of ( cfs_rq , struct rq , cfs ) ;
2007-07-09 20:51:58 +04:00
}
# define entity_is_task(se) 1
# endif /* CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct * task_of ( struct sched_entity * se )
{
return container_of ( se , struct task_struct , se ) ;
}
/**************************************************************
* Scheduling class tree data structure manipulation methods :
*/
2007-10-15 19:00:04 +04:00
static inline void
set_leftmost ( struct cfs_rq * cfs_rq , struct rb_node * leftmost )
{
struct sched_entity * se ;
cfs_rq - > rb_leftmost = leftmost ;
if ( leftmost ) {
se = rb_entry ( leftmost , struct sched_entity , run_node ) ;
2007-10-15 19:00:05 +04:00
if ( ( se - > vruntime > cfs_rq - > min_vruntime ) | |
( cfs_rq - > min_vruntime > ( 1ULL < < 61 ) & &
se - > vruntime < ( 1ULL < < 50 ) ) )
cfs_rq - > min_vruntime = se - > vruntime ;
2007-10-15 19:00:04 +04:00
}
}
2007-10-15 19:00:05 +04:00
s64 entity_key ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
return se - > fair_key - cfs_rq - > min_vruntime ;
}
2007-07-09 20:51:58 +04:00
/*
* Enqueue an entity into the rb - tree :
*/
2007-10-15 19:00:04 +04:00
static void
2007-07-09 20:51:58 +04:00
__enqueue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
struct rb_node * * link = & cfs_rq - > tasks_timeline . rb_node ;
struct rb_node * parent = NULL ;
struct sched_entity * entry ;
2007-10-15 19:00:05 +04:00
s64 key = entity_key ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
int leftmost = 1 ;
/*
* Find the right place in the rbtree :
*/
while ( * link ) {
parent = * link ;
entry = rb_entry ( parent , struct sched_entity , run_node ) ;
/*
* We dont care about collisions . Nodes with
* the same key stay together .
*/
2007-10-15 19:00:05 +04:00
if ( key < entity_key ( cfs_rq , entry ) ) {
2007-07-09 20:51:58 +04:00
link = & parent - > rb_left ;
} else {
link = & parent - > rb_right ;
leftmost = 0 ;
}
}
/*
* Maintain a cache of leftmost tree entries ( it is frequently
* used ) :
*/
if ( leftmost )
2007-10-15 19:00:04 +04:00
set_leftmost ( cfs_rq , & se - > run_node ) ;
2007-07-09 20:51:58 +04:00
rb_link_node ( & se - > run_node , parent , link ) ;
rb_insert_color ( & se - > run_node , & cfs_rq - > tasks_timeline ) ;
update_load_add ( & cfs_rq - > load , se - > load . weight ) ;
cfs_rq - > nr_running + + ;
se - > on_rq = 1 ;
}
2007-10-15 19:00:04 +04:00
static void
2007-07-09 20:51:58 +04:00
__dequeue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
{
if ( cfs_rq - > rb_leftmost = = & se - > run_node )
2007-10-15 19:00:04 +04:00
set_leftmost ( cfs_rq , rb_next ( & se - > run_node ) ) ;
2007-07-09 20:51:58 +04:00
rb_erase ( & se - > run_node , & cfs_rq - > tasks_timeline ) ;
update_load_sub ( & cfs_rq - > load , se - > load . weight ) ;
cfs_rq - > nr_running - - ;
se - > on_rq = 0 ;
}
static inline struct rb_node * first_fair ( struct cfs_rq * cfs_rq )
{
return cfs_rq - > rb_leftmost ;
}
static struct sched_entity * __pick_next_entity ( struct cfs_rq * cfs_rq )
{
return rb_entry ( first_fair ( cfs_rq ) , struct sched_entity , run_node ) ;
}
2007-10-15 19:00:05 +04:00
static inline struct sched_entity * __pick_last_entity ( struct cfs_rq * cfs_rq )
{
struct rb_node * * link = & cfs_rq - > tasks_timeline . rb_node ;
struct sched_entity * se = NULL ;
struct rb_node * parent ;
while ( * link ) {
parent = * link ;
se = rb_entry ( parent , struct sched_entity , run_node ) ;
link = & parent - > rb_right ;
}
return se ;
}
2007-07-09 20:51:58 +04:00
/**************************************************************
* Scheduling class statistics methods :
*/
2007-10-15 19:00:04 +04:00
static u64 __sched_period ( unsigned long nr_running )
{
u64 period = sysctl_sched_latency ;
unsigned long nr_latency =
sysctl_sched_latency / sysctl_sched_min_granularity ;
if ( unlikely ( nr_running > nr_latency ) ) {
period * = nr_running ;
do_div ( period , nr_latency ) ;
}
return period ;
}
2007-10-15 19:00:05 +04:00
static u64 sched_slice ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-08-25 20:41:53 +04:00
{
2007-10-15 19:00:05 +04:00
u64 period = __sched_period ( cfs_rq - > nr_running ) ;
2007-08-25 20:41:53 +04:00
2007-10-15 19:00:05 +04:00
period * = se - > load . weight ;
do_div ( period , cfs_rq - > load . weight ) ;
2007-08-25 20:41:53 +04:00
2007-10-15 19:00:05 +04:00
return period ;
2007-07-09 20:51:58 +04:00
}
/*
* Update the current task ' s runtime statistics . Skip current tasks that
* are not in our scheduling class .
*/
static inline void
2007-10-15 19:00:03 +04:00
__update_curr ( struct cfs_rq * cfs_rq , struct sched_entity * curr ,
unsigned long delta_exec )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:06 +04:00
unsigned long delta_exec_weighted ;
2007-07-09 20:51:58 +04:00
2007-08-02 19:41:40 +04:00
schedstat_set ( curr - > exec_max , max ( ( u64 ) delta_exec , curr - > exec_max ) ) ;
2007-07-09 20:51:58 +04:00
curr - > sum_exec_runtime + = delta_exec ;
2007-10-15 19:00:06 +04:00
schedstat_add ( cfs_rq , exec_clock , delta_exec ) ;
2007-10-15 19:00:04 +04:00
delta_exec_weighted = delta_exec ;
if ( unlikely ( curr - > load . weight ! = NICE_0_LOAD ) ) {
delta_exec_weighted = calc_delta_fair ( delta_exec_weighted ,
& curr - > load ) ;
}
curr - > vruntime + = delta_exec_weighted ;
2007-07-09 20:51:58 +04:00
}
2007-08-09 13:16:47 +04:00
static void update_curr ( struct cfs_rq * cfs_rq )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:03 +04:00
struct sched_entity * curr = cfs_rq - > curr ;
2007-10-15 19:00:03 +04:00
u64 now = rq_of ( cfs_rq ) - > clock ;
2007-07-09 20:51:58 +04:00
unsigned long delta_exec ;
if ( unlikely ( ! curr ) )
return ;
/*
* Get the amount of time the current task was running
* since the last time we changed load ( this cannot
* overflow on 32 bits ) :
*/
2007-10-15 19:00:03 +04:00
delta_exec = ( unsigned long ) ( now - curr - > exec_start ) ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:03 +04:00
__update_curr ( cfs_rq , curr , delta_exec ) ;
curr - > exec_start = now ;
2007-07-09 20:51:58 +04:00
}
static inline void
2007-08-09 13:16:47 +04:00
update_stats_wait_start ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
2007-08-09 13:16:47 +04:00
schedstat_set ( se - > wait_start , rq_of ( cfs_rq ) - > clock ) ;
2007-07-09 20:51:58 +04:00
}
static inline unsigned long
2007-10-15 19:00:04 +04:00
calc_weighted ( unsigned long delta , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:04 +04:00
unsigned long weight = se - > load . weight ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:04 +04:00
if ( unlikely ( weight ! = NICE_0_LOAD ) )
return ( u64 ) delta * se - > load . weight > > NICE_0_SHIFT ;
else
return delta ;
2007-07-09 20:51:58 +04:00
}
/*
* Task is being enqueued - update stats :
*/
2007-08-09 13:16:47 +04:00
static void update_stats_enqueue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
/*
* Are we enqueueing a waiting task ? ( for current tasks
* a dequeue / enqueue event is a NOP )
*/
2007-10-15 19:00:03 +04:00
if ( se ! = cfs_rq - > curr )
2007-08-09 13:16:47 +04:00
update_stats_wait_start ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
/*
* Update the key :
*/
2007-10-15 19:00:04 +04:00
se - > fair_key = se - > vruntime ;
2007-07-09 20:51:58 +04:00
}
static void
2007-08-09 13:16:47 +04:00
update_stats_wait_end ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
2007-10-15 19:00:06 +04:00
schedstat_set ( se - > wait_max , max ( se - > wait_max ,
rq_of ( cfs_rq ) - > clock - se - > wait_start ) ) ;
2007-08-02 19:41:40 +04:00
schedstat_set ( se - > wait_start , 0 ) ;
2007-07-09 20:51:58 +04:00
}
static inline void
2007-08-09 13:16:48 +04:00
update_stats_dequeue ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
2007-08-09 13:16:47 +04:00
update_curr ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
/*
* Mark the end of the wait period if dequeueing a
* waiting task :
*/
2007-10-15 19:00:03 +04:00
if ( se ! = cfs_rq - > curr )
2007-08-09 13:16:47 +04:00
update_stats_wait_end ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
}
/*
* We are picking a new current task - update its stats :
*/
static inline void
2007-08-09 13:16:47 +04:00
update_stats_curr_start ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
/*
* We are starting a new run period :
*/
2007-08-09 13:16:47 +04:00
se - > exec_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 20:51:58 +04:00
}
/*
* We are descheduling a task - update its stats :
*/
static inline void
2007-08-09 13:16:48 +04:00
update_stats_curr_end ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
se - > exec_start = 0 ;
}
/**************************************************
* Scheduling class queueing methods :
*/
2007-08-09 13:16:48 +04:00
static void enqueue_sleeper ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
# ifdef CONFIG_SCHEDSTATS
if ( se - > sleep_start ) {
2007-08-09 13:16:47 +04:00
u64 delta = rq_of ( cfs_rq ) - > clock - se - > sleep_start ;
2007-07-09 20:51:58 +04:00
if ( ( s64 ) delta < 0 )
delta = 0 ;
if ( unlikely ( delta > se - > sleep_max ) )
se - > sleep_max = delta ;
se - > sleep_start = 0 ;
se - > sum_sleep_runtime + = delta ;
}
if ( se - > block_start ) {
2007-08-09 13:16:47 +04:00
u64 delta = rq_of ( cfs_rq ) - > clock - se - > block_start ;
2007-07-09 20:51:58 +04:00
if ( ( s64 ) delta < 0 )
delta = 0 ;
if ( unlikely ( delta > se - > block_max ) )
se - > block_max = delta ;
se - > block_start = 0 ;
se - > sum_sleep_runtime + = delta ;
2007-10-02 16:13:08 +04:00
/*
* Blocking time is in units of nanosecs , so shift by 20 to
* get a milliseconds - range estimation of the amount of
* time that the task spent sleeping :
*/
if ( unlikely ( prof_on = = SLEEP_PROFILING ) ) {
2007-10-15 19:00:06 +04:00
struct task_struct * tsk = task_of ( se ) ;
2007-10-02 16:13:08 +04:00
profile_hits ( SLEEP_PROFILING , ( void * ) get_wchan ( tsk ) ,
delta > > 20 ) ;
}
2007-07-09 20:51:58 +04:00
}
# endif
}
2007-10-15 19:00:05 +04:00
static void
place_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int initial )
{
u64 min_runtime , latency ;
min_runtime = cfs_rq - > min_vruntime ;
2007-10-15 19:00:05 +04:00
if ( sched_feat ( USE_TREE_AVG ) ) {
struct sched_entity * last = __pick_last_entity ( cfs_rq ) ;
if ( last ) {
min_runtime = __pick_next_entity ( cfs_rq ) - > vruntime ;
min_runtime + = last - > vruntime ;
min_runtime > > = 1 ;
}
} else if ( sched_feat ( APPROX_AVG ) )
min_runtime + = sysctl_sched_latency / 2 ;
if ( initial & & sched_feat ( START_DEBIT ) )
min_runtime + = sched_slice ( cfs_rq , se ) ;
2007-10-15 19:00:05 +04:00
if ( ! initial & & sched_feat ( NEW_FAIR_SLEEPERS ) ) {
latency = sysctl_sched_latency ;
if ( min_runtime > latency )
min_runtime - = latency ;
else
min_runtime = 0 ;
}
se - > vruntime = max ( se - > vruntime , min_runtime ) ;
}
2007-07-09 20:51:58 +04:00
static void
2007-08-09 13:16:48 +04:00
enqueue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int wakeup )
2007-07-09 20:51:58 +04:00
{
/*
* Update the fair clock .
*/
2007-08-09 13:16:47 +04:00
update_curr ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:04 +04:00
if ( wakeup ) {
2007-10-15 19:00:05 +04:00
place_entity ( cfs_rq , se , 0 ) ;
2007-08-09 13:16:48 +04:00
enqueue_sleeper ( cfs_rq , se ) ;
2007-10-15 19:00:04 +04:00
}
2007-07-09 20:51:58 +04:00
2007-08-09 13:16:47 +04:00
update_stats_enqueue ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
__enqueue_entity ( cfs_rq , se ) ;
}
static void
2007-08-09 13:16:48 +04:00
dequeue_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se , int sleep )
2007-07-09 20:51:58 +04:00
{
2007-08-09 13:16:48 +04:00
update_stats_dequeue ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
if ( sleep ) {
# ifdef CONFIG_SCHEDSTATS
if ( entity_is_task ( se ) ) {
struct task_struct * tsk = task_of ( se ) ;
if ( tsk - > state & TASK_INTERRUPTIBLE )
2007-08-09 13:16:47 +04:00
se - > sleep_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 20:51:58 +04:00
if ( tsk - > state & TASK_UNINTERRUPTIBLE )
2007-08-09 13:16:47 +04:00
se - > block_start = rq_of ( cfs_rq ) - > clock ;
2007-07-09 20:51:58 +04:00
}
# endif
}
__dequeue_entity ( cfs_rq , se ) ;
}
/*
* Preempt the current task with a newly woken task if needed :
*/
2007-09-05 16:32:49 +04:00
static void
2007-10-15 19:00:05 +04:00
check_preempt_tick ( struct cfs_rq * cfs_rq , struct sched_entity * curr )
2007-07-09 20:51:58 +04:00
{
2007-09-05 16:32:49 +04:00
unsigned long ideal_runtime , delta_exec ;
2007-10-15 19:00:05 +04:00
ideal_runtime = sched_slice ( cfs_rq , curr ) ;
2007-09-05 16:32:49 +04:00
delta_exec = curr - > sum_exec_runtime - curr - > prev_sum_exec_runtime ;
if ( delta_exec > ideal_runtime )
2007-07-09 20:51:58 +04:00
resched_task ( rq_of ( cfs_rq ) - > curr ) ;
}
static inline void
2007-08-09 13:16:48 +04:00
set_next_entity ( struct cfs_rq * cfs_rq , struct sched_entity * se )
2007-07-09 20:51:58 +04:00
{
/*
* Any task has to be enqueued before it get to execute on
* a CPU . So account for the time it spent waiting on the
2007-10-15 19:00:06 +04:00
* runqueue .
2007-07-09 20:51:58 +04:00
*/
2007-08-09 13:16:47 +04:00
update_stats_wait_end ( cfs_rq , se ) ;
2007-08-09 13:16:47 +04:00
update_stats_curr_start ( cfs_rq , se ) ;
2007-10-15 19:00:03 +04:00
cfs_rq - > curr = se ;
2007-10-15 19:00:02 +04:00
# ifdef CONFIG_SCHEDSTATS
/*
* Track our maximum slice length , if the CPU ' s load is at
* least twice that of our own weight ( i . e . dont track it
* when there are only lesser - weight tasks around ) :
*/
2007-10-15 19:00:06 +04:00
if ( rq_of ( cfs_rq ) - > load . weight > = 2 * se - > load . weight ) {
2007-10-15 19:00:02 +04:00
se - > slice_max = max ( se - > slice_max ,
se - > sum_exec_runtime - se - > prev_sum_exec_runtime ) ;
}
# endif
2007-09-05 16:32:49 +04:00
se - > prev_sum_exec_runtime = se - > sum_exec_runtime ;
2007-07-09 20:51:58 +04:00
}
2007-08-09 13:16:48 +04:00
static struct sched_entity * pick_next_entity ( struct cfs_rq * cfs_rq )
2007-07-09 20:51:58 +04:00
{
struct sched_entity * se = __pick_next_entity ( cfs_rq ) ;
2007-08-09 13:16:48 +04:00
set_next_entity ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
return se ;
}
2007-08-09 13:16:48 +04:00
static void put_prev_entity ( struct cfs_rq * cfs_rq , struct sched_entity * prev )
2007-07-09 20:51:58 +04:00
{
/*
* If still on the runqueue then deactivate_task ( )
* was not called and update_curr ( ) has to be done :
*/
if ( prev - > on_rq )
2007-08-09 13:16:47 +04:00
update_curr ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
2007-08-09 13:16:48 +04:00
update_stats_curr_end ( cfs_rq , prev ) ;
2007-07-09 20:51:58 +04:00
if ( prev - > on_rq )
2007-08-09 13:16:47 +04:00
update_stats_wait_start ( cfs_rq , prev ) ;
2007-10-15 19:00:03 +04:00
cfs_rq - > curr = NULL ;
2007-07-09 20:51:58 +04:00
}
static void entity_tick ( struct cfs_rq * cfs_rq , struct sched_entity * curr )
{
/*
* Dequeue and enqueue the task to update its
* position within the tree :
*/
2007-08-09 13:16:48 +04:00
dequeue_entity ( cfs_rq , curr , 0 ) ;
2007-08-09 13:16:48 +04:00
enqueue_entity ( cfs_rq , curr , 0 ) ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:05 +04:00
if ( cfs_rq - > nr_running > 1 )
check_preempt_tick ( cfs_rq , curr ) ;
2007-07-09 20:51:58 +04:00
}
/**************************************************
* CFS operations on tasks :
*/
# ifdef CONFIG_FAIR_GROUP_SCHED
/* Walk up scheduling entities hierarchy */
# define for_each_sched_entity(se) \
for ( ; se ; se = se - > parent )
static inline struct cfs_rq * task_cfs_rq ( struct task_struct * p )
{
return p - > se . cfs_rq ;
}
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq * cfs_rq_of ( struct sched_entity * se )
{
return se - > cfs_rq ;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq * group_cfs_rq ( struct sched_entity * grp )
{
return grp - > my_q ;
}
/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
* another cpu ( ' this_cpu ' )
*/
static inline struct cfs_rq * cpu_cfs_rq ( struct cfs_rq * cfs_rq , int this_cpu )
{
/* A later patch will take group into account */
return & cpu_rq ( this_cpu ) - > cfs ;
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
# define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry ( cfs_rq , & rq - > leaf_cfs_rq_list , leaf_cfs_rq_list )
/* Do the two (enqueued) tasks belong to the same group ? */
static inline int is_same_group ( struct task_struct * curr , struct task_struct * p )
{
if ( curr - > se . cfs_rq = = p - > se . cfs_rq )
return 1 ;
return 0 ;
}
# else /* CONFIG_FAIR_GROUP_SCHED */
# define for_each_sched_entity(se) \
for ( ; se ; se = NULL )
static inline struct cfs_rq * task_cfs_rq ( struct task_struct * p )
{
return & task_rq ( p ) - > cfs ;
}
static inline struct cfs_rq * cfs_rq_of ( struct sched_entity * se )
{
struct task_struct * p = task_of ( se ) ;
struct rq * rq = task_rq ( p ) ;
return & rq - > cfs ;
}
/* runqueue "owned" by this group */
static inline struct cfs_rq * group_cfs_rq ( struct sched_entity * grp )
{
return NULL ;
}
static inline struct cfs_rq * cpu_cfs_rq ( struct cfs_rq * cfs_rq , int this_cpu )
{
return & cpu_rq ( this_cpu ) - > cfs ;
}
# define for_each_leaf_cfs_rq(rq, cfs_rq) \
for ( cfs_rq = & rq - > cfs ; cfs_rq ; cfs_rq = NULL )
static inline int is_same_group ( struct task_struct * curr , struct task_struct * p )
{
return 1 ;
}
# endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* The enqueue_task method is called before nr_running is
* increased . Here we update the fair scheduling stats and
* then put the task into the rbtree :
*/
2007-08-09 13:16:48 +04:00
static void enqueue_task_fair ( struct rq * rq , struct task_struct * p , int wakeup )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq ;
struct sched_entity * se = & p - > se ;
for_each_sched_entity ( se ) {
if ( se - > on_rq )
break ;
cfs_rq = cfs_rq_of ( se ) ;
2007-08-09 13:16:48 +04:00
enqueue_entity ( cfs_rq , se , wakeup ) ;
2007-07-09 20:51:58 +04:00
}
}
/*
* The dequeue_task method is called before nr_running is
* decreased . We remove the task from the rbtree and
* update the fair scheduling stats :
*/
2007-08-09 13:16:48 +04:00
static void dequeue_task_fair ( struct rq * rq , struct task_struct * p , int sleep )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq ;
struct sched_entity * se = & p - > se ;
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
2007-08-09 13:16:48 +04:00
dequeue_entity ( cfs_rq , se , sleep ) ;
2007-07-09 20:51:58 +04:00
/* Don't dequeue parent if it has other entities besides us */
if ( cfs_rq - > load . weight )
break ;
}
}
/*
2007-09-20 01:34:46 +04:00
* sched_yield ( ) support is very simple - we dequeue and enqueue .
*
* If compat_yield is turned on then we requeue to the end of the tree .
2007-07-09 20:51:58 +04:00
*/
static void yield_task_fair ( struct rq * rq , struct task_struct * p )
{
struct cfs_rq * cfs_rq = task_cfs_rq ( p ) ;
2007-09-20 01:34:46 +04:00
struct rb_node * * link = & cfs_rq - > tasks_timeline . rb_node ;
struct sched_entity * rightmost , * se = & p - > se ;
struct rb_node * parent ;
2007-07-09 20:51:58 +04:00
/*
2007-09-20 01:34:46 +04:00
* Are we the only task in the tree ?
*/
if ( unlikely ( cfs_rq - > nr_running = = 1 ) )
return ;
if ( likely ( ! sysctl_sched_compat_yield ) ) {
__update_rq_clock ( rq ) ;
/*
* Dequeue and enqueue the task to update its
* position within the tree :
*/
dequeue_entity ( cfs_rq , & p - > se , 0 ) ;
enqueue_entity ( cfs_rq , & p - > se , 0 ) ;
return ;
}
/*
* Find the rightmost entry in the rbtree :
2007-07-09 20:51:58 +04:00
*/
2007-09-20 01:34:46 +04:00
do {
parent = * link ;
link = & parent - > rb_right ;
} while ( * link ) ;
rightmost = rb_entry ( parent , struct sched_entity , run_node ) ;
/*
* Already in the rightmost position ?
*/
if ( unlikely ( rightmost = = se ) )
return ;
/*
* Minimally necessary key value to be last in the tree :
*/
se - > fair_key = rightmost - > fair_key + 1 ;
if ( cfs_rq - > rb_leftmost = = & se - > run_node )
cfs_rq - > rb_leftmost = rb_next ( & se - > run_node ) ;
/*
* Relink the task to the rightmost position :
*/
rb_erase ( & se - > run_node , & cfs_rq - > tasks_timeline ) ;
rb_link_node ( & se - > run_node , parent , link ) ;
rb_insert_color ( & se - > run_node , & cfs_rq - > tasks_timeline ) ;
2007-07-09 20:51:58 +04:00
}
/*
* Preempt the current task with a newly woken task if needed :
*/
2007-10-15 19:00:05 +04:00
static void check_preempt_wakeup ( struct rq * rq , struct task_struct * p )
2007-07-09 20:51:58 +04:00
{
struct task_struct * curr = rq - > curr ;
struct cfs_rq * cfs_rq = task_cfs_rq ( curr ) ;
if ( unlikely ( rt_prio ( p - > prio ) ) ) {
2007-08-09 13:16:47 +04:00
update_rq_clock ( rq ) ;
2007-08-09 13:16:47 +04:00
update_curr ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
resched_task ( curr ) ;
return ;
}
2007-10-15 19:00:05 +04:00
if ( is_same_group ( curr , p ) ) {
s64 delta = curr - > se . vruntime - p - > se . vruntime ;
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:05 +04:00
if ( delta > ( s64 ) sysctl_sched_wakeup_granularity )
resched_task ( curr ) ;
}
2007-07-09 20:51:58 +04:00
}
2007-08-09 13:16:48 +04:00
static struct task_struct * pick_next_task_fair ( struct rq * rq )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq = & rq - > cfs ;
struct sched_entity * se ;
if ( unlikely ( ! cfs_rq - > nr_running ) )
return NULL ;
do {
2007-08-09 13:16:48 +04:00
se = pick_next_entity ( cfs_rq ) ;
2007-07-09 20:51:58 +04:00
cfs_rq = group_cfs_rq ( se ) ;
} while ( cfs_rq ) ;
return task_of ( se ) ;
}
/*
* Account for a descheduled task :
*/
2007-08-09 13:16:49 +04:00
static void put_prev_task_fair ( struct rq * rq , struct task_struct * prev )
2007-07-09 20:51:58 +04:00
{
struct sched_entity * se = & prev - > se ;
struct cfs_rq * cfs_rq ;
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
2007-08-09 13:16:48 +04:00
put_prev_entity ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
}
}
/**************************************************
* Fair scheduling class load - balancing methods :
*/
/*
* Load - balancing iterator . Note : while the runqueue stays locked
* during the whole iteration , the current task might be
* dequeued so the iterator has to be dequeue - safe . Here we
* achieve that by always pre - iterating before returning
* the current task :
*/
static inline struct task_struct *
__load_balance_iterator ( struct cfs_rq * cfs_rq , struct rb_node * curr )
{
struct task_struct * p ;
if ( ! curr )
return NULL ;
p = rb_entry ( curr , struct task_struct , se . run_node ) ;
cfs_rq - > rb_load_balance_curr = rb_next ( curr ) ;
return p ;
}
static struct task_struct * load_balance_start_fair ( void * arg )
{
struct cfs_rq * cfs_rq = arg ;
return __load_balance_iterator ( cfs_rq , first_fair ( cfs_rq ) ) ;
}
static struct task_struct * load_balance_next_fair ( void * arg )
{
struct cfs_rq * cfs_rq = arg ;
return __load_balance_iterator ( cfs_rq , cfs_rq - > rb_load_balance_curr ) ;
}
2007-08-09 13:16:46 +04:00
# ifdef CONFIG_FAIR_GROUP_SCHED
2007-07-09 20:51:58 +04:00
static int cfs_rq_best_prio ( struct cfs_rq * cfs_rq )
{
struct sched_entity * curr ;
struct task_struct * p ;
if ( ! cfs_rq - > nr_running )
return MAX_PRIO ;
curr = __pick_next_entity ( cfs_rq ) ;
p = task_of ( curr ) ;
return p - > prio ;
}
2007-08-09 13:16:46 +04:00
# endif
2007-07-09 20:51:58 +04:00
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 13:16:46 +04:00
static unsigned long
2007-07-09 20:51:58 +04:00
load_balance_fair ( struct rq * this_rq , int this_cpu , struct rq * busiest ,
2007-08-09 13:16:46 +04:00
unsigned long max_nr_move , unsigned long max_load_move ,
struct sched_domain * sd , enum cpu_idle_type idle ,
int * all_pinned , int * this_best_prio )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * busy_cfs_rq ;
unsigned long load_moved , total_nr_moved = 0 , nr_moved ;
long rem_load_move = max_load_move ;
struct rq_iterator cfs_rq_iterator ;
cfs_rq_iterator . start = load_balance_start_fair ;
cfs_rq_iterator . next = load_balance_next_fair ;
for_each_leaf_cfs_rq ( busiest , busy_cfs_rq ) {
2007-08-09 13:16:46 +04:00
# ifdef CONFIG_FAIR_GROUP_SCHED
2007-07-09 20:51:58 +04:00
struct cfs_rq * this_cfs_rq ;
2007-08-11 01:05:11 +04:00
long imbalance ;
2007-07-09 20:51:58 +04:00
unsigned long maxload ;
this_cfs_rq = cpu_cfs_rq ( busy_cfs_rq , this_cpu ) ;
2007-08-11 01:05:11 +04:00
imbalance = busy_cfs_rq - > load . weight - this_cfs_rq - > load . weight ;
2007-07-09 20:51:58 +04:00
/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
if ( imbalance < = 0 )
continue ;
/* Don't pull more than imbalance/2 */
imbalance / = 2 ;
maxload = min ( rem_load_move , imbalance ) ;
2007-08-09 13:16:46 +04:00
* this_best_prio = cfs_rq_best_prio ( this_cfs_rq ) ;
# else
2007-08-11 01:05:11 +04:00
# define maxload rem_load_move
2007-08-09 13:16:46 +04:00
# endif
2007-07-09 20:51:58 +04:00
/* pass busy_cfs_rq argument into
* load_balance_ [ start | next ] _fair iterators
*/
cfs_rq_iterator . arg = busy_cfs_rq ;
nr_moved = balance_tasks ( this_rq , this_cpu , busiest ,
max_nr_move , maxload , sd , idle , all_pinned ,
2007-08-09 13:16:46 +04:00
& load_moved , this_best_prio , & cfs_rq_iterator ) ;
2007-07-09 20:51:58 +04:00
total_nr_moved + = nr_moved ;
max_nr_move - = nr_moved ;
rem_load_move - = load_moved ;
if ( max_nr_move < = 0 | | rem_load_move < = 0 )
break ;
}
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 13:16:46 +04:00
return max_load_move - rem_load_move ;
2007-07-09 20:51:58 +04:00
}
/*
* scheduler tick hitting a task of our scheduling class :
*/
static void task_tick_fair ( struct rq * rq , struct task_struct * curr )
{
struct cfs_rq * cfs_rq ;
struct sched_entity * se = & curr - > se ;
for_each_sched_entity ( se ) {
cfs_rq = cfs_rq_of ( se ) ;
entity_tick ( cfs_rq , se ) ;
}
}
2007-10-15 19:00:04 +04:00
# define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
2007-07-09 20:51:58 +04:00
/*
* Share the fairness runtime between parent and child , thus the
* total amount of pressure for CPU stays equal - new tasks
* get a chance to run but frequent forkers are not allowed to
* monopolize the CPU . Note : the parent runqueue is locked ,
* the child is not running yet .
*/
2007-08-09 13:16:49 +04:00
static void task_new_fair ( struct rq * rq , struct task_struct * p )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq = task_cfs_rq ( p ) ;
2007-10-15 19:00:03 +04:00
struct sched_entity * se = & p - > se , * curr = cfs_rq - > curr ;
2007-07-09 20:51:58 +04:00
sched_info_queued ( p ) ;
2007-08-28 14:53:24 +04:00
update_curr ( cfs_rq ) ;
2007-10-15 19:00:05 +04:00
place_entity ( cfs_rq , se , 1 ) ;
2007-10-15 19:00:04 +04:00
if ( sysctl_sched_child_runs_first & &
curr - > vruntime < se - > vruntime ) {
dequeue_entity ( cfs_rq , curr , 0 ) ;
swap ( curr - > vruntime , se - > vruntime ) ;
enqueue_entity ( cfs_rq , curr , 0 ) ;
}
2007-07-09 20:51:58 +04:00
2007-10-15 19:00:04 +04:00
update_stats_enqueue ( cfs_rq , se ) ;
2007-07-09 20:51:58 +04:00
__enqueue_entity ( cfs_rq , se ) ;
2007-10-15 19:00:02 +04:00
resched_task ( rq - > curr ) ;
2007-07-09 20:51:58 +04:00
}
# ifdef CONFIG_FAIR_GROUP_SCHED
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq - > curr field when a task
* migrates between groups / classes .
*/
static void set_curr_task_fair ( struct rq * rq )
{
2007-08-24 22:39:10 +04:00
struct sched_entity * se = & rq - > curr - > se ;
2007-08-09 13:16:47 +04:00
2007-08-09 13:16:51 +04:00
for_each_sched_entity ( se )
set_next_entity ( cfs_rq_of ( se ) , se ) ;
2007-07-09 20:51:58 +04:00
}
# else
static void set_curr_task_fair ( struct rq * rq )
{
}
# endif
/*
* All the scheduling class methods :
*/
struct sched_class fair_sched_class __read_mostly = {
. enqueue_task = enqueue_task_fair ,
. dequeue_task = dequeue_task_fair ,
. yield_task = yield_task_fair ,
2007-10-15 19:00:05 +04:00
. check_preempt_curr = check_preempt_wakeup ,
2007-07-09 20:51:58 +04:00
. pick_next_task = pick_next_task_fair ,
. put_prev_task = put_prev_task_fair ,
. load_balance = load_balance_fair ,
. set_curr_task = set_curr_task_fair ,
. task_tick = task_tick_fair ,
. task_new = task_new_fair ,
} ;
# ifdef CONFIG_SCHED_DEBUG
2007-08-09 13:16:47 +04:00
static void print_cfs_stats ( struct seq_file * m , int cpu )
2007-07-09 20:51:58 +04:00
{
struct cfs_rq * cfs_rq ;
2007-08-09 13:16:51 +04:00
for_each_leaf_cfs_rq ( cpu_rq ( cpu ) , cfs_rq )
2007-08-09 13:16:47 +04:00
print_cfs_rq ( m , cpu , cfs_rq ) ;
2007-07-09 20:51:58 +04:00
}
# endif