From 4d9842776a23e52ec4c60e0a79f5e1bbe91e463e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:49 -0500 Subject: [PATCH 01/11] sched: cleanup inc/dec_rt_tasks Move some common definitions up to the function prologe to simplify the body logic. Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1bbd99014011..0a5277233452 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -550,30 +550,28 @@ static void update_curr_rt(struct rq *rq) static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { - WARN_ON(!rt_prio(rt_se_prio(rt_se))); + int prio = rt_se_prio(rt_se); +#ifdef CONFIG_SMP + struct rq *rq = rq_of_rt_rq(rt_rq); +#endif + + WARN_ON(!rt_prio(prio)); rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_se_prio(rt_se) < rt_rq->highest_prio) { -#ifdef CONFIG_SMP - struct rq *rq = rq_of_rt_rq(rt_rq); -#endif + if (prio < rt_rq->highest_prio) { - rt_rq->highest_prio = rt_se_prio(rt_se); + rt_rq->highest_prio = prio; #ifdef CONFIG_SMP if (rq->online) - cpupri_set(&rq->rd->cpupri, rq->cpu, - rt_se_prio(rt_se)); + cpupri_set(&rq->rd->cpupri, rq->cpu, prio); #endif } #endif #ifdef CONFIG_SMP - if (rt_se->nr_cpus_allowed > 1) { - struct rq *rq = rq_of_rt_rq(rt_rq); - + if (rt_se->nr_cpus_allowed > 1) rq->rt.rt_nr_migratory++; - } - update_rt_migration(rq_of_rt_rq(rt_rq)); + update_rt_migration(rq); #endif #ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) @@ -590,6 +588,7 @@ static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { #ifdef CONFIG_SMP + struct rq *rq = rq_of_rt_rq(rt_rq); int highest_prio = rt_rq->highest_prio; #endif @@ -611,20 +610,13 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->highest_prio = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP - if (rt_se->nr_cpus_allowed > 1) { - struct rq *rq = rq_of_rt_rq(rt_rq); + if (rt_se->nr_cpus_allowed > 1) rq->rt.rt_nr_migratory--; - } - if (rt_rq->highest_prio != highest_prio) { - struct rq *rq = rq_of_rt_rq(rt_rq); + if (rq->online && rt_rq->highest_prio != highest_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); - if (rq->online) - cpupri_set(&rq->rd->cpupri, rq->cpu, - rt_rq->highest_prio); - } - - update_rt_migration(rq_of_rt_rq(rt_rq)); + update_rt_migration(rq); #endif /* CONFIG_SMP */ #ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) From e864c499d9e57805ae1f9e7ea404dd223759cd53 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:49 -0500 Subject: [PATCH 02/11] sched: track the next-highest priority on each runqueue We will use this later in the series to reduce the amount of rq-lock contention during a pull operation Signed-off-by: Gregory Haskins --- kernel/sched.c | 8 +++-- kernel/sched_rt.c | 81 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 67 insertions(+), 22 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 756d981d91a4..7729f9a45a8b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -463,7 +463,10 @@ struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - int highest_prio; /* highest queued rt task prio */ + struct { + int curr; /* highest queued rt task prio */ + int next; /* next highest */ + } highest_prio; #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; @@ -8169,7 +8172,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - rt_rq->highest_prio = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.next = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a5277233452..ad36d7232236 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -108,7 +108,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (rt_rq->rt_nr_running) { if (rt_se && !on_rt_rq(rt_se)) enqueue_rt_entity(rt_se); - if (rt_rq->highest_prio < curr->prio) + if (rt_rq->highest_prio.curr < curr->prio) resched_task(curr); } } @@ -473,7 +473,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq) - return rt_rq->highest_prio; + return rt_rq->highest_prio.curr; #endif return rt_task_of(rt_se)->prio; @@ -547,6 +547,21 @@ static void update_curr_rt(struct rq *rq) } } +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); + +static inline int next_prio(struct rq *rq) +{ + struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); + + if (next && rt_prio(next->prio)) + return next->prio; + else + return MAX_RT_PRIO; +} +#endif + static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { @@ -558,14 +573,32 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(prio)); rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (prio < rt_rq->highest_prio) { + if (prio < rt_rq->highest_prio.curr) { - rt_rq->highest_prio = prio; + /* + * If the new task is higher in priority than anything on the + * run-queue, we have a new high that must be published to + * the world. We also know that the previous high becomes + * our next-highest. + */ + rt_rq->highest_prio.next = rt_rq->highest_prio.curr; + rt_rq->highest_prio.curr = prio; #ifdef CONFIG_SMP if (rq->online) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); #endif - } + } else if (prio == rt_rq->highest_prio.curr) + /* + * If the next task is equal in priority to the highest on + * the run-queue, then we implicitly know that the next highest + * task cannot be any lower than current + */ + rt_rq->highest_prio.next = prio; + else if (prio < rt_rq->highest_prio.next) + /* + * Otherwise, we need to recompute next-highest + */ + rt_rq->highest_prio.next = next_prio(rq); #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) @@ -589,7 +622,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { #ifdef CONFIG_SMP struct rq *rq = rq_of_rt_rq(rt_rq); - int highest_prio = rt_rq->highest_prio; + int highest_prio = rt_rq->highest_prio.curr; #endif WARN_ON(!rt_prio(rt_se_prio(rt_se))); @@ -597,24 +630,32 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rt_nr_running--; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_rq->rt_nr_running) { - struct rt_prio_array *array; + int prio = rt_se_prio(rt_se); - WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); - if (rt_se_prio(rt_se) == rt_rq->highest_prio) { - /* recalculate */ - array = &rt_rq->active; - rt_rq->highest_prio = + WARN_ON(prio < rt_rq->highest_prio.curr); + + /* + * This may have been our highest or next-highest priority + * task and therefore we may have some recomputation to do + */ + if (prio == rt_rq->highest_prio.curr) { + struct rt_prio_array *array = &rt_rq->active; + + rt_rq->highest_prio.curr = sched_find_first_bit(array->bitmap); - } /* otherwise leave rq->highest prio alone */ + } + + if (prio <= rt_rq->highest_prio.next) + rt_rq->highest_prio.next = next_prio(rq); } else - rt_rq->highest_prio = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) rq->rt.rt_nr_migratory--; - if (rq->online && rt_rq->highest_prio != highest_prio) - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + if (rq->online && rt_rq->highest_prio.curr != highest_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); update_rt_migration(rq); #endif /* CONFIG_SMP */ @@ -1064,7 +1105,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) } /* If this rq is still suitable use it. */ - if (lowest_rq->rt.highest_prio > task->prio) + if (lowest_rq->rt.highest_prio.curr > task->prio) break; /* try again */ @@ -1252,7 +1293,7 @@ static int pull_rt_task(struct rq *this_rq) static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) + if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) pull_rt_task(rq); } @@ -1338,7 +1379,7 @@ static void rq_online_rt(struct rq *rq) __enable_runtime(rq); - cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } /* Assumes rq->lock is held */ @@ -1429,7 +1470,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p, * can release the rq lock and p could migrate. * Only reschedule if p is still on the same runqueue. */ - if (p->prio > rq->rt.highest_prio && rq->curr == p) + if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) resched_task(p); #else /* For UP simply resched on drop of prio */ From a8728944efe23417e38bf22063f06d9d8ee21d59 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:49 -0500 Subject: [PATCH 03/11] sched: use highest_prio.curr for pull threshold highest_prio.curr is actually a more accurate way to keep track of the pull_rt_task() threshold since it is always up to date, even if the "next" task migrates during double_lock. Therefore, stop looking at the "next" task object and simply use the highest_prio.curr. Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ad36d7232236..f8fb3edadcaa 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1207,14 +1207,12 @@ static void push_rt_tasks(struct rq *rq) static int pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, ret = 0, cpu; - struct task_struct *p, *next; + struct task_struct *p; struct rq *src_rq; if (likely(!rt_overloaded(this_rq))) return 0; - next = pick_next_task_rt(this_rq); - for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1223,17 +1221,9 @@ static int pull_rt_task(struct rq *this_rq) /* * We can potentially drop this_rq's lock in * double_lock_balance, and another CPU could - * steal our next task - hence we must cause - * the caller to recalculate the next task - * in that case: + * alter this_rq */ - if (double_lock_balance(this_rq, src_rq)) { - struct task_struct *old_next = next; - - next = pick_next_task_rt(this_rq); - if (next != old_next) - ret = 1; - } + double_lock_balance(this_rq, src_rq); /* * Are there still pullable RT tasks? @@ -1247,7 +1237,7 @@ static int pull_rt_task(struct rq *this_rq) * Do we have an RT task that preempts * the to-be-scheduled task? */ - if (p && (!next || (p->prio < next->prio))) { + if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); WARN_ON(!p->se.on_rq); @@ -1257,12 +1247,9 @@ static int pull_rt_task(struct rq *this_rq) * This is just that p is wakeing up and hasn't * had a chance to schedule. We only pull * p if it is lower in priority than the - * current task on the run queue or - * this_rq next task is lower in prio than - * the current task on that rq. + * current task on the run queue */ - if (p->prio < src_rq->curr->prio || - (next && next->prio < src_rq->curr->prio)) + if (p->prio < src_rq->curr->prio) goto skip; ret = 1; @@ -1275,13 +1262,7 @@ static int pull_rt_task(struct rq *this_rq) * case there's an even higher prio task * in another runqueue. (low likelyhood * but possible) - * - * Update next so that we won't pick a task - * on another cpu with a priority lower (or equal) - * than the one we just picked. */ - next = p; - } skip: double_unlock_balance(this_rq, src_rq); From 74ab8e4f6412c0b2d730fe5de28dc21de8b92c01 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:50 -0500 Subject: [PATCH 04/11] sched: use highest_prio.next to optimize pull operations We currently take the rq->lock for every cpu in an overload state during pull_rt_tasks(). However, we now have enough information via the highest_prio.[curr|next] fields to determine if there is any tasks of interest to warrant the overhead of the rq->lock, before we actually take it. So we use this information to reduce lock contention during the pull for the case where the source-rq doesnt have tasks that preempt the current task. Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f8fb3edadcaa..d047f288c411 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1218,6 +1218,18 @@ static int pull_rt_task(struct rq *this_rq) continue; src_rq = cpu_rq(cpu); + + /* + * Don't bother taking the src_rq->lock if the next highest + * task is known to be lower-priority than our current task. + * This may look racy, but if this value is about to go + * logically higher, the src_rq will push this task away. + * And if its going logically lower, we do not care + */ + if (src_rq->rt.highest_prio.next >= + this_rq->rt.highest_prio.curr) + continue; + /* * We can potentially drop this_rq's lock in * double_lock_balance, and another CPU could From 777c2f389e463428fd7e2871051a84d7fe84b172 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:50 -0500 Subject: [PATCH 05/11] sched: only try to push a task on wakeup if it is migratable There is no sense in wasting time trying to push a task away that cannot move anywhere else. We gain no benefit from trying to push other tasks at this point, so if the task being woken up is non migratable, just skip the whole operation. This reduces overhead in the wakeup path for certain tasks. Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d047f288c411..8d33843cb2c4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1314,7 +1314,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - rq->rt.overloaded) + rq->rt.overloaded && + p->rt.nr_cpus_allowed > 1) push_rt_tasks(rq); } From 7e96fa5875d4a9be18d74d3ca7b90518d05bc426 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:50 -0500 Subject: [PATCH 06/11] sched: pull only one task during NEWIDLE balancing to limit critical section git-id c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 attempted to limit newidle critical section length by stopping after at least one task was moved. Further investigation has shown that there are other paths nested further inside the algorithm which still remain that allow long latencies to occur with newidle balancing. This patch applies the same technique inside balance_tasks() to limit the duration of this optional balancing operation. Signed-off-by: Gregory Haskins CC: Nick Piggin --- kernel/sched.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7729f9a45a8b..94d9a6c5ff94 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2984,6 +2984,16 @@ next: pulled++; rem_load_move -= p->se.load.weight; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible kernels + * will stop after the first task is pulled to minimize the critical + * section. + */ + if (idle == CPU_NEWLY_IDLE) + goto out; +#endif + /* * We only want to steal up to the prescribed amount of weighted load. */ @@ -3030,9 +3040,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, sd, idle, all_pinned, &this_best_prio); class = class->next; +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) break; - +#endif } while (class && max_load_move > total_load_moved); return total_load_moved > 0; From 8f45e2b516201d1bf681e6026fa5276385def565 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:51 -0500 Subject: [PATCH 07/11] sched: make double-lock-balance fair double_lock balance() currently favors logically lower cpus since they often do not have to release their own lock to acquire a second lock. The result is that logically higher cpus can get starved when there is a lot of pressure on the RQs. This can result in higher latencies on higher cpu-ids. This patch makes the algorithm more fair by forcing all paths to have to release both locks before acquiring them again. Since callsites to double_lock_balance already consider it a potential preemption/reschedule point, they have the proper logic to recheck for atomicity violations. Signed-off-by: Gregory Haskins --- kernel/sched.c | 51 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 94d9a6c5ff94..8fca364f3593 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1608,21 +1608,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +#ifdef CONFIG_PREEMPT + /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __releases(this_rq->lock) __acquires(busiest->lock) __acquires(this_rq->lock) { int ret = 0; - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); @@ -1635,6 +1656,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { From 967fc04671feea4dbf780c9e55a0bc8fcf68a14e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:52 -0500 Subject: [PATCH 08/11] sched: add sched_class->needs_post_schedule() member We currently run class->post_schedule() outside of the rq->lock, which means that we need to test for the need to post_schedule outside of the lock to avoid a forced reacquistion. This is currently not a problem as we only look at rq->rt.overloaded. However, we want to enhance this going forward to look at more state to reduce the need to post_schedule to a bare minimum set. Therefore, we introduce a new member-func called needs_post_schedule() which tests for the post_schedule condtion without actually performing the work. Therefore it is safe to call this function before the rq->lock is released, because we are guaranteed not to drop the lock at an intermediate point (such as what post_schedule() may do). We will use this later in the series [ rostedt: removed paranoid BUG_ON ] Signed-off-by: Gregory Haskins --- include/linux/sched.h | 1 + kernel/sched.c | 8 +++++++- kernel/sched_rt.c | 24 ++++++++++++++---------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e5f928a079e8..836a86c32a65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1012,6 +1012,7 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched.c b/kernel/sched.c index 8fca364f3593..3acbad8991a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2621,6 +2621,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) { struct mm_struct *mm = rq->prev_mm; long prev_state; +#ifdef CONFIG_SMP + int post_schedule = 0; + + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif rq->prev_mm = NULL; @@ -2639,7 +2645,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP - if (current->sched_class->post_schedule) + if (post_schedule) current->sched_class->post_schedule(rq); #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8d33843cb2c4..b0b6ea4ed674 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1290,20 +1290,23 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) pull_rt_task(rq); } +/* + * assumes rq->lock is held + */ +static int needs_post_schedule_rt(struct rq *rq) +{ + return rq->rt.overloaded ? 1 : 0; +} + static void post_schedule_rt(struct rq *rq) { /* - * If we have more than one rt_task queued, then - * see if we can push the other rt_tasks off to other CPUS. - * Note we may release the rq lock, and since - * the lock was owned by prev, we need to release it - * first via finish_lock_switch and then reaquire it here. + * This is only called if needs_post_schedule_rt() indicates that + * we need to push tasks away */ - if (unlikely(rq->rt.overloaded)) { - spin_lock_irq(&rq->lock); - push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); - } + spin_lock_irq(&rq->lock); + push_rt_tasks(rq); + spin_unlock_irq(&rq->lock); } /* @@ -1557,6 +1560,7 @@ static const struct sched_class rt_sched_class = { .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, + .needs_post_schedule = needs_post_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, .switched_from = switched_from_rt, From 4075134e40804821f90866d7de56802e4dcecb1e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:53 -0500 Subject: [PATCH 09/11] plist: fix PLIST_NODE_INIT to work with debug enabled It seems that PLIST_NODE_INIT breaks if used and DEBUG_PI_LIST is defined. Since there are no current users of PLIST_NODE_INIT, this has gone undetected. This patch fixes the build issue that enables the DEBUG_PI_LIST later in the series when we use it in init_task.h Signed-off-by: Gregory Haskins --- include/linux/plist.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/plist.h b/include/linux/plist.h index 85de2f055874..45926d77d6ac 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -96,6 +96,10 @@ struct plist_node { # define PLIST_HEAD_LOCK_INIT(_lock) #endif +#define _PLIST_HEAD_INIT(head) \ + .prio_list = LIST_HEAD_INIT((head).prio_list), \ + .node_list = LIST_HEAD_INIT((head).node_list) + /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name @@ -103,8 +107,7 @@ struct plist_node { */ #define PLIST_HEAD_INIT(head, _lock) \ { \ - .prio_list = LIST_HEAD_INIT((head).prio_list), \ - .node_list = LIST_HEAD_INIT((head).node_list), \ + _PLIST_HEAD_INIT(head), \ PLIST_HEAD_LOCK_INIT(&(_lock)) \ } @@ -116,7 +119,7 @@ struct plist_node { #define PLIST_NODE_INIT(node, __prio) \ { \ .prio = (__prio), \ - .plist = PLIST_HEAD_INIT((node).plist, NULL), \ + .plist = { _PLIST_HEAD_INIT((node).plist) }, \ } /** From 917b627d4d981dc614519d7b34ea31a976b14e12 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:53 -0500 Subject: [PATCH 10/11] sched: create "pushable_tasks" list to limit pushing to one attempt The RT scheduler employs a "push/pull" design to actively balance tasks within the system (on a per disjoint cpuset basis). When a task is awoken, it is immediately determined if there are any lower priority cpus which should be preempted. This is opposed to the way normal SCHED_OTHER tasks behave, which will wait for a periodic rebalancing operation to occur before spreading out load. When a particular RQ has more than 1 active RT task, it is said to be in an "overloaded" state. Once this occurs, the system enters the active balancing mode, where it will try to push the task away, or persuade a different cpu to pull it over. The system will stay in this state until the system falls back below the <= 1 queued RT task per RQ. However, the current implementation suffers from a limitation in the push logic. Once overloaded, all tasks (other than current) on the RQ are analyzed on every push operation, even if it was previously unpushable (due to affinity, etc). Whats more, the operation stops at the first task that is unpushable and will not look at items lower in the queue. This causes two problems: 1) We can have the same tasks analyzed over and over again during each push, which extends out the fast path in the scheduler for no gain. Consider a RQ that has dozens of tasks that are bound to a core. Each one of those tasks will be encountered and skipped for each push operation while they are queued. 2) There may be lower-priority tasks under the unpushable task that could have been successfully pushed, but will never be considered until either the unpushable task is cleared, or a pull operation succeeds. The net result is a potential latency source for mid priority tasks. This patch aims to rectify these two conditions by introducing a new priority sorted list: "pushable_tasks". A task is added to the list each time a task is activated or preempted. It is removed from the list any time it is deactivated, made current, or fails to push. This works because a task only needs to be attempted to push once. After an initial failure to push, the other cpus will eventually try to pull the task when the conditions are proper. This also solves the problem that we don't completely analyze all tasks due to encountering an unpushable tasks. Now every task will have a push attempted (when appropriate). This reduces latency both by shorting the critical section of the rq->lock for certain workloads, and by making sure the algorithm considers all eligible tasks in the system. [ rostedt: added a couple more BUG_ONs ] Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt --- include/linux/init_task.h | 1 + include/linux/sched.h | 1 + kernel/sched.c | 4 ++ kernel/sched_rt.c | 119 ++++++++++++++++++++++++++++++++------ 4 files changed, 107 insertions(+), 18 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e5..6851225f44a7 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -140,6 +140,7 @@ extern struct group_info init_groups; .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ .real_parent = &tsk, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 836a86c32a65..440cabb2d432 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1179,6 +1179,7 @@ struct task_struct { #endif struct list_head tasks; + struct plist_node pushable_tasks; struct mm_struct *mm, *active_mm; diff --git a/kernel/sched.c b/kernel/sched.c index 3acbad8991a2..24ab80c28765 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -471,6 +471,7 @@ struct rt_rq { #ifdef CONFIG_SMP unsigned long rt_nr_migratory; int overloaded; + struct plist_head pushable_tasks; #endif int rt_throttled; u64 rt_time; @@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + plist_node_init(&p->pushable_tasks, MAX_PRIO); + put_cpu(); } @@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; + plist_head_init(&rq->rt.pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b0b6ea4ed674..fe9da6084c87 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq) rq->rt.overloaded = 0; } } + +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + plist_node_init(&p->pushable_tasks, p->prio); + plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +#else + +#define enqueue_pushable_task(rq, p) do { } while (0) +#define dequeue_pushable_task(rq, p) do { } while (0) + #endif /* CONFIG_SMP */ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) @@ -751,6 +769,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) enqueue_rt_entity(rt_se); + if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); + inc_cpu_load(rq, p->se.load.weight); } @@ -761,6 +782,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) update_curr_rt(rq); dequeue_rt_entity(rt_se); + dequeue_pushable_task(rq, p); + dec_cpu_load(rq, p->se.load.weight); } @@ -911,7 +934,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, return next; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; @@ -933,6 +956,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) p = rt_task_of(rt_se); p->se.exec_start = rq->clock; + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = _pick_next_task_rt(rq); + + /* The running task is never eligible for pushing */ + if (p) + dequeue_pushable_task(rq, p); + return p; } @@ -940,6 +975,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); p->se.exec_start = 0; + + /* + * The previous task needs to be made eligible for pushing + * if it is still active + */ + if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); } #ifdef CONFIG_SMP @@ -1116,6 +1158,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->rt.nr_cpus_allowed <= 1); + + BUG_ON(!p->se.on_rq); + BUG_ON(!rt_task(p)); + + return p; +} + /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task @@ -1125,13 +1192,12 @@ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; - int ret = 0; int paranoid = RT_MAX_TRIES; if (!rq->rt.overloaded) return 0; - next_task = pick_next_highest_task_rt(rq, -1); + next_task = pick_next_pushable_task(rq); if (!next_task) return 0; @@ -1163,12 +1229,19 @@ static int push_rt_task(struct rq *rq) * so it is possible that next_task has changed. * If it has, then try again. */ - task = pick_next_highest_task_rt(rq, -1); + task = pick_next_pushable_task(rq); if (unlikely(task != next_task) && task && paranoid--) { put_task_struct(next_task); next_task = task; goto retry; } + + /* + * Once we have failed to push this task, we will not + * try again, since the other cpus will pull from us + * when they are ready + */ + dequeue_pushable_task(rq, next_task); goto out; } @@ -1180,23 +1253,12 @@ static int push_rt_task(struct rq *rq) double_unlock_balance(rq, lowest_rq); - ret = 1; out: put_task_struct(next_task); - return ret; + return 1; } -/* - * TODO: Currently we just use the second highest prio task on - * the queue, and stop when it can't migrate (or there's - * no more RT tasks). There may be a case where a lower - * priority RT task has a different affinity than the - * higher RT task. In this case the lower RT task could - * possibly be able to migrate where as the higher priority - * RT task could not. We currently ignore this issue. - * Enhancements are welcome! - */ static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ @@ -1295,7 +1357,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) */ static int needs_post_schedule_rt(struct rq *rq) { - return rq->rt.overloaded ? 1 : 0; + return has_pushable_tasks(rq); } static void post_schedule_rt(struct rq *rq) @@ -1317,7 +1379,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - rq->rt.overloaded && + has_pushable_tasks(rq) && p->rt.nr_cpus_allowed > 1) push_rt_tasks(rq); } @@ -1354,6 +1416,24 @@ static void set_cpus_allowed_rt(struct task_struct *p, if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); + if (!task_current(rq, p)) { + /* + * Make sure we dequeue this task from the pushable list + * before going further. It will either remain off of + * the list because we are no longer pushable, or it + * will be requeued. + */ + if (p->rt.nr_cpus_allowed > 1) + dequeue_pushable_task(rq, p); + + /* + * Requeue if our weight is changing and still > 1 + */ + if (weight > 1) + enqueue_pushable_task(rq, p); + + } + if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { rq->rt.rt_nr_migratory++; } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { @@ -1538,6 +1618,9 @@ static void set_curr_task_rt(struct rq *rq) struct task_struct *p = rq->curr; p->se.exec_start = rq->clock; + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); } static const struct sched_class rt_sched_class = { From 1563513d34ed4b12ef32bc2adde4a53ce05701a1 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:53 -0500 Subject: [PATCH 11/11] RT: fix push_rt_task() to handle dequeue_pushable properly A panic was discovered by Chirag Jog where a BUG_ON sanity check in the new "pushable_task" logic would trigger a panic under certain circumstances: http://lkml.org/lkml/2008/9/25/189 Gilles Carry discovered that the root cause was attributed to the pushable_tasks list getting corrupted in the push_rt_task logic. This was the result of a dropped rq lock in double_lock_balance allowing a task in the process of being pushed to potentially migrate away, and thus corrupt the pushable_tasks() list. I traced back the problem as introduced by the pushable_tasks patch that went in recently. There is a "retry" path in push_rt_task() that actually had a compound conditional to decide whether to retry or exit. I missed the meaning behind the rationale for the virtual "if(!task) goto out;" portion of the compound statement and thus did not handle it properly. The new pushable_tasks logic actually creates three distinct conditions: 1) an untouched and unpushable task should be dequeued 2) a migrated task where more pushable tasks remain should be retried 3) a migrated task where no more pushable tasks exist should exit The original logic mushed (1) and (3) together, resulting in the system dequeuing a migrated task (against an unlocked foreign run-queue nonetheless). To fix this, we get rid of the notion of "paranoid" and we support the three unique conditions properly. The paranoid feature is no longer relevant with the new pushable logic (since pushable naturally limits the loop) anyway, so lets just remove it. Reported-By: Chirag Jog Found-by: Gilles Carry Signed-off-by: Gregory Haskins --- kernel/sched_rt.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fe9da6084c87..64a8f0aa117b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1192,7 +1192,6 @@ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; - int paranoid = RT_MAX_TRIES; if (!rq->rt.overloaded) return 0; @@ -1226,23 +1225,34 @@ static int push_rt_task(struct rq *rq) struct task_struct *task; /* * find lock_lowest_rq releases rq->lock - * so it is possible that next_task has changed. - * If it has, then try again. + * so it is possible that next_task has migrated. + * + * We need to make sure that the task is still on the same + * run-queue and is also still the next task eligible for + * pushing. */ task = pick_next_pushable_task(rq); - if (unlikely(task != next_task) && task && paranoid--) { - put_task_struct(next_task); - next_task = task; - goto retry; + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * If we get here, the task hasnt moved at all, but + * it has failed to push. We will not try again, + * since the other cpus will pull from us when they + * are ready. + */ + dequeue_pushable_task(rq, next_task); + goto out; } + if (!task) + /* No more tasks, just exit */ + goto out; + /* - * Once we have failed to push this task, we will not - * try again, since the other cpus will pull from us - * when they are ready + * Something has shifted, try again. */ - dequeue_pushable_task(rq, next_task); - goto out; + put_task_struct(next_task); + next_task = task; + goto retry; } deactivate_task(rq, next_task, 0);