sched/rt: Fix PI handling vs. sched_setscheduler()
Andrea Parri reported:
> I found that the following scenario (with CONFIG_RT_GROUP_SCHED=y) is not
> handled correctly:
>
> T1 (prio = 20)
> lock(rtmutex);
>
> T2 (prio = 20)
> blocks on rtmutex (rt_nr_boosted = 0 on T1's rq)
>
> T1 (prio = 20)
> sys_set_scheduler(prio = 0)
> [new_effective_prio == oldprio]
> T1 prio = 20 (rt_nr_boosted = 0 on T1's rq)
>
> The last step is incorrect as T1 is now boosted (c.f., rt_se_boosted());
> in particular, if we continue with
>
> T1 (prio = 20)
> unlock(rtmutex)
> wakeup(T2)
> adjust_prio(T1)
> [prio != rt_mutex_getprio(T1)]
> dequeue(T1)
> rt_nr_boosted = (unsigned long)(-1)
> ...
> T1 prio = 0
>
> then we end up leaving rt_nr_boosted in an "inconsistent" state.
>
> The simple program attached could reproduce the previous scenario; note
> that, as a consequence of the presence of this state, the "assertion"
>
> WARN_ON(!rt_nr_running && rt_nr_boosted)
>
> from dec_rt_group() may trigger.
So normally we dequeue/enqueue tasks in sched_setscheduler(), which
would ensure the accounting stays correct. However in the early PI path
we fail to do so.
So this was introduced at around v3.14, by:
c365c292d0
("sched: Consider pi boosting in setscheduler()")
which fixed another problem exactly because that dequeue/enqueue, joy.
Fix this by teaching rt about DEQUEUE_SAVE/ENQUEUE_RESTORE and have it
preserve runqueue location with that option. This requires decoupling
the on_rt_rq() state from being on the list.
In order to allow for explicit movement during the SAVE/RESTORE,
introduce {DE,EN}QUEUE_MOVE. We still must use SAVE/RESTORE in these
cases to preserve other invariants.
Respecting the SAVE/RESTORE flags also has the (nice) side-effect that
things like sys_nice()/sys_sched_setaffinity() also do not reorder
FIFO tasks (whereas they used to before this patch).
Reported-by: Andrea Parri <parri.andrea@gmail.com>
Tested-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
41d9339733
commit
ff77e46853
@ -1293,6 +1293,8 @@ struct sched_rt_entity {
|
||||
unsigned long timeout;
|
||||
unsigned long watchdog_stamp;
|
||||
unsigned int time_slice;
|
||||
unsigned short on_rq;
|
||||
unsigned short on_list;
|
||||
|
||||
struct sched_rt_entity *back;
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
|
@ -2221,6 +2221,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
__dl_clear_params(p);
|
||||
|
||||
INIT_LIST_HEAD(&p->rt.run_list);
|
||||
p->rt.timeout = 0;
|
||||
p->rt.time_slice = sched_rr_timeslice;
|
||||
p->rt.on_rq = 0;
|
||||
p->rt.on_list = 0;
|
||||
|
||||
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
||||
INIT_HLIST_HEAD(&p->preempt_notifiers);
|
||||
@ -3531,7 +3535,7 @@ EXPORT_SYMBOL(default_wake_function);
|
||||
*/
|
||||
void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
{
|
||||
int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
|
||||
int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
|
||||
struct rq *rq;
|
||||
const struct sched_class *prev_class;
|
||||
|
||||
@ -3559,11 +3563,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
|
||||
trace_sched_pi_setprio(p, prio);
|
||||
oldprio = p->prio;
|
||||
|
||||
if (oldprio == prio)
|
||||
queue_flag &= ~DEQUEUE_MOVE;
|
||||
|
||||
prev_class = p->sched_class;
|
||||
queued = task_on_rq_queued(p);
|
||||
running = task_current(rq, p);
|
||||
if (queued)
|
||||
dequeue_task(rq, p, DEQUEUE_SAVE);
|
||||
dequeue_task(rq, p, queue_flag);
|
||||
if (running)
|
||||
put_prev_task(rq, p);
|
||||
|
||||
@ -3581,7 +3589,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
if (!dl_prio(p->normal_prio) ||
|
||||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
|
||||
p->dl.dl_boosted = 1;
|
||||
enqueue_flag |= ENQUEUE_REPLENISH;
|
||||
queue_flag |= ENQUEUE_REPLENISH;
|
||||
} else
|
||||
p->dl.dl_boosted = 0;
|
||||
p->sched_class = &dl_sched_class;
|
||||
@ -3589,7 +3597,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
if (dl_prio(oldprio))
|
||||
p->dl.dl_boosted = 0;
|
||||
if (oldprio < prio)
|
||||
enqueue_flag |= ENQUEUE_HEAD;
|
||||
queue_flag |= ENQUEUE_HEAD;
|
||||
p->sched_class = &rt_sched_class;
|
||||
} else {
|
||||
if (dl_prio(oldprio))
|
||||
@ -3604,7 +3612,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||
if (running)
|
||||
p->sched_class->set_curr_task(rq);
|
||||
if (queued)
|
||||
enqueue_task(rq, p, enqueue_flag);
|
||||
enqueue_task(rq, p, queue_flag);
|
||||
|
||||
check_class_changed(rq, p, prev_class, oldprio);
|
||||
out_unlock:
|
||||
@ -3960,6 +3968,7 @@ static int __sched_setscheduler(struct task_struct *p,
|
||||
const struct sched_class *prev_class;
|
||||
struct rq *rq;
|
||||
int reset_on_fork;
|
||||
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
|
||||
|
||||
/* may grab non-irq protected spin_locks */
|
||||
BUG_ON(in_interrupt());
|
||||
@ -4142,17 +4151,14 @@ change:
|
||||
* itself.
|
||||
*/
|
||||
new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
|
||||
if (new_effective_prio == oldprio) {
|
||||
__setscheduler_params(p, attr);
|
||||
task_rq_unlock(rq, p, &flags);
|
||||
return 0;
|
||||
}
|
||||
if (new_effective_prio == oldprio)
|
||||
queue_flags &= ~DEQUEUE_MOVE;
|
||||
}
|
||||
|
||||
queued = task_on_rq_queued(p);
|
||||
running = task_current(rq, p);
|
||||
if (queued)
|
||||
dequeue_task(rq, p, DEQUEUE_SAVE);
|
||||
dequeue_task(rq, p, queue_flags);
|
||||
if (running)
|
||||
put_prev_task(rq, p);
|
||||
|
||||
@ -4162,15 +4168,14 @@ change:
|
||||
if (running)
|
||||
p->sched_class->set_curr_task(rq);
|
||||
if (queued) {
|
||||
int enqueue_flags = ENQUEUE_RESTORE;
|
||||
/*
|
||||
* We enqueue to tail when the priority of a task is
|
||||
* increased (user space view).
|
||||
*/
|
||||
if (oldprio <= p->prio)
|
||||
enqueue_flags |= ENQUEUE_HEAD;
|
||||
if (oldprio < p->prio)
|
||||
queue_flags |= ENQUEUE_HEAD;
|
||||
|
||||
enqueue_task(rq, p, enqueue_flags);
|
||||
enqueue_task(rq, p, queue_flags);
|
||||
}
|
||||
|
||||
check_class_changed(rq, p, prev_class, oldprio);
|
||||
@ -7958,7 +7963,7 @@ void sched_move_task(struct task_struct *tsk)
|
||||
queued = task_on_rq_queued(tsk);
|
||||
|
||||
if (queued)
|
||||
dequeue_task(rq, tsk, DEQUEUE_SAVE);
|
||||
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
|
||||
if (unlikely(running))
|
||||
put_prev_task(rq, tsk);
|
||||
|
||||
@ -7982,7 +7987,7 @@ void sched_move_task(struct task_struct *tsk)
|
||||
if (unlikely(running))
|
||||
tsk->sched_class->set_curr_task(rq);
|
||||
if (queued)
|
||||
enqueue_task(rq, tsk, ENQUEUE_RESTORE);
|
||||
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
|
||||
|
||||
task_rq_unlock(rq, tsk, &flags);
|
||||
}
|
||||
|
@ -436,7 +436,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
|
||||
|
||||
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
|
||||
{
|
||||
return !list_empty(&rt_se->run_list);
|
||||
return rt_se->on_rq;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
@ -482,8 +482,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
|
||||
return rt_se->my_q;
|
||||
}
|
||||
|
||||
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
|
||||
static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
|
||||
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
|
||||
static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
|
||||
|
||||
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
|
||||
{
|
||||
@ -499,7 +499,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
|
||||
if (!rt_se)
|
||||
enqueue_top_rt_rq(rt_rq);
|
||||
else if (!on_rt_rq(rt_se))
|
||||
enqueue_rt_entity(rt_se, false);
|
||||
enqueue_rt_entity(rt_se, 0);
|
||||
|
||||
if (rt_rq->highest_prio.curr < curr->prio)
|
||||
resched_curr(rq);
|
||||
@ -516,7 +516,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
|
||||
if (!rt_se)
|
||||
dequeue_top_rt_rq(rt_rq);
|
||||
else if (on_rt_rq(rt_se))
|
||||
dequeue_rt_entity(rt_se);
|
||||
dequeue_rt_entity(rt_se, 0);
|
||||
}
|
||||
|
||||
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
|
||||
@ -1166,7 +1166,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
|
||||
dec_rt_group(rt_se, rt_rq);
|
||||
}
|
||||
|
||||
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
|
||||
/*
|
||||
* Change rt_se->run_list location unless SAVE && !MOVE
|
||||
*
|
||||
* assumes ENQUEUE/DEQUEUE flags match
|
||||
*/
|
||||
static inline bool move_entity(unsigned int flags)
|
||||
{
|
||||
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
|
||||
{
|
||||
list_del_init(&rt_se->run_list);
|
||||
|
||||
if (list_empty(array->queue + rt_se_prio(rt_se)))
|
||||
__clear_bit(rt_se_prio(rt_se), array->bitmap);
|
||||
|
||||
rt_se->on_list = 0;
|
||||
}
|
||||
|
||||
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
|
||||
{
|
||||
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
|
||||
struct rt_prio_array *array = &rt_rq->active;
|
||||
@ -1179,26 +1202,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
|
||||
* get throttled and the current group doesn't have any other
|
||||
* active members.
|
||||
*/
|
||||
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
|
||||
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
|
||||
if (rt_se->on_list)
|
||||
__delist_rt_entity(rt_se, array);
|
||||
return;
|
||||
}
|
||||
|
||||
if (head)
|
||||
list_add(&rt_se->run_list, queue);
|
||||
else
|
||||
list_add_tail(&rt_se->run_list, queue);
|
||||
__set_bit(rt_se_prio(rt_se), array->bitmap);
|
||||
if (move_entity(flags)) {
|
||||
WARN_ON_ONCE(rt_se->on_list);
|
||||
if (flags & ENQUEUE_HEAD)
|
||||
list_add(&rt_se->run_list, queue);
|
||||
else
|
||||
list_add_tail(&rt_se->run_list, queue);
|
||||
|
||||
__set_bit(rt_se_prio(rt_se), array->bitmap);
|
||||
rt_se->on_list = 1;
|
||||
}
|
||||
rt_se->on_rq = 1;
|
||||
|
||||
inc_rt_tasks(rt_se, rt_rq);
|
||||
}
|
||||
|
||||
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
|
||||
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
|
||||
{
|
||||
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
|
||||
struct rt_prio_array *array = &rt_rq->active;
|
||||
|
||||
list_del_init(&rt_se->run_list);
|
||||
if (list_empty(array->queue + rt_se_prio(rt_se)))
|
||||
__clear_bit(rt_se_prio(rt_se), array->bitmap);
|
||||
if (move_entity(flags)) {
|
||||
WARN_ON_ONCE(!rt_se->on_list);
|
||||
__delist_rt_entity(rt_se, array);
|
||||
}
|
||||
rt_se->on_rq = 0;
|
||||
|
||||
dec_rt_tasks(rt_se, rt_rq);
|
||||
}
|
||||
@ -1207,7 +1241,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
|
||||
* Because the prio of an upper entry depends on the lower
|
||||
* entries, we must remove entries top - down.
|
||||
*/
|
||||
static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
|
||||
static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
|
||||
{
|
||||
struct sched_rt_entity *back = NULL;
|
||||
|
||||
@ -1220,31 +1254,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
|
||||
|
||||
for (rt_se = back; rt_se; rt_se = rt_se->back) {
|
||||
if (on_rt_rq(rt_se))
|
||||
__dequeue_rt_entity(rt_se);
|
||||
__dequeue_rt_entity(rt_se, flags);
|
||||
}
|
||||
}
|
||||
|
||||
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
|
||||
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
|
||||
{
|
||||
struct rq *rq = rq_of_rt_se(rt_se);
|
||||
|
||||
dequeue_rt_stack(rt_se);
|
||||
dequeue_rt_stack(rt_se, flags);
|
||||
for_each_sched_rt_entity(rt_se)
|
||||
__enqueue_rt_entity(rt_se, head);
|
||||
__enqueue_rt_entity(rt_se, flags);
|
||||
enqueue_top_rt_rq(&rq->rt);
|
||||
}
|
||||
|
||||
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
|
||||
static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
|
||||
{
|
||||
struct rq *rq = rq_of_rt_se(rt_se);
|
||||
|
||||
dequeue_rt_stack(rt_se);
|
||||
dequeue_rt_stack(rt_se, flags);
|
||||
|
||||
for_each_sched_rt_entity(rt_se) {
|
||||
struct rt_rq *rt_rq = group_rt_rq(rt_se);
|
||||
|
||||
if (rt_rq && rt_rq->rt_nr_running)
|
||||
__enqueue_rt_entity(rt_se, false);
|
||||
__enqueue_rt_entity(rt_se, flags);
|
||||
}
|
||||
enqueue_top_rt_rq(&rq->rt);
|
||||
}
|
||||
@ -1260,7 +1294,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
if (flags & ENQUEUE_WAKEUP)
|
||||
rt_se->timeout = 0;
|
||||
|
||||
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
|
||||
enqueue_rt_entity(rt_se, flags);
|
||||
|
||||
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
|
||||
enqueue_pushable_task(rq, p);
|
||||
@ -1271,7 +1305,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
struct sched_rt_entity *rt_se = &p->rt;
|
||||
|
||||
update_curr_rt(rq);
|
||||
dequeue_rt_entity(rt_se);
|
||||
dequeue_rt_entity(rt_se, flags);
|
||||
|
||||
dequeue_pushable_task(rq, p);
|
||||
}
|
||||
|
@ -1130,18 +1130,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
|
||||
extern const int sched_prio_to_weight[40];
|
||||
extern const u32 sched_prio_to_wmult[40];
|
||||
|
||||
/*
|
||||
* {de,en}queue flags:
|
||||
*
|
||||
* DEQUEUE_SLEEP - task is no longer runnable
|
||||
* ENQUEUE_WAKEUP - task just became runnable
|
||||
*
|
||||
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
|
||||
* are in a known state which allows modification. Such pairs
|
||||
* should preserve as much state as possible.
|
||||
*
|
||||
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
|
||||
* in the runqueue.
|
||||
*
|
||||
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
|
||||
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
|
||||
* ENQUEUE_WAKING - sched_class::task_waking was called
|
||||
*
|
||||
*/
|
||||
|
||||
#define DEQUEUE_SLEEP 0x01
|
||||
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
|
||||
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
|
||||
|
||||
#define ENQUEUE_WAKEUP 0x01
|
||||
#define ENQUEUE_HEAD 0x02
|
||||
#define ENQUEUE_RESTORE 0x02
|
||||
#define ENQUEUE_MOVE 0x04
|
||||
|
||||
#define ENQUEUE_HEAD 0x08
|
||||
#define ENQUEUE_REPLENISH 0x10
|
||||
#ifdef CONFIG_SMP
|
||||
#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
|
||||
#define ENQUEUE_WAKING 0x20
|
||||
#else
|
||||
#define ENQUEUE_WAKING 0x00
|
||||
#endif
|
||||
#define ENQUEUE_REPLENISH 0x08
|
||||
#define ENQUEUE_RESTORE 0x10
|
||||
|
||||
#define DEQUEUE_SLEEP 0x01
|
||||
#define DEQUEUE_SAVE 0x02
|
||||
|
||||
#define RETRY_TASK ((void *)-1UL)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user