Merge branch 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup fixes from Tejun Heo: "This has an unusually high density of tricky fixes: - task_get_css() could deadlock when it races against a dying cgroup. - cgroup.procs didn't list thread group leaders with live threads. This could mislead readers to think that a cgroup is empty when it's not. Fixed by making PROCS iterator include dead tasks. I made a couple mistakes making this change and this pull request contains a couple follow-up patches. - When cpusets run out of online cpus, it updates cpusmasks of member tasks in bizarre ways. Joel improved the behavior significantly" * 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cpuset: restore sanity to cpuset_cpus_allowed_fallback() cgroup: Fix css_task_iter_advance_css_set() cset skip condition cgroup: css_task_iter_skip()'d iterators must be advanced before accessed cgroup: Include dying leaders with live threads in PROCS iterations cgroup: Implement css_task_iter_skip() cgroup: Call cgroup_release() before __exit_signal() docs cgroups: add another example size for hugetlb cgroup: Use css_tryget() instead of css_tryget_online() in task_get_css()
This commit is contained in:
commit
0011572c88
@ -32,14 +32,18 @@ Brief summary of control files
|
||||
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
|
||||
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
|
||||
|
||||
For a system supporting two hugepage size (16M and 16G) the control
|
||||
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
|
||||
files include:
|
||||
|
||||
hugetlb.16GB.limit_in_bytes
|
||||
hugetlb.16GB.max_usage_in_bytes
|
||||
hugetlb.16GB.usage_in_bytes
|
||||
hugetlb.16GB.failcnt
|
||||
hugetlb.16MB.limit_in_bytes
|
||||
hugetlb.16MB.max_usage_in_bytes
|
||||
hugetlb.16MB.usage_in_bytes
|
||||
hugetlb.16MB.failcnt
|
||||
hugetlb.1GB.limit_in_bytes
|
||||
hugetlb.1GB.max_usage_in_bytes
|
||||
hugetlb.1GB.usage_in_bytes
|
||||
hugetlb.1GB.failcnt
|
||||
hugetlb.64KB.limit_in_bytes
|
||||
hugetlb.64KB.max_usage_in_bytes
|
||||
hugetlb.64KB.usage_in_bytes
|
||||
hugetlb.64KB.failcnt
|
||||
hugetlb.32MB.limit_in_bytes
|
||||
hugetlb.32MB.max_usage_in_bytes
|
||||
hugetlb.32MB.usage_in_bytes
|
||||
hugetlb.32MB.failcnt
|
||||
|
@ -221,6 +221,7 @@ struct css_set {
|
||||
*/
|
||||
struct list_head tasks;
|
||||
struct list_head mg_tasks;
|
||||
struct list_head dying_tasks;
|
||||
|
||||
/* all css_task_iters currently walking this cset */
|
||||
struct list_head task_iters;
|
||||
|
@ -43,6 +43,9 @@
|
||||
/* walk all threaded css_sets in the domain */
|
||||
#define CSS_TASK_ITER_THREADED (1U << 1)
|
||||
|
||||
/* internal flags */
|
||||
#define CSS_TASK_ITER_SKIPPED (1U << 16)
|
||||
|
||||
/* a css_task_iter should be treated as an opaque object */
|
||||
struct css_task_iter {
|
||||
struct cgroup_subsys *ss;
|
||||
@ -57,6 +60,7 @@ struct css_task_iter {
|
||||
struct list_head *task_pos;
|
||||
struct list_head *tasks_head;
|
||||
struct list_head *mg_tasks_head;
|
||||
struct list_head *dying_tasks_head;
|
||||
|
||||
struct css_set *cur_cset;
|
||||
struct css_set *cur_dcset;
|
||||
@ -487,7 +491,7 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
|
||||
*
|
||||
* Find the css for the (@task, @subsys_id) combination, increment a
|
||||
* reference on and return it. This function is guaranteed to return a
|
||||
* valid css.
|
||||
* valid css. The returned css may already have been offlined.
|
||||
*/
|
||||
static inline struct cgroup_subsys_state *
|
||||
task_get_css(struct task_struct *task, int subsys_id)
|
||||
@ -497,7 +501,13 @@ task_get_css(struct task_struct *task, int subsys_id)
|
||||
rcu_read_lock();
|
||||
while (true) {
|
||||
css = task_css(task, subsys_id);
|
||||
if (likely(css_tryget_online(css)))
|
||||
/*
|
||||
* Can't use css_tryget_online() here. A task which has
|
||||
* PF_EXITING set may stay associated with an offline css.
|
||||
* If such task calls this function, css_tryget_online()
|
||||
* will keep failing.
|
||||
*/
|
||||
if (likely(css_tryget(css)))
|
||||
break;
|
||||
cpu_relax();
|
||||
}
|
||||
|
@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
|
||||
|
||||
static int cgroup_apply_control(struct cgroup *cgrp);
|
||||
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
|
||||
static void css_task_iter_advance(struct css_task_iter *it);
|
||||
static void css_task_iter_skip(struct css_task_iter *it,
|
||||
struct task_struct *task);
|
||||
static int cgroup_destroy_locked(struct cgroup *cgrp);
|
||||
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
|
||||
struct cgroup_subsys *ss);
|
||||
@ -738,6 +739,7 @@ struct css_set init_css_set = {
|
||||
.dom_cset = &init_css_set,
|
||||
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
|
||||
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
|
||||
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
|
||||
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
|
||||
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
|
||||
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
|
||||
@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
|
||||
cgroup_update_populated(link->cgrp, populated);
|
||||
}
|
||||
|
||||
/*
|
||||
* @task is leaving, advance task iterators which are pointing to it so
|
||||
* that they can resume at the next position. Advancing an iterator might
|
||||
* remove it from the list, use safe walk. See css_task_iter_skip() for
|
||||
* details.
|
||||
*/
|
||||
static void css_set_skip_task_iters(struct css_set *cset,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct css_task_iter *it, *pos;
|
||||
|
||||
list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
|
||||
css_task_iter_skip(it, task);
|
||||
}
|
||||
|
||||
/**
|
||||
* css_set_move_task - move a task from one css_set to another
|
||||
* @task: task being moved
|
||||
@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
|
||||
css_set_update_populated(to_cset, true);
|
||||
|
||||
if (from_cset) {
|
||||
struct css_task_iter *it, *pos;
|
||||
|
||||
WARN_ON_ONCE(list_empty(&task->cg_list));
|
||||
|
||||
/*
|
||||
* @task is leaving, advance task iterators which are
|
||||
* pointing to it so that they can resume at the next
|
||||
* position. Advancing an iterator might remove it from
|
||||
* the list, use safe walk. See css_task_iter_advance*()
|
||||
* for details.
|
||||
*/
|
||||
list_for_each_entry_safe(it, pos, &from_cset->task_iters,
|
||||
iters_node)
|
||||
if (it->task_pos == &task->cg_list)
|
||||
css_task_iter_advance(it);
|
||||
|
||||
css_set_skip_task_iters(from_cset, task);
|
||||
list_del_init(&task->cg_list);
|
||||
if (!css_set_populated(from_cset))
|
||||
css_set_update_populated(from_cset, false);
|
||||
@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
|
||||
cset->dom_cset = cset;
|
||||
INIT_LIST_HEAD(&cset->tasks);
|
||||
INIT_LIST_HEAD(&cset->mg_tasks);
|
||||
INIT_LIST_HEAD(&cset->dying_tasks);
|
||||
INIT_LIST_HEAD(&cset->task_iters);
|
||||
INIT_LIST_HEAD(&cset->threaded_csets);
|
||||
INIT_HLIST_NODE(&cset->hlist);
|
||||
@ -4408,15 +4413,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
|
||||
it->task_pos = NULL;
|
||||
return;
|
||||
}
|
||||
} while (!css_set_populated(cset));
|
||||
} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
|
||||
|
||||
if (!list_empty(&cset->tasks))
|
||||
it->task_pos = cset->tasks.next;
|
||||
else
|
||||
else if (!list_empty(&cset->mg_tasks))
|
||||
it->task_pos = cset->mg_tasks.next;
|
||||
else
|
||||
it->task_pos = cset->dying_tasks.next;
|
||||
|
||||
it->tasks_head = &cset->tasks;
|
||||
it->mg_tasks_head = &cset->mg_tasks;
|
||||
it->dying_tasks_head = &cset->dying_tasks;
|
||||
|
||||
/*
|
||||
* We don't keep css_sets locked across iteration steps and thus
|
||||
@ -4442,9 +4450,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
|
||||
list_add(&it->iters_node, &cset->task_iters);
|
||||
}
|
||||
|
||||
static void css_task_iter_skip(struct css_task_iter *it,
|
||||
struct task_struct *task)
|
||||
{
|
||||
lockdep_assert_held(&css_set_lock);
|
||||
|
||||
if (it->task_pos == &task->cg_list) {
|
||||
it->task_pos = it->task_pos->next;
|
||||
it->flags |= CSS_TASK_ITER_SKIPPED;
|
||||
}
|
||||
}
|
||||
|
||||
static void css_task_iter_advance(struct css_task_iter *it)
|
||||
{
|
||||
struct list_head *next;
|
||||
struct task_struct *task;
|
||||
|
||||
lockdep_assert_held(&css_set_lock);
|
||||
repeat:
|
||||
@ -4454,25 +4473,40 @@ repeat:
|
||||
* consumed first and then ->mg_tasks. After ->mg_tasks,
|
||||
* we move onto the next cset.
|
||||
*/
|
||||
next = it->task_pos->next;
|
||||
|
||||
if (next == it->tasks_head)
|
||||
next = it->mg_tasks_head->next;
|
||||
|
||||
if (next == it->mg_tasks_head)
|
||||
css_task_iter_advance_css_set(it);
|
||||
if (it->flags & CSS_TASK_ITER_SKIPPED)
|
||||
it->flags &= ~CSS_TASK_ITER_SKIPPED;
|
||||
else
|
||||
it->task_pos = next;
|
||||
it->task_pos = it->task_pos->next;
|
||||
|
||||
if (it->task_pos == it->tasks_head)
|
||||
it->task_pos = it->mg_tasks_head->next;
|
||||
if (it->task_pos == it->mg_tasks_head)
|
||||
it->task_pos = it->dying_tasks_head->next;
|
||||
if (it->task_pos == it->dying_tasks_head)
|
||||
css_task_iter_advance_css_set(it);
|
||||
} else {
|
||||
/* called from start, proceed to the first cset */
|
||||
css_task_iter_advance_css_set(it);
|
||||
}
|
||||
|
||||
/* if PROCS, skip over tasks which aren't group leaders */
|
||||
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
|
||||
!thread_group_leader(list_entry(it->task_pos, struct task_struct,
|
||||
cg_list)))
|
||||
goto repeat;
|
||||
if (!it->task_pos)
|
||||
return;
|
||||
|
||||
task = list_entry(it->task_pos, struct task_struct, cg_list);
|
||||
|
||||
if (it->flags & CSS_TASK_ITER_PROCS) {
|
||||
/* if PROCS, skip over tasks which aren't group leaders */
|
||||
if (!thread_group_leader(task))
|
||||
goto repeat;
|
||||
|
||||
/* and dying leaders w/o live member threads */
|
||||
if (!atomic_read(&task->signal->live))
|
||||
goto repeat;
|
||||
} else {
|
||||
/* skip all dying ones */
|
||||
if (task->flags & PF_EXITING)
|
||||
goto repeat;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4528,6 +4562,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
|
||||
/* @it may be half-advanced by skips, finish advancing */
|
||||
if (it->flags & CSS_TASK_ITER_SKIPPED)
|
||||
css_task_iter_advance(it);
|
||||
|
||||
if (it->task_pos) {
|
||||
it->cur_task = list_entry(it->task_pos, struct task_struct,
|
||||
cg_list);
|
||||
@ -6009,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk)
|
||||
if (!list_empty(&tsk->cg_list)) {
|
||||
spin_lock_irq(&css_set_lock);
|
||||
css_set_move_task(tsk, cset, NULL, false);
|
||||
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
|
||||
cset->nr_tasks--;
|
||||
|
||||
WARN_ON_ONCE(cgroup_task_frozen(tsk));
|
||||
@ -6034,6 +6073,13 @@ void cgroup_release(struct task_struct *task)
|
||||
do_each_subsys_mask(ss, ssid, have_release_callback) {
|
||||
ss->release(task);
|
||||
} while_each_subsys_mask();
|
||||
|
||||
if (use_task_css_set_links) {
|
||||
spin_lock_irq(&css_set_lock);
|
||||
css_set_skip_task_iters(task_css_set(task), task);
|
||||
list_del_init(&task->cg_list);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
}
|
||||
}
|
||||
|
||||
void cgroup_free(struct task_struct *task)
|
||||
|
@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
|
||||
spin_unlock_irqrestore(&callback_lock, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
|
||||
* @tsk: pointer to task_struct with which the scheduler is struggling
|
||||
*
|
||||
* Description: In the case that the scheduler cannot find an allowed cpu in
|
||||
* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
|
||||
* mode however, this value is the same as task_cs(tsk)->effective_cpus,
|
||||
* which will not contain a sane cpumask during cases such as cpu hotplugging.
|
||||
* This is the absolute last resort for the scheduler and it is only used if
|
||||
* _every_ other avenue has been traveled.
|
||||
**/
|
||||
|
||||
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
|
||||
{
|
||||
rcu_read_lock();
|
||||
do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
|
||||
do_set_cpus_allowed(tsk, is_in_v2_mode() ?
|
||||
task_cs(tsk)->cpus_allowed : cpu_possible_mask);
|
||||
rcu_read_unlock();
|
||||
|
||||
/*
|
||||
|
@ -195,6 +195,7 @@ repeat:
|
||||
rcu_read_unlock();
|
||||
|
||||
proc_flush_task(p);
|
||||
cgroup_release(p);
|
||||
|
||||
write_lock_irq(&tasklist_lock);
|
||||
ptrace_release_task(p);
|
||||
@ -220,7 +221,6 @@ repeat:
|
||||
}
|
||||
|
||||
write_unlock_irq(&tasklist_lock);
|
||||
cgroup_release(p);
|
||||
release_thread(p);
|
||||
call_rcu(&p->rcu, delayed_put_task_struct);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user