From 295458e67284f57d154ec8156a22797c0cfb044a Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 19 Feb 2015 17:34:46 +0300 Subject: [PATCH 1/6] cgroup: call cgroup_subsys->bind on cgroup subsys initialization Currently, we call cgroup_subsys->bind only on unmount, remount, and when creating a new root on mount. Since the default hierarchy root is created in cgroup_init, we will not call cgroup_subsys->bind if the default hierarchy is freshly mounted. As a result, some controllers will behave incorrectly (most notably, the "memory" controller will not enable hierarchy support). Fix this by calling cgroup_subsys->bind right after initializing a cgroup subsystem. Signed-off-by: Vladimir Davydov Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 29a7b2cc593e..21a4b6d61e21 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5040,6 +5040,9 @@ int __init cgroup_init(void) WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } + + if (ss->bind) + ss->bind(init_css_set.subsys[ssid]); } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); From 587945147cfe48c84dd92c022e25aad6d92c0da8 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Mon, 2 Mar 2015 17:51:10 -0500 Subject: [PATCH 2/6] cgroup: Use kvfree in pidlist_free() The wrapper already calls the appropriate free function, use it instead of spinning our own. Signed-off-by: Bandan Das Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 21a4b6d61e21..a220fdb66568 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count) static void pidlist_free(void *p) { - if (is_vmalloc_addr(p)) - vfree(p); - else - kfree(p); + kvfree(p); } /* From 695df2132cfe3782784d4b0e1f0027168c1830ca Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Wed, 4 Mar 2015 17:09:33 +0800 Subject: [PATCH 3/6] cpuset: initialize cpuset a bit early Now we call ss->bind() in cgroup_init(), so cgroup_init() will call cpuset_bind() and then the latter will access top_cpuset's cpumask, which is NULL, because cpuset_init() is called after cgroup_init() The simplest fix is to swap cgroup_init() and cpuset_init(). Cc: Vladimir Davydov Fixes: 295458e67284 ("cgroup: call cgroup_subsys->bind on cgroup subsys initialization") Reported by: Ming Lei Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Acked-by: Vladimir Davydov --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 6f0f1c5ff8cc..4a6974e67839 100644 --- a/init/main.c +++ b/init/main.c @@ -654,8 +654,8 @@ asmlinkage __visible void __init start_kernel(void) page_writeback_init(); proc_root_init(); nsfs_init(); - cgroup_init(); cpuset_init(); + cgroup_init(); taskstats_init_early(); delayacct_init(); From 3fa0818b3c85e9bb55e3ac96c9523b87e44eab9e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Mar 2015 12:12:07 -0400 Subject: [PATCH 4/6] sched, isolcpu: make cpu_isolated_map visible outside scheduler Needed by the next patch. Also makes cpu_isolated_map present when compiled without SMP and/or with CONFIG_NR_CPUS=1, like the other cpu masks. At some point we may want to clean things up so cpumasks do not exist in UP kernels. Maybe something for the CONFIG_TINY crowd. Cc: Peter Zijlstra Cc: Clark Williams Cc: Li Zefan Cc: Ingo Molnar Cc: Luiz Capitulino Cc: David Rientjes Cc: Mike Galbraith Cc: cgroups@vger.kernel.org Signed-off-by: Rik van Riel Acked-by: Zefan Li Signed-off-by: Tejun Heo --- include/linux/sched.h | 2 ++ kernel/sched/core.c | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..ca365d79480c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -329,6 +329,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern cpumask_var_t cpu_isolated_map; + extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..b578bb23410b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -306,6 +306,9 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { From 47b8ea7186aae7f474ec4c98f43eaa8da719cd83 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Mar 2015 12:12:08 -0400 Subject: [PATCH 5/6] cpusets, isolcpus: exclude isolcpus from load balancing in cpusets Ensure that cpus specified with the isolcpus= boot commandline option stay outside of the load balancing in the kernel scheduler. Operations like load balancing can introduce unwanted latencies, which is exactly what the isolcpus= commandline is there to prevent. Previously, simply creating a new cpuset, without even touching the cpuset.cpus field inside the new cpuset, would undo the effects of isolcpus=, by creating a scheduler domain spanning the whole system, and setting up load balancing inside that domain. The cpuset root cpuset.cpus file is read-only, so there was not even a way to undo that effect. This does not impact the majority of cpusets users, since isolcpus= is a fairly specialized feature used for realtime purposes. Cc: Peter Zijlstra Cc: Clark Williams Cc: Li Zefan Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Mike Galbraith Cc: cgroups@vger.kernel.org Signed-off-by: Rik van Riel Tested-by: David Rientjes Acked-by: Peter Zijlstra (Intel) Acked-by: David Rientjes Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cpuset.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc7f4748d34a..c68f0721df10 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; + if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) + goto done; + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.effective_cpus); + cpumask_and(doms[0], top_cpuset.effective_cpus, + non_isolated_cpus); goto done; } @@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, * the corresponding sched domain. */ if (!cpumask_empty(cp->cpus_allowed) && - !is_sched_load_balance(cp)) + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) continue; if (is_sched_load_balance(cp)) @@ -748,6 +755,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); + cpumask_and(dp, dp, non_isolated_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -760,6 +768,7 @@ restart: BUG_ON(nslot != ndoms); done: + free_cpumask_var(non_isolated_cpus); kfree(csa); /* From 34ebe933417e16f46bc30ea77a66e7f30d0cf0f8 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Mar 2015 12:12:10 -0400 Subject: [PATCH 6/6] cpuset, isolcpus: document relationship between cpusets & isolcpus Document the subtly changed relationship between cpusets and isolcpus. Turns out the old documentation did not match the code... Signed-off-by: Rik van Riel Suggested-by: Peter Zijlstra Acked-by: Zefan Li Signed-off-by: Tejun Heo --- Documentation/cgroups/cpusets.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index f2235a162529..fdf7dff3f607 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt @@ -392,8 +392,10 @@ Put simply, it costs less to balance between two smaller sched domains than one big one, but doing so means that overloads in one of the two domains won't be load balanced to the other one. -By default, there is one sched domain covering all CPUs, except those -marked isolated using the kernel boot time "isolcpus=" argument. +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. This default load balancing across all CPUs is not well suited for the following two situations: @@ -465,6 +467,10 @@ such partially load balanced cpusets, as they may be artificially constrained to some subset of the CPUs allowed to them, for lack of load balancing to the other CPUs. +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + 1.7.1 sched_load_balance implementation details. ------------------------------------------------