Task Control Groups: make cpusets a client of cgroups
Remove the filesystem support logic from the cpusets system and makes cpusets a cgroup subsystem The "cpuset" filesystem becomes a dummy filesystem; attempts to mount it get passed through to the cgroup filesystem with the appropriate options to emulate the old cpuset filesystem behaviour. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
81a6a5cdd2
commit
8793d854ed
@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net
|
||||
Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
|
||||
Modified by Paul Jackson <pj@sgi.com>
|
||||
Modified by Christoph Lameter <clameter@sgi.com>
|
||||
Modified by Paul Menage <menage@google.com>
|
||||
|
||||
CONTENTS:
|
||||
=========
|
||||
@ -16,10 +17,9 @@ CONTENTS:
|
||||
1.2 Why are cpusets needed ?
|
||||
1.3 How are cpusets implemented ?
|
||||
1.4 What are exclusive cpusets ?
|
||||
1.5 What does notify_on_release do ?
|
||||
1.6 What is memory_pressure ?
|
||||
1.7 What is memory spread ?
|
||||
1.8 How do I use cpusets ?
|
||||
1.5 What is memory_pressure ?
|
||||
1.6 What is memory spread ?
|
||||
1.7 How do I use cpusets ?
|
||||
2. Usage Examples and Syntax
|
||||
2.1 Basic Usage
|
||||
2.2 Adding/removing cpus
|
||||
@ -44,18 +44,19 @@ hierarchy visible in a virtual file system. These are the essential
|
||||
hooks, beyond what is already present, required to manage dynamic
|
||||
job placement on large systems.
|
||||
|
||||
Each task has a pointer to a cpuset. Multiple tasks may reference
|
||||
the same cpuset. Requests by a task, using the sched_setaffinity(2)
|
||||
system call to include CPUs in its CPU affinity mask, and using the
|
||||
mbind(2) and set_mempolicy(2) system calls to include Memory Nodes
|
||||
in its memory policy, are both filtered through that tasks cpuset,
|
||||
filtering out any CPUs or Memory Nodes not in that cpuset. The
|
||||
scheduler will not schedule a task on a CPU that is not allowed in
|
||||
its cpus_allowed vector, and the kernel page allocator will not
|
||||
allocate a page on a node that is not allowed in the requesting tasks
|
||||
mems_allowed vector.
|
||||
Cpusets use the generic cgroup subsystem described in
|
||||
Documentation/cgroup.txt.
|
||||
|
||||
User level code may create and destroy cpusets by name in the cpuset
|
||||
Requests by a task, using the sched_setaffinity(2) system call to
|
||||
include CPUs in its CPU affinity mask, and using the mbind(2) and
|
||||
set_mempolicy(2) system calls to include Memory Nodes in its memory
|
||||
policy, are both filtered through that tasks cpuset, filtering out any
|
||||
CPUs or Memory Nodes not in that cpuset. The scheduler will not
|
||||
schedule a task on a CPU that is not allowed in its cpus_allowed
|
||||
vector, and the kernel page allocator will not allocate a page on a
|
||||
node that is not allowed in the requesting tasks mems_allowed vector.
|
||||
|
||||
User level code may create and destroy cpusets by name in the cgroup
|
||||
virtual file system, manage the attributes and permissions of these
|
||||
cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
|
||||
specify and query to which cpuset a task is assigned, and list the
|
||||
@ -115,7 +116,7 @@ Cpusets extends these two mechanisms as follows:
|
||||
- Cpusets are sets of allowed CPUs and Memory Nodes, known to the
|
||||
kernel.
|
||||
- Each task in the system is attached to a cpuset, via a pointer
|
||||
in the task structure to a reference counted cpuset structure.
|
||||
in the task structure to a reference counted cgroup structure.
|
||||
- Calls to sched_setaffinity are filtered to just those CPUs
|
||||
allowed in that tasks cpuset.
|
||||
- Calls to mbind and set_mempolicy are filtered to just
|
||||
@ -145,15 +146,10 @@ into the rest of the kernel, none in performance critical paths:
|
||||
- in page_alloc.c, to restrict memory to allowed nodes.
|
||||
- in vmscan.c, to restrict page recovery to the current cpuset.
|
||||
|
||||
In addition a new file system, of type "cpuset" may be mounted,
|
||||
typically at /dev/cpuset, to enable browsing and modifying the cpusets
|
||||
presently known to the kernel. No new system calls are added for
|
||||
cpusets - all support for querying and modifying cpusets is via
|
||||
this cpuset file system.
|
||||
|
||||
Each task under /proc has an added file named 'cpuset', displaying
|
||||
the cpuset name, as the path relative to the root of the cpuset file
|
||||
system.
|
||||
You should mount the "cgroup" filesystem type in order to enable
|
||||
browsing and modifying the cpusets presently known to the kernel. No
|
||||
new system calls are added for cpusets - all support for querying and
|
||||
modifying cpusets is via this cpuset file system.
|
||||
|
||||
The /proc/<pid>/status file for each task has two added lines,
|
||||
displaying the tasks cpus_allowed (on which CPUs it may be scheduled)
|
||||
@ -163,16 +159,15 @@ in the format seen in the following example:
|
||||
Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
|
||||
Mems_allowed: ffffffff,ffffffff
|
||||
|
||||
Each cpuset is represented by a directory in the cpuset file system
|
||||
containing the following files describing that cpuset:
|
||||
Each cpuset is represented by a directory in the cgroup file system
|
||||
containing (on top of the standard cgroup files) the following
|
||||
files describing that cpuset:
|
||||
|
||||
- cpus: list of CPUs in that cpuset
|
||||
- mems: list of Memory Nodes in that cpuset
|
||||
- memory_migrate flag: if set, move pages to cpusets nodes
|
||||
- cpu_exclusive flag: is cpu placement exclusive?
|
||||
- mem_exclusive flag: is memory placement exclusive?
|
||||
- tasks: list of tasks (by pid) attached to that cpuset
|
||||
- notify_on_release flag: run /sbin/cpuset_release_agent on exit?
|
||||
- memory_pressure: measure of how much paging pressure in cpuset
|
||||
|
||||
In addition, the root cpuset only has the following file:
|
||||
@ -237,21 +232,7 @@ such as requests from interrupt handlers, is allowed to be taken
|
||||
outside even a mem_exclusive cpuset.
|
||||
|
||||
|
||||
1.5 What does notify_on_release do ?
|
||||
------------------------------------
|
||||
|
||||
If the notify_on_release flag is enabled (1) in a cpuset, then whenever
|
||||
the last task in the cpuset leaves (exits or attaches to some other
|
||||
cpuset) and the last child cpuset of that cpuset is removed, then
|
||||
the kernel runs the command /sbin/cpuset_release_agent, supplying the
|
||||
pathname (relative to the mount point of the cpuset file system) of the
|
||||
abandoned cpuset. This enables automatic removal of abandoned cpusets.
|
||||
The default value of notify_on_release in the root cpuset at system
|
||||
boot is disabled (0). The default value of other cpusets at creation
|
||||
is the current value of their parents notify_on_release setting.
|
||||
|
||||
|
||||
1.6 What is memory_pressure ?
|
||||
1.5 What is memory_pressure ?
|
||||
-----------------------------
|
||||
The memory_pressure of a cpuset provides a simple per-cpuset metric
|
||||
of the rate that the tasks in a cpuset are attempting to free up in
|
||||
@ -308,7 +289,7 @@ the tasks in the cpuset, in units of reclaims attempted per second,
|
||||
times 1000.
|
||||
|
||||
|
||||
1.7 What is memory spread ?
|
||||
1.6 What is memory spread ?
|
||||
---------------------------
|
||||
There are two boolean flag files per cpuset that control where the
|
||||
kernel allocates pages for the file system buffers and related in
|
||||
@ -379,7 +360,7 @@ data set, the memory allocation across the nodes in the jobs cpuset
|
||||
can become very uneven.
|
||||
|
||||
|
||||
1.8 How do I use cpusets ?
|
||||
1.7 How do I use cpusets ?
|
||||
--------------------------
|
||||
|
||||
In order to minimize the impact of cpusets on critical kernel
|
||||
@ -469,7 +450,7 @@ than stress the kernel.
|
||||
To start a new job that is to be contained within a cpuset, the steps are:
|
||||
|
||||
1) mkdir /dev/cpuset
|
||||
2) mount -t cpuset none /dev/cpuset
|
||||
2) mount -t cgroup -ocpuset cpuset /dev/cpuset
|
||||
3) Create the new cpuset by doing mkdir's and write's (or echo's) in
|
||||
the /dev/cpuset virtual file system.
|
||||
4) Start a task that will be the "founding father" of the new job.
|
||||
@ -481,7 +462,7 @@ For example, the following sequence of commands will setup a cpuset
|
||||
named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
|
||||
and then start a subshell 'sh' in that cpuset:
|
||||
|
||||
mount -t cpuset none /dev/cpuset
|
||||
mount -t cgroup -ocpuset cpuset /dev/cpuset
|
||||
cd /dev/cpuset
|
||||
mkdir Charlie
|
||||
cd Charlie
|
||||
@ -513,7 +494,7 @@ Creating, modifying, using the cpusets can be done through the cpuset
|
||||
virtual filesystem.
|
||||
|
||||
To mount it, type:
|
||||
# mount -t cpuset none /dev/cpuset
|
||||
# mount -t cgroup -o cpuset cpuset /dev/cpuset
|
||||
|
||||
Then under /dev/cpuset you can find a tree that corresponds to the
|
||||
tree of the cpusets in the system. For instance, /dev/cpuset
|
||||
@ -556,6 +537,18 @@ To remove a cpuset, just use rmdir:
|
||||
This will fail if the cpuset is in use (has cpusets inside, or has
|
||||
processes attached).
|
||||
|
||||
Note that for legacy reasons, the "cpuset" filesystem exists as a
|
||||
wrapper around the cgroup filesystem.
|
||||
|
||||
The command
|
||||
|
||||
mount -t cpuset X /dev/cpuset
|
||||
|
||||
is equivalent to
|
||||
|
||||
mount -t cgroup -ocpuset X /dev/cpuset
|
||||
echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent
|
||||
|
||||
2.2 Adding/removing cpus
|
||||
------------------------
|
||||
|
||||
|
@ -2131,7 +2131,7 @@ static const struct pid_entry tgid_base_stuff[] = {
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
INF("schedstat", S_IRUGO, pid_schedstat),
|
||||
#endif
|
||||
#ifdef CONFIG_CPUSETS
|
||||
#ifdef CONFIG_PROC_PID_CPUSET
|
||||
REG("cpuset", S_IRUGO, cpuset),
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUPS
|
||||
@ -2420,7 +2420,7 @@ static const struct pid_entry tid_base_stuff[] = {
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
INF("schedstat", S_IRUGO, pid_schedstat),
|
||||
#endif
|
||||
#ifdef CONFIG_CPUSETS
|
||||
#ifdef CONFIG_PROC_PID_CPUSET
|
||||
REG("cpuset", S_IRUGO, cpuset),
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUPS
|
||||
|
@ -7,4 +7,10 @@
|
||||
|
||||
/* */
|
||||
|
||||
#ifdef CONFIG_CPUSETS
|
||||
SUBSYS(cpuset)
|
||||
#endif
|
||||
|
||||
/* */
|
||||
|
||||
/* */
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/cgroup.h>
|
||||
|
||||
#ifdef CONFIG_CPUSETS
|
||||
|
||||
@ -19,8 +20,6 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
|
||||
extern int cpuset_init_early(void);
|
||||
extern int cpuset_init(void);
|
||||
extern void cpuset_init_smp(void);
|
||||
extern void cpuset_fork(struct task_struct *p);
|
||||
extern void cpuset_exit(struct task_struct *p);
|
||||
extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
|
||||
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
|
||||
#define cpuset_current_mems_allowed (current->mems_allowed)
|
||||
@ -76,13 +75,13 @@ static inline int cpuset_do_slab_mem_spread(void)
|
||||
|
||||
extern void cpuset_track_online_nodes(void);
|
||||
|
||||
extern int current_cpuset_is_being_rebound(void);
|
||||
|
||||
#else /* !CONFIG_CPUSETS */
|
||||
|
||||
static inline int cpuset_init_early(void) { return 0; }
|
||||
static inline int cpuset_init(void) { return 0; }
|
||||
static inline void cpuset_init_smp(void) {}
|
||||
static inline void cpuset_fork(struct task_struct *p) {}
|
||||
static inline void cpuset_exit(struct task_struct *p) {}
|
||||
|
||||
static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
|
||||
{
|
||||
@ -148,6 +147,11 @@ static inline int cpuset_do_slab_mem_spread(void)
|
||||
|
||||
static inline void cpuset_track_online_nodes(void) {}
|
||||
|
||||
static inline int current_cpuset_is_being_rebound(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* !CONFIG_CPUSETS */
|
||||
|
||||
#endif /* _LINUX_CPUSET_H */
|
||||
|
@ -148,14 +148,6 @@ extern void mpol_rebind_task(struct task_struct *tsk,
|
||||
const nodemask_t *new);
|
||||
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
|
||||
extern void mpol_fix_fork_child_flag(struct task_struct *p);
|
||||
#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
|
||||
|
||||
#ifdef CONFIG_CPUSETS
|
||||
#define current_cpuset_is_being_rebound() \
|
||||
(cpuset_being_rebound == current->cpuset)
|
||||
#else
|
||||
#define current_cpuset_is_being_rebound() 0
|
||||
#endif
|
||||
|
||||
extern struct mempolicy default_policy;
|
||||
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
|
||||
@ -173,8 +165,6 @@ static inline void check_highest_zone(enum zone_type k)
|
||||
int do_migrate_pages(struct mm_struct *mm,
|
||||
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
|
||||
|
||||
extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */
|
||||
|
||||
#else
|
||||
|
||||
struct mempolicy {};
|
||||
@ -248,8 +238,6 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
|
||||
{
|
||||
}
|
||||
|
||||
#define set_cpuset_being_rebound(x) do {} while (0)
|
||||
|
||||
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
|
||||
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
|
||||
{
|
||||
|
@ -756,8 +756,6 @@ static inline int above_background_load(void)
|
||||
}
|
||||
|
||||
struct io_context; /* See blkdev.h */
|
||||
struct cpuset;
|
||||
|
||||
#define NGROUPS_SMALL 32
|
||||
#define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t)))
|
||||
struct group_info {
|
||||
@ -1125,7 +1123,6 @@ struct task_struct {
|
||||
short il_next;
|
||||
#endif
|
||||
#ifdef CONFIG_CPUSETS
|
||||
struct cpuset *cpuset;
|
||||
nodemask_t mems_allowed;
|
||||
int cpuset_mems_generation;
|
||||
int cpuset_mem_spread_rotor;
|
||||
|
@ -280,7 +280,7 @@ config CGROUPS
|
||||
|
||||
config CPUSETS
|
||||
bool "Cpuset support"
|
||||
depends on SMP
|
||||
depends on SMP && CGROUPS
|
||||
help
|
||||
This option will let you create and manage CPUSETs which
|
||||
allow dynamically partitioning a system into sets of CPUs and
|
||||
@ -330,6 +330,11 @@ config SYSFS_DEPRECATED
|
||||
If you are using a distro that was released in 2006 or later,
|
||||
it should be safe to say N here.
|
||||
|
||||
config PROC_PID_CPUSET
|
||||
bool "Include legacy /proc/<pid>/cpuset file"
|
||||
depends on CPUSETS
|
||||
default y
|
||||
|
||||
config RELAY
|
||||
bool "Kernel->user space relay support (formerly relayfs)"
|
||||
help
|
||||
|
1188
kernel/cpuset.c
1188
kernel/cpuset.c
File diff suppressed because it is too large
Load Diff
@ -31,7 +31,6 @@
|
||||
#include <linux/taskstats_kern.h>
|
||||
#include <linux/delayacct.h>
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/signal.h>
|
||||
@ -973,7 +972,6 @@ fastcall NORET_TYPE void do_exit(long code)
|
||||
__exit_fs(tsk);
|
||||
check_stack_usage();
|
||||
exit_thread();
|
||||
cpuset_exit(tsk);
|
||||
cgroup_exit(tsk, 1);
|
||||
exit_keys(tsk);
|
||||
|
||||
|
@ -29,7 +29,6 @@
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/swap.h>
|
||||
@ -1089,7 +1088,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
#endif
|
||||
p->io_context = NULL;
|
||||
p->audit_context = NULL;
|
||||
cpuset_fork(p);
|
||||
cgroup_fork(p);
|
||||
#ifdef CONFIG_NUMA
|
||||
p->mempolicy = mpol_copy(p->mempolicy);
|
||||
@ -1330,7 +1328,6 @@ bad_fork_cleanup_policy:
|
||||
mpol_free(p->mempolicy);
|
||||
bad_fork_cleanup_cgroup:
|
||||
#endif
|
||||
cpuset_exit(p);
|
||||
cgroup_exit(p, cgroup_callbacks_done);
|
||||
bad_fork_cleanup_delays_binfmt:
|
||||
delayacct_tsk_free(p);
|
||||
|
@ -1388,7 +1388,6 @@ EXPORT_SYMBOL(alloc_pages_current);
|
||||
* keeps mempolicies cpuset relative after its cpuset moves. See
|
||||
* further kernel/cpuset.c update_nodemask().
|
||||
*/
|
||||
void *cpuset_being_rebound;
|
||||
|
||||
/* Slow path of a mempolicy copy */
|
||||
struct mempolicy *__mpol_copy(struct mempolicy *old)
|
||||
@ -2019,4 +2018,3 @@ out:
|
||||
m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user