clone3: allow spawning processes into cgroups
This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
parent
f3553220d4
commit
ef2c41cf38
@ -628,8 +628,9 @@ struct cgroup_subsys {
|
||||
void (*cancel_attach)(struct cgroup_taskset *tset);
|
||||
void (*attach)(struct cgroup_taskset *tset);
|
||||
void (*post_attach)(void);
|
||||
int (*can_fork)(struct task_struct *task);
|
||||
void (*cancel_fork)(struct task_struct *task);
|
||||
int (*can_fork)(struct task_struct *task,
|
||||
struct css_set *cset);
|
||||
void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
|
||||
void (*fork)(struct task_struct *task);
|
||||
void (*exit)(struct task_struct *task);
|
||||
void (*release)(struct task_struct *task);
|
||||
|
@ -27,6 +27,8 @@
|
||||
|
||||
#include <linux/cgroup-defs.h>
|
||||
|
||||
struct kernel_clone_args;
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
|
||||
/*
|
||||
@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct pid *pid, struct task_struct *tsk);
|
||||
|
||||
void cgroup_fork(struct task_struct *p);
|
||||
extern int cgroup_can_fork(struct task_struct *p);
|
||||
extern void cgroup_cancel_fork(struct task_struct *p);
|
||||
extern void cgroup_post_fork(struct task_struct *p);
|
||||
extern int cgroup_can_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs);
|
||||
extern void cgroup_cancel_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs);
|
||||
extern void cgroup_post_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs);
|
||||
void cgroup_exit(struct task_struct *p);
|
||||
void cgroup_release(struct task_struct *p);
|
||||
void cgroup_free(struct task_struct *p);
|
||||
@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||
struct dentry *dentry) { return -EINVAL; }
|
||||
|
||||
static inline void cgroup_fork(struct task_struct *p) {}
|
||||
static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
|
||||
static inline void cgroup_cancel_fork(struct task_struct *p) {}
|
||||
static inline void cgroup_post_fork(struct task_struct *p) {}
|
||||
static inline int cgroup_can_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs) { return 0; }
|
||||
static inline void cgroup_cancel_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs) {}
|
||||
static inline void cgroup_post_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs) {}
|
||||
static inline void cgroup_exit(struct task_struct *p) {}
|
||||
static inline void cgroup_release(struct task_struct *p) {}
|
||||
static inline void cgroup_free(struct task_struct *p) {}
|
||||
|
@ -13,6 +13,7 @@
|
||||
struct task_struct;
|
||||
struct rusage;
|
||||
union thread_union;
|
||||
struct css_set;
|
||||
|
||||
/* All the bits taken by the old clone syscall. */
|
||||
#define CLONE_LEGACY_FLAGS 0xffffffffULL
|
||||
@ -29,6 +30,9 @@ struct kernel_clone_args {
|
||||
pid_t *set_tid;
|
||||
/* Number of elements in *set_tid */
|
||||
size_t set_tid_size;
|
||||
int cgroup;
|
||||
struct cgroup *cgrp;
|
||||
struct css_set *cset;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -35,6 +35,7 @@
|
||||
|
||||
/* Flags for the clone3() syscall. */
|
||||
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
|
||||
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
||||
|
||||
/*
|
||||
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
||||
@ -81,6 +82,8 @@
|
||||
* @set_tid_size: This defines the size of the array referenced
|
||||
* in @set_tid. This cannot be larger than the
|
||||
* kernel's limit of nested PID namespaces.
|
||||
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
|
||||
* a file descriptor for the cgroup.
|
||||
*
|
||||
* The structure is versioned by size and thus extensible.
|
||||
* New struct members must go at the end of the struct and
|
||||
@ -97,11 +100,13 @@ struct clone_args {
|
||||
__aligned_u64 tls;
|
||||
__aligned_u64 set_tid;
|
||||
__aligned_u64 set_tid_size;
|
||||
__aligned_u64 cgroup;
|
||||
};
|
||||
#endif
|
||||
|
||||
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
|
||||
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
|
||||
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
|
||||
|
||||
/*
|
||||
* Scheduling policies
|
||||
|
@ -5881,8 +5881,7 @@ out:
|
||||
* @child: pointer to task_struct of forking parent process.
|
||||
*
|
||||
* A task is associated with the init_css_set until cgroup_post_fork()
|
||||
* attaches it to the parent's css_set. Empty cg_list indicates that
|
||||
* @child isn't holding reference to its css_set.
|
||||
* attaches it to the target css_set.
|
||||
*/
|
||||
void cgroup_fork(struct task_struct *child)
|
||||
{
|
||||
@ -5908,24 +5907,154 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
|
||||
return cgrp;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_css_set_fork - find or create a css_set for a child process
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* This functions finds or creates a new css_set which the child
|
||||
* process will be attached to in cgroup_post_fork(). By default,
|
||||
* the child process will be given the same css_set as its parent.
|
||||
*
|
||||
* If CLONE_INTO_CGROUP is specified this function will try to find an
|
||||
* existing css_set which includes the requested cgroup and if not create
|
||||
* a new css_set that the child will be attached to later. If this function
|
||||
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
|
||||
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
|
||||
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
|
||||
* to the target cgroup.
|
||||
*/
|
||||
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
|
||||
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
|
||||
{
|
||||
int ret;
|
||||
struct cgroup *dst_cgrp = NULL;
|
||||
struct css_set *cset;
|
||||
struct super_block *sb;
|
||||
struct file *f;
|
||||
|
||||
if (kargs->flags & CLONE_INTO_CGROUP)
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
cgroup_threadgroup_change_begin(current);
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
cset = task_css_set(current);
|
||||
get_css_set(cset);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
|
||||
kargs->cset = cset;
|
||||
return 0;
|
||||
}
|
||||
|
||||
f = fget_raw(kargs->cgroup);
|
||||
if (!f) {
|
||||
ret = -EBADF;
|
||||
goto err;
|
||||
}
|
||||
sb = f->f_path.dentry->d_sb;
|
||||
|
||||
dst_cgrp = cgroup_get_from_file(f);
|
||||
if (IS_ERR(dst_cgrp)) {
|
||||
ret = PTR_ERR(dst_cgrp);
|
||||
dst_cgrp = NULL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (cgroup_is_dead(dst_cgrp)) {
|
||||
ret = -ENODEV;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that we the target cgroup is writable for us. This is
|
||||
* usually done by the vfs layer but since we're not going through
|
||||
* the vfs layer here we need to do it "manually".
|
||||
*/
|
||||
ret = cgroup_may_write(dst_cgrp, sb);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
|
||||
!(kargs->flags & CLONE_THREAD));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
kargs->cset = find_css_set(cset, dst_cgrp);
|
||||
if (!kargs->cset) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
put_css_set(cset);
|
||||
fput(f);
|
||||
kargs->cgrp = dst_cgrp;
|
||||
return ret;
|
||||
|
||||
err:
|
||||
cgroup_threadgroup_change_end(current);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
if (f)
|
||||
fput(f);
|
||||
if (dst_cgrp)
|
||||
cgroup_put(dst_cgrp);
|
||||
put_css_set(cset);
|
||||
if (kargs->cset)
|
||||
put_css_set(kargs->cset);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_css_set_put_fork - drop references we took during fork
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* Drop references to the prepared css_set and target cgroup if
|
||||
* CLONE_INTO_CGROUP was requested.
|
||||
*/
|
||||
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
|
||||
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
|
||||
{
|
||||
cgroup_threadgroup_change_end(current);
|
||||
|
||||
if (kargs->flags & CLONE_INTO_CGROUP) {
|
||||
struct cgroup *cgrp = kargs->cgrp;
|
||||
struct css_set *cset = kargs->cset;
|
||||
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
if (cset) {
|
||||
put_css_set(cset);
|
||||
kargs->cset = NULL;
|
||||
}
|
||||
|
||||
if (cgrp) {
|
||||
cgroup_put(cgrp);
|
||||
kargs->cgrp = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_can_fork - called on a new task before the process is exposed
|
||||
* @child: the child process
|
||||
*
|
||||
* This prepares a new css_set for the child process which the child will
|
||||
* be attached to in cgroup_post_fork().
|
||||
* This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
|
||||
* callback returns an error, the fork aborts with that error code. This
|
||||
* allows for a cgroup subsystem to conditionally allow or deny new forks.
|
||||
*/
|
||||
int cgroup_can_fork(struct task_struct *child)
|
||||
__acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem)
|
||||
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i, j, ret;
|
||||
|
||||
cgroup_threadgroup_change_begin(current);
|
||||
ret = cgroup_css_set_fork(kargs);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
do_each_subsys_mask(ss, i, have_canfork_callback) {
|
||||
ret = ss->can_fork(child);
|
||||
ret = ss->can_fork(child, kargs->cset);
|
||||
if (ret)
|
||||
goto out_revert;
|
||||
} while_each_subsys_mask();
|
||||
@ -5937,32 +6066,34 @@ out_revert:
|
||||
if (j >= i)
|
||||
break;
|
||||
if (ss->cancel_fork)
|
||||
ss->cancel_fork(child);
|
||||
ss->cancel_fork(child, kargs->cset);
|
||||
}
|
||||
|
||||
cgroup_threadgroup_change_end(current);
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
|
||||
* @child: the child process
|
||||
*
|
||||
* This calls the cancel_fork() callbacks if a fork failed *after*
|
||||
* cgroup_can_fork() succeded.
|
||||
*/
|
||||
void cgroup_cancel_fork(struct task_struct *child)
|
||||
__releases(&cgroup_threadgroup_rwsem)
|
||||
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
|
||||
* @child: the child process
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* This calls the cancel_fork() callbacks if a fork failed *after*
|
||||
* cgroup_can_fork() succeded and cleans up references we took to
|
||||
* prepare a new css_set for the child process in cgroup_can_fork().
|
||||
*/
|
||||
void cgroup_cancel_fork(struct task_struct *child,
|
||||
struct kernel_clone_args *kargs)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i;
|
||||
|
||||
for_each_subsys(ss, i)
|
||||
if (ss->cancel_fork)
|
||||
ss->cancel_fork(child);
|
||||
ss->cancel_fork(child, kargs->cset);
|
||||
|
||||
cgroup_threadgroup_change_end(current);
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child)
|
||||
* Attach the child process to its css_set calling the subsystem fork()
|
||||
* callbacks.
|
||||
*/
|
||||
void cgroup_post_fork(struct task_struct *child)
|
||||
__releases(&cgroup_threadgroup_rwsem)
|
||||
void cgroup_post_fork(struct task_struct *child,
|
||||
struct kernel_clone_args *kargs)
|
||||
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
struct css_set *cset;
|
||||
int i;
|
||||
|
||||
cset = kargs->cset;
|
||||
kargs->cset = NULL;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
|
||||
/* init tasks are special, only link regular threads */
|
||||
if (likely(child->pid)) {
|
||||
WARN_ON_ONCE(!list_empty(&child->cg_list));
|
||||
cset = task_css_set(current); /* current is @child's parent */
|
||||
get_css_set(cset);
|
||||
cset->nr_tasks++;
|
||||
css_set_move_task(child, NULL, cset, false);
|
||||
} else {
|
||||
put_css_set(cset);
|
||||
cset = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child)
|
||||
ss->fork(child);
|
||||
} while_each_subsys_mask();
|
||||
|
||||
cgroup_threadgroup_change_end(current);
|
||||
/* Make the new cset the root_cset of the new cgroup namespace. */
|
||||
if (kargs->flags & CLONE_NEWCGROUP) {
|
||||
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
|
||||
|
||||
get_css_set(cset);
|
||||
child->nsproxy->cgroup_ns->root_cset = cset;
|
||||
put_css_set(rcset);
|
||||
}
|
||||
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/sched/task.h>
|
||||
|
||||
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
|
||||
#define PIDS_MAX_STR "max"
|
||||
@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
|
||||
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
|
||||
* on cgroup_threadgroup_change_begin() held by the copy_process().
|
||||
*/
|
||||
static int pids_can_fork(struct task_struct *task)
|
||||
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct pids_cgroup *pids;
|
||||
int err;
|
||||
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
if (cset)
|
||||
css = cset->subsys[pids_cgrp_id];
|
||||
else
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
pids = css_pids(css);
|
||||
err = pids_try_charge(pids, 1);
|
||||
if (err) {
|
||||
@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task)
|
||||
return err;
|
||||
}
|
||||
|
||||
static void pids_cancel_fork(struct task_struct *task)
|
||||
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct pids_cgroup *pids;
|
||||
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
if (cset)
|
||||
css = cset->subsys[pids_cgrp_id];
|
||||
else
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
pids = css_pids(css);
|
||||
pids_uncharge(pids, 1);
|
||||
}
|
||||
|
@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
* between here and cgroup_post_fork() if an organisation operation is in
|
||||
* progress.
|
||||
*/
|
||||
retval = cgroup_can_fork(p);
|
||||
retval = cgroup_can_fork(p, args);
|
||||
if (retval)
|
||||
goto bad_fork_put_pidfd;
|
||||
|
||||
@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
write_unlock_irq(&tasklist_lock);
|
||||
|
||||
proc_fork_connector(p);
|
||||
cgroup_post_fork(p);
|
||||
cgroup_post_fork(p, args);
|
||||
perf_event_fork(p);
|
||||
|
||||
trace_task_newtask(p, clone_flags);
|
||||
@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
bad_fork_cancel_cgroup:
|
||||
spin_unlock(¤t->sighand->siglock);
|
||||
write_unlock_irq(&tasklist_lock);
|
||||
cgroup_cancel_fork(p);
|
||||
cgroup_cancel_fork(p, args);
|
||||
bad_fork_put_pidfd:
|
||||
if (clone_flags & CLONE_PIDFD) {
|
||||
fput(pidfile);
|
||||
@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
|
||||
!valid_signal(args.exit_signal)))
|
||||
return -EINVAL;
|
||||
|
||||
if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
|
||||
return -EINVAL;
|
||||
|
||||
*kargs = (struct kernel_clone_args){
|
||||
.flags = args.flags,
|
||||
.pidfd = u64_to_user_ptr(args.pidfd),
|
||||
@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
|
||||
.stack_size = args.stack_size,
|
||||
.tls = args.tls,
|
||||
.set_tid_size = args.set_tid_size,
|
||||
.cgroup = args.cgroup,
|
||||
};
|
||||
|
||||
if (args.set_tid &&
|
||||
@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
|
||||
static bool clone3_args_valid(struct kernel_clone_args *kargs)
|
||||
{
|
||||
/* Verify that no unknown flags are passed along. */
|
||||
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
|
||||
if (kargs->flags &
|
||||
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
|
||||
return false;
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user