cgroup: add support for eBPF programs
This patch adds two sets of eBPF program pointers to struct cgroup. One for such that are directly pinned to a cgroup, and one for such that are effective for it. To illustrate the logic behind that, assume the following example cgroup hierarchy. A - B - C \ D - E If only B has a program attached, it will be effective for B, C, D and E. If D then attaches a program itself, that will be effective for both D and E, and the program in B will only affect B and C. Only one program of a given type is effective for a cgroup. Attaching and detaching programs will be done through the bpf(2) syscall. For now, ingress and egress inet socket filtering are the only supported use-cases. Signed-off-by: Daniel Mack <daniel@zonque.org> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
0e33661de4
commit
3007098494
79
include/linux/bpf-cgroup.h
Normal file
79
include/linux/bpf-cgroup.h
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
#ifndef _BPF_CGROUP_H
|
||||||
|
#define _BPF_CGROUP_H
|
||||||
|
|
||||||
|
#include <linux/bpf.h>
|
||||||
|
#include <linux/jump_label.h>
|
||||||
|
#include <uapi/linux/bpf.h>
|
||||||
|
|
||||||
|
struct sock;
|
||||||
|
struct cgroup;
|
||||||
|
struct sk_buff;
|
||||||
|
|
||||||
|
#ifdef CONFIG_CGROUP_BPF
|
||||||
|
|
||||||
|
extern struct static_key_false cgroup_bpf_enabled_key;
|
||||||
|
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
|
||||||
|
|
||||||
|
struct cgroup_bpf {
|
||||||
|
/*
|
||||||
|
* Store two sets of bpf_prog pointers, one for programs that are
|
||||||
|
* pinned directly to this cgroup, and one for those that are effective
|
||||||
|
* when this cgroup is accessed.
|
||||||
|
*/
|
||||||
|
struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
|
||||||
|
struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
|
||||||
|
};
|
||||||
|
|
||||||
|
void cgroup_bpf_put(struct cgroup *cgrp);
|
||||||
|
void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
|
||||||
|
|
||||||
|
void __cgroup_bpf_update(struct cgroup *cgrp,
|
||||||
|
struct cgroup *parent,
|
||||||
|
struct bpf_prog *prog,
|
||||||
|
enum bpf_attach_type type);
|
||||||
|
|
||||||
|
/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
|
||||||
|
void cgroup_bpf_update(struct cgroup *cgrp,
|
||||||
|
struct bpf_prog *prog,
|
||||||
|
enum bpf_attach_type type);
|
||||||
|
|
||||||
|
int __cgroup_bpf_run_filter(struct sock *sk,
|
||||||
|
struct sk_buff *skb,
|
||||||
|
enum bpf_attach_type type);
|
||||||
|
|
||||||
|
/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
|
||||||
|
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \
|
||||||
|
({ \
|
||||||
|
int __ret = 0; \
|
||||||
|
if (cgroup_bpf_enabled) \
|
||||||
|
__ret = __cgroup_bpf_run_filter(sk, skb, \
|
||||||
|
BPF_CGROUP_INET_INGRESS); \
|
||||||
|
\
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) \
|
||||||
|
({ \
|
||||||
|
int __ret = 0; \
|
||||||
|
if (cgroup_bpf_enabled && sk && sk == skb->sk) { \
|
||||||
|
typeof(sk) __sk = sk_to_full_sk(sk); \
|
||||||
|
if (sk_fullsock(__sk)) \
|
||||||
|
__ret = __cgroup_bpf_run_filter(__sk, skb, \
|
||||||
|
BPF_CGROUP_INET_EGRESS); \
|
||||||
|
} \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
struct cgroup_bpf {};
|
||||||
|
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
|
||||||
|
static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
|
||||||
|
struct cgroup *parent) {}
|
||||||
|
|
||||||
|
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
|
||||||
|
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
|
||||||
|
|
||||||
|
#endif /* CONFIG_CGROUP_BPF */
|
||||||
|
|
||||||
|
#endif /* _BPF_CGROUP_H */
|
@ -16,6 +16,7 @@
|
|||||||
#include <linux/percpu-refcount.h>
|
#include <linux/percpu-refcount.h>
|
||||||
#include <linux/percpu-rwsem.h>
|
#include <linux/percpu-rwsem.h>
|
||||||
#include <linux/workqueue.h>
|
#include <linux/workqueue.h>
|
||||||
|
#include <linux/bpf-cgroup.h>
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUPS
|
#ifdef CONFIG_CGROUPS
|
||||||
|
|
||||||
@ -300,6 +301,9 @@ struct cgroup {
|
|||||||
/* used to schedule release agent */
|
/* used to schedule release agent */
|
||||||
struct work_struct release_agent_work;
|
struct work_struct release_agent_work;
|
||||||
|
|
||||||
|
/* used to store eBPF programs */
|
||||||
|
struct cgroup_bpf bpf;
|
||||||
|
|
||||||
/* ids of the ancestors at each level including self */
|
/* ids of the ancestors at each level including self */
|
||||||
int ancestor_ids[];
|
int ancestor_ids[];
|
||||||
};
|
};
|
||||||
|
12
init/Kconfig
12
init/Kconfig
@ -1154,6 +1154,18 @@ config CGROUP_PERF
|
|||||||
|
|
||||||
Say N if unsure.
|
Say N if unsure.
|
||||||
|
|
||||||
|
config CGROUP_BPF
|
||||||
|
bool "Support for eBPF programs attached to cgroups"
|
||||||
|
depends on BPF_SYSCALL && SOCK_CGROUP_DATA
|
||||||
|
help
|
||||||
|
Allow attaching eBPF programs to a cgroup using the bpf(2)
|
||||||
|
syscall command BPF_PROG_ATTACH.
|
||||||
|
|
||||||
|
In which context these programs are accessed depends on the type
|
||||||
|
of attachment. For instance, programs that are attached using
|
||||||
|
BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
|
||||||
|
inet sockets.
|
||||||
|
|
||||||
config CGROUP_DEBUG
|
config CGROUP_DEBUG
|
||||||
bool "Example controller"
|
bool "Example controller"
|
||||||
default n
|
default n
|
||||||
|
@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
|
|||||||
ifeq ($(CONFIG_PERF_EVENTS),y)
|
ifeq ($(CONFIG_PERF_EVENTS),y)
|
||||||
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
|
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
|
||||||
endif
|
endif
|
||||||
|
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
|
||||||
|
167
kernel/bpf/cgroup.c
Normal file
167
kernel/bpf/cgroup.c
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
/*
|
||||||
|
* Functions to manage eBPF programs attached to cgroups
|
||||||
|
*
|
||||||
|
* Copyright (c) 2016 Daniel Mack
|
||||||
|
*
|
||||||
|
* This file is subject to the terms and conditions of version 2 of the GNU
|
||||||
|
* General Public License. See the file COPYING in the main directory of the
|
||||||
|
* Linux distribution for more details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <linux/kernel.h>
|
||||||
|
#include <linux/atomic.h>
|
||||||
|
#include <linux/cgroup.h>
|
||||||
|
#include <linux/slab.h>
|
||||||
|
#include <linux/bpf.h>
|
||||||
|
#include <linux/bpf-cgroup.h>
|
||||||
|
#include <net/sock.h>
|
||||||
|
|
||||||
|
DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
|
||||||
|
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cgroup_bpf_put() - put references of all bpf programs
|
||||||
|
* @cgrp: the cgroup to modify
|
||||||
|
*/
|
||||||
|
void cgroup_bpf_put(struct cgroup *cgrp)
|
||||||
|
{
|
||||||
|
unsigned int type;
|
||||||
|
|
||||||
|
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
|
||||||
|
struct bpf_prog *prog = cgrp->bpf.prog[type];
|
||||||
|
|
||||||
|
if (prog) {
|
||||||
|
bpf_prog_put(prog);
|
||||||
|
static_branch_dec(&cgroup_bpf_enabled_key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cgroup_bpf_inherit() - inherit effective programs from parent
|
||||||
|
* @cgrp: the cgroup to modify
|
||||||
|
* @parent: the parent to inherit from
|
||||||
|
*/
|
||||||
|
void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
|
||||||
|
{
|
||||||
|
unsigned int type;
|
||||||
|
|
||||||
|
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
|
||||||
|
struct bpf_prog *e;
|
||||||
|
|
||||||
|
e = rcu_dereference_protected(parent->bpf.effective[type],
|
||||||
|
lockdep_is_held(&cgroup_mutex));
|
||||||
|
rcu_assign_pointer(cgrp->bpf.effective[type], e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* __cgroup_bpf_update() - Update the pinned program of a cgroup, and
|
||||||
|
* propagate the change to descendants
|
||||||
|
* @cgrp: The cgroup which descendants to traverse
|
||||||
|
* @parent: The parent of @cgrp, or %NULL if @cgrp is the root
|
||||||
|
* @prog: A new program to pin
|
||||||
|
* @type: Type of pinning operation (ingress/egress)
|
||||||
|
*
|
||||||
|
* Each cgroup has a set of two pointers for bpf programs; one for eBPF
|
||||||
|
* programs it owns, and which is effective for execution.
|
||||||
|
*
|
||||||
|
* If @prog is %NULL, this function attaches a new program to the cgroup and
|
||||||
|
* releases the one that is currently attached, if any. @prog is then made
|
||||||
|
* the effective program of type @type in that cgroup.
|
||||||
|
*
|
||||||
|
* If @prog is %NULL, the currently attached program of type @type is released,
|
||||||
|
* and the effective program of the parent cgroup (if any) is inherited to
|
||||||
|
* @cgrp.
|
||||||
|
*
|
||||||
|
* Then, the descendants of @cgrp are walked and the effective program for
|
||||||
|
* each of them is set to the effective program of @cgrp unless the
|
||||||
|
* descendant has its own program attached, in which case the subbranch is
|
||||||
|
* skipped. This ensures that delegated subcgroups with own programs are left
|
||||||
|
* untouched.
|
||||||
|
*
|
||||||
|
* Must be called with cgroup_mutex held.
|
||||||
|
*/
|
||||||
|
void __cgroup_bpf_update(struct cgroup *cgrp,
|
||||||
|
struct cgroup *parent,
|
||||||
|
struct bpf_prog *prog,
|
||||||
|
enum bpf_attach_type type)
|
||||||
|
{
|
||||||
|
struct bpf_prog *old_prog, *effective;
|
||||||
|
struct cgroup_subsys_state *pos;
|
||||||
|
|
||||||
|
old_prog = xchg(cgrp->bpf.prog + type, prog);
|
||||||
|
|
||||||
|
effective = (!prog && parent) ?
|
||||||
|
rcu_dereference_protected(parent->bpf.effective[type],
|
||||||
|
lockdep_is_held(&cgroup_mutex)) :
|
||||||
|
prog;
|
||||||
|
|
||||||
|
css_for_each_descendant_pre(pos, &cgrp->self) {
|
||||||
|
struct cgroup *desc = container_of(pos, struct cgroup, self);
|
||||||
|
|
||||||
|
/* skip the subtree if the descendant has its own program */
|
||||||
|
if (desc->bpf.prog[type] && desc != cgrp)
|
||||||
|
pos = css_rightmost_descendant(pos);
|
||||||
|
else
|
||||||
|
rcu_assign_pointer(desc->bpf.effective[type],
|
||||||
|
effective);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prog)
|
||||||
|
static_branch_inc(&cgroup_bpf_enabled_key);
|
||||||
|
|
||||||
|
if (old_prog) {
|
||||||
|
bpf_prog_put(old_prog);
|
||||||
|
static_branch_dec(&cgroup_bpf_enabled_key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* __cgroup_bpf_run_filter() - Run a program for packet filtering
|
||||||
|
* @sk: The socken sending or receiving traffic
|
||||||
|
* @skb: The skb that is being sent or received
|
||||||
|
* @type: The type of program to be exectuted
|
||||||
|
*
|
||||||
|
* If no socket is passed, or the socket is not of type INET or INET6,
|
||||||
|
* this function does nothing and returns 0.
|
||||||
|
*
|
||||||
|
* The program type passed in via @type must be suitable for network
|
||||||
|
* filtering. No further check is performed to assert that.
|
||||||
|
*
|
||||||
|
* This function will return %-EPERM if any if an attached program was found
|
||||||
|
* and if it returned != 1 during execution. In all other cases, 0 is returned.
|
||||||
|
*/
|
||||||
|
int __cgroup_bpf_run_filter(struct sock *sk,
|
||||||
|
struct sk_buff *skb,
|
||||||
|
enum bpf_attach_type type)
|
||||||
|
{
|
||||||
|
struct bpf_prog *prog;
|
||||||
|
struct cgroup *cgrp;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
if (!sk || !sk_fullsock(sk))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (sk->sk_family != AF_INET &&
|
||||||
|
sk->sk_family != AF_INET6)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
|
||||||
|
prog = rcu_dereference(cgrp->bpf.effective[type]);
|
||||||
|
if (prog) {
|
||||||
|
unsigned int offset = skb->data - skb_network_header(skb);
|
||||||
|
|
||||||
|
__skb_push(skb, offset);
|
||||||
|
ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
|
||||||
|
__skb_pull(skb, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(__cgroup_bpf_run_filter);
|
@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
|
|||||||
if (cgrp->kn)
|
if (cgrp->kn)
|
||||||
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
|
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
|
||||||
NULL);
|
NULL);
|
||||||
|
|
||||||
|
cgroup_bpf_put(cgrp);
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_unlock(&cgroup_mutex);
|
mutex_unlock(&cgroup_mutex);
|
||||||
@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
|
|||||||
if (!cgroup_on_dfl(cgrp))
|
if (!cgroup_on_dfl(cgrp))
|
||||||
cgrp->subtree_control = cgroup_control(cgrp);
|
cgrp->subtree_control = cgroup_control(cgrp);
|
||||||
|
|
||||||
|
if (parent)
|
||||||
|
cgroup_bpf_inherit(cgrp, parent);
|
||||||
|
|
||||||
cgroup_propagate_control(cgrp);
|
cgroup_propagate_control(cgrp);
|
||||||
|
|
||||||
/* @cgrp doesn't have dir yet so the following will only create csses */
|
/* @cgrp doesn't have dir yet so the following will only create csses */
|
||||||
@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void)
|
|||||||
}
|
}
|
||||||
subsys_initcall(cgroup_namespaces_init);
|
subsys_initcall(cgroup_namespaces_init);
|
||||||
|
|
||||||
|
#ifdef CONFIG_CGROUP_BPF
|
||||||
|
void cgroup_bpf_update(struct cgroup *cgrp,
|
||||||
|
struct bpf_prog *prog,
|
||||||
|
enum bpf_attach_type type)
|
||||||
|
{
|
||||||
|
struct cgroup *parent = cgroup_parent(cgrp);
|
||||||
|
|
||||||
|
mutex_lock(&cgroup_mutex);
|
||||||
|
__cgroup_bpf_update(cgrp, parent, prog, type);
|
||||||
|
mutex_unlock(&cgroup_mutex);
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_CGROUP_BPF */
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUP_DEBUG
|
#ifdef CONFIG_CGROUP_DEBUG
|
||||||
static struct cgroup_subsys_state *
|
static struct cgroup_subsys_state *
|
||||||
debug_css_alloc(struct cgroup_subsys_state *parent_css)
|
debug_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||||
|
Loading…
Reference in New Issue
Block a user