mirror of
https://github.com/systemd/systemd.git
synced 2025-01-24 06:04:05 +03:00
Merge pull request #10062 from rgushchin/device
Support cgroup v2 bpf-based device controller
This commit is contained in:
commit
3316429f19
@ -2767,6 +2767,8 @@ static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
|
||||
[CGROUP_CONTROLLER_MEMORY] = "memory",
|
||||
[CGROUP_CONTROLLER_DEVICES] = "devices",
|
||||
[CGROUP_CONTROLLER_PIDS] = "pids",
|
||||
[CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
|
||||
[CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
/* An enum of well known cgroup controllers */
|
||||
typedef enum CGroupController {
|
||||
/* Original cgroup controllers */
|
||||
CGROUP_CONTROLLER_CPU,
|
||||
CGROUP_CONTROLLER_CPUACCT, /* v1 only */
|
||||
CGROUP_CONTROLLER_IO, /* v2 only */
|
||||
@ -26,6 +27,11 @@ typedef enum CGroupController {
|
||||
CGROUP_CONTROLLER_MEMORY,
|
||||
CGROUP_CONTROLLER_DEVICES, /* v1 only */
|
||||
CGROUP_CONTROLLER_PIDS,
|
||||
|
||||
/* BPF-based pseudo-controllers, v2 only */
|
||||
CGROUP_CONTROLLER_BPF_FIREWALL,
|
||||
CGROUP_CONTROLLER_BPF_DEVICES,
|
||||
|
||||
_CGROUP_CONTROLLER_MAX,
|
||||
_CGROUP_CONTROLLER_INVALID = -1,
|
||||
} CGroupController;
|
||||
@ -41,6 +47,8 @@ typedef enum CGroupMask {
|
||||
CGROUP_MASK_MEMORY = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_MEMORY),
|
||||
CGROUP_MASK_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_DEVICES),
|
||||
CGROUP_MASK_PIDS = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_PIDS),
|
||||
CGROUP_MASK_BPF_FIREWALL = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_FIREWALL),
|
||||
CGROUP_MASK_BPF_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_DEVICES),
|
||||
_CGROUP_MASK_ALL = CGROUP_CONTROLLER_TO_MASK(_CGROUP_CONTROLLER_MAX) - 1
|
||||
} CGroupMask;
|
||||
|
||||
|
247
src/core/bpf-devices.c
Normal file
247
src/core/bpf-devices.c
Normal file
@ -0,0 +1,247 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1+ */
|
||||
#include <linux/libbpf.h>
|
||||
|
||||
#include "bpf-devices.h"
|
||||
#include "bpf-program.h"
|
||||
|
||||
#define PASS_JUMP_OFF 4096
|
||||
|
||||
static int bpf_access_type(const char *acc) {
|
||||
int r = 0;
|
||||
|
||||
assert(acc);
|
||||
|
||||
for (; *acc; acc++)
|
||||
switch(*acc) {
|
||||
case 'r':
|
||||
r |= BPF_DEVCG_ACC_READ;
|
||||
break;
|
||||
case 'w':
|
||||
r |= BPF_DEVCG_ACC_WRITE;
|
||||
break;
|
||||
case 'm':
|
||||
r |= BPF_DEVCG_ACC_MKNOD;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int cgroup_bpf_whitelist_device(BPFProgram *prog, int type, int major, int minor, const char *acc) {
|
||||
struct bpf_insn insn[] = {
|
||||
BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 6), /* compare device type */
|
||||
BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */
|
||||
BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0),
|
||||
BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */
|
||||
BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */
|
||||
BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */
|
||||
BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
|
||||
};
|
||||
int r, access;
|
||||
|
||||
assert(prog);
|
||||
assert(acc);
|
||||
|
||||
access = bpf_access_type(acc);
|
||||
if (access <= 0)
|
||||
return -EINVAL;
|
||||
|
||||
insn[2].imm = access;
|
||||
|
||||
r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
|
||||
if (r < 0)
|
||||
log_error_errno(r, "Extending device control BPF program failed: %m");
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int cgroup_bpf_whitelist_major(BPFProgram *prog, int type, int major, const char *acc) {
|
||||
struct bpf_insn insn[] = {
|
||||
BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 5), /* compare device type */
|
||||
BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */
|
||||
BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0),
|
||||
BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */
|
||||
BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */
|
||||
BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */
|
||||
};
|
||||
int r, access;
|
||||
|
||||
assert(prog);
|
||||
assert(acc);
|
||||
|
||||
access = bpf_access_type(acc);
|
||||
if (access <= 0)
|
||||
return -EINVAL;
|
||||
|
||||
insn[2].imm = access;
|
||||
|
||||
r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn));
|
||||
if (r < 0)
|
||||
log_error_errno(r, "Extending device control BPF program failed: %m");
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist) {
|
||||
struct bpf_insn pre_insn[] = {
|
||||
/* load device type to r2 */
|
||||
BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_1,
|
||||
offsetof(struct bpf_cgroup_dev_ctx, access_type)),
|
||||
|
||||
/* load access type to r3 */
|
||||
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
|
||||
offsetof(struct bpf_cgroup_dev_ctx, access_type)),
|
||||
BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16),
|
||||
|
||||
/* load major number to r4 */
|
||||
BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
|
||||
offsetof(struct bpf_cgroup_dev_ctx, major)),
|
||||
|
||||
/* load minor number to r5 */
|
||||
BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
|
||||
offsetof(struct bpf_cgroup_dev_ctx, minor)),
|
||||
};
|
||||
|
||||
_cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
|
||||
int r;
|
||||
|
||||
assert(ret);
|
||||
|
||||
if (policy == CGROUP_AUTO && !whitelist)
|
||||
return 0;
|
||||
|
||||
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &prog);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Loading device control BPF program failed: %m");
|
||||
|
||||
if (policy == CGROUP_CLOSED || whitelist) {
|
||||
r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn));
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Extending device control BPF program failed: %m");
|
||||
}
|
||||
|
||||
*ret = TAKE_PTR(prog);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cgroup_apply_device_bpf(Unit *u, BPFProgram *prog, CGroupDevicePolicy policy, bool whitelist) {
|
||||
struct bpf_insn post_insn[] = {
|
||||
/* return DENY */
|
||||
BPF_MOV64_IMM(BPF_REG_0, 0),
|
||||
BPF_JMP_A(1),
|
||||
|
||||
};
|
||||
|
||||
struct bpf_insn exit_insn[] = {
|
||||
/* else return ALLOW */
|
||||
BPF_MOV64_IMM(BPF_REG_0, 1),
|
||||
BPF_EXIT_INSN()
|
||||
};
|
||||
|
||||
_cleanup_free_ char *path = NULL;
|
||||
uint32_t flags;
|
||||
int r;
|
||||
|
||||
if (!prog) {
|
||||
/* Remove existing program. */
|
||||
u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (policy != CGROUP_STRICT || whitelist) {
|
||||
size_t off;
|
||||
|
||||
r = bpf_program_add_instructions(prog, post_insn, ELEMENTSOF(post_insn));
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Extending device control BPF program failed: %m");
|
||||
|
||||
/* Fixup PASS_JUMP_OFF jump offsets. */
|
||||
for (off = 0; off < prog->n_instructions; off++) {
|
||||
struct bpf_insn *ins = &prog->instructions[off];
|
||||
|
||||
if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF)
|
||||
ins->off = prog->n_instructions - off - 1;
|
||||
}
|
||||
} else
|
||||
/* Explicitly forbid everything. */
|
||||
exit_insn[0].imm = 0;
|
||||
|
||||
r = bpf_program_add_instructions(prog, exit_insn, ELEMENTSOF(exit_insn));
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Extending device control BPF program failed: %m");
|
||||
|
||||
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to determine cgroup path: %m");
|
||||
|
||||
flags = (u->type == UNIT_SLICE || unit_cgroup_delegate(u)) ? BPF_F_ALLOW_MULTI : 0;
|
||||
|
||||
/* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */
|
||||
u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
|
||||
|
||||
r = bpf_program_cgroup_attach(prog, BPF_CGROUP_DEVICE, path, flags);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m", path);
|
||||
|
||||
/* Remember that this BPF program is installed now. */
|
||||
u->bpf_device_control_installed = bpf_program_ref(prog);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bpf_devices_supported(void) {
|
||||
struct bpf_insn trivial[] = {
|
||||
BPF_MOV64_IMM(BPF_REG_0, 1),
|
||||
BPF_EXIT_INSN()
|
||||
};
|
||||
|
||||
_cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
|
||||
static int supported = -1;
|
||||
int r;
|
||||
|
||||
/* Checks whether BPF device controller is supported. For this, we check five things:
|
||||
*
|
||||
* a) whether we are privileged
|
||||
* b) whether the unified hierarchy is being used
|
||||
* c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require
|
||||
*/
|
||||
|
||||
if (supported >= 0)
|
||||
return supported;
|
||||
|
||||
if (geteuid() != 0) {
|
||||
log_debug("Not enough privileges, BPF device control is not supported.");
|
||||
return supported = 0;
|
||||
}
|
||||
|
||||
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
|
||||
if (r == 0) {
|
||||
log_debug("Not running with unified cgroups, BPF device control is not supported.");
|
||||
return supported = 0;
|
||||
}
|
||||
|
||||
r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, &program);
|
||||
if (r < 0) {
|
||||
log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m");
|
||||
return supported = 0;
|
||||
}
|
||||
|
||||
r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
|
||||
if (r < 0) {
|
||||
log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m");
|
||||
return supported = 0;
|
||||
}
|
||||
|
||||
r = bpf_program_load_kernel(program, NULL, 0);
|
||||
if (r < 0) {
|
||||
log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m");
|
||||
return supported = 0;
|
||||
}
|
||||
|
||||
return supported;
|
||||
}
|
16
src/core/bpf-devices.h
Normal file
16
src/core/bpf-devices.h
Normal file
@ -0,0 +1,16 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1+ */
|
||||
#pragma once
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "unit.h"
|
||||
|
||||
struct BPFProgram;
|
||||
|
||||
int bpf_devices_supported(void);
|
||||
|
||||
int cgroup_bpf_whitelist_device(BPFProgram *p, int type, int major, int minor, const char *acc);
|
||||
int cgroup_bpf_whitelist_major(BPFProgram *p, int type, int major, const char *acc);
|
||||
|
||||
int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist);
|
||||
int cgroup_apply_device_bpf(Unit *u, BPFProgram *p, CGroupDevicePolicy policy, bool whitelist);
|
@ -7,6 +7,7 @@
|
||||
#include "blockdev-util.h"
|
||||
#include "bpf-firewall.h"
|
||||
#include "btrfs-util.h"
|
||||
#include "bpf-devices.h"
|
||||
#include "bus-error.h"
|
||||
#include "cgroup-util.h"
|
||||
#include "cgroup.h"
|
||||
@ -386,8 +387,7 @@ static int lookup_block_device(const char *p, dev_t *ret) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int whitelist_device(const char *path, const char *node, const char *acc) {
|
||||
char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
|
||||
static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) {
|
||||
struct stat st;
|
||||
bool ignore_notfound;
|
||||
int r;
|
||||
@ -414,23 +414,34 @@ static int whitelist_device(const char *path, const char *node, const char *acc)
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
sprintf(buf,
|
||||
"%c %u:%u %s",
|
||||
S_ISCHR(st.st_mode) ? 'c' : 'b',
|
||||
major(st.st_rdev), minor(st.st_rdev),
|
||||
acc);
|
||||
if (cg_all_unified() > 0) {
|
||||
if (!prog)
|
||||
return 0;
|
||||
|
||||
r = cg_set_attribute("devices", path, "devices.allow", buf);
|
||||
if (r < 0)
|
||||
log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
|
||||
"Failed to set devices.allow on %s: %m", path);
|
||||
cgroup_bpf_whitelist_device(prog, S_ISCHR(st.st_mode) ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
|
||||
major(st.st_rdev), minor(st.st_rdev), acc);
|
||||
} else {
|
||||
char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
|
||||
|
||||
sprintf(buf,
|
||||
"%c %u:%u %s",
|
||||
S_ISCHR(st.st_mode) ? 'c' : 'b',
|
||||
major(st.st_rdev), minor(st.st_rdev),
|
||||
acc);
|
||||
|
||||
r = cg_set_attribute("devices", path, "devices.allow", buf);
|
||||
if (r < 0)
|
||||
log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING,
|
||||
r, "Failed to set devices.allow on %s: %m", path);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
|
||||
static int whitelist_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc) {
|
||||
_cleanup_fclose_ FILE *f = NULL;
|
||||
char line[LINE_MAX];
|
||||
char *p, *w;
|
||||
bool good = false;
|
||||
int r;
|
||||
|
||||
@ -443,7 +454,6 @@ static int whitelist_major(const char *path, const char *name, char type, const
|
||||
return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
|
||||
|
||||
FOREACH_LINE(line, f, goto fail) {
|
||||
char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
|
||||
unsigned maj;
|
||||
|
||||
truncate_nl(line);
|
||||
@ -485,16 +495,27 @@ static int whitelist_major(const char *path, const char *name, char type, const
|
||||
if (fnmatch(name, w, 0) != 0)
|
||||
continue;
|
||||
|
||||
sprintf(buf,
|
||||
"%c %u:* %s",
|
||||
type,
|
||||
maj,
|
||||
acc);
|
||||
if (cg_all_unified() > 0) {
|
||||
if (!prog)
|
||||
continue;
|
||||
|
||||
r = cg_set_attribute("devices", path, "devices.allow", buf);
|
||||
if (r < 0)
|
||||
log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
|
||||
"Failed to set devices.allow on %s: %m", path);
|
||||
cgroup_bpf_whitelist_major(prog,
|
||||
type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK,
|
||||
maj, acc);
|
||||
} else {
|
||||
char buf[2+DECIMAL_STR_MAX(unsigned)+3+4];
|
||||
|
||||
sprintf(buf,
|
||||
"%c %u:* %s",
|
||||
type,
|
||||
maj,
|
||||
acc);
|
||||
|
||||
r = cg_set_attribute("devices", path, "devices.allow", buf);
|
||||
if (r < 0)
|
||||
log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING,
|
||||
r, "Failed to set devices.allow on %s: %m", path);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -770,7 +791,6 @@ static void cgroup_apply_firewall(Unit *u) {
|
||||
static void cgroup_context_apply(
|
||||
Unit *u,
|
||||
CGroupMask apply_mask,
|
||||
bool apply_bpf,
|
||||
ManagerState state) {
|
||||
|
||||
const char *path;
|
||||
@ -781,7 +801,7 @@ static void cgroup_context_apply(
|
||||
assert(u);
|
||||
|
||||
/* Nothing to do? Exit early! */
|
||||
if (apply_mask == 0 && !apply_bpf)
|
||||
if (apply_mask == 0)
|
||||
return;
|
||||
|
||||
/* Some cgroup attributes are not supported on the root cgroup, hence silently ignore */
|
||||
@ -1020,20 +1040,27 @@ static void cgroup_context_apply(
|
||||
}
|
||||
}
|
||||
|
||||
if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) {
|
||||
if ((apply_mask & (CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES)) && !is_root) {
|
||||
_cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
|
||||
CGroupDeviceAllow *a;
|
||||
|
||||
/* Changing the devices list of a populated cgroup
|
||||
* might result in EINVAL, hence ignore EINVAL
|
||||
* here. */
|
||||
if (cg_all_unified() > 0) {
|
||||
r = cgroup_init_device_bpf(&prog, c->device_policy, c->device_allow);
|
||||
if (r < 0)
|
||||
log_unit_warning_errno(u, r, "Failed to initialize device control bpf program: %m");
|
||||
} else {
|
||||
/* Changing the devices list of a populated cgroup
|
||||
* might result in EINVAL, hence ignore EINVAL
|
||||
* here. */
|
||||
|
||||
if (c->device_allow || c->device_policy != CGROUP_AUTO)
|
||||
r = cg_set_attribute("devices", path, "devices.deny", "a");
|
||||
else
|
||||
r = cg_set_attribute("devices", path, "devices.allow", "a");
|
||||
if (r < 0)
|
||||
log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
|
||||
"Failed to reset devices.list: %m");
|
||||
if (c->device_allow || c->device_policy != CGROUP_AUTO)
|
||||
r = cg_set_attribute("devices", path, "devices.deny", "a");
|
||||
else
|
||||
r = cg_set_attribute("devices", path, "devices.allow", "a");
|
||||
if (r < 0)
|
||||
log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
|
||||
"Failed to reset devices.list: %m");
|
||||
}
|
||||
|
||||
if (c->device_policy == CGROUP_CLOSED ||
|
||||
(c->device_policy == CGROUP_AUTO && c->device_allow)) {
|
||||
@ -1052,10 +1079,10 @@ static void cgroup_context_apply(
|
||||
const char *x, *y;
|
||||
|
||||
NULSTR_FOREACH_PAIR(x, y, auto_devices)
|
||||
whitelist_device(path, x, y);
|
||||
whitelist_device(prog, path, x, y);
|
||||
|
||||
/* PTS (/dev/pts) devices may not be duplicated, but accessed */
|
||||
whitelist_major(path, "pts", 'c', "rw");
|
||||
whitelist_major(prog, path, "pts", 'c', "rw");
|
||||
}
|
||||
|
||||
LIST_FOREACH(device_allow, a, c->device_allow) {
|
||||
@ -1075,14 +1102,26 @@ static void cgroup_context_apply(
|
||||
acc[k++] = 0;
|
||||
|
||||
if (path_startswith(a->path, "/dev/"))
|
||||
whitelist_device(path, a->path, acc);
|
||||
whitelist_device(prog, path, a->path, acc);
|
||||
else if ((val = startswith(a->path, "block-")))
|
||||
whitelist_major(path, val, 'b', acc);
|
||||
whitelist_major(prog, path, val, 'b', acc);
|
||||
else if ((val = startswith(a->path, "char-")))
|
||||
whitelist_major(path, val, 'c', acc);
|
||||
whitelist_major(prog, path, val, 'c', acc);
|
||||
else
|
||||
log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path);
|
||||
}
|
||||
|
||||
r = cgroup_apply_device_bpf(u, prog, c->device_policy, c->device_allow);
|
||||
if (r < 0) {
|
||||
static bool warned = false;
|
||||
|
||||
log_full_errno(warned ? LOG_DEBUG : LOG_WARNING, r,
|
||||
"Unit %s configures device ACL, but the local system doesn't seem to support the BPF-based device controller.\n"
|
||||
"Proceeding WITHOUT applying ACL (all devices will be accessible)!\n"
|
||||
"(This warning is only shown for the first loaded unit using device ACL.)", u->id);
|
||||
|
||||
warned = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (apply_mask & CGROUP_MASK_PIDS) {
|
||||
@ -1127,7 +1166,7 @@ static void cgroup_context_apply(
|
||||
}
|
||||
}
|
||||
|
||||
if (apply_bpf)
|
||||
if (apply_mask & CGROUP_MASK_BPF_FIREWALL)
|
||||
cgroup_apply_firewall(u);
|
||||
}
|
||||
|
||||
@ -1152,7 +1191,7 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) {
|
||||
|
||||
if (c->device_allow ||
|
||||
c->device_policy != CGROUP_AUTO)
|
||||
mask |= CGROUP_MASK_DEVICES;
|
||||
mask |= CGROUP_MASK_DEVICES | CGROUP_MASK_BPF_DEVICES;
|
||||
|
||||
if (c->tasks_accounting ||
|
||||
c->tasks_max != CGROUP_LIMIT_MAX)
|
||||
@ -1161,6 +1200,15 @@ CGroupMask cgroup_context_get_mask(CGroupContext *c) {
|
||||
return mask;
|
||||
}
|
||||
|
||||
CGroupMask unit_get_bpf_mask(Unit *u) {
|
||||
CGroupMask mask = 0;
|
||||
|
||||
if (unit_get_needs_bpf_firewall(u))
|
||||
mask |= CGROUP_MASK_BPF_FIREWALL;
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
CGroupMask unit_get_own_mask(Unit *u) {
|
||||
CGroupContext *c;
|
||||
|
||||
@ -1170,7 +1218,7 @@ CGroupMask unit_get_own_mask(Unit *u) {
|
||||
if (!c)
|
||||
return 0;
|
||||
|
||||
return cgroup_context_get_mask(c) | unit_get_delegate_mask(u);
|
||||
return cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
|
||||
}
|
||||
|
||||
CGroupMask unit_get_delegate_mask(Unit *u) {
|
||||
@ -1278,7 +1326,7 @@ CGroupMask unit_get_enable_mask(Unit *u) {
|
||||
return mask;
|
||||
}
|
||||
|
||||
bool unit_get_needs_bpf(Unit *u) {
|
||||
bool unit_get_needs_bpf_firewall(Unit *u) {
|
||||
CGroupContext *c;
|
||||
Unit *p;
|
||||
assert(u);
|
||||
@ -1508,8 +1556,7 @@ int unit_pick_cgroup_path(Unit *u) {
|
||||
static int unit_create_cgroup(
|
||||
Unit *u,
|
||||
CGroupMask target_mask,
|
||||
CGroupMask enable_mask,
|
||||
bool needs_bpf) {
|
||||
CGroupMask enable_mask) {
|
||||
|
||||
CGroupContext *c;
|
||||
int r;
|
||||
@ -1549,7 +1596,6 @@ static int unit_create_cgroup(
|
||||
u->cgroup_realized = true;
|
||||
u->cgroup_realized_mask = target_mask;
|
||||
u->cgroup_enabled_mask = enable_mask;
|
||||
u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF;
|
||||
|
||||
if (u->type != UNIT_SLICE && !unit_cgroup_delegate(u)) {
|
||||
|
||||
@ -1725,16 +1771,14 @@ static void cgroup_xattr_apply(Unit *u) {
|
||||
static bool unit_has_mask_realized(
|
||||
Unit *u,
|
||||
CGroupMask target_mask,
|
||||
CGroupMask enable_mask,
|
||||
bool needs_bpf) {
|
||||
CGroupMask enable_mask) {
|
||||
|
||||
assert(u);
|
||||
|
||||
return u->cgroup_realized &&
|
||||
u->cgroup_realized_mask == target_mask &&
|
||||
u->cgroup_enabled_mask == enable_mask &&
|
||||
((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) ||
|
||||
(!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF));
|
||||
u->cgroup_invalidated_mask == 0;
|
||||
}
|
||||
|
||||
static void unit_add_to_cgroup_realize_queue(Unit *u) {
|
||||
@ -1765,7 +1809,6 @@ static void unit_remove_from_cgroup_realize_queue(Unit *u) {
|
||||
* Returns 0 on success and < 0 on failure. */
|
||||
static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
|
||||
CGroupMask target_mask, enable_mask;
|
||||
bool needs_bpf, apply_bpf;
|
||||
int r;
|
||||
|
||||
assert(u);
|
||||
@ -1774,16 +1817,10 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
|
||||
|
||||
target_mask = unit_get_target_mask(u);
|
||||
enable_mask = unit_get_enable_mask(u);
|
||||
needs_bpf = unit_get_needs_bpf(u);
|
||||
|
||||
if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf))
|
||||
if (unit_has_mask_realized(u, target_mask, enable_mask))
|
||||
return 0;
|
||||
|
||||
/* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously
|
||||
* the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it
|
||||
* this will trickle down properly to cgroupfs. */
|
||||
apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF;
|
||||
|
||||
/* First, realize parents */
|
||||
if (UNIT_ISSET(u->slice)) {
|
||||
r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
|
||||
@ -1792,12 +1829,12 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
|
||||
}
|
||||
|
||||
/* And then do the real work */
|
||||
r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf);
|
||||
r = unit_create_cgroup(u, target_mask, enable_mask);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* Finally, apply the necessary attributes. */
|
||||
cgroup_context_apply(u, target_mask, apply_bpf, state);
|
||||
cgroup_context_apply(u, target_mask, state);
|
||||
cgroup_xattr_apply(u);
|
||||
|
||||
return 0;
|
||||
@ -1863,8 +1900,7 @@ static void unit_add_siblings_to_cgroup_realize_queue(Unit *u) {
|
||||
* any changes. */
|
||||
if (unit_has_mask_realized(m,
|
||||
unit_get_target_mask(m),
|
||||
unit_get_enable_mask(m),
|
||||
unit_get_needs_bpf(m)))
|
||||
unit_get_enable_mask(m)))
|
||||
continue;
|
||||
|
||||
unit_add_to_cgroup_realize_queue(m);
|
||||
@ -1946,6 +1982,8 @@ void unit_prune_cgroup(Unit *u) {
|
||||
u->cgroup_realized = false;
|
||||
u->cgroup_realized_mask = 0;
|
||||
u->cgroup_enabled_mask = 0;
|
||||
|
||||
u->bpf_device_control_installed = bpf_program_unref(u->bpf_device_control_installed);
|
||||
}
|
||||
|
||||
int unit_search_main_pid(Unit *u, pid_t *ret) {
|
||||
@ -2207,11 +2245,30 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents,
|
||||
}
|
||||
}
|
||||
|
||||
static int cg_bpf_mask_supported(CGroupMask *ret) {
|
||||
CGroupMask mask = 0;
|
||||
int r;
|
||||
|
||||
/* BPF-based firewall */
|
||||
r = bpf_firewall_supported();
|
||||
if (r > 0)
|
||||
mask |= CGROUP_MASK_BPF_FIREWALL;
|
||||
|
||||
/* BPF-based device access control */
|
||||
r = bpf_devices_supported();
|
||||
if (r > 0)
|
||||
mask |= CGROUP_MASK_BPF_DEVICES;
|
||||
|
||||
*ret = mask;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int manager_setup_cgroup(Manager *m) {
|
||||
_cleanup_free_ char *path = NULL;
|
||||
const char *scope_path;
|
||||
CGroupController c;
|
||||
int r, all_unified;
|
||||
CGroupMask mask;
|
||||
char *e;
|
||||
|
||||
assert(m);
|
||||
@ -2341,10 +2398,18 @@ int manager_setup_cgroup(Manager *m) {
|
||||
if (!all_unified && m->test_run_flags == 0)
|
||||
(void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
|
||||
|
||||
/* 8. Figure out which controllers are supported, and log about it */
|
||||
/* 8. Figure out which controllers are supported */
|
||||
r = cg_mask_supported(&m->cgroup_supported);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to determine supported controllers: %m");
|
||||
|
||||
/* 9. Figure out which bpf-based pseudo-controllers are supported */
|
||||
r = cg_bpf_mask_supported(&mask);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to determine supported bpf-based pseudo-controllers: %m");
|
||||
m->cgroup_supported |= mask;
|
||||
|
||||
/* 10. Log which controllers are supported */
|
||||
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
|
||||
log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & CGROUP_CONTROLLER_TO_MASK(c)));
|
||||
|
||||
@ -2718,10 +2783,10 @@ void unit_invalidate_cgroup_bpf(Unit *u) {
|
||||
if (!UNIT_HAS_CGROUP_CONTEXT(u))
|
||||
return;
|
||||
|
||||
if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) /* NOP? */
|
||||
if (u->cgroup_invalidated_mask & CGROUP_MASK_BPF_FIREWALL) /* NOP? */
|
||||
return;
|
||||
|
||||
u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
|
||||
u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
|
||||
unit_add_to_cgroup_realize_queue(u);
|
||||
|
||||
/* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access
|
||||
|
@ -157,7 +157,8 @@ CGroupMask unit_get_subtree_mask(Unit *u);
|
||||
CGroupMask unit_get_target_mask(Unit *u);
|
||||
CGroupMask unit_get_enable_mask(Unit *u);
|
||||
|
||||
bool unit_get_needs_bpf(Unit *u);
|
||||
bool unit_get_needs_bpf_firewall(Unit *u);
|
||||
CGroupMask unit_get_bpf_mask(Unit *u);
|
||||
|
||||
void unit_update_cgroup_members_masks(Unit *u);
|
||||
|
||||
|
@ -1201,7 +1201,7 @@ static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
|
||||
if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0)
|
||||
return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
|
||||
|
||||
r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(1024ULL*1024ULL*16ULL));
|
||||
r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(1024ULL*1024ULL*64ULL));
|
||||
if (r < 0)
|
||||
return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
|
||||
|
||||
|
@ -5,6 +5,8 @@ libcore_la_sources = '''
|
||||
audit-fd.h
|
||||
automount.c
|
||||
automount.h
|
||||
bpf-devices.c
|
||||
bpf-devices.h
|
||||
bpf-firewall.c
|
||||
bpf-firewall.h
|
||||
cgroup.c
|
||||
|
@ -93,7 +93,7 @@ Unit *unit_new(Manager *m, size_t size) {
|
||||
u->ref_uid = UID_INVALID;
|
||||
u->ref_gid = GID_INVALID;
|
||||
u->cpu_usage_last = NSEC_INFINITY;
|
||||
u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED;
|
||||
u->cgroup_invalidated_mask |= CGROUP_MASK_BPF_FIREWALL;
|
||||
|
||||
u->ip_accounting_ingress_map_fd = -1;
|
||||
u->ip_accounting_egress_map_fd = -1;
|
||||
@ -666,6 +666,8 @@ void unit_free(Unit *u) {
|
||||
bpf_program_unref(u->ip_bpf_egress);
|
||||
bpf_program_unref(u->ip_bpf_egress_installed);
|
||||
|
||||
bpf_program_unref(u->bpf_device_control_installed);
|
||||
|
||||
condition_free_list(u->conditions);
|
||||
condition_free_list(u->asserts);
|
||||
|
||||
@ -3253,7 +3255,7 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) {
|
||||
unit_serialize_item(u, f, "cgroup-realized", yes_no(u->cgroup_realized));
|
||||
(void) unit_serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask);
|
||||
(void) unit_serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask);
|
||||
unit_serialize_item_format(u, f, "cgroup-bpf-realized", "%i", u->cgroup_bpf_state);
|
||||
(void) unit_serialize_cgroup_mask(f, "cgroup-invalidated-mask", u->cgroup_invalidated_mask);
|
||||
|
||||
if (uid_is_valid(u->ref_uid))
|
||||
unit_serialize_item_format(u, f, "ref-uid", UID_FMT, u->ref_uid);
|
||||
@ -3568,18 +3570,11 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
|
||||
log_unit_debug(u, "Failed to parse cgroup-enabled-mask %s, ignoring.", v);
|
||||
continue;
|
||||
|
||||
} else if (streq(l, "cgroup-bpf-realized")) {
|
||||
int i;
|
||||
} else if (streq(l, "cgroup-invalidated-mask")) {
|
||||
|
||||
r = safe_atoi(v, &i);
|
||||
r = cg_mask_from_string(v, &u->cgroup_invalidated_mask);
|
||||
if (r < 0)
|
||||
log_unit_debug(u, "Failed to parse cgroup BPF state %s, ignoring.", v);
|
||||
else
|
||||
u->cgroup_bpf_state =
|
||||
i < 0 ? UNIT_CGROUP_BPF_INVALIDATED :
|
||||
i > 0 ? UNIT_CGROUP_BPF_ON :
|
||||
UNIT_CGROUP_BPF_OFF;
|
||||
|
||||
log_unit_debug(u, "Failed to parse cgroup-invalidated-mask %s, ignoring.", v);
|
||||
continue;
|
||||
|
||||
} else if (streq(l, "ref-uid")) {
|
||||
|
@ -105,12 +105,6 @@ struct UnitRef {
|
||||
LIST_FIELDS(UnitRef, refs_by_target);
|
||||
};
|
||||
|
||||
typedef enum UnitCGroupBPFState {
|
||||
UNIT_CGROUP_BPF_OFF = 0,
|
||||
UNIT_CGROUP_BPF_ON = 1,
|
||||
UNIT_CGROUP_BPF_INVALIDATED = -1,
|
||||
} UnitCGroupBPFState;
|
||||
|
||||
typedef struct Unit {
|
||||
Manager *manager;
|
||||
|
||||
@ -258,10 +252,14 @@ typedef struct Unit {
|
||||
char *cgroup_path;
|
||||
CGroupMask cgroup_realized_mask;
|
||||
CGroupMask cgroup_enabled_mask;
|
||||
CGroupMask cgroup_invalidated_mask;
|
||||
CGroupMask cgroup_subtree_mask;
|
||||
CGroupMask cgroup_members_mask;
|
||||
int cgroup_inotify_wd;
|
||||
|
||||
/* Device Controller BPF program */
|
||||
BPFProgram *bpf_device_control_installed;
|
||||
|
||||
/* IP BPF Firewalling/accounting */
|
||||
int ip_accounting_ingress_map_fd;
|
||||
int ip_accounting_egress_map_fd;
|
||||
@ -336,8 +334,6 @@ typedef struct Unit {
|
||||
bool cgroup_members_mask_valid:1;
|
||||
bool cgroup_subtree_mask_valid:1;
|
||||
|
||||
UnitCGroupBPFState cgroup_bpf_state:2;
|
||||
|
||||
/* Reset cgroup accounting next time we fork something off */
|
||||
bool reset_accounting:1;
|
||||
|
||||
|
@ -174,6 +174,16 @@ struct bpf_insn;
|
||||
.off = OFF, \
|
||||
.imm = IMM })
|
||||
|
||||
/* Unconditional jumps */
|
||||
|
||||
#define BPF_JMP_A(OFF) \
|
||||
((struct bpf_insn) { \
|
||||
.code = BPF_JMP | BPF_JA, \
|
||||
.dst_reg = 0, \
|
||||
.src_reg = 0, \
|
||||
.off = OFF, \
|
||||
.imm = 0 })
|
||||
|
||||
/* Raw code statement block */
|
||||
|
||||
#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
|
||||
|
@ -100,7 +100,7 @@ static void test_cg_mask_to_string_one(CGroupMask mask, const char *t) {
|
||||
|
||||
static void test_cg_mask_to_string(void) {
|
||||
test_cg_mask_to_string_one(0, NULL);
|
||||
test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct io blkio memory devices pids");
|
||||
test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct io blkio memory devices pids bpf-firewall bpf-devices");
|
||||
test_cg_mask_to_string_one(CGROUP_MASK_CPU, "cpu");
|
||||
test_cg_mask_to_string_one(CGROUP_MASK_CPUACCT, "cpuacct");
|
||||
test_cg_mask_to_string_one(CGROUP_MASK_IO, "io");
|
||||
|
Loading…
x
Reference in New Issue
Block a user