diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 614871bce20..e1b9a5e490e 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2474,6 +2474,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -3008,6 +3010,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -3566,6 +3570,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4251,6 +4257,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -4811,6 +4819,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -5365,6 +5375,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -5952,6 +5964,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -6440,6 +6454,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -6912,6 +6928,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -7620,6 +7638,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -8094,6 +8114,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -8552,6 +8574,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -9113,6 +9137,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly a(ss) BPFProgram = [...]; }; interface org.freedesktop.DBus.Peer { ... }; interface org.freedesktop.DBus.Introspectable { ... }; @@ -9251,6 +9277,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -9393,6 +9421,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -9554,6 +9584,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { readonly u ManagedOOMMemoryPressureLimit = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly a(ss) BPFProgram = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s KillMode = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -9708,6 +9740,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + @@ -9876,6 +9910,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 1bc45a9f007..a2d01f7afbf 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -696,6 +696,12 @@ If these settings are used multiple times in the same unit all the specified programs are attached. If an empty string is assigned to these settings the program list is reset and all previous specified programs ignored. + If the path BPF_FS_PROGRAM_PATH in IPIngressFilterPath= assignment + is already being handled by BPFProgram= ingress hook, e.g. + BPFProgram=ingress:BPF_FS_PROGRAM_PATH, + the assignment will be still considered valid and the program will be attached to a cgroup. Same for + IPEgressFilterPath= path and egress hook. + Note that for socket-activated services, the IP filter programs configured on the socket unit apply to all sockets associated with it directly, but not to any sockets created by the ultimately activated services for it. Conversely, the IP filter programs configured for the service are not applied to any sockets passed into @@ -710,6 +716,52 @@ + + BPFProgram=type:program-path + + Add a custom cgroup BPF program. + + BPFProgram= allows attaching BPF hooks to the cgroup of a systemd unit. + (This generalizes the functionality exposed via IPEgressFilterPath= for egress and + IPIngressFilterPath= for ingress.) + Cgroup-bpf hooks in the form of BPF programs loaded to the BPF filesystem are attached with cgroup-bpf attach + flags determined by the unit. For details about attachment types and flags see . + For general BPF documentation please refer to . + + The specification of BPF program consists of a type followed by a + program-path with : as the separator: + type:program-path. + + type is the string name of BPF attach type also used in + bpftool. type can be one of egress, + ingress, sock_create, sock_ops, + device, bind4, bind6, + connect4, connect6, post_bind4, + post_bind6, sendmsg4, sendmsg6, + sysctl, recvmsg4, recvmsg6, + getsockopt, setsockopt. + + Setting BPFProgram= to an empty value makes previous assignments ineffective. + Multiple assignments of the same type:program-path + value have the same effect as a single assignment: the program with the path program-path + will be attached to cgroup hook type just once. + If BPF egress pinned to program-path path is already being + handled by IPEgressFilterPath=, BPFProgram= + assignment will be considered valid and BPFProgram= will be attached to a cgroup. + Similarly for ingress hook and IPIngressFilterPath= assignment. + + BPF programs passed with BPFProgram= are attached to the cgroup of a unit with BPF + attach flag multi, that allows further attachments of the same + type within cgroup hierarchy topped by the unit cgroup. + + Examples: +BPFProgram=egress:/sys/fs/bpf/egress-hook +BPFProgram=bind6:/sys/fs/bpf/sock-addr-hook + + + + DeviceAllow= diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index fb68d286f0a..1ac1f6dff0b 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2163,6 +2163,7 @@ static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = { [CGROUP_CONTROLLER_PIDS] = "pids", [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall", [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices", + [CGROUP_CONTROLLER_BPF_FOREIGN] = "bpf-foreign", }; DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController); diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index f79e384147d..8894fd9b0af 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -30,6 +30,7 @@ typedef enum CGroupController { /* BPF-based pseudo-controllers, v2 only */ CGROUP_CONTROLLER_BPF_FIREWALL, CGROUP_CONTROLLER_BPF_DEVICES, + CGROUP_CONTROLLER_BPF_FOREIGN, _CGROUP_CONTROLLER_MAX, _CGROUP_CONTROLLER_INVALID = -EINVAL, @@ -49,6 +50,7 @@ typedef enum CGroupMask { CGROUP_MASK_PIDS = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_PIDS), CGROUP_MASK_BPF_FIREWALL = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_FIREWALL), CGROUP_MASK_BPF_DEVICES = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_DEVICES), + CGROUP_MASK_BPF_FOREIGN = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BPF_FOREIGN), /* All real cgroup v1 controllers */ CGROUP_MASK_V1 = CGROUP_MASK_CPU|CGROUP_MASK_CPUACCT|CGROUP_MASK_BLKIO|CGROUP_MASK_MEMORY|CGROUP_MASK_DEVICES|CGROUP_MASK_PIDS, @@ -57,7 +59,7 @@ typedef enum CGroupMask { CGROUP_MASK_V2 = CGROUP_MASK_CPU|CGROUP_MASK_CPUSET|CGROUP_MASK_IO|CGROUP_MASK_MEMORY|CGROUP_MASK_PIDS, /* All cgroup v2 BPF pseudo-controllers */ - CGROUP_MASK_BPF = CGROUP_MASK_BPF_FIREWALL|CGROUP_MASK_BPF_DEVICES, + CGROUP_MASK_BPF = CGROUP_MASK_BPF_FIREWALL|CGROUP_MASK_BPF_DEVICES|CGROUP_MASK_BPF_FOREIGN, _CGROUP_MASK_ALL = CGROUP_CONTROLLER_TO_MASK(_CGROUP_CONTROLLER_MAX) - 1 } CGroupMask; diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c index 0f588b6ca5f..02e33399c3e 100644 --- a/src/core/bpf-firewall.c +++ b/src/core/bpf-firewall.c @@ -698,8 +698,7 @@ int bpf_firewall_install(Unit *u) { if (r < 0) return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m"); - flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI && - (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0; + flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0; /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to * minimize the time window when we don't account for IP traffic. */ @@ -707,8 +706,7 @@ int bpf_firewall_install(Unit *u) { u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed); if (u->ip_bpf_egress) { - r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, - flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI)); + r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags); if (r < 0) return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path); @@ -717,8 +715,7 @@ int bpf_firewall_install(Unit *u) { } if (u->ip_bpf_ingress) { - r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, - flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI)); + r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags); if (r < 0) return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path); diff --git a/src/core/bpf-foreign.c b/src/core/bpf-foreign.c new file mode 100644 index 00000000000..98655bda3c6 --- /dev/null +++ b/src/core/bpf-foreign.c @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "bpf-foreign.h" +#include "bpf-program.h" +#include "cgroup.h" +#include "memory-util.h" +#include "mountpoint-util.h" +#include "set.h" + +typedef struct BPFForeignKey BPFForeignKey; +struct BPFForeignKey { + uint32_t prog_id; + uint32_t attach_type; +}; + +static int bpf_foreign_key_new(uint32_t prog_id, + enum bpf_attach_type attach_type, + BPFForeignKey **ret) { + _cleanup_free_ BPFForeignKey *p = NULL; + + assert(ret); + + p = new(BPFForeignKey, 1); + if (!p) + return log_oom(); + + *p = (BPFForeignKey) { + .prog_id = prog_id, + .attach_type = attach_type, + }; + + *ret = TAKE_PTR(p); + + return 0; +} + +static int bpf_foreign_key_compare_func(const BPFForeignKey *a, const BPFForeignKey *b) { + int r = CMP(a->prog_id, b->prog_id); + if (r != 0) + return r; + + return CMP(a->attach_type, b->attach_type); +} + +static void bpf_foreign_key_hash_func(const BPFForeignKey *p, struct siphash *h) { + siphash24_compress(&p->prog_id, sizeof(p->prog_id), h); + siphash24_compress(&p->attach_type, sizeof(p->attach_type), h); +} + +DEFINE_PRIVATE_HASH_OPS_FULL(bpf_foreign_by_key_hash_ops, + BPFForeignKey, bpf_foreign_key_hash_func, bpf_foreign_key_compare_func, free, + BPFProgram, bpf_program_unref); + +static int attach_programs(Unit *u, const char *path, Hashmap* foreign_by_key, uint32_t attach_flags) { + const BPFForeignKey *key; + BPFProgram *prog; + int r; + + assert(u); + + HASHMAP_FOREACH_KEY(prog, key, foreign_by_key) { + r = bpf_program_cgroup_attach(prog, key->attach_type, path, attach_flags); + if (r < 0) + return log_unit_error_errno(u, r, "Attaching foreign BPF program to cgroup %s failed: %m", path); + } + + return 0; +} + +/* + * Prepare foreign BPF program for installation: + * - Load the program from BPF filesystem to the kernel; + * - Store program FD identified by program ID and attach type in the unit. + */ +static int bpf_foreign_prepare( + Unit *u, + enum bpf_attach_type attach_type, + const char *bpffs_path) { + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; + _cleanup_free_ BPFForeignKey *key = NULL; + uint32_t prog_id; + int r; + + assert(u); + assert(bpffs_path); + + r = bpf_program_new_from_bpffs_path(bpffs_path, &prog); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to create foreign BPFProgram: %m"); + + r = bpf_program_get_id_by_fd(prog->kernel_fd, &prog_id); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to get BPF program id by fd: %m"); + + r = bpf_foreign_key_new(prog_id, attach_type, &key); + if (r < 0) + return log_unit_error_errno(u, r, + "Failed to create foreign BPF program key from path '%s': %m", bpffs_path); + + r = hashmap_ensure_put(&u->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog); + if (r == -EEXIST) { + log_unit_warning_errno(u, r, "Foreign BPF program already exists, ignoring: %m"); + return 0; + } + if (r < 0) + return log_unit_error_errno(u, r, "Failed to put foreign BPFProgram into map: %m"); + + TAKE_PTR(key); + TAKE_PTR(prog); + + return 0; +} + +int bpf_foreign_supported(void) { + int r; + + r = cg_all_unified(); + if (r <= 0) + return r; + + return path_is_mount_point("/sys/fs/bpf", NULL, 0); +} + +int bpf_foreign_install(Unit *u) { + _cleanup_free_ char *cgroup_path = NULL; + CGroupBPFForeignProgram *p; + CGroupContext *cc; + int r; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to get cgroup path: %m"); + + LIST_FOREACH(programs, p, cc->bpf_foreign_programs) { + r = bpf_foreign_prepare(u, p->attach_type, p->bpffs_path); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to prepare foreign BPF hashmap: %m"); + } + + r = attach_programs(u, cgroup_path, u->bpf_foreign_by_key, BPF_F_ALLOW_MULTI); + if (r < 0) + return log_unit_error_errno(u, r, "Failed to install foreign BPF programs: %m"); + + return 0; +} diff --git a/src/core/bpf-foreign.h b/src/core/bpf-foreign.h new file mode 100644 index 00000000000..7704986e3e8 --- /dev/null +++ b/src/core/bpf-foreign.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#pragma once + +#include "unit.h" + +int bpf_foreign_supported(void); +/* + * Attach cgroup-bpf programs foreign to systemd, i.e. loaded to the kernel by an entity + * external to systemd. + */ +int bpf_foreign_install(Unit *u); diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 96073b108b2..8b5df7610c8 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -8,6 +8,7 @@ #include "blockdev-util.h" #include "bpf-devices.h" #include "bpf-firewall.h" +#include "bpf-foreign.h" #include "btrfs-util.h" #include "bus-error.h" #include "cgroup-setup.h" @@ -190,6 +191,15 @@ void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockI free(b); } +void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p) { + assert(c); + assert(p); + + LIST_REMOVE(programs, c->bpf_foreign_programs, p); + free(p->bpffs_path); + free(p); +} + void cgroup_context_done(CGroupContext *c) { assert(c); @@ -217,6 +227,9 @@ void cgroup_context_done(CGroupContext *c) { c->ip_filters_ingress = strv_free(c->ip_filters_ingress); c->ip_filters_egress = strv_free(c->ip_filters_egress); + while (c->bpf_foreign_programs) + cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs); + cpu_set_reset(&c->cpuset_cpus); cpu_set_reset(&c->cpuset_mems); } @@ -360,6 +373,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { CGroupIODeviceLatency *l; CGroupBlockIODeviceBandwidth *b; CGroupBlockIODeviceWeight *w; + CGroupBPFForeignProgram *p; CGroupDeviceAllow *a; CGroupContext *c; IPAddressAccessItem *iaai; @@ -544,6 +558,10 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { STRV_FOREACH(path, c->ip_filters_egress) fprintf(f, "%sIPEgressFilterPath: %s\n", prefix, *path); + + LIST_FOREACH(programs, p, c->bpf_foreign_programs) + fprintf(f, "%sBPFProgram: %s:%s", + prefix, bpf_cgroup_attach_type_to_string(p->attach_type), p->bpffs_path); } int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) { @@ -575,6 +593,34 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) return 0; } +int cgroup_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *bpffs_path) { + CGroupBPFForeignProgram *p; + _cleanup_free_ char *d = NULL; + + assert(c); + assert(bpffs_path); + + if (!path_is_normalized(bpffs_path) || !path_is_absolute(bpffs_path)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not normalized: %m"); + + d = strdup(bpffs_path); + if (!d) + return log_oom(); + + p = new(CGroupBPFForeignProgram, 1); + if (!p) + return log_oom(); + + *p = (CGroupBPFForeignProgram) { + .attach_type = attach_type, + .bpffs_path = TAKE_PTR(d), + }; + + LIST_PREPEND(programs, c->bpf_foreign_programs, TAKE_PTR(p)); + + return 0; +} + #define UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(entry) \ uint64_t unit_get_ancestor_##entry(Unit *u) { \ CGroupContext *c; \ @@ -1115,6 +1161,12 @@ static void set_io_weight(Unit *u, const char *controller, uint64_t weight) { (void) set_attribute_and_warn(u, controller, p, buf); } +static void cgroup_apply_bpf_foreign_program(Unit *u) { + assert(u); + + (void) bpf_foreign_install(u); +} + static void cgroup_context_apply( Unit *u, CGroupMask apply_mask, @@ -1428,6 +1480,9 @@ static void cgroup_context_apply( if (apply_mask & CGROUP_MASK_BPF_FIREWALL) cgroup_apply_firewall(u); + + if (apply_mask & CGROUP_MASK_BPF_FOREIGN) + cgroup_apply_bpf_foreign_program(u); } static bool unit_get_needs_bpf_firewall(Unit *u) { @@ -1460,6 +1515,17 @@ static bool unit_get_needs_bpf_firewall(Unit *u) { return false; } +static bool unit_get_needs_bpf_foreign_program(Unit *u) { + CGroupContext *c; + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + return !LIST_IS_EMPTY(c->bpf_foreign_programs); +} + static CGroupMask unit_get_cgroup_mask(Unit *u) { CGroupMask mask = 0; CGroupContext *c; @@ -1511,6 +1577,9 @@ static CGroupMask unit_get_bpf_mask(Unit *u) { if (unit_get_needs_bpf_firewall(u)) mask |= CGROUP_MASK_BPF_FIREWALL; + if (unit_get_needs_bpf_foreign_program(u)) + mask |= CGROUP_MASK_BPF_FOREIGN; + return mask; } @@ -2989,6 +3058,11 @@ static int cg_bpf_mask_supported(CGroupMask *ret) { if (r > 0) mask |= CGROUP_MASK_BPF_DEVICES; + /* BPF pinned prog */ + r = bpf_foreign_supported(); + if (r > 0) + mask |= CGROUP_MASK_BPF_FOREIGN; + *ret = mask; return 0; } diff --git a/src/core/cgroup.h b/src/core/cgroup.h index fa79ba15239..be3060eba7c 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -31,6 +31,7 @@ typedef struct CGroupIODeviceLimit CGroupIODeviceLimit; typedef struct CGroupIODeviceLatency CGroupIODeviceLatency; typedef struct CGroupBlockIODeviceWeight CGroupBlockIODeviceWeight; typedef struct CGroupBlockIODeviceBandwidth CGroupBlockIODeviceBandwidth; +typedef struct CGroupBPFForeignProgram CGroupBPFForeignProgram; typedef enum CGroupDevicePolicy { /* When devices listed, will allow those, plus built-in ones, if none are listed will allow @@ -94,6 +95,12 @@ struct CGroupBlockIODeviceBandwidth { uint64_t wbps; }; +struct CGroupBPFForeignProgram { + LIST_FIELDS(CGroupBPFForeignProgram, programs); + uint32_t attach_type; + char *bpffs_path; +}; + struct CGroupContext { bool cpu_accounting; bool io_accounting; @@ -142,6 +149,7 @@ struct CGroupContext { char **ip_filters_ingress; char **ip_filters_egress; + LIST_HEAD(CGroupBPFForeignProgram, bpf_foreign_programs); /* For legacy hierarchies */ uint64_t cpu_shares; @@ -202,8 +210,10 @@ void cgroup_context_free_io_device_limit(CGroupContext *c, CGroupIODeviceLimit * void cgroup_context_free_io_device_latency(CGroupContext *c, CGroupIODeviceLatency *l); void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w); void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b); +void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p); int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode); +int cgroup_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path); void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 033fd857d66..60a2ad78162 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -5,6 +5,7 @@ #include "af-list.h" #include "alloc-util.h" #include "bpf-firewall.h" +#include "bpf-foreign.h" #include "bus-get-properties.h" #include "cgroup-util.h" #include "cgroup.h" @@ -347,6 +348,33 @@ static int property_get_ip_address_access( return sd_bus_message_close_container(reply); } +static int property_get_bpf_foreign_program( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + CGroupContext *c = userdata; + CGroupBPFForeignProgram *p; + int r; + + r = sd_bus_message_open_container(reply, 'a', "(ss)"); + if (r < 0) + return r; + + LIST_FOREACH(programs, p, c->bpf_foreign_programs) { + const char *attach_type = bpf_cgroup_attach_type_to_string(p->attach_type); + + r = sd_bus_message_append(reply, "(ss)", attach_type, p->bpffs_path); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0), @@ -398,6 +426,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimit", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit), 0), SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0), + SD_BUS_PROPERTY("BPFProgram", "a(ss)", property_get_bpf_foreign_program, 0, 0), SD_BUS_VTABLE_END }; @@ -570,6 +599,85 @@ static int bus_cgroup_set_transient_property( } } + return 1; + } else if (streq(name, "BPFProgram")) { + const char *a, *p; + size_t n = 0; + + r = sd_bus_message_enter_container(message, 'a', "(ss)"); + if (r < 0) + return r; + + while ((r = sd_bus_message_read(message, "(ss)", &a, &p)) > 0) { + int attach_type = bpf_cgroup_attach_type_from_string(a); + if (attach_type < 0) + return sd_bus_error_setf( + error, + SD_BUS_ERROR_INVALID_ARGS, + "%s expects a valid BPF attach type, got '%s'.", + name, a); + + if (!path_is_normalized(p) || !path_is_absolute(p)) + return sd_bus_error_setf( + error, + SD_BUS_ERROR_INVALID_ARGS, + "%s= expects a normalized absolute path.", + name); + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + r = cgroup_add_bpf_foreign_program(c, attach_type, p); + if (r < 0) + return r; + } + n++; + } + if (r < 0) + return r; + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + CGroupBPFForeignProgram *fp; + size_t size = 0; + + if (n == 0) + while (c->bpf_foreign_programs) + cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs); + + f = open_memstream_unlocked(&buf, &size); + if (!f) + return -ENOMEM; + + fputs(name, f); + fputs("=\n", f); + + LIST_FOREACH(programs, fp, c->bpf_foreign_programs) + fprintf(f, "%s=%s:%s\n", name, + bpf_cgroup_attach_type_to_string(fp->attach_type), + fp->bpffs_path); + + r = fflush_and_check(f); + if (r < 0) + return r; + + unit_write_setting(u, flags, name, buf); + + if (!LIST_IS_EMPTY(c->bpf_foreign_programs)) { + r = bpf_foreign_supported(); + if (r < 0) + return r; + if (r == 0) + log_full(LOG_DEBUG, + "Transient unit %s configures a BPF program pinned to BPF " + "filesystem, but the local system does not support that.\n" + "Starting this unit will fail!", u->id); + } + } + return 1; } diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 5ef785c0dee..4bd1207e2c5 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -234,7 +234,8 @@ $1.ManagedOOMSwap, config_parse_managed_oom_mode, $1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure) $1.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit) $1.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof($1, cgroup_context.moom_preference) -$1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' +$1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0 +$1.BPFProgram, config_parse_bpf_foreign_program, 0, offsetof($1, cgroup_context)' )m4_dnl Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) Unit.Documentation, config_parse_documentation, 0, offsetof(Unit, documentation) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 1a1e58976ae..a0c403a60c9 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -19,6 +19,7 @@ #include "all-units.h" #include "alloc-util.h" #include "bpf-firewall.h" +#include "bpf-program.h" #include "bus-error.h" #include "bus-internal.h" #include "bus-util.h" @@ -5581,6 +5582,64 @@ int config_parse_ip_filter_bpf_progs( return 0; } +int config_parse_bpf_foreign_program( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + _cleanup_free_ char *resolved = NULL, *word = NULL; + CGroupContext *c = data; + Unit *u = userdata; + int attach_type, r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + if (isempty(rvalue)) { + while (c->bpf_foreign_programs) + cgroup_context_remove_bpf_foreign_program(c, c->bpf_foreign_programs); + + return 0; + } + + r = extract_first_word(&rvalue, &word, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse foreign BPF program, ignoring: %s", rvalue); + return 0; + } + + attach_type = bpf_cgroup_attach_type_from_string(word); + if (attach_type < 0) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown BPF attach type=%s, ignoring: %s", word, rvalue); + return 0; + } + + r = unit_full_printf(u, rvalue, &resolved); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to resolve unit specifiers in '%s', ignoring: %m", rvalue); + return 0; + } + + r = path_simplify_and_warn(resolved, PATH_CHECK_ABSOLUTE, unit, filename, line, lvalue); + if (r < 0) + return 0; + + r = cgroup_add_bpf_foreign_program(c, attach_type, resolved); + if (r < 0) + return log_error_errno(r, "Failed to add foreign BPF program to cgroup context: %m"); + + return 0; +} + static int merge_by_names(Unit **u, Set *names, const char *id) { char *k; int r; diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 4746a8a792b..e99c9a40559 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -140,6 +140,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_swap_priority); CONFIG_PARSER_PROTOTYPE(config_parse_mount_images); CONFIG_PARSER_PROTOTYPE(config_parse_socket_timestamping); CONFIG_PARSER_PROTOTYPE(config_parse_extension_images); +CONFIG_PARSER_PROTOTYPE(config_parse_bpf_foreign_program); /* gperf prototypes */ const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); diff --git a/src/core/meson.build b/src/core/meson.build index a389c906b36..a1294f3a725 100644 --- a/src/core/meson.build +++ b/src/core/meson.build @@ -11,6 +11,8 @@ libcore_sources = ''' bpf-devices.h bpf-firewall.c bpf-firewall.h + bpf-foreign.c + bpf-foreign.h cgroup.c cgroup.h core-varlink.c diff --git a/src/core/unit.c b/src/core/unit.c index 2c5dc54379a..cf83272dcbe 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -11,6 +11,7 @@ #include "all-units.h" #include "alloc-util.h" #include "bpf-firewall.h" +#include "bpf-foreign.h" #include "bus-common-errors.h" #include "bus-util.h" #include "cgroup-setup.h" @@ -723,6 +724,8 @@ Unit* unit_free(Unit *u) { set_free(u->ip_bpf_custom_ingress_installed); set_free(u->ip_bpf_custom_egress_installed); + hashmap_free(u->bpf_foreign_by_key); + bpf_program_unref(u->bpf_device_control_installed); condition_free_list(u->conditions); diff --git a/src/core/unit.h b/src/core/unit.h index 6d38e666803..128122b8df7 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -305,6 +305,10 @@ typedef struct Unit { Set *ip_bpf_custom_egress; Set *ip_bpf_custom_egress_installed; + /* BPF programs managed (e.g. loaded to kernel) by an entity external to systemd, + * attached to unit cgroup by provided program fd and attach type. */ + Hashmap *bpf_foreign_by_key; + uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c index 10239142af3..a8a34521fd6 100644 --- a/src/shared/bpf-program.c +++ b/src/shared/bpf-program.c @@ -11,6 +11,50 @@ #include "memory-util.h" #include "missing_syscall.h" #include "path-util.h" +#include "string-table.h" + +static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = { + [BPF_CGROUP_INET_INGRESS] = "ingress", + [BPF_CGROUP_INET_EGRESS] = "egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create", + [BPF_CGROUP_SOCK_OPS] = "sock_ops", + [BPF_CGROUP_DEVICE] = "device", + [BPF_CGROUP_INET4_BIND] = "bind4", + [BPF_CGROUP_INET6_BIND] = "bind6", + [BPF_CGROUP_INET4_CONNECT] = "connect4", + [BPF_CGROUP_INET6_CONNECT] = "connect6", + [BPF_CGROUP_INET4_POST_BIND] = "post_bind4", + [BPF_CGROUP_INET6_POST_BIND] = "post_bind6", + [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4", + [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6", + [BPF_CGROUP_SYSCTL] = "sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4", + [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6", + [BPF_CGROUP_GETSOCKOPT] = "getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "setsockopt", +}; + +DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int); + + /* struct bpf_prog_info info must be initialized since its value is both input and output + * for BPF_OBJ_GET_INFO_BY_FD syscall. */ +static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) { + union bpf_attr attr; + + /* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when + * structured initialization is used. + * Refer to https://github.com/systemd/systemd/issues/18164 + */ + zero(attr); + attr.info.bpf_fd = prog_fd; + attr.info.info_len = info_len; + attr.info.info = PTR_TO_UINT64(info); + + if (bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)) < 0) + return -errno; + + return 0; +} int bpf_program_new(uint32_t prog_type, BPFProgram **ret) { _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; @@ -28,6 +72,38 @@ int bpf_program_new(uint32_t prog_type, BPFProgram **ret) { return 0; } +int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) { + _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; + struct bpf_prog_info info = {}; + int r; + + assert(path); + assert(ret); + + p = new(BPFProgram, 1); + if (!p) + return -ENOMEM; + + *p = (BPFProgram) { + .prog_type = BPF_PROG_TYPE_UNSPEC, + .n_ref = 1, + .kernel_fd = -1, + }; + + r = bpf_program_load_from_bpf_fs(p, path); + if (r < 0) + return r; + + r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info)); + if (r < 0) + return r; + + p->prog_type = info.type; + *ret = TAKE_PTR(p); + + return 0; +} + static BPFProgram *bpf_program_free(BPFProgram *p) { assert(p); @@ -254,3 +330,31 @@ int bpf_map_lookup_element(int fd, const void *key, void *value) { return 0; } + +int bpf_program_pin(int prog_fd, const char *bpffs_path) { + union bpf_attr attr; + + zero(attr); + attr.pathname = PTR_TO_UINT64((void *) bpffs_path); + attr.bpf_fd = prog_fd; + + if (bpf(BPF_OBJ_PIN, &attr, sizeof(attr)) < 0) + return -errno; + + return 0; +} + +int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) { + struct bpf_prog_info info = {}; + int r; + + assert(ret_id); + + r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info)); + if (r < 0) + return r; + + *ret_id = info.id; + + return 0; +}; diff --git a/src/shared/bpf-program.h b/src/shared/bpf-program.h index eef77f9d8e1..86fd338c93c 100644 --- a/src/shared/bpf-program.h +++ b/src/shared/bpf-program.h @@ -26,8 +26,9 @@ struct BPFProgram { }; int bpf_program_new(uint32_t prog_type, BPFProgram **ret); -BPFProgram *bpf_program_unref(BPFProgram *p); +int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret); BPFProgram *bpf_program_ref(BPFProgram *p); +BPFProgram *bpf_program_unref(BPFProgram *p); int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count); int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size); @@ -35,9 +36,14 @@ int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path); int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags); int bpf_program_cgroup_detach(BPFProgram *p); +int bpf_program_pin(int prog_fd, const char *bpffs_path); +int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id); int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags); int bpf_map_update_element(int fd, const void *key, void *value); int bpf_map_lookup_element(int fd, const void *key, void *value); +int bpf_cgroup_attach_type_from_string(const char *str) _pure_; +const char *bpf_cgroup_attach_type_to_string(int attach_type) _const_; + DEFINE_TRIVIAL_CLEANUP_FUNC(BPFProgram*, bpf_program_unref); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 84d4729334c..eb28c359244 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -842,6 +842,26 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons return 1; } + if (streq(field, "BPFProgram")) { + if (isempty(eq)) + r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 0); + else { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&eq, &word, ":", 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to parse %s: %m", field); + + r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 1, word, eq); + } + if (r < 0) + return bus_log_create_error(r); + + return 1; + } + return 0; } diff --git a/src/systemctl/systemctl-show.c b/src/systemctl/systemctl-show.c index c3c81f03fbd..2fe3d8c509e 100644 --- a/src/systemctl/systemctl-show.c +++ b/src/systemctl/systemctl-show.c @@ -1694,6 +1694,23 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m return 1; + } else if (streq(name, "BPFProgram")) { + const char *a, *p; + + r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(ss)"); + if (r < 0) + return bus_log_parse_error(r); + + while ((r = sd_bus_message_read(m, "(ss)", &a, &p)) > 0) + bus_print_property_valuef(name, expected_value, value, "%s:%s", a, p); + if (r < 0) + return bus_log_parse_error(r); + + r = sd_bus_message_exit_container(m); + if (r < 0) + return bus_log_parse_error(r); + + return 1; } break; diff --git a/src/test/meson.build b/src/test/meson.build index 1aa0c56b623..e077c8e03f2 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -324,6 +324,12 @@ tests += [ libblkid], core_includes], + [['src/test/test-bpf-foreign-programs.c'], + [libcore, + libshared], + [], + core_includes], + [['src/test/test-watch-pid.c'], [libcore, libshared], diff --git a/src/test/test-bpf-foreign-programs.c b/src/test/test-bpf-foreign-programs.c new file mode 100644 index 00000000000..e7039240777 --- /dev/null +++ b/src/test/test-bpf-foreign-programs.c @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include +#include +#include +#include + +#include "bpf-foreign.h" +#include "load-fragment.h" +#include "manager.h" +#include "process-util.h" +#include "rlimit-util.h" +#include "rm-rf.h" +#include "service.h" +#include "tests.h" +#include "unit.h" +#include "virt.h" + +struct Test { + const char *option_name; + enum bpf_prog_type prog_type; + enum bpf_attach_type attach_type; + const char *bpffs_path; +}; + +typedef struct Test Test; + +#define BPFFS_PATH(prog_suffix) ("/sys/fs/bpf/test-bpf-foreing-" # prog_suffix) +static const Test single_prog[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + }, +}; +static const Test path_split_test[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("path:split:test"), + }, +}; + +static const Test same_prog_same_hook[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .bpffs_path = BPFFS_PATH("trivial-sock"), + }, + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .bpffs_path = BPFFS_PATH("trivial-sock"), + } +}; + +static const Test multi_prog_same_hook[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .bpffs_path = BPFFS_PATH("trivial-sock-0"), + }, + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK, + .attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .bpffs_path = BPFFS_PATH("trivial-sock-1"), + } +}; + +static const Test same_prog_multi_hook[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + }, + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_EGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + } +}; + +static const Test same_prog_multi_option_0[] = { + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + }, + { + .option_name = "IPIngressFilterPath", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_INGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + } +}; + +static const Test same_prog_multi_option_1[] = { + { + .option_name = "IPEgressFilterPath", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_EGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + }, + { + .option_name = "BPFProgram", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .attach_type = BPF_CGROUP_INET_EGRESS, + .bpffs_path = BPFFS_PATH("trivial-skb"), + } +}; +#undef BPFFS_PATH + +static int bpf_foreign_test_to_string(enum bpf_attach_type attach_type, const char *bpffs_path, char **ret_str) { + const char *s = NULL; + + assert_se(bpffs_path); + assert_se(ret_str); + + assert_se(s = bpf_cgroup_attach_type_to_string(attach_type)); + assert_se(*ret_str = strjoin(s, ":", bpffs_path)); + + return 0; +} + +static char **unlink_paths_and_free(char **paths) { + char **i; + + STRV_FOREACH(i, paths) + (void) unlink(*i); + + return strv_free(paths); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(char **, unlink_paths_and_free); + +static int pin_programs(Unit *u, CGroupContext *cc, const Test *test_suite, size_t test_suite_size, char ***paths_ret) { + _cleanup_(unlink_paths_and_freep) char **bpffs_paths = NULL; + static const struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN() + }; + char log_buf[0xffff]; + int r; + + assert_se(paths_ret); + + for (size_t i = 0; i < test_suite_size; i++) { + _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL; + _cleanup_free_ char *str = NULL; + + r = bpf_foreign_test_to_string(test_suite[i].attach_type, test_suite[i].bpffs_path, &str); + if (r < 0) + return log_error_errno(r, "Failed to convert program to string"); + + r = bpf_program_new(test_suite[i].prog_type, &prog); + if (r < 0) + return log_error_errno(r, "Failed to create program '%s'", str); + + r = bpf_program_add_instructions(prog, trivial, ELEMENTSOF(trivial)); + if (r < 0) + return log_error_errno(r, "Failed to add trivial instructions for '%s'", str); + + r = bpf_program_load_kernel(prog, log_buf, ELEMENTSOF(log_buf)); + if (r < 0) + return log_error_errno(r, "Failed to load BPF program '%s'", str); + + if (strv_contains(bpffs_paths, test_suite[i].bpffs_path)) + continue; + + r = strv_extend(&bpffs_paths, test_suite[i].bpffs_path); + if (r < 0) + return log_error_errno(r, "Failed to put path into a vector: %m"); + + r = bpf_program_pin(prog->kernel_fd, test_suite[i].bpffs_path); + if (r < 0) + return log_error_errno(r, "Failed to pin BPF program '%s'", str); + } + + *paths_ret = TAKE_PTR(bpffs_paths); + return 0; +} + +static int test_bpf_cgroup_programs(Manager *m, const char *unit_name, const Test *test_suite, size_t test_suite_size) { + _cleanup_(unlink_paths_and_freep) char **bpffs_paths = NULL; + _cleanup_(unit_freep) Unit *u = NULL; + CGroupContext *cc = NULL; + int cld_code, r; + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, unit_name) == 0); + assert_se(cc = unit_get_cgroup_context(u)); + + r = pin_programs(u, cc, test_suite, test_suite_size, &bpffs_paths); + if (r < 0) + return log_error_errno(r, "Failed to pin programs: %m"); + + for (size_t i = 0; i < test_suite_size; i++) { + if (streq(test_suite[i].option_name, "BPFProgram")) { + _cleanup_free_ char *option = NULL; + r = bpf_foreign_test_to_string(test_suite[i].attach_type, test_suite[i].bpffs_path, &option); + if (r < 0) + return log_error_errno(r, "Failed to compose option string: %m"); + r = config_parse_bpf_foreign_program( + u->id, "filename", 1, "Service", 1, test_suite[i].option_name, 0, option, cc, u); + + if (r < 0) + return log_error_errno(r, "Failed to parse option string '%s': %m", option); + } else if (STR_IN_SET(test_suite[i].option_name, "IPIngressFilterPath", "IPEgressFilterPath")) { + const char *option = test_suite[i].bpffs_path; + void *paths = NULL; + + if (streq(test_suite[i].option_name, "IPIngressFilterPath")) + paths = &cc->ip_filters_ingress; + else + paths = &cc->ip_filters_egress; + + r = config_parse_ip_filter_bpf_progs( + u->id, "filename", 1, "Service", 1, test_suite[i].option_name, 0, option, paths, u); + if (r < 0) + return log_error_errno(r, "Failed to parse option string '%s': %m", option); + } + } + + r = config_parse_exec( + u->id, + "filename", + 1, + "Service", + 1, + "ExecStart", + SERVICE_EXEC_START, + "-/bin/ping -c 5 127.0.0.1 -W 1", + SERVICE(u)->exec_command, + u); + if (r < 0) + return log_error_errno(r, "Failed to parse ExecStart"); + + SERVICE(u)->type = SERVICE_ONESHOT; + u->load_state = UNIT_LOADED; + + r = unit_start(u); + if (r < 0) + return log_error_errno(r, "Unit start failed %m"); + + while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED)) { + r = sd_event_run(m->event, UINT64_MAX); + if (r < 0) + return log_error_errno(errno, "Event run failed %m"); + } + + cld_code = SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code; + if (cld_code != CLD_EXITED) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), + "ExecStart didn't exited, code='%s'", sigchld_code_to_string(cld_code)); + + if (SERVICE(u)->state != SERVICE_DEAD) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Service is not dead"); + + return r; +} + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + _cleanup_(manager_freep) Manager *m = NULL; + _cleanup_free_ char *unit_dir = NULL; + struct rlimit rl; + int r; + + test_setup_logging(LOG_DEBUG); + + if (detect_container() > 0) + return log_tests_skipped("test-bpf fails inside LXC and Docker containers: https://github.com/systemd/systemd/issues/9666"); + + if (getuid() != 0) + return log_tests_skipped("not running as root"); + + assert_se(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0); + rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE); + (void) setrlimit_closest(RLIMIT_MEMLOCK, &rl); + + if (!can_memlock()) + return log_tests_skipped("Can't use mlock(), skipping."); + + r = cg_all_unified(); + if (r <= 0) + return log_tests_skipped_errno(r, "Unified hierarchy is required, skipping."); + + r = enter_cgroup_subroot(NULL); + if (r == -ENOMEDIUM) + return log_tests_skipped("cgroupfs not available"); + + assert_se(get_testdata_dir("units", &unit_dir) >= 0); + assert_se(set_unit_path(unit_dir) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + assert_se(manager_new(UNIT_FILE_USER, MANAGER_TEST_RUN_BASIC, &m) >= 0); + assert_se(manager_startup(m, NULL, NULL) >= 0); + + assert_se(test_bpf_cgroup_programs(m, + "single_prog.service", single_prog, ELEMENTSOF(single_prog)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "multi_prog_same_hook.service", + multi_prog_same_hook, ELEMENTSOF(multi_prog_same_hook)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "same_prog_multi_hook.service", + same_prog_multi_hook, ELEMENTSOF(same_prog_multi_hook)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "same_prog_multi_option_0.service", + same_prog_multi_option_0, ELEMENTSOF(same_prog_multi_option_0)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "same_prog_multi_option_1.service", + same_prog_multi_option_1, ELEMENTSOF(same_prog_multi_option_1)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "same_prog_same_hook.service", + same_prog_same_hook, + ELEMENTSOF(same_prog_same_hook)) >= 0); + assert_se(test_bpf_cgroup_programs(m, + "path_split_test.service", + path_split_test, + ELEMENTSOF(path_split_test)) >= 0); + return 0; +} diff --git a/src/test/test-cgroup-mask.c b/src/test/test-cgroup-mask.c index b53e327c63d..d721946f713 100644 --- a/src/test/test-cgroup-mask.c +++ b/src/test/test-cgroup-mask.c @@ -140,7 +140,7 @@ static void test_cg_mask_to_string_one(CGroupMask mask, const char *t) { static void test_cg_mask_to_string(void) { test_cg_mask_to_string_one(0, NULL); - test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct cpuset io blkio memory devices pids bpf-firewall bpf-devices"); + test_cg_mask_to_string_one(_CGROUP_MASK_ALL, "cpu cpuacct cpuset io blkio memory devices pids bpf-firewall bpf-devices bpf-foreign"); test_cg_mask_to_string_one(CGROUP_MASK_CPU, "cpu"); test_cg_mask_to_string_one(CGROUP_MASK_CPUACCT, "cpuacct"); test_cg_mask_to_string_one(CGROUP_MASK_CPUSET, "cpuset"); diff --git a/test/fuzz/fuzz-unit-file/directives.mount b/test/fuzz/fuzz-unit-file/directives.mount index 4cc64a96b6e..f1a8d19d18e 100644 --- a/test/fuzz/fuzz-unit-file/directives.mount +++ b/test/fuzz/fuzz-unit-file/directives.mount @@ -4,6 +4,7 @@ AllowedCPUs= AllowedMemoryNodes= AmbientCapabilities= AppArmorProfile= +BPFProgram= BindPaths= BindReadOnlyPaths= BlockIOAccounting= diff --git a/test/fuzz/fuzz-unit-file/directives.scope b/test/fuzz/fuzz-unit-file/directives.scope index 0af372f7cab..97ca9b47aa0 100644 --- a/test/fuzz/fuzz-unit-file/directives.scope +++ b/test/fuzz/fuzz-unit-file/directives.scope @@ -2,6 +2,7 @@ scope [Scope] AllowedCPUs= AllowedMemoryNodes= +BPFProgram= BlockIOAccounting= BlockIODeviceWeight= BlockIOReadBandwidth= diff --git a/test/fuzz/fuzz-unit-file/directives.service b/test/fuzz/fuzz-unit-file/directives.service index ea3c93d93d1..c2069c1a785 100644 --- a/test/fuzz/fuzz-unit-file/directives.service +++ b/test/fuzz/fuzz-unit-file/directives.service @@ -28,6 +28,7 @@ AssertPathIsSymbolicLink= AssertSecurity= AssertUser= AssertVirtualization= +BPFProgram= Before= BindTo= BindsTo= diff --git a/test/fuzz/fuzz-unit-file/directives.slice b/test/fuzz/fuzz-unit-file/directives.slice index 2a35b3e5181..b96d0628ad7 100644 --- a/test/fuzz/fuzz-unit-file/directives.slice +++ b/test/fuzz/fuzz-unit-file/directives.slice @@ -2,6 +2,7 @@ slice [Slice] AllowedCPUs= AllowedMemoryNodes= +BPFProgram= BlockIOAccounting= BlockIODeviceWeight= BlockIOReadBandwidth= diff --git a/test/fuzz/fuzz-unit-file/directives.socket b/test/fuzz/fuzz-unit-file/directives.socket index 3931601ead0..79e04a2e84b 100644 --- a/test/fuzz/fuzz-unit-file/directives.socket +++ b/test/fuzz/fuzz-unit-file/directives.socket @@ -5,6 +5,7 @@ AllowedCPUs= AllowedMemoryNodes= AmbientCapabilities= AppArmorProfile= +BPFProgram= Backlog= BindIPv6Only= BindPaths= diff --git a/test/fuzz/fuzz-unit-file/directives.swap b/test/fuzz/fuzz-unit-file/directives.swap index 3998f8bd333..c3b63aac56a 100644 --- a/test/fuzz/fuzz-unit-file/directives.swap +++ b/test/fuzz/fuzz-unit-file/directives.swap @@ -4,6 +4,7 @@ AllowedCPUs= AllowedMemoryNodes= AmbientCapabilities= AppArmorProfile= +BPFProgram= BindPaths= BindReadOnlyPaths= BlockIOAccounting=