From 18f573aaf9802ddac494c94a59968e0b34155695 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 5 Sep 2017 19:20:29 +0200 Subject: [PATCH 01/42] core: make sure to dump cgroup context when unit_dump() is called for all unit types For some reason we didn't dump the cgroup context for a number of unit types, including service units. Not sure how this wasn't noticed before... Add this in. --- src/core/mount.c | 1 + src/core/service.c | 2 ++ src/core/socket.c | 2 ++ src/core/swap.c | 1 + 4 files changed, 6 insertions(+) diff --git a/src/core/mount.c b/src/core/mount.c index c3805ee055f..472f54242cd 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -736,6 +736,7 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) { exec_context_dump(&m->exec_context, f, prefix); kill_context_dump(&m->kill_context, f, prefix); + cgroup_context_dump(&m->cgroup_context, f, prefix); } static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { diff --git a/src/core/service.c b/src/core/service.c index c9a7222cc64..2144884f9e0 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -803,6 +803,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) { "%sFile Descriptor Store Current: %u\n", prefix, s->n_fd_store_max, prefix, s->n_fd_store); + + cgroup_context_dump(&s->cgroup_context, f, prefix); } static int service_load_pid_file(Service *s, bool may_warn) { diff --git a/src/core/socket.c b/src/core/socket.c index 9d8367e90bd..a82e7d21870 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -852,6 +852,8 @@ static void socket_dump(Unit *u, FILE *f, const char *prefix) { exec_command_dump_list(s->exec_command[c], f, prefix2); } + + cgroup_context_dump(&s->cgroup_context, f, prefix); } static int instance_from_socket(int fd, unsigned nr, char **instance) { diff --git a/src/core/swap.c b/src/core/swap.c index 9553ee16a83..303f62d25af 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -602,6 +602,7 @@ static void swap_dump(Unit *u, FILE *f, const char *prefix) { exec_context_dump(&s->exec_context, f, prefix); kill_context_dump(&s->kill_context, f, prefix); + cgroup_context_dump(&s->cgroup_context, f, prefix); } static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { From 8b238b13b1fd133826a6eb0515d75a4c501016ae Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 4 Sep 2017 18:18:04 +0200 Subject: [PATCH 02/42] cgroup-util: minor coding style adjustment --- src/basic/cgroup-util.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 5dea0789784..f6f4033ad8b 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -103,9 +103,12 @@ int cg_read_pid(FILE *f, pid_t *_pid) { return 1; } -int cg_read_event(const char *controller, const char *path, const char *event, - char **val) -{ +int cg_read_event( + const char *controller, + const char *path, + const char *event, + char **val) { + _cleanup_free_ char *events = NULL, *content = NULL; char *p, *line; int r; From 7cce4fb7f703fa928caf10e5b4e235625ee4bf80 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 5 Sep 2017 11:17:01 +0200 Subject: [PATCH 03/42] cgroup: always invalidate "cpu" and "cpuacct" together This doesn't really matter, as we never invalidate cpuacct explicitly, and there's no real reason to care for it explicitly, however it's prettier if we always treat cpu and cpuacct as belonging together, the same way we conisder "io" and "blkio" to belong together. --- src/core/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index c806d6b7cb4..9013a0810b3 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2167,6 +2167,9 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) { if (m & (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)) m |= CGROUP_MASK_IO | CGROUP_MASK_BLKIO; + if (m & (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT)) + m |= CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT; + if ((u->cgroup_realized_mask & m) == 0) return; From 10bd3e2e4c21bbf74480e686b9de564a9f6d0a4e Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 5 Sep 2017 11:40:47 +0200 Subject: [PATCH 04/42] manager: watching the cgroup2 inotify fd is safe in test runs too Less deviation between test runs and normal runs is always a good idea, hence enable more stuff that is safe in test runs --- src/core/cgroup.c | 108 ++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 57 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 9013a0810b3..ffb0f49cd6b 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1756,6 +1756,7 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, int manager_setup_cgroup(Manager *m) { _cleanup_free_ char *path = NULL; + const char *scope_path; CGroupController c; int r, all_unified; char *e; @@ -1813,74 +1814,67 @@ int manager_setup_cgroup(Manager *m) { log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER_LEGACY ". File system hierarchy is at %s.", path); } - if (!m->test_run_flags) { - const char *scope_path; + /* 3. Install agent */ + if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) { - /* 3. Install agent */ - if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) { + /* In the unified hierarchy we can get + * cgroup empty notifications via inotify. */ - /* In the unified hierarchy we can get - * cgroup empty notifications via inotify. */ + m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); + safe_close(m->cgroup_inotify_fd); - m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source); - safe_close(m->cgroup_inotify_fd); + m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); + if (m->cgroup_inotify_fd < 0) + return log_error_errno(errno, "Failed to create control group inotify object: %m"); - m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC); - if (m->cgroup_inotify_fd < 0) - return log_error_errno(errno, "Failed to create control group inotify object: %m"); - - r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m); - if (r < 0) - return log_error_errno(r, "Failed to watch control group inotify object: %m"); - - /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also - * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ - r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5); - if (r < 0) - return log_error_errno(r, "Failed to set priority of inotify event source: %m"); - - (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify"); - - } else if (MANAGER_IS_SYSTEM(m)) { - - /* On the legacy hierarchy we only get - * notifications via cgroup agents. (Which - * isn't really reliable, since it does not - * generate events when control groups with - * children run empty. */ - - r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH); - if (r < 0) - log_warning_errno(r, "Failed to install release agent, ignoring: %m"); - else if (r > 0) - log_debug("Installed release agent."); - else if (r == 0) - log_debug("Release agent already installed."); - } - - /* 4. Make sure we are in the special "init.scope" unit in the root slice. */ - scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); - r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m); if (r < 0) - return log_error_errno(r, "Failed to create %s control group: %m", scope_path); + return log_error_errno(r, "Failed to watch control group inotify object: %m"); - /* also, move all other userspace processes remaining - * in the root cgroup into that scope. */ - r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + /* Process cgroup empty notifications early, but after service notifications and SIGCHLD. Also + * see handling of cgroup agent notifications, for the classic cgroup hierarchy support. */ + r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_NORMAL-5); if (r < 0) - log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m"); + return log_error_errno(r, "Failed to set priority of inotify event source: %m"); - /* 5. And pin it, so that it cannot be unmounted */ - safe_close(m->pin_cgroupfs_fd); - m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK); - if (m->pin_cgroupfs_fd < 0) - return log_error_errno(errno, "Failed to open pin file: %m"); + (void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify"); - /* 6. Always enable hierarchical support if it exists... */ - if (!all_unified) - (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1"); + } else if (MANAGER_IS_SYSTEM(m) && m->test_run_flags == 0) { + + /* On the legacy hierarchy we only get notifications via cgroup agents. (Which isn't really reliable, + * since it does not generate events when control groups with children run empty. */ + + r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH); + if (r < 0) + log_warning_errno(r, "Failed to install release agent, ignoring: %m"); + else if (r > 0) + log_debug("Installed release agent."); + else if (r == 0) + log_debug("Release agent already installed."); } + /* 4. Make sure we are in the special "init.scope" unit in the root slice. */ + scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE); + r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + if (r < 0) + return log_error_errno(r, "Failed to create %s control group: %m", scope_path); + + /* also, move all other userspace processes remaining + * in the root cgroup into that scope. */ + r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, 0); + if (r < 0) + log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m"); + + /* 5. And pin it, so that it cannot be unmounted */ + safe_close(m->pin_cgroupfs_fd); + m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK); + if (m->pin_cgroupfs_fd < 0) + return log_error_errno(errno, "Failed to open pin file: %m"); + + /* 6. Always enable hierarchical support if it exists... */ + if (!all_unified && m->test_run_flags == 0) + (void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1"); + /* 7. Figure out which controllers are supported */ r = cg_mask_supported(&m->cgroup_supported); if (r < 0) From bd389aa73473ee1a72a28d4612ac0e5a190cfea6 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 4 Sep 2017 18:19:07 +0200 Subject: [PATCH 05/42] manager: initialize timeouts when allocating a naked Manager object This way we can safely run manager objects from tests and good timeouts apply. Without this all timeouts are set 0, which means they fire instantly, when run from tests which do not explicitly configure them (the way main.c does). --- src/core/manager.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/core/manager.c b/src/core/manager.c index 46036aa50c7..032e75d7985 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -616,6 +616,9 @@ int manager_new(UnitFileScope scope, unsigned test_run_flags, Manager **_m) { m->default_timer_accuracy_usec = USEC_PER_MINUTE; m->default_tasks_accounting = true; m->default_tasks_max = UINT64_MAX; + m->default_timeout_start_usec = DEFAULT_TIMEOUT_USEC; + m->default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC; + m->default_restart_usec = DEFAULT_RESTART_USEC; #ifdef ENABLE_EFI if (MANAGER_IS_SYSTEM(m) && detect_container() <= 0) From 5a941f5f212f38501836ccfe6164feb518b6f471 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 1 Sep 2017 14:40:02 +0200 Subject: [PATCH 06/42] in-addr-util: be more systematic with naming our functions Let's rename all our functions that process IPv4 in_addr structures in4_addr_xyz(), following the already establishing naming logic for this. Leave the in_addr_xyz() prefix for functions that process the IPv4/IPv6 in_addr_union union instead. --- src/basic/in-addr-util.c | 14 +++++++------- src/basic/in-addr-util.h | 8 ++++---- src/libsystemd-network/sd-dhcp-lease.c | 4 ++-- src/libsystemd-network/sd-dhcp-server.c | 2 +- src/network/networkd-address.c | 2 +- src/network/networkd-dhcp4.c | 6 +++--- src/shared/firewall-util.c | 4 ++-- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/basic/in-addr-util.c b/src/basic/in-addr-util.c index d52fdad3ac9..d6b0a9634ac 100644 --- a/src/basic/in-addr-util.c +++ b/src/basic/in-addr-util.c @@ -371,13 +371,13 @@ int in_addr_ifindex_from_string_auto(const char *s, int *family, union in_addr_u return r; } -unsigned char in_addr_netmask_to_prefixlen(const struct in_addr *addr) { +unsigned char in4_addr_netmask_to_prefixlen(const struct in_addr *addr) { assert(addr); return 32 - u32ctz(be32toh(addr->s_addr)); } -struct in_addr* in_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned char prefixlen) { +struct in_addr* in4_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned char prefixlen) { assert(addr); assert(prefixlen <= 32); @@ -390,7 +390,7 @@ struct in_addr* in_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned char return addr; } -int in_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixlen) { +int in4_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixlen) { uint8_t msb_octet = *(uint8_t*) addr; /* addr may not be aligned, so make sure we only access it byte-wise */ @@ -414,18 +414,18 @@ int in_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixl return 0; } -int in_addr_default_subnet_mask(const struct in_addr *addr, struct in_addr *mask) { +int in4_addr_default_subnet_mask(const struct in_addr *addr, struct in_addr *mask) { unsigned char prefixlen; int r; assert(addr); assert(mask); - r = in_addr_default_prefixlen(addr, &prefixlen); + r = in4_addr_default_prefixlen(addr, &prefixlen); if (r < 0) return r; - in_addr_prefixlen_to_netmask(mask, prefixlen); + in4_addr_prefixlen_to_netmask(mask, prefixlen); return 0; } @@ -435,7 +435,7 @@ int in_addr_mask(int family, union in_addr_union *addr, unsigned char prefixlen) if (family == AF_INET) { struct in_addr mask; - if (!in_addr_prefixlen_to_netmask(&mask, prefixlen)) + if (!in4_addr_prefixlen_to_netmask(&mask, prefixlen)) return -EINVAL; addr->in.s_addr &= mask.s_addr; diff --git a/src/basic/in-addr-util.h b/src/basic/in-addr-util.h index 14e27246b59..80aad0833c3 100644 --- a/src/basic/in-addr-util.h +++ b/src/basic/in-addr-util.h @@ -55,10 +55,10 @@ int in_addr_ifindex_to_string(int family, const union in_addr_union *u, int ifin int in_addr_from_string(int family, const char *s, union in_addr_union *ret); int in_addr_from_string_auto(const char *s, int *family, union in_addr_union *ret); int in_addr_ifindex_from_string_auto(const char *s, int *family, union in_addr_union *ret, int *ifindex); -unsigned char in_addr_netmask_to_prefixlen(const struct in_addr *addr); -struct in_addr* in_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned char prefixlen); -int in_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixlen); -int in_addr_default_subnet_mask(const struct in_addr *addr, struct in_addr *mask); +unsigned char in4_addr_netmask_to_prefixlen(const struct in_addr *addr); +struct in_addr* in4_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned char prefixlen); +int in4_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixlen); +int in4_addr_default_subnet_mask(const struct in_addr *addr, struct in_addr *mask); int in_addr_mask(int family, union in_addr_union *addr, unsigned char prefixlen); int in_addr_prefix_from_string(const char *p, int family, union in_addr_union *ret_prefix, uint8_t *ret_prefixlen); diff --git a/src/libsystemd-network/sd-dhcp-lease.c b/src/libsystemd-network/sd-dhcp-lease.c index 6f0e51720a1..1ab569765d8 100644 --- a/src/libsystemd-network/sd-dhcp-lease.c +++ b/src/libsystemd-network/sd-dhcp-lease.c @@ -471,7 +471,7 @@ static int lease_parse_routes( struct sd_dhcp_route *route = *routes + *routes_size; int r; - r = in_addr_default_prefixlen((struct in_addr*) option, &route->dst_prefixlen); + r = in4_addr_default_prefixlen((struct in_addr*) option, &route->dst_prefixlen); if (r < 0) { log_debug("Failed to determine destination prefix length from class based IP, ignoring"); continue; @@ -1253,7 +1253,7 @@ int dhcp_lease_set_default_subnet_mask(sd_dhcp_lease *lease) { address.s_addr = lease->address; /* fall back to the default subnet masks based on address class */ - r = in_addr_default_subnet_mask(&address, &mask); + r = in4_addr_default_subnet_mask(&address, &mask); if (r < 0) return r; diff --git a/src/libsystemd-network/sd-dhcp-server.c b/src/libsystemd-network/sd-dhcp-server.c index 5a59c377f8c..727cc16ab56 100644 --- a/src/libsystemd-network/sd-dhcp-server.c +++ b/src/libsystemd-network/sd-dhcp-server.c @@ -56,7 +56,7 @@ int sd_dhcp_server_configure_pool(sd_dhcp_server *server, struct in_addr *addres assert_return(address->s_addr != INADDR_ANY, -EINVAL); assert_return(prefixlen <= 32, -ERANGE); - assert_se(in_addr_prefixlen_to_netmask(&netmask_addr, prefixlen)); + assert_se(in4_addr_prefixlen_to_netmask(&netmask_addr, prefixlen)); netmask = netmask_addr.s_addr; server_off = be32toh(address->s_addr & ~netmask); diff --git a/src/network/networkd-address.c b/src/network/networkd-address.c index 7f536b4ba9a..8f625975fbb 100644 --- a/src/network/networkd-address.c +++ b/src/network/networkd-address.c @@ -768,7 +768,7 @@ int config_parse_address(const char *unit, } if (!e && f == AF_INET) { - r = in_addr_default_prefixlen(&buffer.in, &n->prefixlen); + r = in4_addr_default_prefixlen(&buffer.in, &n->prefixlen); if (r < 0) { log_syntax(unit, LOG_ERR, filename, line, r, "Prefix length not specified, and a default one can not be deduced for '%s', ignoring assignment", address); return 0; diff --git a/src/network/networkd-dhcp4.c b/src/network/networkd-dhcp4.c index 9c69979c7bb..3b5bacd13bf 100644 --- a/src/network/networkd-dhcp4.c +++ b/src/network/networkd-dhcp4.c @@ -237,7 +237,7 @@ static int dhcp_lease_lost(Link *link) { if (r >= 0) { r = sd_dhcp_lease_get_netmask(link->dhcp_lease, &netmask); if (r >= 0) - prefixlen = in_addr_netmask_to_prefixlen(&netmask); + prefixlen = in4_addr_netmask_to_prefixlen(&netmask); address->family = AF_INET; address->in_addr.in = addr; @@ -316,7 +316,7 @@ static int dhcp4_update_address(Link *link, assert(netmask); assert(lifetime); - prefixlen = in_addr_netmask_to_prefixlen(netmask); + prefixlen = in4_addr_netmask_to_prefixlen(netmask); r = address_new(&addr); if (r < 0) @@ -406,7 +406,7 @@ static int dhcp_lease_acquired(sd_dhcp_client *client, Link *link) { if (r < 0) return log_link_error_errno(link, r, "DHCP error: No netmask: %m"); - prefixlen = in_addr_netmask_to_prefixlen(&netmask); + prefixlen = in4_addr_netmask_to_prefixlen(&netmask); r = sd_dhcp_lease_get_router(lease, &gateway); if (r < 0 && r != -ENODATA) diff --git a/src/shared/firewall-util.c b/src/shared/firewall-util.c index 952fc48c459..3a6e987ee18 100644 --- a/src/shared/firewall-util.c +++ b/src/shared/firewall-util.c @@ -72,7 +72,7 @@ static int entry_fill_basics( } if (source) { entry->ip.src = source->in; - in_addr_prefixlen_to_netmask(&entry->ip.smsk, source_prefixlen); + in4_addr_prefixlen_to_netmask(&entry->ip.smsk, source_prefixlen); } if (out_interface) { @@ -84,7 +84,7 @@ static int entry_fill_basics( } if (destination) { entry->ip.dst = destination->in; - in_addr_prefixlen_to_netmask(&entry->ip.dmsk, destination_prefixlen); + in4_addr_prefixlen_to_netmask(&entry->ip.dmsk, destination_prefixlen); } return 0; From 4e2d5273619f25125851bb85c20fae5ffe389ce4 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 1 Sep 2017 14:08:16 +0200 Subject: [PATCH 07/42] in-addr-util: prefix return parameters with ret_ --- src/basic/in-addr-util.c | 10 +++++----- src/basic/in-addr-util.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/basic/in-addr-util.c b/src/basic/in-addr-util.c index d6b0a9634ac..2198783ec78 100644 --- a/src/basic/in-addr-util.c +++ b/src/basic/in-addr-util.c @@ -308,22 +308,22 @@ int in_addr_from_string(int family, const char *s, union in_addr_union *ret) { return 0; } -int in_addr_from_string_auto(const char *s, int *family, union in_addr_union *ret) { +int in_addr_from_string_auto(const char *s, int *ret_family, union in_addr_union *ret) { int r; assert(s); r = in_addr_from_string(AF_INET, s, ret); if (r >= 0) { - if (family) - *family = AF_INET; + if (ret_family) + *ret_family = AF_INET; return 0; } r = in_addr_from_string(AF_INET6, s, ret); if (r >= 0) { - if (family) - *family = AF_INET6; + if (ret_family) + *ret_family = AF_INET6; return 0; } diff --git a/src/basic/in-addr-util.h b/src/basic/in-addr-util.h index 80aad0833c3..8401d662e69 100644 --- a/src/basic/in-addr-util.h +++ b/src/basic/in-addr-util.h @@ -53,7 +53,7 @@ int in_addr_prefix_next(int family, union in_addr_union *u, unsigned prefixlen); int in_addr_to_string(int family, const union in_addr_union *u, char **ret); int in_addr_ifindex_to_string(int family, const union in_addr_union *u, int ifindex, char **ret); int in_addr_from_string(int family, const char *s, union in_addr_union *ret); -int in_addr_from_string_auto(const char *s, int *family, union in_addr_union *ret); +int in_addr_from_string_auto(const char *s, int *ret_family, union in_addr_union *ret); int in_addr_ifindex_from_string_auto(const char *s, int *family, union in_addr_union *ret, int *ifindex); unsigned char in4_addr_netmask_to_prefixlen(const struct in_addr *addr); struct in_addr* in4_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned char prefixlen); From f4912f3a74a1f55be371a515418083bd5d05169c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 1 Sep 2017 14:25:59 +0200 Subject: [PATCH 08/42] in-addr-util: add new helper call in_addr_prefix_from_string_auto() This is much like in_addr_prefix_from_string(), but automatically determines whether IPv4 or IPv6 addresses are specified. Also adds a test for it. --- src/basic/in-addr-util.c | 88 ++++++++++++++++++++++++++++++------ src/basic/in-addr-util.h | 4 +- src/test/meson.build | 4 ++ src/test/test-in-addr-util.c | 75 ++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 16 deletions(-) create mode 100644 src/test/test-in-addr-util.c diff --git a/src/basic/in-addr-util.c b/src/basic/in-addr-util.c index 2198783ec78..94f06258aac 100644 --- a/src/basic/in-addr-util.c +++ b/src/basic/in-addr-util.c @@ -465,10 +465,33 @@ int in_addr_mask(int family, union in_addr_union *addr, unsigned char prefixlen) return -EAFNOSUPPORT; } -int in_addr_prefix_from_string(const char *p, int family, union in_addr_union *ret_prefix, uint8_t *ret_prefixlen) { +int in_addr_parse_prefixlen(int family, const char *p, unsigned char *ret) { + uint8_t u; + int r; + + if (!IN_SET(family, AF_INET, AF_INET6)) + return -EAFNOSUPPORT; + + r = safe_atou8(p, &u); + if (r < 0) + return r; + + if (u > FAMILY_ADDRESS_SIZE(family) * 8) + return -ERANGE; + + *ret = u; + return 0; +} + +int in_addr_prefix_from_string( + const char *p, + int family, + union in_addr_union *ret_prefix, + unsigned char *ret_prefixlen) { + union in_addr_union buffer; const char *e, *l; - uint8_t k; + unsigned char k; int r; assert(p); @@ -486,23 +509,58 @@ int in_addr_prefix_from_string(const char *p, int family, union in_addr_union *r if (r < 0) return r; - k = FAMILY_ADDRESS_SIZE(family) * 8; - if (e) { - uint8_t n; - - r = safe_atou8(e + 1, &n); + r = in_addr_parse_prefixlen(family, e+1, &k); if (r < 0) return r; + } else + k = FAMILY_ADDRESS_SIZE(family) * 8; - if (n > k) - return -ERANGE; - - k = n; - } - - *ret_prefix = buffer; - *ret_prefixlen = k; + if (ret_prefix) + *ret_prefix = buffer; + if (ret_prefixlen) + *ret_prefixlen = k; return 0; } + +int in_addr_prefix_from_string_auto( + const char *p, + int *ret_family, + union in_addr_union *ret_prefix, + unsigned char *ret_prefixlen) { + + union in_addr_union buffer; + const char *e, *l; + unsigned char k; + int family, r; + + assert(p); + + e = strchr(p, '/'); + if (e) + l = strndupa(p, e - p); + else + l = p; + + r = in_addr_from_string_auto(l, &family, &buffer); + if (r < 0) + return r; + + if (e) { + r = in_addr_parse_prefixlen(family, e+1, &k); + if (r < 0) + return r; + } else + k = FAMILY_ADDRESS_SIZE(family) * 8; + + if (ret_family) + *ret_family = family; + if (ret_prefix) + *ret_prefix = buffer; + if (ret_prefixlen) + *ret_prefixlen = k; + + return 0; + +} diff --git a/src/basic/in-addr-util.h b/src/basic/in-addr-util.h index 8401d662e69..bb57c089bfd 100644 --- a/src/basic/in-addr-util.h +++ b/src/basic/in-addr-util.h @@ -60,7 +60,9 @@ struct in_addr* in4_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned cha int in4_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixlen); int in4_addr_default_subnet_mask(const struct in_addr *addr, struct in_addr *mask); int in_addr_mask(int family, union in_addr_union *addr, unsigned char prefixlen); -int in_addr_prefix_from_string(const char *p, int family, union in_addr_union *ret_prefix, uint8_t *ret_prefixlen); +int in_addr_parse_prefixlen(int family, const char *p, unsigned char *ret); +int in_addr_prefix_from_string(const char *p, int family, union in_addr_union *ret_prefix, unsigned char *ret_prefixlen); +int in_addr_prefix_from_string_auto(const char *p, int *ret_family, union in_addr_union *ret_prefix, unsigned char *ret_prefixlen); static inline size_t FAMILY_ADDRESS_SIZE(int family) { assert(family == AF_INET || family == AF_INET6); diff --git a/src/test/meson.build b/src/test/meson.build index 57f76559a7f..b1543cdcd2e 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -277,6 +277,10 @@ tests += [ [], []], + [['src/test/test-in-addr-util.c'], + [], + []], + [['src/test/test-barrier.c'], [], []], diff --git a/src/test/test-in-addr-util.c b/src/test/test-in-addr-util.c new file mode 100644 index 00000000000..8b7a1229fee --- /dev/null +++ b/src/test/test-in-addr-util.c @@ -0,0 +1,75 @@ +/*** + This file is part of systemd + + Copyright 2017 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +#include "in-addr-util.h" + +static void test_in_addr_prefix_from_string(const char *p, int family, int ret, const union in_addr_union *u, unsigned char prefixlen) { + union in_addr_union q; + unsigned char l; + int r; + + r = in_addr_prefix_from_string(p, family, &q, &l); + assert_se(r == ret); + + if (r >= 0) { + int f; + + assert_se(in_addr_equal(family, &q, u)); + assert_se(l == prefixlen); + + r = in_addr_prefix_from_string_auto(p, &f, &q, &l); + assert_se(r >= 0); + + assert_se(f == family); + assert_se(in_addr_equal(family, &q, u)); + assert_se(l == prefixlen); + } +} + +int main(int argc, char *argv[]) { + test_in_addr_prefix_from_string("", AF_INET, -EINVAL, NULL, 0); + test_in_addr_prefix_from_string("/", AF_INET, -EINVAL, NULL, 0); + test_in_addr_prefix_from_string("/8", AF_INET, -EINVAL, NULL, 0); + test_in_addr_prefix_from_string("1.2.3.4", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 32); + test_in_addr_prefix_from_string("1.2.3.4/0", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 0); + test_in_addr_prefix_from_string("1.2.3.4/1", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 1); + test_in_addr_prefix_from_string("1.2.3.4/2", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 2); + test_in_addr_prefix_from_string("1.2.3.4/32", AF_INET, 0, &(union in_addr_union) { .in = (struct in_addr) { .s_addr = htobe32(0x01020304) } }, 32); + test_in_addr_prefix_from_string("1.2.3.4/33", AF_INET, -ERANGE, NULL, 0); + test_in_addr_prefix_from_string("1.2.3.4/-1", AF_INET, -ERANGE, NULL, 0); + test_in_addr_prefix_from_string("::1", AF_INET, -EINVAL, NULL, 0); + + test_in_addr_prefix_from_string("", AF_INET6, -EINVAL, NULL, 0); + test_in_addr_prefix_from_string("/", AF_INET6, -EINVAL, NULL, 0); + test_in_addr_prefix_from_string("/8", AF_INET6, -EINVAL, NULL, 0); + test_in_addr_prefix_from_string("::1", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 128); + test_in_addr_prefix_from_string("::1/0", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 0); + test_in_addr_prefix_from_string("::1/1", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 1); + test_in_addr_prefix_from_string("::1/2", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 2); + test_in_addr_prefix_from_string("::1/32", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 32); + test_in_addr_prefix_from_string("::1/33", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 33); + test_in_addr_prefix_from_string("::1/64", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 64); + test_in_addr_prefix_from_string("::1/128", AF_INET6, 0, &(union in_addr_union) { .in6 = IN6ADDR_LOOPBACK_INIT }, 128); + test_in_addr_prefix_from_string("::1/129", AF_INET6, -ERANGE, NULL, 0); + test_in_addr_prefix_from_string("::1/-1", AF_INET6, -ERANGE, NULL, 0); + + return 0; +} From 3f0c2342c04663d7d86748d815095df889212c15 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Fri, 28 Oct 2016 17:37:54 +0200 Subject: [PATCH 09/42] build-sys: add new kernel bpf.h drop-in The defines we need are pretty comprehensive and new, hence copy in the full header from the kernel. --- src/shared/linux/bpf.h | 673 ++++++++++++++++++++++++++++++++++ src/shared/linux/bpf_common.h | 55 +++ src/shared/linux/libbpf.h | 198 ++++++++++ 3 files changed, 926 insertions(+) create mode 100644 src/shared/linux/bpf.h create mode 100644 src/shared/linux/bpf_common.h create mode 100644 src/shared/linux/libbpf.h diff --git a/src/shared/linux/bpf.h b/src/shared/linux/bpf.h new file mode 100644 index 00000000000..8477b446097 --- /dev/null +++ b/src/shared/linux/bpf.h @@ -0,0 +1,673 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __LINUX_BPF_H__ +#define __LINUX_BPF_H__ + +#include +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word */ +#define BPF_XADD 0xc0 /* exclusive add */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +struct bpf_lpm_trie_key { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u8 data[0]; /* Arbitrary size */ +}; + +/* BPF syscall commands, see bpf(2) man-page for details. */ +enum bpf_cmd { + BPF_MAP_CREATE, + BPF_MAP_LOOKUP_ELEM, + BPF_MAP_UPDATE_ELEM, + BPF_MAP_DELETE_ELEM, + BPF_MAP_GET_NEXT_KEY, + BPF_PROG_LOAD, + BPF_OBJ_PIN, + BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, + BPF_PROG_TEST_RUN, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, + BPF_MAP_TYPE_HASH, + BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, + BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, +}; + +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, + BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, +}; + +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + +/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command + * to the given target_fd cgroup the descendent cgroup will be able to + * override effective bpf program that was inherited from this cgroup + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) + +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + +#define BPF_PSEUDO_MAP_FD 1 + +/* flags for BPF_MAP_UPDATE_ELEM command */ +#define BPF_ANY 0 /* create new element or update existing */ +#define BPF_NOEXIST 1 /* create new element if it didn't exist */ +#define BPF_EXIST 2 /* update existing element */ + +#define BPF_F_NO_PREALLOC (1U << 0) +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ +#define BPF_F_NO_COMMON_LRU (1U << 1) + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* prealloc or not */ + __u32 inner_map_fd; /* fd pointing to the inner map */ + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + __u64 flags; + }; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 prog_flags; + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + __u32 attach_flags; + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; + __u32 data_size_out; + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + } test; +} __attribute__((aligned(8))); + +/* BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(&map, &key) + * Return: Map value or NULL + * + * int bpf_map_update_elem(&map, &key, &value, flags) + * Return: 0 on success or negative error + * + * int bpf_map_delete_elem(&map, &key) + * Return: 0 on success or negative error + * + * int bpf_probe_read(void *dst, int size, void *src) + * Return: 0 on success or negative error + * + * u64 bpf_ktime_get_ns(void) + * Return: current ktime + * + * int bpf_trace_printk(const char *fmt, int fmt_size, ...) + * Return: length of buffer written or negative error + * + * u32 bpf_prandom_u32(void) + * Return: random value + * + * u32 bpf_raw_smp_processor_id(void) + * Return: SMP processor ID + * + * int bpf_skb_store_bytes(skb, offset, from, len, flags) + * store bytes into packet + * @skb: pointer to skb + * @offset: offset within packet from skb->mac_header + * @from: pointer where to copy bytes from + * @len: number of bytes to store into packet + * @flags: bit 0 - if true, recompute skb->csum + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_l3_csum_replace(skb, offset, from, to, flags) + * recompute IP checksum + * @skb: pointer to skb + * @offset: offset within packet where IP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_l4_csum_replace(skb, offset, from, to, flags) + * recompute TCP/UDP checksum + * @skb: pointer to skb + * @offset: offset within packet where TCP/UDP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * bit 4 - is pseudo header + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_tail_call(ctx, prog_array_map, index) + * jump into another BPF program + * @ctx: context pointer passed to next program + * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY + * @index: index inside array that selects specific program to run + * Return: 0 on success or negative error + * + * int bpf_clone_redirect(skb, ifindex, flags) + * redirect to another netdev + * @skb: pointer to skb + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: 0 on success or negative error + * + * u64 bpf_get_current_pid_tgid(void) + * Return: current->tgid << 32 | current->pid + * + * u64 bpf_get_current_uid_gid(void) + * Return: current_gid << 32 | current_uid + * + * int bpf_get_current_comm(char *buf, int size_of_buf) + * stores current->comm into buf + * Return: 0 on success or negative error + * + * u32 bpf_get_cgroup_classid(skb) + * retrieve a proc's classid + * @skb: pointer to skb + * Return: classid if != 0 + * + * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) + * Return: 0 on success or negative error + * + * int bpf_skb_vlan_pop(skb) + * Return: 0 on success or negative error + * + * int bpf_skb_get_tunnel_key(skb, key, size, flags) + * int bpf_skb_set_tunnel_key(skb, key, size, flags) + * retrieve or populate tunnel metadata + * @skb: pointer to skb + * @key: pointer to 'struct bpf_tunnel_key' + * @size: size of 'struct bpf_tunnel_key' + * @flags: room for future extensions + * Return: 0 on success or negative error + * + * u64 bpf_perf_event_read(&map, index) + * Return: Number events read or error code + * + * int bpf_redirect(ifindex, flags) + * redirect to another netdev + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: TC_ACT_REDIRECT + * + * u32 bpf_get_route_realm(skb) + * retrieve a dst's tclassid + * @skb: pointer to skb + * Return: realm if != 0 + * + * int bpf_perf_event_output(ctx, map, index, data, size) + * output perf raw sample + * @ctx: struct pt_regs* + * @map: pointer to perf_event_array map + * @index: index of event in the map + * @data: data on stack to be output as raw data + * @size: size of data + * Return: 0 on success or negative error + * + * int bpf_get_stackid(ctx, map, flags) + * walk user or kernel stack and return id + * @ctx: struct pt_regs* + * @map: pointer to stack_trace map + * @flags: bits 0-7 - numer of stack frames to skip + * bit 8 - collect user stack instead of kernel + * bit 9 - compare stacks by hash only + * bit 10 - if two different stacks hash into the same stackid + * discard old + * other bits - reserved + * Return: >= 0 stackid on success or negative error + * + * s64 bpf_csum_diff(from, from_size, to, to_size, seed) + * calculate csum diff + * @from: raw from buffer + * @from_size: length of from buffer + * @to: raw to buffer + * @to_size: length of to buffer + * @seed: optional seed + * Return: csum result or negative error code + * + * int bpf_skb_get_tunnel_opt(skb, opt, size) + * retrieve tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: option size + * + * int bpf_skb_set_tunnel_opt(skb, opt, size) + * populate tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: 0 on success or negative error + * + * int bpf_skb_change_proto(skb, proto, flags) + * Change protocol of the skb. Currently supported is v4 -> v6, + * v6 -> v4 transitions. The helper will also resize the skb. eBPF + * program is expected to fill the new headers via skb_store_bytes + * and lX_csum_replace. + * @skb: pointer to skb + * @proto: new skb->protocol type + * @flags: reserved + * Return: 0 on success or negative error + * + * int bpf_skb_change_type(skb, type) + * Change packet type of skb. + * @skb: pointer to skb + * @type: new skb->pkt_type type + * Return: 0 on success or negative error + * + * int bpf_skb_under_cgroup(skb, map, index) + * Check cgroup2 membership of skb + * @skb: pointer to skb + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 skb failed the cgroup2 descendant test + * == 1 skb succeeded the cgroup2 descendant test + * < 0 error + * + * u32 bpf_get_hash_recalc(skb) + * Retrieve and possibly recalculate skb->hash. + * @skb: pointer to skb + * Return: hash + * + * u64 bpf_get_current_task(void) + * Returns current task_struct + * Return: current + * + * int bpf_probe_write_user(void *dst, void *src, int len) + * safely attempt to write to a location + * @dst: destination address in userspace + * @src: source address on stack + * @len: number of bytes to copy + * Return: 0 on success or negative error + * + * int bpf_current_task_under_cgroup(map, index) + * Check cgroup2 membership of current task + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 current failed the cgroup2 descendant test + * == 1 current succeeded the cgroup2 descendant test + * < 0 error + * + * int bpf_skb_change_tail(skb, len, flags) + * The helper will resize the skb to the given new size, to be used f.e. + * with control messages. + * @skb: pointer to skb + * @len: new skb length + * @flags: reserved + * Return: 0 on success or negative error + * + * int bpf_skb_pull_data(skb, len) + * The helper will pull in non-linear data in case the skb is non-linear + * and not all of len are part of the linear section. Only needed for + * read/write with direct packet access. + * @skb: pointer to skb + * @len: len to make read/writeable + * Return: 0 on success or negative error + * + * s64 bpf_csum_update(skb, csum) + * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. + * @skb: pointer to skb + * @csum: csum to add + * Return: csum on success or negative error + * + * void bpf_set_hash_invalid(skb) + * Invalidate current skb->hash. + * @skb: pointer to skb + * + * int bpf_get_numa_node_id() + * Return: Id of current NUMA node. + * + * int bpf_skb_change_head() + * Grows headroom of skb and adjusts MAC header offset accordingly. + * Will extends/reallocae as required automatically. + * May change skb data pointer and will thus invalidate any check + * performed for direct packet access. + * @skb: pointer to skb + * @len: length of header to be pushed in front + * @flags: Flags (unused for now) + * Return: 0 on success or negative error + * + * int bpf_xdp_adjust_head(xdp_md, delta) + * Adjust the xdp_md.data by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data + * Return: 0 on success or negative on error + * + * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * Copy a NUL terminated string from unsafe address. In case the string + * length is smaller than size, the target is not padded with further NUL + * bytes. In case the string length is larger than size, just count-1 + * bytes are copied and the last byte is set to NUL. + * @dst: destination address + * @size: maximum number of bytes to copy, including the trailing NUL + * @unsafe_ptr: unsafe address + * Return: + * > 0 length of the string including the trailing NUL on success + * < 0 error + * + * u64 bpf_get_socket_cookie(skb) + * Get the cookie for the socket stored inside sk_buff. + * @skb: pointer to skb + * Return: 8 Bytes non-decreasing number on success or 0 if the socket + * field is missing inside sk_buff + * + * u32 bpf_get_socket_uid(skb) + * Get the owner uid of the socket stored inside sk_buff. + * @skb: pointer to skb + * Return: uid of the socket owner on success or overflowuid if failed. + */ +#define __BPF_FUNC_MAPPER(FN) \ + FN(unspec), \ + FN(map_lookup_elem), \ + FN(map_update_elem), \ + FN(map_delete_elem), \ + FN(probe_read), \ + FN(ktime_get_ns), \ + FN(trace_printk), \ + FN(get_prandom_u32), \ + FN(get_smp_processor_id), \ + FN(skb_store_bytes), \ + FN(l3_csum_replace), \ + FN(l4_csum_replace), \ + FN(tail_call), \ + FN(clone_redirect), \ + FN(get_current_pid_tgid), \ + FN(get_current_uid_gid), \ + FN(get_current_comm), \ + FN(get_cgroup_classid), \ + FN(skb_vlan_push), \ + FN(skb_vlan_pop), \ + FN(skb_get_tunnel_key), \ + FN(skb_set_tunnel_key), \ + FN(perf_event_read), \ + FN(redirect), \ + FN(get_route_realm), \ + FN(perf_event_output), \ + FN(skb_load_bytes), \ + FN(get_stackid), \ + FN(csum_diff), \ + FN(skb_get_tunnel_opt), \ + FN(skb_set_tunnel_opt), \ + FN(skb_change_proto), \ + FN(skb_change_type), \ + FN(skb_under_cgroup), \ + FN(get_hash_recalc), \ + FN(get_current_task), \ + FN(probe_write_user), \ + FN(current_task_under_cgroup), \ + FN(skb_change_tail), \ + FN(skb_pull_data), \ + FN(csum_update), \ + FN(set_hash_invalid), \ + FN(get_numa_node_id), \ + FN(skb_change_head), \ + FN(xdp_adjust_head), \ + FN(probe_read_str), \ + FN(get_socket_cookie), \ + FN(get_socket_uid), + +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x +enum bpf_func_id { + __BPF_FUNC_MAPPER(__BPF_ENUM_FN) + __BPF_FUNC_MAX_ID, +}; +#undef __BPF_ENUM_FN + +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) +#define BPF_F_INVALIDATE_HASH (1ULL << 1) + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +#define BPF_F_HDR_FIELD_MASK 0xfULL + +/* BPF_FUNC_l4_csum_replace flags. */ +#define BPF_F_PSEUDO_HDR (1ULL << 4) +#define BPF_F_MARK_MANGLED_0 (1ULL << 5) +#define BPF_F_MARK_ENFORCE (1ULL << 6) + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +#define BPF_F_INGRESS (1ULL << 0) + +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +#define BPF_F_TUNINFO_IPV6 (1ULL << 0) + +/* BPF_FUNC_get_stackid flags. */ +#define BPF_F_SKIP_FIELD_MASK 0xffULL +#define BPF_F_USER_STACK (1ULL << 8) +#define BPF_F_FAST_STACK_CMP (1ULL << 9) +#define BPF_F_REUSE_STACKID (1ULL << 10) + +/* BPF_FUNC_skb_set_tunnel_key flags. */ +#define BPF_F_ZERO_CSUM_TX (1ULL << 1) +#define BPF_F_DONT_FRAGMENT (1ULL << 2) + +/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ +#define BPF_F_INDEX_MASK 0xffffffffULL +#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +/* BPF_FUNC_perf_event_output for sk_buff input context. */ +#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) + +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; + __u32 vlan_proto; + __u32 priority; + __u32 ingress_ifindex; + __u32 ifindex; + __u32 tc_index; + __u32 cb[5]; + __u32 hash; + __u32 tc_classid; + __u32 data; + __u32 data_end; + __u32 napi_id; +}; + +struct bpf_tunnel_key { + __u32 tunnel_id; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; + __u16 tunnel_ext; + __u32 tunnel_label; +}; + +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes */ +}; + +struct bpf_sock { + __u32 bound_dev_if; + __u32 family; + __u32 type; + __u32 protocol; +}; + +#define XDP_PACKET_HEADROOM 256 + +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will result + * in packet drop. + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, + XDP_TX, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; +}; + +#endif /* __LINUX_BPF_H__ */ diff --git a/src/shared/linux/bpf_common.h b/src/shared/linux/bpf_common.h new file mode 100644 index 00000000000..afe7433b989 --- /dev/null +++ b/src/shared/linux/bpf_common.h @@ -0,0 +1,55 @@ +#ifndef __LINUX_BPF_COMMON_H__ +#define __LINUX_BPF_COMMON_H__ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 +#define BPF_H 0x08 +#define BPF_B 0x10 +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +#ifndef BPF_MAXINSNS +#define BPF_MAXINSNS 4096 +#endif + +#endif /* __LINUX_BPF_COMMON_H__ */ diff --git a/src/shared/linux/libbpf.h b/src/shared/linux/libbpf.h new file mode 100644 index 00000000000..1989e3a869b --- /dev/null +++ b/src/shared/linux/libbpf.h @@ -0,0 +1,198 @@ +/* eBPF mini library */ +#ifndef __LIBBPF_H +#define __LIBBPF_H + +#include + +struct bpf_insn; + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +#define BPF_ALU64_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_ALU32_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Short form of mov, dst_reg = src_reg */ + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* Short form of mov, dst_reg = imm32 */ + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + +#ifndef BPF_PSEUDO_MAP_FD +# define BPF_PSEUDO_MAP_FD 1 +#endif + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + + +/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ + +#define BPF_LD_ABS(SIZE, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ + +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Raw code statement block */ + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +/* Program exit */ + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#endif From 71e5200f94b22589922704aa4abdf95d4fe2e528 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 18 Oct 2016 17:57:10 +0200 Subject: [PATCH 10/42] Add abstraction model for BPF programs This object takes a number of bpf_insn members and wraps them together with the in-kernel reference id. Will be needed by the firewall code. --- meson.build | 2 + src/basic/bpf-program.c | 182 ++++++++++++++++++++++++++++++++++++ src/basic/bpf-program.h | 55 +++++++++++ src/basic/meson.build | 20 ++-- src/basic/missing_syscall.h | 32 +++++++ 5 files changed, 282 insertions(+), 9 deletions(-) create mode 100644 src/basic/bpf-program.c create mode 100644 src/basic/bpf-program.h diff --git a/meson.build b/meson.build index 3e85442a6fe..d72fc6f1486 100644 --- a/meson.build +++ b/meson.build @@ -443,6 +443,8 @@ foreach ident : [ #include '''], ['copy_file_range', '''#include #include '''], + ['bpf', '''#include + #include '''], ['explicit_bzero' , '''#include '''], ] diff --git a/src/basic/bpf-program.c b/src/basic/bpf-program.c new file mode 100644 index 00000000000..9326176743f --- /dev/null +++ b/src/basic/bpf-program.c @@ -0,0 +1,182 @@ +/*** + This file is part of systemd. + + Copyright 2016 Daniel Mack + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bpf-program.h" +#include "fd-util.h" +#include "log.h" +#include "missing.h" + +int bpf_program_new(uint32_t prog_type, BPFProgram **ret) { + _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; + + p = new0(BPFProgram, 1); + if (!p) + return log_oom(); + + p->prog_type = prog_type; + p->kernel_fd = -1; + + *ret = p; + p = NULL; + return 0; +} + +BPFProgram *bpf_program_unref(BPFProgram *p) { + if (!p) + return NULL; + + safe_close(p->kernel_fd); + free(p->instructions); + + return mfree(p); +} + +int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) { + + assert(p); + + if (!GREEDY_REALLOC(p->instructions, p->allocated, p->n_instructions + count)) + return -ENOMEM; + + memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count); + p->n_instructions += count; + + return 0; +} + +int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) { + union bpf_attr attr; + + assert(p); + + if (p->kernel_fd >= 0) + return -EBUSY; + + attr = (union bpf_attr) { + .prog_type = p->prog_type, + .insns = PTR_TO_UINT64(p->instructions), + .insn_cnt = p->n_instructions, + .license = PTR_TO_UINT64("GPL"), + .log_buf = PTR_TO_UINT64(log_buf), + .log_level = !!log_buf, + .log_size = log_size, + }; + + p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + if (p->kernel_fd < 0) + return -errno; + + return 0; +} + +int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path) { + _cleanup_close_ int fd = -1; + union bpf_attr attr; + + assert(p); + assert(type >= 0); + assert(path); + + fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + + attr = (union bpf_attr) { + .attach_type = type, + .target_fd = fd, + .attach_bpf_fd = p->kernel_fd, + }; + + if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) + return -errno; + + return 0; +} + +int bpf_program_cgroup_detach(int type, const char *path) { + _cleanup_close_ int fd = -1; + union bpf_attr attr; + + assert(path); + + fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (fd < 0) + return -errno; + + attr = (union bpf_attr) { + .attach_type = type, + .target_fd = fd, + }; + + if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) + return -errno; + + return 0; +} + +int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) { + union bpf_attr attr = { + .map_type = type, + .key_size = key_size, + .value_size = value_size, + .max_entries = max_entries, + .map_flags = flags, + }; + int fd; + + fd = bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); + if (fd < 0) + return -errno; + + return fd; +} + +int bpf_map_update_element(int fd, const void *key, void *value) { + + union bpf_attr attr = { + .map_fd = fd, + .key = PTR_TO_UINT64(key), + .value = PTR_TO_UINT64(value), + }; + + if (bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)) < 0) + return -errno; + + return 0; +} + +int bpf_map_lookup_element(int fd, const void *key, void *value) { + + union bpf_attr attr = { + .map_fd = fd, + .key = PTR_TO_UINT64(key), + .value = PTR_TO_UINT64(value), + }; + + if (bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)) < 0) + return -errno; + + return 0; +} diff --git a/src/basic/bpf-program.h b/src/basic/bpf-program.h new file mode 100644 index 00000000000..0dd150b60a0 --- /dev/null +++ b/src/basic/bpf-program.h @@ -0,0 +1,55 @@ +#pragma once + +/*** + This file is part of systemd. + + Copyright 2016 Daniel Mack + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . + + [Except for the stuff copy/pasted from the kernel sources, see below] +***/ + +#include +#include +#include + +#include "list.h" +#include "macro.h" + +typedef struct BPFProgram BPFProgram; + +struct BPFProgram { + int kernel_fd; + uint32_t prog_type; + + size_t n_instructions; + size_t allocated; + struct bpf_insn *instructions; +}; + +int bpf_program_new(uint32_t prog_type, BPFProgram **ret); +BPFProgram *bpf_program_unref(BPFProgram *p); + +int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count); +int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size); + +int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path); +int bpf_program_cgroup_detach(int type, const char *path); + +int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags); +int bpf_map_update_element(int fd, const void *key, void *value); +int bpf_map_lookup_element(int fd, const void *key, void *value); + +DEFINE_TRIVIAL_CLEANUP_FUNC(BPFProgram*, bpf_program_unref); diff --git a/src/basic/meson.build b/src/basic/meson.build index 67cc27274d6..994336fde2d 100644 --- a/src/basic/meson.build +++ b/src/basic/meson.build @@ -1,4 +1,6 @@ basic_sources_plain = files(''' + MurmurHash2.c + MurmurHash2.h af-list.c af-list.h alloc-util.c @@ -16,6 +18,8 @@ basic_sources_plain = files(''' bitmap.c bitmap.h blkid-util.h + bpf-program.c + bpf-program.h btrfs-ctree.h btrfs-util.c btrfs-util.h @@ -24,10 +28,10 @@ basic_sources_plain = files(''' bus-label.h calendarspec.c calendarspec.h - capability-util.c - capability-util.h cap-list.c cap-list.h + capability-util.c + capability-util.h cgroup-util.c cgroup-util.h chattr-util.c @@ -61,10 +65,10 @@ basic_sources_plain = files(''' extract-word.h fd-util.c fd-util.h - fileio.c - fileio.h fileio-label.c fileio-label.h + fileio.c + fileio.h format-util.h fs-util.c fs-util.h @@ -82,9 +86,9 @@ basic_sources_plain = files(''' hostname-util.h in-addr-util.c in-addr-util.h - ioprio.h io-util.c io-util.h + ioprio.h journal-importer.c journal-importer.h khash.c @@ -106,13 +110,11 @@ basic_sources_plain = files(''' mempool.c mempool.h missing_syscall.h + mkdir-label.c mkdir.c mkdir.h - mkdir-label.c mount-util.c mount-util.h - MurmurHash2.c - MurmurHash2.h nss-util.h ordered-set.c ordered-set.h @@ -138,9 +140,9 @@ basic_sources_plain = files(''' rlimit-util.h rm-rf.c rm-rf.h - securebits.h securebits-util.c securebits-util.h + securebits.h selinux-util.c selinux-util.h set.h diff --git a/src/basic/missing_syscall.h b/src/basic/missing_syscall.h index 898116c7b31..17cde5e74f0 100644 --- a/src/basic/missing_syscall.h +++ b/src/basic/missing_syscall.h @@ -22,6 +22,8 @@ /* Missing glibc definitions to access certain kernel APIs */ +#include + #if !HAVE_DECL_PIVOT_ROOT static inline int pivot_root(const char *new_root, const char *put_old) { return syscall(SYS_pivot_root, new_root, put_old); @@ -316,3 +318,33 @@ static inline ssize_t copy_file_range(int fd_in, loff_t *off_in, # endif } #endif + +#if !HAVE_DECL_BPF +# ifndef __NR_bpf +# if defined __i386__ +# define __NR_bpf 357 +# elif defined __x86_64__ +# define __NR_bpf 321 +# elif defined __aarch64__ +# define __NR_bpf 280 +# elif defined __sparc__ +# define __NR_bpf 349 +# elif defined __s390__ +# define __NR_bpf 351 +# else +# warning "__NR_bpf not defined for your architecture" +# endif +# endif + +union bpf_attr; + +static inline int bpf(int cmd, union bpf_attr *attr, size_t size) { +#ifdef __NR_bpf + return (int) syscall(__NR_bpf, cmd, attr, size); +#else + errno = ENOSYS; + return -1; +#endif +} + +#endif From b36672e072e32060d6c25acfb51e409bc617b754 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Thu, 3 Nov 2016 17:30:06 +0100 Subject: [PATCH 11/42] Add IP address address ACL representation and parser Add a config directive parser that takes multiple space separated IPv4 or IPv6 addresses with optional netmasks in CIDR notation rvalue and puts a parsed version of it to linked list of IPAddressAccessItem objects. The code actually using this will be added later. --- src/core/ip-address-access.c | 165 ++++++++++++++++++++++++++++ src/core/ip-address-access.h | 36 +++++++ src/core/meson.build | 202 ++++++++++++++++++----------------- 3 files changed, 303 insertions(+), 100 deletions(-) create mode 100644 src/core/ip-address-access.c create mode 100644 src/core/ip-address-access.h diff --git a/src/core/ip-address-access.c b/src/core/ip-address-access.c new file mode 100644 index 00000000000..6a89bb23c18 --- /dev/null +++ b/src/core/ip-address-access.c @@ -0,0 +1,165 @@ +/*** + This file is part of systemd. + + Copyright 2016 Daniel Mack + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include + +#include "alloc-util.h" +#include "extract-word.h" +#include "hostname-util.h" +#include "ip-address-access.h" +#include "parse-util.h" +#include "string-util.h" + +int config_parse_ip_address_access( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + IPAddressAccessItem **list = data; + const char *p; + int r; + + assert(list); + + if (isempty(rvalue)) { + *list = ip_address_access_free_all(*list); + return 0; + } + + p = rvalue; + + for (;;) { + _cleanup_free_ IPAddressAccessItem *a = NULL; + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == 0) + break; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue); + break; + } + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + if (streq(word, "any")) { + /* "any" is a shortcut for 0.0.0.0/0 and ::/0 */ + + a->family = AF_INET; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + + } else if (is_localhost(word)) { + /* "localhost" is a shortcut for 127.0.0.0/8 and ::1/128 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32(0x7f000000); + a->prefixlen = 8; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) IN6ADDR_LOOPBACK_INIT; + a->prefixlen = 128; + + } else if (streq(word, "link-local")) { + + /* "link-local" is a shortcut for 169.254.0.0/16 and fe80::/64 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32((UINT32_C(169) << 24 | UINT32_C(254) << 16)); + a->prefixlen = 16; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) { + .__in6_u.__u6_addr32[0] = htobe32(0xfe800000) + }; + a->prefixlen = 64; + + } else if (streq(word, "multicast")) { + + /* "multicast" is a shortcut for 224.0.0.0/4 and ff00::/8 */ + + a->family = AF_INET; + a->address.in.s_addr = htobe32((UINT32_C(224) << 24)); + a->prefixlen = 4; + LIST_APPEND(items, *list, a); + + a = new0(IPAddressAccessItem, 1); + if (!a) + return log_oom(); + + a->family = AF_INET6; + a->address.in6 = (struct in6_addr) { + .__in6_u.__u6_addr32[0] = htobe32(0xff000000) + }; + a->prefixlen = 8; + + } else { + r = in_addr_prefix_from_string_auto(word, &a->family, &a->address, &a->prefixlen); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Address prefix is invalid, ignoring assignment: %s", word); + return 0; + } + } + + LIST_APPEND(items, *list, a); + a = NULL; + } + + return 0; +} + +IPAddressAccessItem* ip_address_access_free_all(IPAddressAccessItem *first) { + IPAddressAccessItem *next, *p = first; + + while (p) { + next = p->items_next; + free(p); + + p = next; + } + + return NULL; +} diff --git a/src/core/ip-address-access.h b/src/core/ip-address-access.h new file mode 100644 index 00000000000..eea20b4848b --- /dev/null +++ b/src/core/ip-address-access.h @@ -0,0 +1,36 @@ +#pragma once + +/*** + This file is part of systemd. + + Copyright 2016 Daniel Mack + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include "in-addr-util.h" +#include "list.h" + +typedef struct IPAddressAccessItem IPAddressAccessItem; + +struct IPAddressAccessItem { + int family; + unsigned char prefixlen; + union in_addr_union address; + LIST_FIELDS(IPAddressAccessItem, items); +}; + +int config_parse_ip_address_access(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); + +IPAddressAccessItem* ip_address_access_free_all(IPAddressAccessItem *first); diff --git a/src/core/meson.build b/src/core/meson.build index 569eed9cad3..bea0d3528d1 100644 --- a/src/core/meson.build +++ b/src/core/meson.build @@ -1,114 +1,116 @@ libcore_la_sources = ''' - unit.c - unit.h - unit-printf.c - unit-printf.h - job.c - job.h - manager.c - manager.h - transaction.c - transaction.h - load-fragment.c - load-fragment.h - service.c - service.h - socket.c - socket.h - target.c - target.h - device.c - device.h - mount.c - mount.h + audit-fd.c + audit-fd.h automount.c automount.h - swap.c - swap.h - timer.c - timer.h - path.c - path.h - slice.c - slice.h - scope.c - scope.h - load-dropin.c - load-dropin.h - execute.c - execute.h - dynamic-user.c - dynamic-user.h - kill.c - kill.h - dbus.c - dbus.h - dbus-manager.c - dbus-manager.h - dbus-unit.c - dbus-unit.h - dbus-job.c - dbus-job.h - dbus-service.c - dbus-service.h - dbus-socket.c - dbus-socket.h - dbus-target.c - dbus-target.h - dbus-device.c - dbus-device.h - dbus-mount.c - dbus-mount.h - dbus-automount.c - dbus-automount.h - dbus-swap.c - dbus-swap.h - dbus-timer.c - dbus-timer.h - dbus-path.c - dbus-path.h - dbus-slice.c - dbus-slice.h - dbus-scope.c - dbus-scope.h - dbus-execute.c - dbus-execute.h - dbus-kill.c - dbus-kill.h - dbus-cgroup.c - dbus-cgroup.h cgroup.c cgroup.h + dbus-automount.c + dbus-automount.h + dbus-cgroup.c + dbus-cgroup.h + dbus-device.c + dbus-device.h + dbus-execute.c + dbus-execute.h + dbus-job.c + dbus-job.h + dbus-kill.c + dbus-kill.h + dbus-manager.c + dbus-manager.h + dbus-mount.c + dbus-mount.h + dbus-path.c + dbus-path.h + dbus-scope.c + dbus-scope.h + dbus-service.c + dbus-service.h + dbus-slice.c + dbus-slice.h + dbus-socket.c + dbus-socket.h + dbus-swap.c + dbus-swap.h + dbus-target.c + dbus-target.h + dbus-timer.c + dbus-timer.h + dbus-unit.c + dbus-unit.h + dbus.c + dbus.h + device.c + device.h + dynamic-user.c + dynamic-user.h + emergency-action.c + emergency-action.h + execute.c + execute.h + hostname-setup.c + hostname-setup.h + ima-setup.c + ima-setup.h + ip-address-access.c + ip-address-access.h + job.c + job.h + kill.c + kill.h + killall.c + killall.h + kmod-setup.c + kmod-setup.h + load-dropin.c + load-dropin.h + load-fragment.c + load-fragment.h + locale-setup.c + locale-setup.h + loopback-setup.c + loopback-setup.h + machine-id-setup.c + machine-id-setup.h + manager.c + manager.h + mount-setup.c + mount-setup.h + mount.c + mount.h + namespace.c + namespace.h + path.c + path.h + scope.c + scope.h selinux-access.c selinux-access.h selinux-setup.c selinux-setup.h - smack-setup.c - smack-setup.h - ima-setup.c - ima-setup.h - locale-setup.h - locale-setup.c - hostname-setup.c - hostname-setup.h - machine-id-setup.c - machine-id-setup.h - mount-setup.c - mount-setup.h - kmod-setup.c - kmod-setup.h - loopback-setup.h - loopback-setup.c - namespace.c - namespace.h - killall.h - killall.c - audit-fd.c - audit-fd.h + service.c + service.h show-status.c show-status.h - emergency-action.c - emergency-action.h + slice.c + slice.h + smack-setup.c + smack-setup.h + socket.c + socket.h + swap.c + swap.h + target.c + target.h + timer.c + timer.h + transaction.c + transaction.h + unit-printf.c + unit-printf.h + unit.c + unit.h '''.split() load_fragment_gperf_gperf = custom_target( From 6a48d82f0285f5654ec98a5f4b06752eb707b248 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Fri, 11 Nov 2016 19:59:19 +0100 Subject: [PATCH 12/42] cgroup: add fields to accommodate eBPF related details Add pointers for compiled eBPF programs as well as list heads for allowed and denied hosts for both directions. --- src/core/cgroup.c | 3 +++ src/core/cgroup.h | 7 ++++++- src/core/manager.h | 1 + src/core/system.conf | 2 ++ src/core/unit.c | 20 ++++++++++++++++++++ src/core/unit.h | 13 +++++++++++++ 6 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index ffb0f49cd6b..62cbe08f13f 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -141,6 +141,9 @@ void cgroup_context_done(CGroupContext *c) { while (c->device_allow) cgroup_context_free_device_allow(c, c->device_allow); + + c->ip_address_allow = ip_address_access_free_all(c->ip_address_allow); + c->ip_address_deny = ip_address_access_free_all(c->ip_address_deny); } void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 4cd168f63e5..2baf4d20e99 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -21,9 +21,10 @@ #include +#include "cgroup-util.h" +#include "ip-address-access.h" #include "list.h" #include "time-util.h" -#include "cgroup-util.h" typedef struct CGroupContext CGroupContext; typedef struct CGroupDeviceAllow CGroupDeviceAllow; @@ -87,6 +88,7 @@ struct CGroupContext { bool blockio_accounting; bool memory_accounting; bool tasks_accounting; + bool ip_accounting; /* For unified hierarchy */ uint64_t cpu_weight; @@ -103,6 +105,9 @@ struct CGroupContext { uint64_t memory_max; uint64_t memory_swap_max; + LIST_HEAD(IPAddressAccessItem, ip_address_allow); + LIST_HEAD(IPAddressAccessItem, ip_address_deny); + /* For legacy hierarchies */ uint64_t cpu_shares; uint64_t startup_cpu_shares; diff --git a/src/core/manager.h b/src/core/manager.h index 713d2db70cd..8880b3aab53 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -29,6 +29,7 @@ #include "cgroup-util.h" #include "fdset.h" #include "hashmap.h" +#include "ip-address-access.h" #include "list.h" #include "ratelimit.h" diff --git a/src/core/system.conf b/src/core/system.conf index 746572b7ff2..88f646e2fe1 100644 --- a/src/core/system.conf +++ b/src/core/system.conf @@ -60,3 +60,5 @@ #DefaultLimitNICE= #DefaultLimitRTPRIO= #DefaultLimitRTTIME= +#IPAddressAllow= +#IPAddressDeny= diff --git a/src/core/unit.c b/src/core/unit.c index df89f3d01f1..6451b755607 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -35,6 +35,7 @@ #include "dropin.h" #include "escape.h" #include "execute.h" +#include "fd-util.h" #include "fileio-label.h" #include "format-util.h" #include "id128-util.h" @@ -103,6 +104,13 @@ Unit *unit_new(Manager *m, size_t size) { u->ref_gid = GID_INVALID; u->cpu_usage_last = NSEC_INFINITY; + u->ip_accounting_ingress_map_fd = -1; + u->ip_accounting_egress_map_fd = -1; + u->ipv4_allow_map_fd = -1; + u->ipv6_allow_map_fd = -1; + u->ipv4_deny_map_fd = -1; + u->ipv6_deny_map_fd = -1; + RATELIMIT_INIT(u->start_limit, m->default_start_limit_interval, m->default_start_limit_burst); RATELIMIT_INIT(u->auto_stop_ratelimit, 10 * USEC_PER_SEC, 16); @@ -156,6 +164,7 @@ static void unit_init(Unit *u) { cc->blockio_accounting = u->manager->default_blockio_accounting; cc->memory_accounting = u->manager->default_memory_accounting; cc->tasks_accounting = u->manager->default_tasks_accounting; + cc->ip_accounting = u->manager->default_ip_accounting; if (u->type != UNIT_SLICE) cc->tasks_max = u->manager->default_tasks_max; @@ -610,6 +619,17 @@ void unit_free(Unit *u) { while (u->refs) unit_ref_unset(u->refs); + safe_close(u->ip_accounting_ingress_map_fd); + safe_close(u->ip_accounting_egress_map_fd); + + safe_close(u->ipv4_allow_map_fd); + safe_close(u->ipv6_allow_map_fd); + safe_close(u->ipv4_deny_map_fd); + safe_close(u->ipv6_deny_map_fd); + + bpf_program_unref(u->ip_bpf_ingress); + bpf_program_unref(u->ip_bpf_egress); + free(u); } diff --git a/src/core/unit.h b/src/core/unit.h index 4d9751a4069..95c41fcceae 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -28,6 +28,7 @@ typedef struct UnitVTable UnitVTable; typedef struct UnitRef UnitRef; typedef struct UnitStatusMessageFormats UnitStatusMessageFormats; +#include "bpf-program.h" #include "condition.h" #include "emergency-action.h" #include "install.h" @@ -205,6 +206,18 @@ struct Unit { CGroupMask cgroup_members_mask; int cgroup_inotify_wd; + /* IP BPF Firewalling/accounting */ + int ip_accounting_ingress_map_fd; + int ip_accounting_egress_map_fd; + + int ipv4_allow_map_fd; + int ipv6_allow_map_fd; + int ipv4_deny_map_fd; + int ipv6_deny_map_fd; + + BPFProgram *ip_bpf_ingress; + BPFProgram *ip_bpf_egress; + /* How to start OnFailure units */ JobMode on_failure_job_mode; From 1988a9d12015990c145a6e8515d5e22ef88b32cb Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Fri, 11 Nov 2016 19:41:16 +0100 Subject: [PATCH 13/42] Add firewall eBPF compiler --- src/core/bpf-firewall.c | 672 ++++++++++++++++++++++++++++++++++++++++ src/core/bpf-firewall.h | 32 ++ src/core/meson.build | 2 + 3 files changed, 706 insertions(+) create mode 100644 src/core/bpf-firewall.c create mode 100644 src/core/bpf-firewall.h diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c new file mode 100644 index 00000000000..732c36fc1a8 --- /dev/null +++ b/src/core/bpf-firewall.c @@ -0,0 +1,672 @@ +/*** + This file is part of systemd. + + Copyright 2016 Daniel Mack + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bpf-program.h" +#include "fd-util.h" +#include "ip-address-access.h" +#include "unit.h" + +enum { + MAP_KEY_PACKETS, + MAP_KEY_BYTES, +}; + +enum { + ACCESS_ALLOWED = 1, + ACCESS_DENIED = 2, +}; + +/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */ + +static int add_lookup_instructions( + BPFProgram *p, + int map_fd, + int protocol, + bool is_ingress, + int verdict) { + + int r, addr_offset, addr_size; + + assert(p); + assert(map_fd >= 0); + + switch (protocol) { + + case ETH_P_IP: + addr_size = sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct iphdr, saddr) : + offsetof(struct iphdr, daddr); + break; + + case ETH_P_IPV6: + addr_size = 4 * sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct ip6_hdr, ip6_src.s6_addr) : + offsetof(struct ip6_hdr, ip6_dst.s6_addr); + break; + + default: + return -EAFNOSUPPORT; + } + + do { + /* Compare IPv4 with one word instruction (32bit) */ + struct bpf_insn insn[] = { + /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0), + + /* + * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address + * + * R1: Pointer to the skb + * R2: Data offset + * R3: Destination buffer on the stack (r10 - 4) + * R4: Number of bytes to read (4) + */ + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV32_IMM(BPF_REG_2, addr_offset), + + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size), + + BPF_MOV32_IMM(BPF_REG_4, addr_size), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + + /* + * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the + * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key' + * has to be set to the maximum possible value. + * + * On success, the looked up value is stored in R0. For this application, the actual + * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any + * matching value. + */ + + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)), + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8), + + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + + } while (false); + + return 0; +} + +static int bpf_firewall_compile_bpf( + Unit *u, + bool is_ingress, + BPFProgram **ret) { + + struct bpf_insn pre_insn[] = { + /* + * When the eBPF program is entered, R1 contains the address of the skb. + * However, R1-R5 are scratch registers that are not preserved when calling + * into kernel functions, so we need to save anything that's supposed to + * stay around to R6-R9. Save the skb to R6. + */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* + * Although we cannot access the skb data directly from eBPF programs used in this + * scenario, the kernel has prepared some fields for us to access through struct __sk_buff. + * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7 + * for later use. + */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)), + + /* + * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet + * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning. + */ + BPF_MOV32_IMM(BPF_REG_8, 0), + }; + + /* + * The access checkers compiled for the configured allowance and denial lists + * write to R8 at runtime. The following code prepares for an early exit that + * skip the accounting if the packet is denied. + * + * R0 = 1 + * if (R8 == ACCESS_DENIED) + * R0 = 0 + * + * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet + * is allowed to pass. + */ + struct bpf_insn post_insn[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + }; + + _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; + int accounting_map_fd, r; + bool access_enabled; + + assert(u); + assert(ret); + + accounting_map_fd = is_ingress ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + + access_enabled = + u->ipv4_allow_map_fd >= 0 || + u->ipv6_allow_map_fd >= 0 || + u->ipv4_deny_map_fd >= 0 || + u->ipv6_deny_map_fd >= 0; + + if (accounting_map_fd < 0 && !access_enabled) { + *ret = NULL; + return 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p); + if (r < 0) + return r; + + r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return r; + + if (access_enabled) { + /* + * The simple rule this function translates into eBPF instructions is: + * + * - Access will be granted when an address matches an entry in @list_allow + * - Otherwise, access will be denied when an address matches an entry in @list_deny + * - Otherwise, access will be granted + */ + + if (u->ipv4_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv6_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv4_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (u->ipv6_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + } + + r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return r; + + if (accounting_map_fd >= 0) { + struct bpf_insn insn[] = { + /* + * If R0 == 0, the packet will be denied; skip the accounting instructions in this case. + * The jump label will be fixed up later. + */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0), + + /* Count packets */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Count bytes */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Allow the packet to pass */ + BPF_MOV64_IMM(BPF_REG_0, 1), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } + + do { + /* + * Exit from the eBPF program, R0 contains the verdict. + * 0 means the packet is denied, 1 means the packet may pass. + */ + struct bpf_insn insn[] = { + BPF_EXIT_INSN() + }; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } while (false); + + *ret = p; + p = NULL; + + return 0; +} + +static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) { + IPAddressAccessItem *a; + + assert(n_ipv4); + assert(n_ipv6); + + LIST_FOREACH(items, a, list) { + switch (a->family) { + + case AF_INET: + (*n_ipv4)++; + break; + + case AF_INET6: + (*n_ipv6)++; + break; + + default: + return -EAFNOSUPPORT; + } + } + + return 0; +} + +static int bpf_firewall_add_access_items( + IPAddressAccessItem *list, + int ipv4_map_fd, + int ipv6_map_fd, + int verdict) { + + struct bpf_lpm_trie_key *key_ipv4, *key_ipv6; + uint64_t value = verdict; + IPAddressAccessItem *a; + int r; + + key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)); + key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4); + + LIST_FOREACH(items, a, list) { + switch (a->family) { + + case AF_INET: + key_ipv4->prefixlen = a->prefixlen; + memcpy(key_ipv4->data, &a->address, sizeof(uint32_t)); + + r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value); + if (r < 0) + return r; + + break; + + case AF_INET6: + key_ipv6->prefixlen = a->prefixlen; + memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t)); + + r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value); + if (r < 0) + return r; + + break; + + default: + return -EAFNOSUPPORT; + } + } + + return 0; +} + +static int bpf_firewall_prepare_access_maps( + Unit *u, + int verdict, + int *ret_ipv4_map_fd, + int *ret_ipv6_map_fd) { + + _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1; + size_t n_ipv4 = 0, n_ipv6 = 0; + Unit *p; + int r; + + assert(ret_ipv4_map_fd); + assert(ret_ipv6_map_fd); + + for (p = u; p; p = UNIT_DEREF(p->slice)) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6); + } + + if (n_ipv4 > 0) { + ipv4_map_fd = bpf_map_new( + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t), + sizeof(uint64_t), + n_ipv4, + BPF_F_NO_PREALLOC); + if (ipv4_map_fd < 0) + return ipv4_map_fd; + } + + if (n_ipv6 > 0) { + ipv6_map_fd = bpf_map_new( + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4, + sizeof(uint64_t), + n_ipv6, + BPF_F_NO_PREALLOC); + if (ipv6_map_fd < 0) + return ipv6_map_fd; + } + + for (p = u; p; p = UNIT_DEREF(p->slice)) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, + ipv4_map_fd, ipv6_map_fd, verdict); + if (r < 0) + return r; + } + + *ret_ipv4_map_fd = ipv4_map_fd; + *ret_ipv6_map_fd = ipv6_map_fd; + + ipv4_map_fd = ipv6_map_fd = -1; + return 0; +} + +static int bpf_firewall_prepare_accounting_maps(bool enabled, int *fd_ingress, int *fd_egress) { + int r; + + assert(fd_ingress); + assert(fd_egress); + + if (enabled) { + if (*fd_ingress < 0) { + r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_ingress = r; + } + + if (*fd_egress < 0) { + + r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_egress = r; + } + } else { + *fd_ingress = safe_close(*fd_ingress); + *fd_egress = safe_close(*fd_egress); + } + + return 0; +} + +int bpf_firewall_compile(Unit *u) { + CGroupContext *cc; + int r; + + assert(u); + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == 0) { + log_debug("BPF firewalling not supported on this systemd, proceeding without."); + return -EOPNOTSUPP; + } + + /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves, + * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual + * configuration, but we don't flush out the accounting unnecessarily */ + + u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress); + u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress); + + u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); + u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); + + u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); + u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + + r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd); + if (r < 0) + return log_error_errno(r, "Preparation of eBPF allow maps failed: %m"); + + r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd); + if (r < 0) + return log_error_errno(r, "Preparation of eBPF deny maps failed: %m"); + + r = bpf_firewall_prepare_accounting_maps(cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd); + if (r < 0) + return log_error_errno(r, "Preparation of eBPF accounting maps failed: %m"); + + r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress); + if (r < 0) + return log_error_errno(r, "Compilation for ingress BPF program failed: %m"); + + r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress); + if (r < 0) + return log_error_errno(r, "Compilation for egress BPF program failed: %m"); + + return 0; +} + +int bpf_firewall_install(Unit *u) { + _cleanup_free_ char *path = NULL; + int r; + + assert(u); + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == 0) { + log_debug("BPF firewalling not supported on this systemd, proceeding without."); + return -EOPNOTSUPP; + } + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup path: %m"); + + if (u->ip_bpf_egress) { + r = bpf_program_load_kernel(u->ip_bpf_egress, NULL, 0); + if (r < 0) + return log_error_errno(r, "Kernel upload of egress BPF program failed: %m"); + + r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path); + if (r < 0) + return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path); + } else { + r = bpf_program_cgroup_detach(BPF_CGROUP_INET_EGRESS, path); + if (r < 0) + return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r, + "Detaching egress BPF program from cgroup failed: %m"); + } + + if (u->ip_bpf_ingress) { + r = bpf_program_load_kernel(u->ip_bpf_ingress, NULL, 0); + if (r < 0) + return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m"); + + r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path); + if (r < 0) + return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path); + } else { + r = bpf_program_cgroup_detach(BPF_CGROUP_INET_INGRESS, path); + if (r < 0) + return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r, + "Detaching ingress BPF program from cgroup failed: %m"); + } + + return 0; +} + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) { + uint64_t key, packets; + int r; + + if (map_fd < 0) + return -EBADF; + + if (ret_packets) { + key = MAP_KEY_PACKETS; + r = bpf_map_lookup_element(map_fd, &key, &packets); + if (r < 0) + return r; + } + + if (ret_bytes) { + key = MAP_KEY_BYTES; + r = bpf_map_lookup_element(map_fd, &key, ret_bytes); + if (r < 0) + return r; + } + + if (ret_packets) + *ret_packets = packets; + + return 0; +} + +int bpf_firewall_reset_accounting(int map_fd) { + uint64_t key, value = 0; + int r; + + if (map_fd < 0) + return -EBADF; + + key = MAP_KEY_PACKETS; + r = bpf_map_update_element(map_fd, &key, &value); + if (r < 0) + return r; + + key = MAP_KEY_BYTES; + return bpf_map_update_element(map_fd, &key, &value); +} + + +int bpf_firewall_supported(void) { + static int supported = -1; + int fd, r; + + /* Checks whether BPF firewalling is supported. For this, we check three things: + * + * a) whether we are privileged + * b) whether the unified hierarchy is being used + * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require + * + */ + + if (supported >= 0) + return supported; + + if (geteuid() != 0) + return supported = false; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) + return supported = false; + + fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t), + sizeof(uint64_t), + 1, + BPF_F_NO_PREALLOC); + if (fd < 0) { + log_debug_errno(r, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m"); + return supported = false; + } + + safe_close(fd); + + return supported = true; +} diff --git a/src/core/bpf-firewall.h b/src/core/bpf-firewall.h new file mode 100644 index 00000000000..870e314e0e0 --- /dev/null +++ b/src/core/bpf-firewall.h @@ -0,0 +1,32 @@ +#pragma once + +/*** + This file is part of systemd. + + Copyright 2016 Daniel Mack + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +#include "unit.h" + +int bpf_firewall_supported(void); + +int bpf_firewall_compile(Unit *u); +int bpf_firewall_install(Unit *u); + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets); +int bpf_firewall_reset_accounting(int map_fd); diff --git a/src/core/meson.build b/src/core/meson.build index bea0d3528d1..ac600be117d 100644 --- a/src/core/meson.build +++ b/src/core/meson.build @@ -3,6 +3,8 @@ libcore_la_sources = ''' audit-fd.h automount.c automount.h + bpf-firewall.c + bpf-firewall.h cgroup.c cgroup.h dbus-automount.c From 906c06f64a87cce5378191c092d10432543ea907 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Tue, 5 Sep 2017 19:27:53 +0200 Subject: [PATCH 14/42] cgroup, unit, fragment parser: make use of new firewall functions --- src/core/cgroup.c | 190 +++++++++++++++++++++++--- src/core/cgroup.h | 18 ++- src/core/dbus-unit.c | 38 ++++++ src/core/load-fragment-gperf.gperf.m4 | 3 + src/core/mount.c | 9 +- src/core/mount.h | 2 +- src/core/scope.c | 3 +- src/core/service.c | 9 +- src/core/service.h | 2 +- src/core/slice.c | 3 +- src/core/socket.c | 9 +- src/core/socket.h | 2 +- src/core/swap.c | 9 +- src/core/swap.h | 2 +- src/core/unit.c | 15 ++ src/core/unit.h | 8 ++ 16 files changed, 280 insertions(+), 42 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 62cbe08f13f..47c2ad98a81 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -21,6 +21,7 @@ #include #include "alloc-util.h" +#include "bpf-firewall.h" #include "cgroup-util.h" #include "cgroup.h" #include "fd-util.h" @@ -30,9 +31,9 @@ #include "path-util.h" #include "process-util.h" #include "special.h" +#include "stdio-util.h" #include "string-table.h" #include "string-util.h" -#include "stdio-util.h" #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC) @@ -648,7 +649,27 @@ static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_ "Failed to set %s: %m", file); } -static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { +static void cgroup_apply_firewall(Unit *u, CGroupContext *c) { + int r; + + if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is + * not recursive we don't ever touch the bpf on them */ + return; + + r = bpf_firewall_compile(u); + if (r < 0) + return; + + (void) bpf_firewall_install(u); + return; +} + +static void cgroup_context_apply( + Unit *u, + CGroupMask apply_mask, + bool apply_bpf, + ManagerState state) { + const char *path; CGroupContext *c; bool is_root; @@ -662,7 +683,8 @@ static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { assert(c); assert(path); - if (mask == 0) + /* Nothing to do? Exit early! */ + if (apply_mask == 0 && !apply_bpf) return; /* Some cgroup attributes are not supported on the root cgroup, @@ -676,9 +698,11 @@ static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { * cgroup trees (assuming we are running in a container then), * and missing cgroups, i.e. EROFS and ENOENT. */ - if ((mask & CGROUP_MASK_CPU) && !is_root) { - bool has_weight = cgroup_context_has_cpu_weight(c); - bool has_shares = cgroup_context_has_cpu_shares(c); + if ((apply_mask & CGROUP_MASK_CPU) && !is_root) { + bool has_weight, has_shares; + + has_weight = cgroup_context_has_cpu_weight(c); + has_shares = cgroup_context_has_cpu_shares(c); if (cg_all_unified() > 0) { uint64_t weight; @@ -715,7 +739,7 @@ static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { } } - if (mask & CGROUP_MASK_IO) { + if (apply_mask & CGROUP_MASK_IO) { bool has_io = cgroup_context_has_io_config(c); bool has_blockio = cgroup_context_has_blockio_config(c); @@ -792,7 +816,7 @@ static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { } } - if (mask & CGROUP_MASK_BLKIO) { + if (apply_mask & CGROUP_MASK_BLKIO) { bool has_io = cgroup_context_has_io_config(c); bool has_blockio = cgroup_context_has_blockio_config(c); @@ -859,7 +883,7 @@ static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { } } - if ((mask & CGROUP_MASK_MEMORY) && !is_root) { + if ((apply_mask & CGROUP_MASK_MEMORY) && !is_root) { if (cg_all_unified() > 0) { uint64_t max, swap_max = CGROUP_LIMIT_MAX; @@ -899,7 +923,7 @@ static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { } } - if ((mask & CGROUP_MASK_DEVICES) && !is_root) { + if ((apply_mask & CGROUP_MASK_DEVICES) && !is_root) { CGroupDeviceAllow *a; /* Changing the devices list of a populated cgroup @@ -963,7 +987,7 @@ static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { } } - if ((mask & CGROUP_MASK_PIDS) && !is_root) { + if ((apply_mask & CGROUP_MASK_PIDS) && !is_root) { if (c->tasks_max != CGROUP_LIMIT_MAX) { char buf[DECIMAL_STR_MAX(uint64_t) + 2]; @@ -977,6 +1001,9 @@ static void cgroup_context_apply(Unit *u, CGroupMask mask, ManagerState state) { log_unit_full(u, IN_SET(r, -ENOENT, -EROFS, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to set pids.max: %m"); } + + if (apply_bpf) + cgroup_apply_firewall(u, c); } CGroupMask cgroup_context_get_mask(CGroupContext *c) { @@ -1123,6 +1150,39 @@ CGroupMask unit_get_enable_mask(Unit *u) { return mask; } +bool unit_get_needs_bpf(Unit *u) { + CGroupContext *c; + Unit *p; + assert(u); + + /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the + * moment. */ + if (u->type == UNIT_SLICE) + return false; + + c = unit_get_cgroup_context(u); + if (!c) + return false; + + if (c->ip_accounting || + c->ip_address_allow || + c->ip_address_deny) + return true; + + /* If any parent slice has an IP access list defined, it applies too */ + for (p = UNIT_DEREF(u->slice); p; p = UNIT_DEREF(p->slice)) { + c = unit_get_cgroup_context(p); + if (!c) + return false; + + if (c->ip_address_allow || + c->ip_address_deny) + return true; + } + + return false; +} + /* Recurse from a unit up through its containing slices, propagating * mask bits upward. A unit is also member of itself. */ void unit_update_cgroup_members_masks(Unit *u) { @@ -1298,7 +1358,8 @@ int unit_watch_cgroup(Unit *u) { static int unit_create_cgroup( Unit *u, CGroupMask target_mask, - CGroupMask enable_mask) { + CGroupMask enable_mask, + bool needs_bpf) { CGroupContext *c; int r; @@ -1340,6 +1401,7 @@ static int unit_create_cgroup( u->cgroup_realized = true; u->cgroup_realized_mask = target_mask; u->cgroup_enabled_mask = enable_mask; + u->cgroup_bpf_state = needs_bpf ? UNIT_CGROUP_BPF_ON : UNIT_CGROUP_BPF_OFF; if (u->type != UNIT_SLICE && !c->delegate) { @@ -1389,10 +1451,19 @@ static void cgroup_xattr_apply(Unit *u) { log_unit_warning_errno(u, r, "Failed to set invocation ID on control group %s, ignoring: %m", u->cgroup_path); } -static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask, CGroupMask enable_mask) { +static bool unit_has_mask_realized( + Unit *u, + CGroupMask target_mask, + CGroupMask enable_mask, + bool needs_bpf) { + assert(u); - return u->cgroup_realized && u->cgroup_realized_mask == target_mask && u->cgroup_enabled_mask == enable_mask; + return u->cgroup_realized && + u->cgroup_realized_mask == target_mask && + u->cgroup_enabled_mask == enable_mask && + ((needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_ON) || + (!needs_bpf && u->cgroup_bpf_state == UNIT_CGROUP_BPF_OFF)); } /* Check if necessary controllers and attributes for a unit are in place. @@ -1403,6 +1474,7 @@ static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask, CGroupMask e * Returns 0 on success and < 0 on failure. */ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { CGroupMask target_mask, enable_mask; + bool needs_bpf, apply_bpf; int r; assert(u); @@ -1414,10 +1486,16 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { target_mask = unit_get_target_mask(u); enable_mask = unit_get_enable_mask(u); + needs_bpf = unit_get_needs_bpf(u); - if (unit_has_mask_realized(u, target_mask, enable_mask)) + if (unit_has_mask_realized(u, target_mask, enable_mask, needs_bpf)) return 0; + /* Make sure we apply the BPF filters either when one is configured, or if none is configured but previously + * the state was anything but off. This way, if a unit with a BPF filter applied is reconfigured to lose it + * this will trickle down properly to cgroupfs. */ + apply_bpf = needs_bpf || u->cgroup_bpf_state != UNIT_CGROUP_BPF_OFF; + /* First, realize parents */ if (UNIT_ISSET(u->slice)) { r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state); @@ -1426,12 +1504,12 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { } /* And then do the real work */ - r = unit_create_cgroup(u, target_mask, enable_mask); + r = unit_create_cgroup(u, target_mask, enable_mask, needs_bpf); if (r < 0) return r; /* Finally, apply the necessary attributes. */ - cgroup_context_apply(u, target_mask, state); + cgroup_context_apply(u, target_mask, apply_bpf, state); cgroup_xattr_apply(u); return 0; @@ -1495,7 +1573,10 @@ static void unit_queue_siblings(Unit *u) { /* If the unit doesn't need any new controllers * and has current ones realized, it doesn't need * any changes. */ - if (unit_has_mask_realized(m, unit_get_target_mask(m), unit_get_enable_mask(m))) + if (unit_has_mask_realized(m, + unit_get_target_mask(m), + unit_get_enable_mask(m), + unit_get_needs_bpf(m))) continue; unit_add_to_cgroup_queue(m); @@ -2121,7 +2202,34 @@ int unit_get_cpu_usage(Unit *u, nsec_t *ret) { return 0; } -int unit_reset_cpu_usage(Unit *u) { +int unit_get_ip_accounting( + Unit *u, + CGroupIPAccountingMetric metric, + uint64_t *ret) { + + int fd, r; + + assert(u); + assert(metric >= 0); + assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX); + assert(ret); + + fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + + if (fd < 0) + return -ENODATA; + + if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES)) + r = bpf_firewall_read_accounting(fd, ret, NULL); + else + r = bpf_firewall_read_accounting(fd, NULL, ret); + + return r; +} + +int unit_reset_cpu_accounting(Unit *u) { nsec_t ns; int r; @@ -2139,6 +2247,20 @@ int unit_reset_cpu_usage(Unit *u) { return 0; } +int unit_reset_ip_accounting(Unit *u) { + int r = 0, q = 0; + + assert(u); + + if (u->ip_accounting_ingress_map_fd >= 0) + r = bpf_firewall_reset_accounting(u->ip_accounting_ingress_map_fd); + + if (u->ip_accounting_egress_map_fd >= 0) + q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd); + + return r < 0 ? r : q; +} + bool unit_cgroup_delegate(Unit *u) { CGroupContext *c; @@ -2174,6 +2296,36 @@ void unit_invalidate_cgroup(Unit *u, CGroupMask m) { unit_add_to_cgroup_queue(u); } +void unit_invalidate_cgroup_bpf(Unit *u) { + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return; + + if (u->cgroup_bpf_state == UNIT_CGROUP_BPF_INVALIDATED) + return; + + u->cgroup_bpf_state = UNIT_CGROUP_BPF_INVALIDATED; + unit_add_to_cgroup_queue(u); + + /* If we are a slice unit, we also need to put compile a new BPF program for all our children, as the IP access + * list of our children includes our own. */ + if (u->type == UNIT_SLICE) { + Unit *member; + Iterator i; + + SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) { + if (member == u) + continue; + + if (UNIT_DEREF(member->slice) != u) + continue; + + unit_invalidate_cgroup_bpf(member); + } + } +} + void manager_invalidate_startup_units(Manager *m) { Iterator i; Unit *u; diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 2baf4d20e99..fcbf8d01ca0 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -128,6 +128,16 @@ struct CGroupContext { bool delegate; }; +/* Used when querying IP accounting data */ +typedef enum CGroupIPAccountingMetric { + CGROUP_IP_INGRESS_BYTES, + CGROUP_IP_INGRESS_PACKETS, + CGROUP_IP_EGRESS_BYTES, + CGROUP_IP_EGRESS_PACKETS, + _CGROUP_IP_ACCOUNTING_METRIC_MAX, + _CGROUP_IP_ACCOUNTING_METRIC_INVALID = -1, +} CGroupIPAccountingMetric; + #include "unit.h" void cgroup_context_init(CGroupContext *c); @@ -150,6 +160,8 @@ CGroupMask unit_get_subtree_mask(Unit *u); CGroupMask unit_get_target_mask(Unit *u); CGroupMask unit_get_enable_mask(Unit *u); +bool unit_get_needs_bpf(Unit *u); + void unit_update_cgroup_members_masks(Unit *u); char *unit_default_cgroup_path(Unit *u); @@ -177,7 +189,10 @@ int unit_watch_all_pids(Unit *u); int unit_get_memory_current(Unit *u, uint64_t *ret); int unit_get_tasks_current(Unit *u, uint64_t *ret); int unit_get_cpu_usage(Unit *u, nsec_t *ret); -int unit_reset_cpu_usage(Unit *u); +int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret); + +int unit_reset_cpu_accounting(Unit *u); +int unit_reset_ip_accounting(Unit *u); bool unit_cgroup_delegate(Unit *u); @@ -185,6 +200,7 @@ int unit_notify_cgroup_empty(Unit *u); int manager_notify_cgroup_empty(Manager *m, const char *group); void unit_invalidate_cgroup(Unit *u, CGroupMask m); +void unit_invalidate_cgroup_bpf(Unit *u); void manager_invalidate_startup_units(Manager *m); diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index b0645ce2940..8d2ae964d81 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -20,6 +20,7 @@ #include "sd-bus.h" #include "alloc-util.h" +#include "bpf-firewall.h" #include "bus-common-errors.h" #include "cgroup-util.h" #include "dbus-job.h" @@ -1051,6 +1052,39 @@ int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bu return sd_bus_send(NULL, reply, NULL); } +static int property_get_ip_counter( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupIPAccountingMetric metric; + uint64_t value = (uint64_t) -1; + Unit *u = userdata; + + assert(bus); + assert(reply); + assert(property); + assert(u); + + if (streq(property, "IPIngressBytes")) + metric = CGROUP_IP_INGRESS_BYTES; + else if (streq(property, "IPIngressPackets")) + metric = CGROUP_IP_INGRESS_PACKETS; + else if (streq(property, "IPEgressBytes")) + metric = CGROUP_IP_EGRESS_BYTES; + else { + assert(streq(property, "IPEgressPackets")); + metric = CGROUP_IP_EGRESS_PACKETS; + } + + (void) unit_get_ip_accounting(u, metric, &value); + return sd_bus_message_append(reply, "t", value); +} + const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0), @@ -1058,6 +1092,10 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_PROPERTY("MemoryCurrent", "t", property_get_current_memory, 0, 0), SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0), SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0), + SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0), + SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0), SD_BUS_METHOD("GetProcesses", NULL, "a(sus)", bus_unit_method_get_processes, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_VTABLE_END }; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index f7d5f248619..cc8aad05a08 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -174,6 +174,9 @@ $1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, $1.TasksAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.tasks_accounting) $1.TasksMax, config_parse_tasks_max, 0, offsetof($1, cgroup_context.tasks_max) $1.Delegate, config_parse_bool, 0, offsetof($1, cgroup_context.delegate) +$1.IPAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.ip_accounting) +$1.IPAddressAllow, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_allow) +$1.IPAddressDeny, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_deny) $1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' )m4_dnl Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) diff --git a/src/core/mount.c b/src/core/mount.c index 472f54242cd..46bcf37ae06 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -754,9 +754,10 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { assert(_pid); (void) unit_realize_cgroup(UNIT(m)); - if (m->reset_cpu_usage) { - (void) unit_reset_cpu_usage(UNIT(m)); - m->reset_cpu_usage = false; + if (m->reset_accounting) { + (void) unit_reset_cpu_accounting(UNIT(m)); + (void) unit_reset_ip_accounting(UNIT(m)); + m->reset_accounting = false; } r = unit_setup_exec_runtime(UNIT(m)); @@ -1044,7 +1045,7 @@ static int mount_start(Unit *u) { m->result = MOUNT_SUCCESS; m->reload_result = MOUNT_SUCCESS; - m->reset_cpu_usage = true; + m->reset_accounting = true; mount_enter_mounting(m); return 1; diff --git a/src/core/mount.h b/src/core/mount.h index 9f7326ba6ad..f81e4217dfb 100644 --- a/src/core/mount.h +++ b/src/core/mount.h @@ -67,7 +67,7 @@ struct Mount { bool just_mounted:1; bool just_changed:1; - bool reset_cpu_usage:1; + bool reset_accounting:1; bool sloppy_options; diff --git a/src/core/scope.c b/src/core/scope.c index a1d5c1cfd54..8f9df3b9b7f 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -333,7 +333,8 @@ static int scope_start(Unit *u) { return r; (void) unit_realize_cgroup(u); - (void) unit_reset_cpu_usage(u); + (void) unit_reset_cpu_accounting(u); + (void) unit_reset_ip_accounting(u); r = unit_attach_pids_to_cgroup(u); if (r < 0) { diff --git a/src/core/service.c b/src/core/service.c index 2144884f9e0..b0ce9bfcfa0 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1244,9 +1244,10 @@ static int service_spawn( } (void) unit_realize_cgroup(UNIT(s)); - if (s->reset_cpu_usage) { - (void) unit_reset_cpu_usage(UNIT(s)); - s->reset_cpu_usage = false; + if (s->reset_accounting) { + (void) unit_reset_cpu_accounting(UNIT(s)); + (void) unit_reset_ip_accounting(UNIT(s)); + s->reset_accounting = false; } r = unit_setup_exec_runtime(UNIT(s)); @@ -2138,7 +2139,7 @@ static int service_start(Unit *u) { s->main_pid_known = false; s->main_pid_alien = false; s->forbid_restart = false; - s->reset_cpu_usage = true; + s->reset_accounting = true; s->status_text = mfree(s->status_text); s->status_errno = 0; diff --git a/src/core/service.h b/src/core/service.h index 0ac8bc9a675..16b700637c2 100644 --- a/src/core/service.h +++ b/src/core/service.h @@ -165,7 +165,7 @@ struct Service { bool forbid_restart:1; bool start_timeout_defined:1; - bool reset_cpu_usage:1; + bool reset_accounting:1; char *bus_name; char *bus_name_owner; /* unique name of the current owner */ diff --git a/src/core/slice.c b/src/core/slice.c index ed5d3fd701a..b15f751c82d 100644 --- a/src/core/slice.c +++ b/src/core/slice.c @@ -222,7 +222,8 @@ static int slice_start(Unit *u) { return r; (void) unit_realize_cgroup(u); - (void) unit_reset_cpu_usage(u); + (void) unit_reset_cpu_accounting(u); + (void) unit_reset_ip_accounting(u); slice_set_state(t, SLICE_ACTIVE); return 1; diff --git a/src/core/socket.c b/src/core/socket.c index a82e7d21870..ec901fbdd7a 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1775,9 +1775,10 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { assert(_pid); (void) unit_realize_cgroup(UNIT(s)); - if (s->reset_cpu_usage) { - (void) unit_reset_cpu_usage(UNIT(s)); - s->reset_cpu_usage = false; + if (s->reset_accounting) { + (void) unit_reset_cpu_accounting(UNIT(s)); + (void) unit_reset_ip_accounting(UNIT(s)); + s->reset_accounting = false; } r = unit_setup_exec_runtime(UNIT(s)); @@ -2373,7 +2374,7 @@ static int socket_start(Unit *u) { return r; s->result = SOCKET_SUCCESS; - s->reset_cpu_usage = true; + s->reset_accounting = true; socket_enter_start_pre(s); return 1; diff --git a/src/core/socket.h b/src/core/socket.h index 89f4664510b..8c263963c48 100644 --- a/src/core/socket.h +++ b/src/core/socket.h @@ -161,7 +161,7 @@ struct Socket { char *user, *group; - bool reset_cpu_usage:1; + bool reset_accounting:1; char *fdname; diff --git a/src/core/swap.c b/src/core/swap.c index 303f62d25af..d58f68458ba 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -620,9 +620,10 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { assert(_pid); (void) unit_realize_cgroup(UNIT(s)); - if (s->reset_cpu_usage) { - (void) unit_reset_cpu_usage(UNIT(s)); - s->reset_cpu_usage = false; + if (s->reset_accounting) { + (void) unit_reset_cpu_accounting(UNIT(s)); + (void) unit_reset_ip_accounting(UNIT(s)); + s->reset_accounting = false; } r = unit_setup_exec_runtime(UNIT(s)); @@ -861,7 +862,7 @@ static int swap_start(Unit *u) { return r; s->result = SWAP_SUCCESS; - s->reset_cpu_usage = true; + s->reset_accounting = true; swap_enter_activating(s); return 1; diff --git a/src/core/swap.h b/src/core/swap.h index b0ef50f1e8f..45da63c5e2d 100644 --- a/src/core/swap.h +++ b/src/core/swap.h @@ -70,7 +70,7 @@ struct Swap { bool is_active:1; bool just_activated:1; - bool reset_cpu_usage:1; + bool reset_accounting:1; SwapResult result; diff --git a/src/core/unit.c b/src/core/unit.c index 6451b755607..02c8b2a45d0 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -2818,6 +2818,7 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { unit_serialize_item(u, f, "cgroup-realized", yes_no(u->cgroup_realized)); (void) unit_serialize_cgroup_mask(f, "cgroup-realized-mask", u->cgroup_realized_mask); (void) unit_serialize_cgroup_mask(f, "cgroup-enabled-mask", u->cgroup_enabled_mask); + unit_serialize_item_format(u, f, "cgroup-bpf-realized", "%i", u->cgroup_bpf_state); if (uid_is_valid(u->ref_uid)) unit_serialize_item_format(u, f, "ref-uid", UID_FMT, u->ref_uid); @@ -3089,6 +3090,20 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { log_unit_debug(u, "Failed to parse cgroup-enabled-mask %s, ignoring.", v); continue; + } else if (streq(l, "cgroup-bpf-realized")) { + int i; + + r = safe_atoi(v, &i); + if (r < 0) + log_unit_debug(u, "Failed to parse cgroup BPF state %s, ignoring.", v); + else + u->cgroup_bpf_state = + i < 0 ? UNIT_CGROUP_BPF_INVALIDATED : + i > 0 ? UNIT_CGROUP_BPF_ON : + UNIT_CGROUP_BPF_OFF; + + continue; + } else if (streq(l, "ref-uid")) { uid_t uid; diff --git a/src/core/unit.h b/src/core/unit.h index 95c41fcceae..598cc6ede61 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -71,6 +71,12 @@ struct UnitRef { LIST_FIELDS(UnitRef, refs); }; +typedef enum UnitCGroupBPFState { + UNIT_CGROUP_BPF_OFF = 0, + UNIT_CGROUP_BPF_ON = 1, + UNIT_CGROUP_BPF_INVALIDATED = -1, +} UnitCGroupBPFState; + struct Unit { Manager *manager; @@ -267,6 +273,8 @@ struct Unit { bool cgroup_members_mask_valid:1; bool cgroup_subtree_mask_valid:1; + UnitCGroupBPFState cgroup_bpf_state:2; + bool start_limit_hit:1; /* Did we already invoke unit_coldplug() for this unit? */ From 377bfd2d49fad1bbef6f48a7686e28fec00ea7fa Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Fri, 1 Sep 2017 16:04:50 +0200 Subject: [PATCH 15/42] manager: hook up IP accounting defaults --- src/core/main.c | 3 +++ src/core/manager.h | 1 + src/core/system.conf | 1 + src/core/unit.c | 1 + 4 files changed, 6 insertions(+) diff --git a/src/core/main.c b/src/core/main.c index fbf8876a2de..8660a31a2dd 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -128,6 +128,7 @@ static Set* arg_syscall_archs = NULL; static FILE* arg_serialization = NULL; static bool arg_default_cpu_accounting = false; static bool arg_default_io_accounting = false; +static bool arg_default_ip_accounting = false; static bool arg_default_blockio_accounting = false; static bool arg_default_memory_accounting = false; static bool arg_default_tasks_accounting = true; @@ -748,6 +749,7 @@ static int parse_config_file(void) { { "Manager", "DefaultLimitRTTIME", config_parse_limit, RLIMIT_RTTIME, arg_default_rlimit }, { "Manager", "DefaultCPUAccounting", config_parse_bool, 0, &arg_default_cpu_accounting }, { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_default_io_accounting }, + { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_default_ip_accounting }, { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_default_blockio_accounting }, { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting }, { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting }, @@ -792,6 +794,7 @@ static void manager_set_defaults(Manager *m) { m->default_start_limit_burst = arg_default_start_limit_burst; m->default_cpu_accounting = arg_default_cpu_accounting; m->default_io_accounting = arg_default_io_accounting; + m->default_ip_accounting = arg_default_ip_accounting; m->default_blockio_accounting = arg_default_blockio_accounting; m->default_memory_accounting = arg_default_memory_accounting; m->default_tasks_accounting = arg_default_tasks_accounting; diff --git a/src/core/manager.h b/src/core/manager.h index 8880b3aab53..e8a62674711 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -272,6 +272,7 @@ struct Manager { bool default_io_accounting; bool default_blockio_accounting; bool default_tasks_accounting; + bool default_ip_accounting; uint64_t default_tasks_max; usec_t default_timer_accuracy_usec; diff --git a/src/core/system.conf b/src/core/system.conf index 88f646e2fe1..6b86eac33db 100644 --- a/src/core/system.conf +++ b/src/core/system.conf @@ -40,6 +40,7 @@ #DefaultEnvironment= #DefaultCPUAccounting=no #DefaultIOAccounting=no +#DefaultIPAccounting=no #DefaultBlockIOAccounting=no #DefaultMemoryAccounting=no #DefaultTasksAccounting=yes diff --git a/src/core/unit.c b/src/core/unit.c index 02c8b2a45d0..dc709d7ca45 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -161,6 +161,7 @@ static void unit_init(Unit *u) { cc->cpu_accounting = u->manager->default_cpu_accounting; cc->io_accounting = u->manager->default_io_accounting; + cc->ip_accounting = u->manager->default_ip_accounting; cc->blockio_accounting = u->manager->default_blockio_accounting; cc->memory_accounting = u->manager->default_memory_accounting; cc->tasks_accounting = u->manager->default_tasks_accounting; From 0e97c93fe5c374432325ca29fe0ec21ac19dd9dd Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Thu, 3 Nov 2016 19:00:09 +0100 Subject: [PATCH 16/42] systemctl: report accounted network traffic in "systemctl status" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This hooks up the eposed D-Bus values and displays them like this: -bash-4.3# systemctl status httpd ● httpd.service - The Apache HTTP Server Loaded: loaded (/etc/systemd/system/httpd.service; enabled; vendor preset: disabled) Active: active (running) since Fri 2016-11-11 20:10:36 CET; 1min 29s ago Main PID: 33 (httpd) Status: "Total requests: 22514; Idle/Busy workers 92/7;Requests/sec: 259; Bytes served/sec: 87KB/sec" Network: 15.8M in, 51.1M out ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ CGroup: /system.slice/httpd.service ├─ 33 /usr/sbin/httpd -DFOREGROUND ├─ 37 /usr/sbin/httpd -DFOREGROUND ├─112 /usr/sbin/httpd -DFOREGROUND └─119 /usr/sbin/httpd -DFOREGROUND --- src/systemctl/systemctl.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/systemctl/systemctl.c b/src/systemctl/systemctl.c index 318cd35bf60..897fc48b989 100644 --- a/src/systemctl/systemctl.c +++ b/src/systemctl/systemctl.c @@ -3878,6 +3878,9 @@ typedef struct UnitStatusInfo { uint64_t tasks_current; uint64_t tasks_max; + uint64_t ip_ingress_bytes; + uint64_t ip_egress_bytes; + LIST_HEAD(ExecStatusInfo, exec); } UnitStatusInfo; @@ -4194,6 +4197,14 @@ static void print_status_info( if (i->status_errno > 0) printf(" Error: %i (%s)\n", i->status_errno, strerror(i->status_errno)); + if (i->ip_ingress_bytes != (uint64_t) -1 && i->ip_egress_bytes != (uint64_t) -1) { + char buf_in[FORMAT_BYTES_MAX], buf_out[FORMAT_BYTES_MAX]; + + printf(" IP: %s in, %s out\n", + format_bytes(buf_in, sizeof(buf_in), i->ip_ingress_bytes), + format_bytes(buf_out, sizeof(buf_out), i->ip_egress_bytes)); + } + if (i->tasks_current != (uint64_t) -1) { printf(" Tasks: %" PRIu64, i->tasks_current); @@ -4484,6 +4495,10 @@ static int status_property(const char *name, sd_bus_message *m, UnitStatusInfo * i->next_elapse_monotonic = u; else if (streq(name, "NextElapseUSecRealtime")) i->next_elapse_real = u; + else if (streq(name, "IPIngressBytes")) + i->ip_ingress_bytes = u; + else if (streq(name, "IPEgressBytes")) + i->ip_egress_bytes = u; break; } @@ -4998,6 +5013,8 @@ static int show_one( .cpu_usage_nsec = (uint64_t) -1, .tasks_current = (uint64_t) -1, .tasks_max = (uint64_t) -1, + .ip_ingress_bytes = (uint64_t) -1, + .ip_egress_bytes = (uint64_t) -1, }; int r; From c21c99060ba6db8257f575cc9b4b470716c50e0b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 1 Sep 2017 20:31:44 +0200 Subject: [PATCH 17/42] cgroup: dump the newly added IP settings in the cgroup context --- src/core/cgroup.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 47c2ad98a81..af611e7e7bc 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -153,6 +153,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { CGroupBlockIODeviceBandwidth *b; CGroupBlockIODeviceWeight *w; CGroupDeviceAllow *a; + IPAddressAccessItem *iaai; char u[FORMAT_TIMESPAN_MAX]; assert(c); @@ -166,6 +167,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { "%sBlockIOAccounting=%s\n" "%sMemoryAccounting=%s\n" "%sTasksAccounting=%s\n" + "%sIPAccounting=%s\n" "%sCPUWeight=%" PRIu64 "\n" "%sStartupCPUWeight=%" PRIu64 "\n" "%sCPUShares=%" PRIu64 "\n" @@ -188,6 +190,7 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { prefix, yes_no(c->blockio_accounting), prefix, yes_no(c->memory_accounting), prefix, yes_no(c->tasks_accounting), + prefix, yes_no(c->ip_accounting), prefix, c->cpu_weight, prefix, c->startup_cpu_weight, prefix, c->cpu_shares, @@ -257,6 +260,20 @@ void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { b->path, format_bytes(buf, sizeof(buf), b->wbps)); } + + LIST_FOREACH(items, iaai, c->ip_address_allow) { + _cleanup_free_ char *k = NULL; + + (void) in_addr_to_string(iaai->family, &iaai->address, &k); + fprintf(f, "%sIPAddressAllow=%s/%u\n", prefix, strnull(k), iaai->prefixlen); + } + + LIST_FOREACH(items, iaai, c->ip_address_deny) { + _cleanup_free_ char *k = NULL; + + (void) in_addr_to_string(iaai->family, &iaai->address, &k); + fprintf(f, "%sIPAddressDeny=%s/%u\n", prefix, strnull(k), iaai->prefixlen); + } } static int lookup_block_device(const char *p, dev_t *dev) { From 3dc5ca9787fcc08317d5e6a689cb1e7eb9ba5384 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 5 Sep 2017 11:16:35 +0200 Subject: [PATCH 18/42] core: support IP firewalling to be configured for transient units --- src/core/dbus-cgroup.c | 169 +++++++++++++++++++++++++++++++++++++ src/shared/bus-unit-util.c | 133 +++++++++++++++++++++++++++-- 2 files changed, 295 insertions(+), 7 deletions(-) diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index c1026e3f5b4..3bb4108ac72 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -17,6 +17,9 @@ along with systemd; If not, see . ***/ +#include + +#include "af-list.h" #include "alloc-util.h" #include "bus-util.h" #include "cgroup-util.h" @@ -206,6 +209,48 @@ static int property_get_device_allow( return sd_bus_message_close_container(reply); } +static int property_get_ip_address_access( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + IPAddressAccessItem** items = userdata, *i; + int r; + + r = sd_bus_message_open_container(reply, 'a', "(iayu)"); + if (r < 0) + return r; + + LIST_FOREACH(items, i, *items) { + + r = sd_bus_message_open_container(reply, 'r', "iayu"); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "i", i->family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(reply, 'y', &i->address, FAMILY_ADDRESS_SIZE(i->family)); + if (r < 0) + return r; + + r = sd_bus_message_append(reply, "u", (uint32_t) i->prefixlen); + if (r < 0) + return r; + + r = sd_bus_message_close_container(reply); + if (r < 0) + return r; + } + + return sd_bus_message_close_container(reply); +} + const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Delegate", "b", bus_property_get_bool, offsetof(CGroupContext, delegate), 0), @@ -239,6 +284,9 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("DeviceAllow", "a(ss)", property_get_device_allow, 0, 0), SD_BUS_PROPERTY("TasksAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, tasks_accounting), 0), SD_BUS_PROPERTY("TasksMax", "t", NULL, offsetof(CGroupContext, tasks_max), 0), + SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0), + SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0), + SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0), SD_BUS_VTABLE_END }; @@ -1133,6 +1181,7 @@ int bus_cgroup_set_property( } return 1; + } else if (streq(name, "TasksMaxScale")) { uint64_t limit; uint32_t raw; @@ -1152,6 +1201,126 @@ int bus_cgroup_set_property( (uint32_t) (DIV_ROUND_UP((uint64_t) raw * 100U, (uint64_t) UINT32_MAX))); } + return 1; + + } else if (streq(name, "IPAccounting")) { + int b; + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (mode != UNIT_CHECK) { + c->ip_accounting = b; + + unit_invalidate_cgroup_bpf(u); + unit_write_drop_in_private(u, mode, name, b ? "IPAccounting=yes" : "IPAccounting=no"); + } + + return 1; + + } else if (STR_IN_SET(name, "IPAddressAllow", "IPAddressDeny")) { + IPAddressAccessItem **list; + size_t n = 0; + + list = streq(name, "IPAddressAllow") ? &c->ip_address_allow : &c->ip_address_deny; + + r = sd_bus_message_enter_container(message, 'a', "(iayu)"); + if (r < 0) + return r; + + for (;;) { + const void *ap; + int32_t family; + uint32_t prefixlen; + size_t an; + + r = sd_bus_message_enter_container(message, 'r', "iayu"); + if (r < 0) + return r; + if (r == 0) + break; + + r = sd_bus_message_read(message, "i", &family); + if (r < 0) + return r; + + if (!IN_SET(family, AF_INET, AF_INET6)) + return sd_bus_error_set_errnof(error, EINVAL, "IPAddressAllow= expects IPv4 or IPv6 addresses only."); + + r = sd_bus_message_read_array(message, 'y', &ap, &an); + if (r < 0) + return r; + + if (an != FAMILY_ADDRESS_SIZE(family)) + return sd_bus_error_set_errnof(error, EINVAL, "IP address has wrong size for family (%s, expected %zu, got %zu)", + af_to_name(family), FAMILY_ADDRESS_SIZE(family), an); + + r = sd_bus_message_read(message, "u", &prefixlen); + if (r < 0) + return r; + + if (prefixlen > FAMILY_ADDRESS_SIZE(family)*8) + return sd_bus_error_set_errnof(error, EINVAL, "Prefix length too large for family."); + + if (mode != UNIT_CHECK) { + IPAddressAccessItem *item; + + item = new0(IPAddressAccessItem, 1); + if (!item) + return -ENOMEM; + + item->family = family; + item->prefixlen = prefixlen; + memcpy(&item->address, ap, an); + + LIST_PREPEND(items, *list, item); + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + n++; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + if (mode != UNIT_CHECK) { + _cleanup_free_ char *buf = NULL; + _cleanup_fclose_ FILE *f = NULL; + IPAddressAccessItem *item; + size_t size = 0; + + if (n == 0) + *list = ip_address_access_free_all(*list); + + unit_invalidate_cgroup_bpf(u); + f = open_memstream(&buf, &size); + if (!f) + return -ENOMEM; + + fputs_unlocked(name, f); + fputs_unlocked("=\n", f); + + LIST_FOREACH(items, item, *list) { + char buffer[CONST_MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)]; + + errno = 0; + if (!inet_ntop(item->family, &item->address, buffer, sizeof(buffer))) + return errno > 0 ? -errno : -EINVAL; + + fprintf(f, "%s=%s/%u\n", name, buffer, item->prefixlen); + } + + r = fflush_and_check(f); + if (r < 0) + return r; + unit_write_drop_in_private(u, mode, name, buf); + } + return 1; } diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index d6b119987c0..d216df465f4 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -28,6 +28,8 @@ #include "errno-list.h" #include "escape.h" #include "hashmap.h" +#include "hostname-util.h" +#include "in-addr-util.h" #include "list.h" #include "locale-util.h" #include "mount-util.h" @@ -66,6 +68,31 @@ int bus_parse_unit_info(sd_bus_message *message, UnitInfo *u) { &u->job_path); } +static int bus_append_ip_address_access(sd_bus_message *m, int family, const union in_addr_union *prefix, unsigned char prefixlen) { + int r; + + assert(m); + assert(prefix); + + r = sd_bus_message_open_container(m, 'r', "iayu"); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "i", family); + if (r < 0) + return r; + + r = sd_bus_message_append_array(m, 'y', prefix, FAMILY_ADDRESS_SIZE(family)); + if (r < 0) + return r; + + r = sd_bus_message_append(m, "u", prefixlen); + if (r < 0) + return r; + + return sd_bus_message_close_container(m); +} + int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignment) { const char *eq, *field; UnitDependency dep; @@ -207,13 +234,13 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen r = sd_bus_message_append(m, "sv", sn, "t", l.rlim_cur); } else if (STR_IN_SET(field, - "CPUAccounting", "MemoryAccounting", "IOAccounting", "BlockIOAccounting", "TasksAccounting", - "SendSIGHUP", "SendSIGKILL", "WakeSystem", "DefaultDependencies", - "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "TTYVTDisallocate", "RemainAfterExit", - "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges", - "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute", - "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", - "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS", + "CPUAccounting", "MemoryAccounting", "IOAccounting", "BlockIOAccounting", + "TasksAccounting", "IPAccounting", "SendSIGHUP", "SendSIGKILL", "WakeSystem", + "DefaultDependencies", "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "TTYVTDisallocate", + "RemainAfterExit", "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", + "NoNewPrivileges", "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", + "MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser", "RemoveIPC", + "ProtectKernelTunables", "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS", "CPUSchedulingResetOnFork", "LockPersonality")) { r = parse_boolean(eq); @@ -433,6 +460,98 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen r = sd_bus_message_append(m, "v", "a(st)", 1, path, u); } + } else if (STR_IN_SET(field, "IPAddressAllow", "IPAddressDeny")) { + + if (isempty(eq)) + r = sd_bus_message_append(m, "v", "a(iayu)", 0); + else { + unsigned char prefixlen; + union in_addr_union prefix = {}; + int family; + + r = sd_bus_message_open_container(m, 'v', "a(iayu)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(iayu)"); + if (r < 0) + return bus_log_create_error(r); + + if (streq(eq, "any")) { + /* "any" is a shortcut for 0.0.0.0/0 and ::/0 */ + + r = bus_append_ip_address_access(m, AF_INET, &prefix, 0); + if (r < 0) + return bus_log_create_error(r); + + r = bus_append_ip_address_access(m, AF_INET6, &prefix, 0); + if (r < 0) + return bus_log_create_error(r); + + } else if (is_localhost(eq)) { + /* "localhost" is a shortcut for 127.0.0.0/8 and ::1/128 */ + + prefix.in.s_addr = htobe32(0x7f000000); + r = bus_append_ip_address_access(m, AF_INET, &prefix, 8); + if (r < 0) + return bus_log_create_error(r); + + prefix.in6 = (struct in6_addr) IN6ADDR_LOOPBACK_INIT; + r = bus_append_ip_address_access(m, AF_INET6, &prefix, 128); + if (r < 0) + return r; + + } else if (streq(eq, "link-local")) { + + /* "link-local" is a shortcut for 169.254.0.0/16 and fe80::/64 */ + + prefix.in.s_addr = htobe32((UINT32_C(169) << 24 | UINT32_C(254) << 16)); + r = bus_append_ip_address_access(m, AF_INET, &prefix, 16); + if (r < 0) + return bus_log_create_error(r); + + prefix.in6 = (struct in6_addr) { + .__in6_u.__u6_addr32[0] = htobe32(0xfe800000) + }; + r = bus_append_ip_address_access(m, AF_INET6, &prefix, 64); + if (r < 0) + return bus_log_create_error(r); + + } else if (streq(eq, "multicast")) { + + /* "multicast" is a shortcut for 224.0.0.0/4 and ff00::/8 */ + + prefix.in.s_addr = htobe32((UINT32_C(224) << 24)); + r = bus_append_ip_address_access(m, AF_INET, &prefix, 4); + if (r < 0) + return bus_log_create_error(r); + + prefix.in6 = (struct in6_addr) { + .__in6_u.__u6_addr32[0] = htobe32(0xff000000) + }; + r = bus_append_ip_address_access(m, AF_INET6, &prefix, 8); + if (r < 0) + return bus_log_create_error(r); + + } else { + r = in_addr_prefix_from_string_auto(eq, &family, &prefix, &prefixlen); + if (r < 0) + return log_error_errno(r, "Failed to parse IP address prefix: %s", eq); + + r = bus_append_ip_address_access(m, family, &prefix, prefixlen); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + } else if (streq(field, "CPUSchedulingPolicy")) { int n; From 2ba6e7381b0d7708df28bf1423a642bb12222614 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 5 Sep 2017 12:19:28 +0200 Subject: [PATCH 19/42] mkosi: when the build fails, show its log output, and propagate error --- mkosi.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkosi.build b/mkosi.build index 12e88b909ca..92eb55b130c 100755 --- a/mkosi.build +++ b/mkosi.build @@ -28,7 +28,7 @@ export LC_CTYPE=C.UTF-8 [ -f "$BUILDDIR"/build.ninja ] || meson "$BUILDDIR" ninja -C "$BUILDDIR" all -[ "$WITH_TESTS" = 0 ] || ninja -C "$BUILDDIR" test +[ "$WITH_TESTS" = 0 ] || ninja -C "$BUILDDIR" test || ( RET="$?" ; cat "$BUILDDIR"/meson-logs/testlog.txt ; exit "$RET" ) ninja -C "$BUILDDIR" install mkdir -p "$DESTDIR"/etc From 1274b6c68759be1b06140d13e5ec3a0ce3967e94 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 5 Sep 2017 17:41:34 +0200 Subject: [PATCH 20/42] ip-address-access: minimize IP address lists Let's drop redundant items from the IP address list after parsing. Let's also mask out redundant bits hidden by the prefixlength. --- src/basic/in-addr-util.c | 24 +++++++++++++++++++++ src/basic/in-addr-util.h | 1 + src/core/dbus-cgroup.c | 2 ++ src/core/ip-address-access.c | 42 ++++++++++++++++++++++++++++++++++++ src/core/ip-address-access.h | 2 ++ 5 files changed, 71 insertions(+) diff --git a/src/basic/in-addr-util.c b/src/basic/in-addr-util.c index 94f06258aac..e27faba75fa 100644 --- a/src/basic/in-addr-util.c +++ b/src/basic/in-addr-util.c @@ -465,6 +465,30 @@ int in_addr_mask(int family, union in_addr_union *addr, unsigned char prefixlen) return -EAFNOSUPPORT; } +int in_addr_prefix_covers(int family, + const union in_addr_union *prefix, + unsigned char prefixlen, + const union in_addr_union *address) { + + union in_addr_union masked_prefix, masked_address; + int r; + + assert(prefix); + assert(address); + + masked_prefix = *prefix; + r = in_addr_mask(family, &masked_prefix, prefixlen); + if (r < 0) + return r; + + masked_address = *address; + r = in_addr_mask(family, &masked_address, prefixlen); + if (r < 0) + return r; + + return in_addr_equal(family, &masked_prefix, &masked_address); +} + int in_addr_parse_prefixlen(int family, const char *p, unsigned char *ret) { uint8_t u; int r; diff --git a/src/basic/in-addr-util.h b/src/basic/in-addr-util.h index bb57c089bfd..d129bf55854 100644 --- a/src/basic/in-addr-util.h +++ b/src/basic/in-addr-util.h @@ -60,6 +60,7 @@ struct in_addr* in4_addr_prefixlen_to_netmask(struct in_addr *addr, unsigned cha int in4_addr_default_prefixlen(const struct in_addr *addr, unsigned char *prefixlen); int in4_addr_default_subnet_mask(const struct in_addr *addr, struct in_addr *mask); int in_addr_mask(int family, union in_addr_union *addr, unsigned char prefixlen); +int in_addr_prefix_covers(int family, const union in_addr_union *prefix, unsigned char prefixlen, const union in_addr_union *address); int in_addr_parse_prefixlen(int family, const char *p, unsigned char *ret); int in_addr_prefix_from_string(const char *p, int family, union in_addr_union *ret_prefix, unsigned char *ret_prefixlen); int in_addr_prefix_from_string_auto(const char *p, int *ret_family, union in_addr_union *ret_prefix, unsigned char *ret_prefixlen); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 3bb4108ac72..42381eca364 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -1288,6 +1288,8 @@ int bus_cgroup_set_property( if (r < 0) return r; + *list = ip_address_access_reduce(*list); + if (mode != UNIT_CHECK) { _cleanup_free_ char *buf = NULL; _cleanup_fclose_ FILE *f = NULL; diff --git a/src/core/ip-address-access.c b/src/core/ip-address-access.c index 6a89bb23c18..18d28708bed 100644 --- a/src/core/ip-address-access.c +++ b/src/core/ip-address-access.c @@ -148,6 +148,8 @@ int config_parse_ip_address_access( a = NULL; } + *list = ip_address_access_reduce(*list); + return 0; } @@ -163,3 +165,43 @@ IPAddressAccessItem* ip_address_access_free_all(IPAddressAccessItem *first) { return NULL; } + +IPAddressAccessItem* ip_address_access_reduce(IPAddressAccessItem *first) { + IPAddressAccessItem *a, *b, *tmp; + int r; + + /* Drops all entries from the list that are covered by another entry in full, thus removing all redundant + * entries. */ + + LIST_FOREACH_SAFE(items, a, tmp, first) { + + /* Drop irrelevant bits */ + (void) in_addr_mask(a->family, &a->address, a->prefixlen); + + LIST_FOREACH(items, b, first) { + + if (a == b) + continue; + + if (a->family != b->family) + continue; + + if (b->prefixlen > a->prefixlen) + continue; + + r = in_addr_prefix_covers(b->family, + &b->address, + b->prefixlen, + &a->address); + if (r <= 0) + continue; + + /* b covers a fully, then let's drop a */ + + LIST_REMOVE(items, first, a); + free(a); + } + } + + return first; +} diff --git a/src/core/ip-address-access.h b/src/core/ip-address-access.h index eea20b4848b..9aeab1f4f84 100644 --- a/src/core/ip-address-access.h +++ b/src/core/ip-address-access.h @@ -34,3 +34,5 @@ struct IPAddressAccessItem { int config_parse_ip_address_access(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); IPAddressAccessItem* ip_address_access_free_all(IPAddressAccessItem *first); + +IPAddressAccessItem* ip_address_access_reduce(IPAddressAccessItem *first); From db3a59308c67693256f92e9e5541d7f0cbd95bd9 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Thu, 3 Nov 2016 17:31:25 +0100 Subject: [PATCH 21/42] Add test for eBPF firewall code --- src/test/meson.build | 11 +++ src/test/test-bpf.c | 162 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 src/test/test-bpf.c diff --git a/src/test/meson.build b/src/test/meson.build index b1543cdcd2e..1f3db65781e 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -339,6 +339,17 @@ tests += [ [libbasic], []], + [['src/test/test-bpf.c', + 'src/test/test-helper.c'], + [libcore, + libshared], + [libmount, + threads, + librt, + libseccomp, + libselinux, + libblkid]], + [['src/test/test-hashmap.c', 'src/test/test-hashmap-plain.c', test_hashmap_ordered_c], diff --git a/src/test/test-bpf.c b/src/test/test-bpf.c new file mode 100644 index 00000000000..74e9d505612 --- /dev/null +++ b/src/test/test-bpf.c @@ -0,0 +1,162 @@ +/*** + This file is part of systemd. + + Copyright 2016 Daniel Mack + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#include +#include + +#include "bpf-firewall.h" +#include "bpf-program.h" +#include "load-fragment.h" +#include "manager.h" +#include "rm-rf.h" +#include "service.h" +#include "test-helper.h" +#include "tests.h" +#include "unit.h" + +int main(int argc, char *argv[]) { + struct bpf_insn exit_insn[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + CGroupContext *cc = NULL; + _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL; + Manager *m = NULL; + Unit *u; + char log_buf[65535]; + int r; + + log_set_max_level(LOG_DEBUG); + log_parse_environment(); + log_open(); + + enter_cgroup_subroot(); + assert_se(set_unit_path(get_testdata_dir("")) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p); + assert(r == 0); + + r = bpf_program_add_instructions(p, exit_insn, ELEMENTSOF(exit_insn)); + assert(r == 0); + + if (getuid() != 0) { + log_notice("Not running as root, skipping kernel related tests."); + return EXIT_TEST_SKIP; + } + + r = bpf_firewall_supported(); + if (r == 0) { + log_notice("BPF firewalling not supported, skipping"); + return EXIT_TEST_SKIP; + } + assert_se(r > 0); + + r = bpf_program_load_kernel(p, log_buf, ELEMENTSOF(log_buf)); + assert(r >= 0); + + p = bpf_program_unref(p); + + /* The simple tests suceeded. Now let's try full unit-based use-case. */ + + assert_se(manager_new(UNIT_FILE_USER, true, &m) >= 0); + assert_se(manager_startup(m, NULL, NULL) >= 0); + + assert_se(u = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(u, "foo.service") == 0); + assert_se(cc = unit_get_cgroup_context(u)); + u->perpetual = true; + + cc->ip_accounting = true; + + assert_se(config_parse_ip_address_access(u->id, "filename", 1, "Service", 1, "IPAddressAllow", 0, "10.0.1.0/24", &cc->ip_address_allow, NULL) == 0); + assert_se(config_parse_ip_address_access(u->id, "filename", 1, "Service", 1, "IPAddressAllow", 0, "127.0.0.2", &cc->ip_address_allow, NULL) == 0); + assert_se(config_parse_ip_address_access(u->id, "filename", 1, "Service", 1, "IPAddressDeny", 0, "127.0.0.3", &cc->ip_address_deny, NULL) == 0); + assert_se(config_parse_ip_address_access(u->id, "filename", 1, "Service", 1, "IPAddressDeny", 0, "10.0.3.2/24", &cc->ip_address_deny, NULL) == 0); + assert_se(config_parse_ip_address_access(u->id, "filename", 1, "Service", 1, "IPAddressDeny", 0, "127.0.0.1/25", &cc->ip_address_deny, NULL) == 0); + assert_se(config_parse_ip_address_access(u->id, "filename", 1, "Service", 1, "IPAddressDeny", 0, "127.0.0.4", &cc->ip_address_deny, NULL) == 0); + + assert(cc->ip_address_allow); + assert(cc->ip_address_allow->items_next); + assert(!cc->ip_address_allow->items_next->items_next); + + /* The deny list is defined redundantly, let's ensure it got properly reduced */ + assert(cc->ip_address_deny); + assert(cc->ip_address_deny->items_next); + assert(!cc->ip_address_deny->items_next->items_next); + + assert_se(config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", SERVICE_EXEC_START, "/usr/bin/ping -c 1 127.0.0.2 -W 5", SERVICE(u)->exec_command, u) == 0); + assert_se(config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart", SERVICE_EXEC_START, "/usr/bin/ping -c 1 127.0.0.3 -W 5", SERVICE(u)->exec_command, u) == 0); + + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]); + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next); + assert_se(!SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next->command_next); + + SERVICE(u)->type = SERVICE_ONESHOT; + u->load_state = UNIT_LOADED; + + unit_dump(u, stdout, NULL); + + r = bpf_firewall_compile(u); + if (IN_SET(r, -ENOTTY, -ENOSYS, -EPERM )) { + /* Kernel doesn't support the necessary bpf bits, or masked out via seccomp? */ + manager_free(m); + return EXIT_TEST_SKIP; + } + assert_se(r >= 0); + + assert(u->ip_bpf_ingress); + assert(u->ip_bpf_egress); + + r = bpf_program_load_kernel(u->ip_bpf_ingress, log_buf, ELEMENTSOF(log_buf)); + + log_notice("log:"); + log_notice("-------"); + log_notice("%s", log_buf); + log_notice("-------"); + + assert(r >= 0); + + r = bpf_program_load_kernel(u->ip_bpf_egress, log_buf, ELEMENTSOF(log_buf)); + + log_notice("log:"); + log_notice("-------"); + log_notice("%s", log_buf); + log_notice("-------"); + + assert(r >= 0); + + assert(unit_start(u) >= 0); + + while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED)) + assert_se(sd_event_run(m->event, UINT64_MAX) >= 0); + + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code == CLD_EXITED && + SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.status == EXIT_SUCCESS); + + assert_se(SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next->exec_status.code != CLD_EXITED || + SERVICE(u)->exec_command[SERVICE_EXEC_START]->command_next->exec_status.status != EXIT_SUCCESS); + + manager_free(m); + + return 0; +} From 078ba556da253e994724aa5565d570b5d22e0c17 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 6 Sep 2017 17:56:15 +0200 Subject: [PATCH 22/42] core: warn loudly if IP firewalling is configured but not in effect --- src/core/dbus-cgroup.c | 10 ++++++++++ src/core/ip-address-access.c | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 42381eca364..f61ca08fcba 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -21,6 +21,7 @@ #include "af-list.h" #include "alloc-util.h" +#include "bpf-firewall.h" #include "bus-util.h" #include "cgroup-util.h" #include "cgroup.h" @@ -1321,6 +1322,15 @@ int bus_cgroup_set_property( if (r < 0) return r; unit_write_drop_in_private(u, mode, name, buf); + + if (*list) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == 0) + log_warning("Transient unit %s configures an IP firewall, but the local system does not support BPF/cgroup firewalling.\n" + "Proceeding WITHOUT firewalling in effect!", u->id); + } } return 1; diff --git a/src/core/ip-address-access.c b/src/core/ip-address-access.c index 18d28708bed..cfb7d51c4f6 100644 --- a/src/core/ip-address-access.c +++ b/src/core/ip-address-access.c @@ -21,6 +21,7 @@ #include #include "alloc-util.h" +#include "bpf-firewall.h" #include "extract-word.h" #include "hostname-util.h" #include "ip-address-access.h" @@ -150,6 +151,15 @@ int config_parse_ip_address_access( *list = ip_address_access_reduce(*list); + if (*list) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == 0) + log_warning("File %s:%u configures an IP firewall (%s=%s), but the local system does not support BPF/cgroup based firewalling.\n" + "Proceeding WITHOUT firewalling in effect!", filename, line, lvalue, rvalue); + } + return 0; } From 5ed272cf928b5f2d7420b5568390a8190c8d52fe Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Sep 2017 11:15:27 +0200 Subject: [PATCH 23/42] socket-label: let's use IN_SET, so that we have to call socket_address_family() only once --- src/basic/socket-label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/socket-label.c b/src/basic/socket-label.c index 6d1dc83874f..6e7cdaac633 100644 --- a/src/basic/socket-label.c +++ b/src/basic/socket-label.c @@ -83,7 +83,7 @@ int socket_address_listen( return -errno; } - if (socket_address_family(a) == AF_INET || socket_address_family(a) == AF_INET6) { + if (IN_SET(socket_address_family(a), AF_INET, AF_INET6)) { if (bind_to_device) if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, bind_to_device, strlen(bind_to_device)+1) < 0) return -errno; From a79279c7fd9f43680da7d5cac382981cf7714f52 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Sep 2017 11:17:43 +0200 Subject: [PATCH 24/42] core: when creating the socket fds for a socket unit, join socket's cgroup first Let's make sure that a socket unit's IPAddressAllow=/IPAddressDeny= settings are in effect on all socket fds associated with it. In order to make this happen we need to make sure the cgroup the fds are associated with are the socket unit's cgroup. The only way to do that is invoking socket()+accept() in them. Since we really don't want to migrate PID 1 around we do this by forking off a helper process, which invokes socket()/accept() and sends the newly created fd to PID 1. Ugly, but works, and there's apparently no better way right now. This generalizes forking off per-unit helper processes in a new function unit_fork_helper_process(), which is then also used by the NSS chown() code of socket units. --- src/core/socket.c | 257 +++++++++++++++++++++++++++++++++++++--------- src/core/unit.c | 40 ++++++++ src/core/unit.h | 2 + 3 files changed, 248 insertions(+), 51 deletions(-) diff --git a/src/core/socket.c b/src/core/socket.c index ec901fbdd7a..891a2c8fdb3 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -29,6 +29,7 @@ #include #include "alloc-util.h" +#include "bpf-firewall.h" #include "bus-error.h" #include "bus-util.h" #include "copy.h" @@ -37,6 +38,7 @@ #include "exit-status.h" #include "fd-util.h" #include "format-util.h" +#include "in-addr-util.h" #include "io-util.h" #include "label.h" #include "log.h" @@ -56,7 +58,6 @@ #include "unit-name.h" #include "unit.h" #include "user-util.h" -#include "in-addr-util.h" struct SocketPeer { unsigned n_ref; @@ -1437,6 +1438,102 @@ no_label: return 0; } +static int socket_address_listen_do( + Socket *s, + const SocketAddress *address, + const char *label) { + + assert(s); + assert(address); + + return socket_address_listen( + address, + SOCK_CLOEXEC|SOCK_NONBLOCK, + s->backlog, + s->bind_ipv6_only, + s->bind_to_device, + s->reuse_port, + s->free_bind, + s->transparent, + s->directory_mode, + s->socket_mode, + label); +} + +static int socket_address_listen_in_cgroup( + Socket *s, + const SocketAddress *address, + const char *label) { + + _cleanup_close_pair_ int pair[2] = { -1, -1 }; + int fd, r; + pid_t pid; + + assert(s); + assert(address); + + /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the socket's cgroup + * in which the socket is actually created. This way we ensure the socket is actually properly attached to the + * unit's cgroup for the purpose of BPF filtering and such. */ + + if (!IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) + goto shortcut; /* BPF filtering only applies to IPv4 + IPv6, shortcut things for other protocols */ + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == 0) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */ + goto shortcut; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); + + r = unit_fork_helper_process(UNIT(s), &pid); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to fork off listener stub process: %m"); + if (r == 0) { + /* Child */ + + pair[0] = safe_close(pair[0]); + + fd = socket_address_listen_do(s, address, label); + if (fd < 0) { + log_unit_error_errno(UNIT(s), fd, "Failed to create listening socket: %m"); + _exit(EXIT_FAILURE); + } + + r = send_one_fd(pair[1], fd, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to send listening socket to parent: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + fd = receive_one_fd(pair[0], 0); + + /* We synchronously wait for the helper, as it shouldn't be slow */ + r = wait_for_terminate_and_warn("listen-cgroup-helper", pid, false); + if (r < 0) { + safe_close(fd); + return r; + } + + if (fd < 0) + return log_unit_error_errno(UNIT(s), fd, "Failed to receive listening socket: %m"); + + return fd; + +shortcut: + fd = socket_address_listen_do(s, address, label); + if (fd < 0) + return log_error_errno(fd, "Failed to create listening socket: %m"); + + return fd; +} + static int socket_open_fds(Socket *s) { _cleanup_(mac_selinux_freep) char *label = NULL; bool know_label = false; @@ -1480,18 +1577,7 @@ static int socket_open_fds(Socket *s) { break; } - r = socket_address_listen( - &p->address, - SOCK_CLOEXEC|SOCK_NONBLOCK, - s->backlog, - s->bind_ipv6_only, - s->bind_to_device, - s->reuse_port, - s->free_bind, - s->transparent, - s->directory_mode, - s->socket_mode, - label); + r = socket_address_listen_in_cgroup(s, &p->address, label); if (r < 0) goto rollback; @@ -1829,27 +1915,23 @@ static int socket_chown(Socket *s, pid_t *_pid) { /* We have to resolve the user names out-of-process, hence * let's fork here. It's messy, but well, what can we do? */ - pid = fork(); - if (pid < 0) - return -errno; - - if (pid == 0) { - SocketPort *p; + r = unit_fork_helper_process(UNIT(s), &pid); + if (r < 0) + return r; + if (r == 0) { uid_t uid = UID_INVALID; gid_t gid = GID_INVALID; - int ret; + SocketPort *p; - (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE, -1); - (void) ignore_signals(SIGPIPE, -1); - log_forget_fds(); + /* Child */ if (!isempty(s->user)) { const char *user = s->user; r = get_user_creds(&user, &uid, &gid, NULL, NULL); if (r < 0) { - ret = EXIT_USER; - goto fail_child; + log_unit_error_errno(UNIT(s), r, "Failed to resolve user %s: %m", user); + _exit(EXIT_USER); } } @@ -1858,8 +1940,8 @@ static int socket_chown(Socket *s, pid_t *_pid) { r = get_group_creds(&group, &gid); if (r < 0) { - ret = EXIT_GROUP; - goto fail_child; + log_unit_error_errno(UNIT(s), r, "Failed to resolve group %s: %m", group); + _exit(EXIT_GROUP); } } @@ -1875,19 +1957,12 @@ static int socket_chown(Socket *s, pid_t *_pid) { continue; if (chown(path, uid, gid) < 0) { - r = -errno; - ret = EXIT_CHOWN; - goto fail_child; + log_unit_error_errno(UNIT(s), errno, "Failed to chown(): %m"); + _exit(EXIT_CHOWN); } } - _exit(0); - - fail_child: - log_open(); - log_error_errno(r, "Failed to chown socket at step %s: %m", exit_status_to_string(ret, EXIT_STATUS_SYSTEMD)); - - _exit(ret); + _exit(EXIT_SUCCESS); } r = unit_watch_pid(UNIT(s), pid); @@ -2699,6 +2774,97 @@ _pure_ static bool socket_check_gc(Unit *u) { return s->n_connections > 0; } +static int socket_accept_do(Socket *s, int fd) { + int cfd; + + assert(s); + assert(fd >= 0); + + for (;;) { + cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK); + if (cfd < 0) { + if (errno == EINTR) + continue; + + return -errno; + } + + break; + } + + return cfd; +} + +static int socket_accept_in_cgroup(Socket *s, SocketPort *p, int fd) { + _cleanup_close_pair_ int pair[2] = { -1, -1 }; + int cfd, r; + pid_t pid; + + assert(s); + assert(p); + assert(fd >= 0); + + /* Similar to socket_address_listen_in_cgroup(), but for accept() rathern than socket(): make sure that any + * connection socket is also properly associated with the cgroup. */ + + if (!IN_SET(p->address.sockaddr.sa.sa_family, AF_INET, AF_INET6)) + goto shortcut; + + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r == 0) + goto shortcut; + + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); + + r = unit_fork_helper_process(UNIT(s), &pid); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to fork off accept stub process: %m"); + if (r == 0) { + /* Child */ + + pair[0] = safe_close(pair[0]); + + cfd = socket_accept_do(s, fd); + if (cfd < 0) { + log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m"); + _exit(EXIT_FAILURE); + } + + r = send_one_fd(pair[1], cfd, 0); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to send connection socket to parent: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + cfd = receive_one_fd(pair[0], 0); + + /* We synchronously wait for the helper, as it shouldn't be slow */ + r = wait_for_terminate_and_warn("accept-cgroup-helper", pid, false); + if (r < 0) { + safe_close(cfd); + return r; + } + + if (cfd < 0) + return log_unit_error_errno(UNIT(s), cfd, "Failed to receive connection socket: %m"); + + return cfd; + +shortcut: + cfd = socket_accept_do(s, fd); + if (cfd < 0) + return log_unit_error_errno(UNIT(s), cfd, "Failed to accept connection socket: %m"); + + return cfd; +} + static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, void *userdata) { SocketPort *p = userdata; int cfd = -1; @@ -2724,20 +2890,9 @@ static int socket_dispatch_io(sd_event_source *source, int fd, uint32_t revents, p->type == SOCKET_SOCKET && socket_address_can_accept(&p->address)) { - for (;;) { - - cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK); - if (cfd < 0) { - - if (errno == EINTR) - continue; - - log_unit_error_errno(UNIT(p->socket), errno, "Failed to accept socket: %m"); - goto fail; - } - - break; - } + cfd = socket_accept_in_cgroup(p->socket, p, fd); + if (cfd < 0) + goto fail; socket_apply_socket_options(p->socket, cfd); } diff --git a/src/core/unit.c b/src/core/unit.c index dc709d7ca45..bb40baf2363 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -4476,3 +4476,43 @@ void unit_set_exec_params(Unit *s, ExecParameters *p) { c = unit_get_cgroup_context(s); SET_FLAG(p->flags, EXEC_CGROUP_DELEGATE, c && c->delegate); } + +int unit_fork_helper_process(Unit *u, pid_t *ret) { + pid_t pid; + int r; + + assert(u); + assert(ret); + + /* Forks off a helper process and makes sure it is a member of the unit's cgroup. Returns == 0 in the child, + * and > 0 in the parent. The pid parameter is always filled in with the child's PID. */ + + (void) unit_realize_cgroup(u); + + pid = fork(); + if (pid < 0) + return -errno; + + if (pid == 0) { + + (void) default_signals(SIGNALS_CRASH_HANDLER, SIGNALS_IGNORE, -1); + (void) ignore_signals(SIGPIPE, -1); + + log_close(); + log_open(); + + if (u->cgroup_path) { + r = cg_attach_everywhere(u->manager->cgroup_supported, u->cgroup_path, 0, NULL, NULL); + if (r < 0) { + log_unit_error_errno(u, r, "Failed to join unit cgroup %s: %m", u->cgroup_path); + _exit(EXIT_CGROUP); + } + } + + *ret = getpid_cached(); + return 0; + } + + *ret = pid; + return 1; +} diff --git a/src/core/unit.h b/src/core/unit.h index 598cc6ede61..a707e5259de 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -682,6 +682,8 @@ bool unit_shall_confirm_spawn(Unit *u); void unit_set_exec_params(Unit *s, ExecParameters *p); +int unit_fork_helper_process(Unit *u, pid_t *ret); + /* Macros which append UNIT= or USER_UNIT= to the message */ #define log_unit_full(unit, level, error, ...) \ From 6b659ed87e98a69ab60ce2220f50a64af54a21d3 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Sep 2017 14:07:13 +0200 Subject: [PATCH 25/42] core: serialize/deserialize IP accounting across daemon reload/reexec Make sure the current IP accounting counters aren't lost during reload/reexec. Note that we destroy all BPF file objects during a reload: the BPF programs, the access and the accounting maps. The former two need to be regenerated anyway with the newly loaded configuration data, but the latter one needs to survive reloads/reexec. In this implementation I opted to only save/restore the accounting map content instead of the map itself. While this opens a (theoretic) window where IP traffic is still accounted to the old map after we read it out, and we thus miss a few bytes this has the benefit that we can alter the map layout between versions should the need arise. --- src/core/cgroup.c | 15 +++++++++++++-- src/core/unit.c | 32 ++++++++++++++++++++++++++++++++ src/core/unit.h | 3 +++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index af611e7e7bc..c6667b39c78 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2224,6 +2224,7 @@ int unit_get_ip_accounting( CGroupIPAccountingMetric metric, uint64_t *ret) { + uint64_t value; int fd, r; assert(u); @@ -2239,9 +2240,17 @@ int unit_get_ip_accounting( return -ENODATA; if (IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_EGRESS_BYTES)) - r = bpf_firewall_read_accounting(fd, ret, NULL); + r = bpf_firewall_read_accounting(fd, &value, NULL); else - r = bpf_firewall_read_accounting(fd, NULL, ret); + r = bpf_firewall_read_accounting(fd, NULL, &value); + if (r < 0) + return r; + + /* Add in additional metrics from a previous runtime. Note that when reexecing/reloading the daemon we compile + * all BPF programs and maps anew, but serialize the old counters. When deserializing we store them in the + * ip_accounting_extra[] field, and add them in here transparently. */ + + *ret = value + u->ip_accounting_extra[metric]; return r; } @@ -2275,6 +2284,8 @@ int unit_reset_ip_accounting(Unit *u) { if (u->ip_accounting_egress_map_fd >= 0) q = bpf_firewall_reset_accounting(u->ip_accounting_egress_map_fd); + zero(u->ip_accounting_extra); + return r < 0 ? r : q; } diff --git a/src/core/unit.c b/src/core/unit.c index bb40baf2363..5ed57644551 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -2770,7 +2770,15 @@ static int unit_serialize_cgroup_mask(FILE *f, const char *key, CGroupMask mask) return r; } +static const char *ip_accounting_metric_field[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "ip-accounting-ingress-bytes", + [CGROUP_IP_INGRESS_PACKETS] = "ip-accounting-ingress-packets", + [CGROUP_IP_EGRESS_BYTES] = "ip-accounting-egress-bytes", + [CGROUP_IP_EGRESS_PACKETS] = "ip-accounting-egress-packets", +}; + int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { + CGroupIPAccountingMetric m; int r; assert(u); @@ -2831,6 +2839,14 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) { bus_track_serialize(u->bus_track, f, "ref"); + for (m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + uint64_t v; + + r = unit_get_ip_accounting(u, m, &v); + if (r >= 0) + unit_serialize_item_format(u, f, ip_accounting_metric_field[m], "%" PRIu64, v); + } + if (serialize_jobs) { if (u->job) { fprintf(f, "job\n"); @@ -2937,6 +2953,7 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { for (;;) { char line[LINE_MAX], *l, *v; + CGroupIPAccountingMetric m; size_t k; if (!fgets(line, sizeof(line), f)) { @@ -3147,6 +3164,21 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { continue; } + /* Check if this is an IP accounting metric serialization field */ + for (m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) + if (streq(l, ip_accounting_metric_field[m])) + break; + if (m < _CGROUP_IP_ACCOUNTING_METRIC_MAX) { + uint64_t c; + + r = safe_atou64(v, &c); + if (r < 0) + log_unit_debug(u, "Failed to parse IP accounting value %s, ignoring.", v); + else + u->ip_accounting_extra[m] = c; + continue; + } + if (unit_can_serialize(u)) { if (rt) { r = exec_runtime_deserialize_item(u, rt, l, v, fds); diff --git a/src/core/unit.h b/src/core/unit.h index a707e5259de..2759bd07a28 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -34,6 +34,7 @@ typedef struct UnitStatusMessageFormats UnitStatusMessageFormats; #include "install.h" #include "list.h" #include "unit-name.h" +#include "cgroup.h" typedef enum KillOperation { KILL_TERMINATE, @@ -224,6 +225,8 @@ struct Unit { BPFProgram *ip_bpf_ingress; BPFProgram *ip_bpf_egress; + uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; + /* How to start OnFailure units */ JobMode on_failure_job_mode; From 58d83430e1276fe8d1224c2b5f76e756d143a375 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Sep 2017 14:32:33 +0200 Subject: [PATCH 26/42] core: when coming back from reload/reexec, reapply all cgroup properties With this change we'll invalidate all cgroup settings after coming back from a daemon reload/reexec, so that the new settings are instantly applied. This is useful for the BPF case, because we don't serialize/deserialize the BPF program fd, and hence have to install a new, updated BPF program when coming back from the reload/reexec. However, this is also useful for the rest of the cgroup settings, as it ensures that user configuration really takes effect wherever we can. --- src/core/cgroup.c | 1 + src/core/unit.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index c6667b39c78..e9cb0d35c91 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1533,6 +1533,7 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) { } static void unit_add_to_cgroup_queue(Unit *u) { + assert(u); if (u->in_cgroup_queue) return; diff --git a/src/core/unit.c b/src/core/unit.c index 5ed57644551..68295d64455 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3205,6 +3205,11 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) { if (!dual_timestamp_is_set(&u->state_change_timestamp)) dual_timestamp_get(&u->state_change_timestamp); + /* Let's make sure that everything that is deserialized also gets any potential new cgroup settings applied + * after we are done. For that we invalidate anything already realized, so that we can realize it again. */ + unit_invalidate_cgroup(u, _CGROUP_MASK_ALL); + unit_invalidate_cgroup_bpf(u); + return 0; } From cf3b4be101acb396fe3b9504685a970be7f86764 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Sep 2017 16:31:01 +0200 Subject: [PATCH 27/42] cgroup: refuse to return accounting data if accounting isn't turned on We used to be a bit sloppy on this, and handed out accounting data even for units where accounting wasn't explicitly enabled. Let's be stricter here, so that we know the accounting data is actually fully valid. This is necessary, as the accounting data is no longer stored exclusively in cgroupfs, but is partly maintained external of that, and flushed during unit starts. We should hence only expose accounting data we really know is fully current. --- src/core/cgroup.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index e9cb0d35c91..9a0d374aa8d 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2088,11 +2088,18 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { int unit_get_memory_current(Unit *u, uint64_t *ret) { _cleanup_free_ char *v = NULL; + CGroupContext *cc; int r; assert(u); assert(ret); + cc = unit_get_cgroup_context(u); + if (!cc) + return -ENODATA; + if (!cc->memory_accounting) + return -ENODATA; + if (!u->cgroup_path) return -ENODATA; @@ -2116,11 +2123,18 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) { int unit_get_tasks_current(Unit *u, uint64_t *ret) { _cleanup_free_ char *v = NULL; + CGroupContext *cc; int r; assert(u); assert(ret); + cc = unit_get_cgroup_context(u); + if (!cc) + return -ENODATA; + if (!cc->tasks_accounting) + return -ENODATA; + if (!u->cgroup_path) return -ENODATA; @@ -2187,6 +2201,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) { } int unit_get_cpu_usage(Unit *u, nsec_t *ret) { + CGroupContext *cc; nsec_t ns; int r; @@ -2196,6 +2211,12 @@ int unit_get_cpu_usage(Unit *u, nsec_t *ret) { * started. If the cgroup has been removed already, returns the last cached value. To cache the value, simply * call this function with a NULL return value. */ + cc = unit_get_cgroup_context(u); + if (!cc) + return -ENODATA; + if (!cc->cpu_accounting) + return -ENODATA; + r = unit_get_cpu_usage_raw(u, &ns); if (r == -ENODATA && u->cpu_usage_last != NSEC_INFINITY) { /* If we can't get the CPU usage anymore (because the cgroup was already removed, for example), use our @@ -2225,6 +2246,7 @@ int unit_get_ip_accounting( CGroupIPAccountingMetric metric, uint64_t *ret) { + CGroupContext *cc; uint64_t value; int fd, r; @@ -2233,6 +2255,19 @@ int unit_get_ip_accounting( assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX); assert(ret); + /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are + * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero + * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup + * filters. */ + if (u->type == UNIT_SLICE) + return -ENODATA; + + cc = unit_get_cgroup_context(u); + if (!cc) + return -ENODATA; + if (!cc->ip_accounting) + return -ENODATA; + fd = IN_SET(metric, CGROUP_IP_INGRESS_BYTES, CGROUP_IP_INGRESS_PACKETS) ? u->ip_accounting_ingress_map_fd : u->ip_accounting_egress_map_fd; From 8d8631d4c9e4132d0a09b7d16996b1942e379223 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Sat, 12 Nov 2016 13:38:38 +0100 Subject: [PATCH 28/42] man: document the new ip accounting and filting directives --- man/systemd-system.conf.xml | 15 ++-- man/systemd.resource-control.xml | 117 +++++++++++++++++++++++++++++++ man/systemd.special.xml | 12 ++-- 3 files changed, 129 insertions(+), 15 deletions(-) diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index 336c7a5fd11..81f1b1ef8d0 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -319,17 +319,14 @@ DefaultBlockIOAccounting= DefaultMemoryAccounting= DefaultTasksAccounting= + DefaultIPAccounting= - Configure the default resource accounting - settings, as configured per-unit by - CPUAccounting=, - BlockIOAccounting=, - MemoryAccounting= and - TasksAccounting=. See + Configure the default resource accounting settings, as configured per-unit by + CPUAccounting=, BlockIOAccounting=, MemoryAccounting=, + TasksAccounting= and IPAccounting=. See systemd.resource-control5 - for details on the per-unit - settings. DefaultTasksAccounting= defaults - to on, the other three settings to off. + for details on the per-unit settings. DefaultTasksAccounting= defaults to on, the other + four settings to off. diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index bb69599f998..0c0c91608ac 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -480,6 +480,123 @@ + + IPAccounting= + + + Takes a boolean argument. If true, turns on IPv4 and IPv6 network traffic accounting for packets sent + or received by the unit. When this option is turned on, all IPv4 and IPv6 sockets created by any process of + the unit are accounted for. When this option is used in socket units, it applies to all IPv4 and IPv6 sockets + associated with it (including both listening and connection sockets where this applies). Note that for + socket-activated services, this configuration setting and the accounting data of the service unit and the + socket unit are kept separate, and displayed separately. No propagation of the setting and the collected + statistics is done, in either direction. Moreover, any traffic sent or received on any of the socket unit's + sockets is accounted to the socket unit — and never to the service unit it might have activated, even if the + socket is used by it. Note that IP accounting is currently not supported for slice units, and enabling this + option for them has no effect. The system default for this setting may be controlled with + DefaultIPAccounting= in + systemd-system.conf5. + + + + + IPAddressAllow=ADDDRESS[/PREFIXLENGTH]… + IPAddressDeny=ADDRESS[/PREFIXLENGTH]… + + + Turn on address range network traffic filtering for packets sent and received over AF_INET and AF_INET6 + sockets. Both directives take a space separated list of IPv4 or IPv6 addresses, each optionally suffixed + with an address prefix length (separated by a / character). If the latter is omitted, the + address is considered a host address, i.e. the prefix covers the whole address (32 for IPv4, 128 for IPv6). + + + The access lists configured with this option are applied to all sockets created by processes of this + unit (or in the case of socket units, associated with it). The lists are implicitly combined with any lists + configured for any of the parent slice units this unit might be a member of. By default all access lists are + empty. When configured the lists are enforced as follows: + + + Access will be granted in case its destination/source address matches any entry in the + IPAddressAllow= setting. + + Otherwise, access will be denied in case its destination/source address matches any entry + in the IPAddressDeny= setting. + + Otherwise, access will be granted. + + + In order to implement a whitelisting IP firewall, it is recommended to use a + IPAddressDeny=any setting on an upper-level slice unit (such as the + root slice -.slice or the slice containing all system services + system.slice – see + systemd.special7 for + details on these slice units), plus individual per-service IPAddressAllow= lines + permitting network access to relevant services, and only them. + + Note that for socket-activated services, the IP access list configured on the socket unit applies to + all sockets associated with it directly, but not to any sockets created by the ultimately activated services + for it. Conversely, the IP access list configured for the service is not applied to any sockets passed into + the service via socket activation. Thus, it is usually a good idea, to replicate the IP access lists on both + the socket and the service unit, however it often makes sense to maintain one list more open and the other + one more restricted, depending on the usecase. + + If these settings are used multiple times in the same unit the specified lists are combined. If an + empty string is assigned to these settings the specific access list is reset and all previous settings undone. + + In place of explicit IPv4 or IPv6 address and prefix length specifications a small set of symbolic + names may be used. The following names are defined: + + + Special address/network names + + + + + + + + + Symbolic Name + Definition + Meaning + + + + + + any + 0.0.0.0/0 ::/0 + Any host + + + + localhost + 127.0.0.0/8 ::1/128 + All addresses on the local loopback + + + + link-local + 169.254.0.0/16 fe80::/64 + All link-local IP addresses + + + + multicast + 224.0.0.0/4 ff00::/8 + All IP multicasting addresses + + + +
+ + Note that these settings might not be supported on some systems (for example if eBPF control group + support is not enabled in the underlying kernel or container manager). These settings will have no effect in + that case. If compatibility with such systems is desired it is hence recommended to not exclusively rely on + them for IP security. +
+
+ DeviceAllow= diff --git a/man/systemd.special.xml b/man/systemd.special.xml index 73e1e720e9c..5a831149bcf 100644 --- a/man/systemd.special.xml +++ b/man/systemd.special.xml @@ -1009,17 +1009,17 @@ PartOf=graphical-session.target Special Slice Units - There are four .slice units which form - the basis of the hierarchy for assignment of resources for - services, users, and virtual machines or containers. + There are four .slice units which form the basis of the hierarchy for assignment of + resources for services, users, and virtual machines or containers. See + -.slice7 for details about slice + units. -.slice - The root slice is the root of the hierarchy. It - usually does not contain units directly, but may be used to - set defaults for the whole tree. + The root slice is the root of the slice hierarchy. It usually does not contain units directly, but may + be used to set defaults for the whole tree. From f1c50becda3a6ef44e43f503e138f5f4a4884ce3 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 20 Sep 2017 18:27:53 +0200 Subject: [PATCH 29/42] core: make sure to log invocation ID of units also when doing structured logging --- src/core/execute.c | 5 +++++ src/core/job.c | 2 ++ src/core/manager.c | 4 ++-- src/core/service.c | 2 ++ src/core/unit.c | 3 +++ src/core/unit.h | 1 + 6 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index 28c6b2fc389..6b4336430e9 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3150,6 +3150,7 @@ static int exec_child( "EXECUTABLE=%s", command->path, LOG_UNIT_MESSAGE(unit, "Executing: %s", line), LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), NULL); log_close(); } @@ -3223,6 +3224,7 @@ int exec_spawn(Unit *unit, LOG_UNIT_MESSAGE(unit, "About to execute: %s", line), "EXECUTABLE=%s", command->path, LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), NULL); pid = fork(); if (pid < 0) @@ -3254,6 +3256,7 @@ int exec_spawn(Unit *unit, log_struct_errno(LOG_ERR, r, "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), LOG_UNIT_MESSAGE(unit, "%s: %m", error_message), "EXECUTABLE=%s", command->path, @@ -3262,6 +3265,7 @@ int exec_spawn(Unit *unit, log_struct_errno(LOG_INFO, r, "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), LOG_UNIT_MESSAGE(unit, "Skipped spawning %s: %m", command->path), "EXECUTABLE=%s", command->path, @@ -3270,6 +3274,7 @@ int exec_spawn(Unit *unit, log_struct_errno(LOG_ERR, r, "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, LOG_UNIT_ID(unit), + LOG_UNIT_INVOCATION_ID(unit), LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m", exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD), command->path), diff --git a/src/core/job.c b/src/core/job.c index 8e2039d3214..dd0733030c0 100644 --- a/src/core/job.c +++ b/src/core/job.c @@ -808,6 +808,7 @@ static void job_log_status_message(Unit *u, JobType t, JobResult result) { LOG_MESSAGE("%s", buf), "RESULT=%s", job_result_to_string(result), LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), NULL); return; } @@ -816,6 +817,7 @@ static void job_log_status_message(Unit *u, JobType t, JobResult result) { LOG_MESSAGE("%s", buf), "RESULT=%s", job_result_to_string(result), LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), mid, NULL); } diff --git a/src/core/manager.c b/src/core/manager.c index 032e75d7985..5cf4bc4ee60 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -631,13 +631,13 @@ int manager_new(UnitFileScope scope, unsigned test_run_flags, Manager **_m) { m->unit_log_format_string = "UNIT=%s"; m->invocation_log_field = "INVOCATION_ID="; - m->invocation_log_format_string = "INVOCATION_ID=" SD_ID128_FORMAT_STR; + m->invocation_log_format_string = "INVOCATION_ID=%s"; } else { m->unit_log_field = "USER_UNIT="; m->unit_log_format_string = "USER_UNIT=%s"; m->invocation_log_field = "USER_INVOCATION_ID="; - m->invocation_log_format_string = "USER_INVOCATION_ID=" SD_ID128_FORMAT_STR; + m->invocation_log_format_string = "USER_INVOCATION_ID=%s"; } m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1; diff --git a/src/core/service.c b/src/core/service.c index b0ce9bfcfa0..21fc4e2abec 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1956,6 +1956,7 @@ static void service_enter_restart(Service *s) { log_struct(LOG_INFO, "MESSAGE_ID=" SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR, LOG_UNIT_ID(UNIT(s)), + LOG_UNIT_INVOCATION_ID(UNIT(s)), LOG_UNIT_MESSAGE(UNIT(s), "Scheduled restart job, restart counter is at %u.", s->n_restarts), "N_RESTARTS=%u", s->n_restarts, NULL); @@ -2951,6 +2952,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { "EXIT_CODE=%s", sigchld_code_to_string(code), "EXIT_STATUS=%i", status, LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), NULL); if (s->result == SERVICE_SUCCESS) diff --git a/src/core/unit.c b/src/core/unit.c index 68295d64455..65a8c77f86f 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -1544,6 +1544,7 @@ static void unit_status_log_starting_stopping_reloading(Unit *u, JobType t) { log_struct(LOG_INFO, LOG_MESSAGE("%s", buf), LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), mid, NULL); } @@ -4242,6 +4243,7 @@ void unit_warn_if_dir_nonempty(Unit *u, const char* where) { log_struct(LOG_NOTICE, "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR, LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), LOG_UNIT_MESSAGE(u, "Directory %s to mount over is not empty, mounting anyway.", where), "WHERE=%s", where, NULL); @@ -4264,6 +4266,7 @@ int unit_fail_if_symlink(Unit *u, const char* where) { log_struct(LOG_ERR, "MESSAGE_ID=" SD_MESSAGE_OVERMOUNTING_STR, LOG_UNIT_ID(u), + LOG_UNIT_INVOCATION_ID(u), LOG_UNIT_MESSAGE(u, "Mount on symlink %s not allowed.", where), "WHERE=%s", where, NULL); diff --git a/src/core/unit.h b/src/core/unit.h index 2759bd07a28..9aa00b056f1 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -710,3 +710,4 @@ int unit_fork_helper_process(Unit *u, pid_t *ret); #define LOG_UNIT_MESSAGE(unit, fmt, ...) "MESSAGE=%s: " fmt, (unit)->id, ##__VA_ARGS__ #define LOG_UNIT_ID(unit) (unit)->manager->unit_log_format_string, (unit)->id +#define LOG_UNIT_INVOCATION_ID(unit) (unit)->manager->invocation_log_format_string, (unit)->invocation_id_string From dba1bd4396b50f8d86f729ac4762a1ef0612d7a8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 20 Sep 2017 18:28:29 +0200 Subject: [PATCH 30/42] documentation: document nss-systemd's internal environment variables in ENVIRONMENT.md --- ENVIRONMENT.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ENVIRONMENT.md b/ENVIRONMENT.md index e542d4ec6fd..4ae561a8924 100644 --- a/ENVIRONMENT.md +++ b/ENVIRONMENT.md @@ -64,3 +64,17 @@ installed systemd tests: * `$SYSTEMD_TEST_DATA` — override the location of test data. This is useful if a test executable is moved to an arbitrary location. + +nss-systemd: + +* `$SYSTEMD_NSS_BYPASS_SYNTHETIC=1` — if set, `nss-systemd` won't synthesize + user/group records for the `root` and `nobody` users if they are missing from + `/etc/passwd`. + +* `$SYSTEMD_NSS_DYNAMIC_BYPASS=1` — if set, `nss-systemd` won't return + user/group records for dynamically registered service users (i.e. users + registered through `DynamicUser=1`). + +* `$SYSTEMD_NSS_BYPASS_BUS=1` — if set, `nss-systemd` won't use D-Bus to do + dynamic user lookups. This is primarily useful to make `nss-systemd` work + safely from within `dbus-daemon`. From 646cc98dc81c4d0edbc1b57e7bca0f474b47e270 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 20 Sep 2017 18:29:08 +0200 Subject: [PATCH 31/42] =?UTF-8?q?job:=20change=20result=20field=20for=20lo?= =?UTF-8?q?g=20message=20about=20job=20result=20RESULT=3D=20=E2=86=92=20JO?= =?UTF-8?q?B=5FRESULT=3D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So, currently, some of the structured log messages we generated based on jobs carry the result in RESULT=, and others in JOB_RESULT=. Let's streamline this, as stick to JOB_RESULT= in one place. This is kind of an API break, but given that currently most software has to check both fields anyway, I think we can get away with it. Why unify on JOB_RESULT= rather than RESULT=? Well, we manage different types of result codes in systemd. Most importanlty besides job results there are also service results, and we should be explicit in what we mean here. --- src/core/job.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/core/job.c b/src/core/job.c index dd0733030c0..f04c8a21683 100644 --- a/src/core/job.c +++ b/src/core/job.c @@ -806,7 +806,8 @@ static void job_log_status_message(Unit *u, JobType t, JobResult result) { default: log_struct(job_result_log_level[result], LOG_MESSAGE("%s", buf), - "RESULT=%s", job_result_to_string(result), + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), LOG_UNIT_ID(u), LOG_UNIT_INVOCATION_ID(u), NULL); @@ -815,7 +816,8 @@ static void job_log_status_message(Unit *u, JobType t, JobResult result) { log_struct(job_result_log_level[result], LOG_MESSAGE("%s", buf), - "RESULT=%s", job_result_to_string(result), + "JOB_TYPE=%s", job_type_to_string(t), + "JOB_RESULT=%s", job_result_to_string(result), LOG_UNIT_ID(u), LOG_UNIT_INVOCATION_ID(u), mid, @@ -823,6 +825,7 @@ static void job_log_status_message(Unit *u, JobType t, JobResult result) { } static void job_emit_status_message(Unit *u, JobType t, JobResult result) { + assert(u); /* No message if the job did not actually do anything due to failed condition. */ if (t == JOB_START && result == JOB_DONE && !u->condition_result) @@ -905,7 +908,7 @@ int job_finish_and_invalidate(Job *j, JobResult result, bool recursive, bool alr * the unit itself. We don't treat JOB_CANCELED as failure in * this context. And JOB_FAILURE is already handled by the * unit itself. */ - if (result == JOB_TIMEOUT || result == JOB_DEPENDENCY) { + if (IN_SET(result, JOB_TIMEOUT, JOB_DEPENDENCY)) { log_struct(LOG_NOTICE, "JOB_TYPE=%s", job_type_to_string(t), "JOB_RESULT=%s", job_result_to_string(result), From e6a7ec4b8e33f38f578e12af9ae9ca7ddde80aac Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 13:52:34 +0200 Subject: [PATCH 32/42] io-util: add new IOVEC_INIT/IOVEC_MAKE macros This adds IOVEC_INIT() and IOVEC_MAKE() for initializing iovec structures from a pointer and a size. On top of these IOVEC_INIT_STRING() and IOVEC_MAKE_STRING() are added which take a string and automatically determine the size of the string using strlen(). This patch removes the old IOVEC_SET_STRING() macro, given that IOVEC_MAKE_STRING() is now useful for similar purposes. Note that the old IOVEC_SET_STRING() invocations were two characters shorter than the new ones using IOVEC_MAKE_STRING(), but I think the new syntax is more readable and more generic as it simply resolves to a C99 literal structure initialization. Moreover, we can use very similar syntax now for initializing strings and pointer+size iovec entries. We canalso use the new macros to initialize function parameters on-the-fly or array definitions. And given that we shouldn't have so many ways to do the same stuff, let's just settle on the new macros. (This also converts some code to use _cleanup_ where dynamically allocated strings were using IOVEC_SET_STRING() before, to modernize things a bit) --- src/basic/io-util.h | 13 +++---- src/basic/journal-importer.c | 5 ++- src/basic/log.c | 46 +++++++++++----------- src/core/dynamic-user.c | 17 +++------ src/core/execute.c | 6 +-- src/core/show-status.c | 14 +++---- src/coredump/coredump.c | 30 +++++++-------- src/journal/journal-send.c | 70 ++++++++++++++-------------------- src/journal/journald-audit.c | 14 +++---- src/journal/journald-console.c | 22 +++++------ src/journal/journald-kmsg.c | 56 +++++++++++---------------- src/journal/journald-native.c | 2 +- src/journal/journald-server.c | 30 +++++++-------- src/journal/journald-stream.c | 15 ++++---- src/journal/journald-syslog.c | 32 ++++++++-------- src/test/test-fileio.c | 6 +-- 16 files changed, 172 insertions(+), 206 deletions(-) diff --git a/src/basic/io-util.h b/src/basic/io-util.h index 4684ed3bfc7..d9b69adde9a 100644 --- a/src/basic/io-util.h +++ b/src/basic/io-util.h @@ -40,14 +40,6 @@ int fd_wait_for_event(int fd, int event, usec_t timeout); ssize_t sparse_write(int fd, const void *p, size_t sz, size_t run_length); -#define IOVEC_SET_STRING(i, s) \ - do { \ - struct iovec *_i = &(i); \ - char *_s = (char *)(s); \ - _i->iov_base = _s; \ - _i->iov_len = strlen(_s); \ - } while (false) - static inline size_t IOVEC_TOTAL_SIZE(const struct iovec *i, unsigned n) { unsigned j; size_t r = 0; @@ -93,3 +85,8 @@ static inline bool FILE_SIZE_VALID_OR_INFINITY(uint64_t l) { return FILE_SIZE_VALID(l); } + +#define IOVEC_INIT(base, len) { .iov_base = (base), .iov_len = (len) } +#define IOVEC_MAKE(base, len) (struct iovec) IOVEC_INIT(base, len) +#define IOVEC_INIT_STRING(string) IOVEC_INIT((char*) string, strlen(string)) +#define IOVEC_MAKE_STRING(string) (struct iovec) IOVEC_INIT_STRING(string) diff --git a/src/basic/journal-importer.c b/src/basic/journal-importer.c index 7d72effdea9..38ac8deaf34 100644 --- a/src/basic/journal-importer.c +++ b/src/basic/journal-importer.c @@ -20,8 +20,9 @@ #include #include "alloc-util.h" -#include "journal-importer.h" #include "fd-util.h" +#include "io-util.h" +#include "journal-importer.h" #include "parse-util.h" #include "string-util.h" #include "unaligned.h" @@ -38,7 +39,7 @@ static int iovw_put(struct iovec_wrapper *iovw, void* data, size_t len) { if (!GREEDY_REALLOC(iovw->iovec, iovw->size_bytes, iovw->count + 1)) return log_oom(); - iovw->iovec[iovw->count++] = (struct iovec) {data, len}; + iovw->iovec[iovw->count++] = IOVEC_MAKE(data, len); return 0; } diff --git a/src/basic/log.c b/src/basic/log.c index 421ae52dc5f..a43e8206779 100644 --- a/src/basic/log.c +++ b/src/basic/log.c @@ -351,22 +351,22 @@ static int write_to_console( if (log_target == LOG_TARGET_CONSOLE_PREFIXED) { xsprintf(prefix, "<%i>", level); - IOVEC_SET_STRING(iovec[n++], prefix); + iovec[n++] = IOVEC_MAKE_STRING(prefix); } highlight = LOG_PRI(level) <= LOG_ERR && show_color; if (show_location) { snprintf(location, sizeof(location), "(%s:%i) ", file, line); - IOVEC_SET_STRING(iovec[n++], location); + iovec[n++] = IOVEC_MAKE_STRING(location); } if (highlight) - IOVEC_SET_STRING(iovec[n++], ANSI_HIGHLIGHT_RED); - IOVEC_SET_STRING(iovec[n++], buffer); + iovec[n++] = IOVEC_MAKE_STRING(ANSI_HIGHLIGHT_RED); + iovec[n++] = IOVEC_MAKE_STRING(buffer); if (highlight) - IOVEC_SET_STRING(iovec[n++], ANSI_NORMAL); - IOVEC_SET_STRING(iovec[n++], "\n"); + iovec[n++] = IOVEC_MAKE_STRING(ANSI_NORMAL); + iovec[n++] = IOVEC_MAKE_STRING("\n"); if (writev(console_fd, iovec, n) < 0) { @@ -425,11 +425,11 @@ static int write_to_syslog( xsprintf(header_pid, "["PID_FMT"]: ", getpid_cached()); - IOVEC_SET_STRING(iovec[0], header_priority); - IOVEC_SET_STRING(iovec[1], header_time); - IOVEC_SET_STRING(iovec[2], program_invocation_short_name); - IOVEC_SET_STRING(iovec[3], header_pid); - IOVEC_SET_STRING(iovec[4], buffer); + iovec[0] = IOVEC_MAKE_STRING(header_priority); + iovec[1] = IOVEC_MAKE_STRING(header_time); + iovec[2] = IOVEC_MAKE_STRING(program_invocation_short_name); + iovec[3] = IOVEC_MAKE_STRING(header_pid); + iovec[4] = IOVEC_MAKE_STRING(buffer); /* When using syslog via SOCK_STREAM separate the messages by NUL chars */ if (syslog_is_stream) @@ -470,11 +470,11 @@ static int write_to_kmsg( xsprintf(header_priority, "<%i>", level); xsprintf(header_pid, "["PID_FMT"]: ", getpid_cached()); - IOVEC_SET_STRING(iovec[0], header_priority); - IOVEC_SET_STRING(iovec[1], program_invocation_short_name); - IOVEC_SET_STRING(iovec[2], header_pid); - IOVEC_SET_STRING(iovec[3], buffer); - IOVEC_SET_STRING(iovec[4], "\n"); + iovec[0] = IOVEC_MAKE_STRING(header_priority); + iovec[1] = IOVEC_MAKE_STRING(program_invocation_short_name); + iovec[2] = IOVEC_MAKE_STRING(header_pid); + iovec[3] = IOVEC_MAKE_STRING(buffer); + iovec[4] = IOVEC_MAKE_STRING("\n"); if (writev(kmsg_fd, iovec, ELEMENTSOF(iovec)) < 0) return -errno; @@ -547,10 +547,10 @@ static int write_to_journal( log_do_header(header, sizeof(header), level, error, file, line, func, object_field, object, extra_field, extra); - IOVEC_SET_STRING(iovec[0], header); - IOVEC_SET_STRING(iovec[1], "MESSAGE="); - IOVEC_SET_STRING(iovec[2], buffer); - IOVEC_SET_STRING(iovec[3], "\n"); + iovec[0] = IOVEC_MAKE_STRING(header); + iovec[1] = IOVEC_MAKE_STRING("MESSAGE="); + iovec[2] = IOVEC_MAKE_STRING(buffer); + iovec[3] = IOVEC_MAKE_STRING("\n"); mh.msg_iov = iovec; mh.msg_iovlen = ELEMENTSOF(iovec); @@ -872,7 +872,7 @@ int log_format_iovec( * the next format string */ VA_FORMAT_ADVANCE(format, ap); - IOVEC_SET_STRING(iovec[(*n)++], m); + iovec[(*n)++] = IOVEC_MAKE_STRING(m); if (newline_separator) { iovec[*n].iov_base = (char*) &nl; @@ -893,9 +893,9 @@ int log_struct_internal( const char *func, const char *format, ...) { + LogRealm realm = LOG_REALM_REMOVE_LEVEL(level); char buf[LINE_MAX]; bool found = false; - LogRealm realm = LOG_REALM_REMOVE_LEVEL(level); PROTECT_ERRNO; va_list ap; @@ -926,7 +926,7 @@ int log_struct_internal( /* If the journal is available do structured logging */ log_do_header(header, sizeof(header), level, error, file, line, func, NULL, NULL, NULL, NULL); - IOVEC_SET_STRING(iovec[n++], header); + iovec[n++] = IOVEC_MAKE_STRING(header); va_start(ap, format); r = log_format_iovec(iovec, ELEMENTSOF(iovec), &n, true, error, format, ap); diff --git a/src/core/dynamic-user.c b/src/core/dynamic-user.c index e1846e1adbe..9b0dbaf248f 100644 --- a/src/core/dynamic-user.c +++ b/src/core/dynamic-user.c @@ -23,13 +23,14 @@ #include "dynamic-user.h" #include "fd-util.h" +#include "fileio.h" #include "fs-util.h" +#include "io-util.h" #include "parse-util.h" #include "random-util.h" #include "stdio-util.h" #include "string-util.h" #include "user-util.h" -#include "fileio.h" /* Takes a value generated randomly or by hashing and turns it into a UID in the right range */ #define UID_CLAMP_INTO_RANGE(rnd) (((uid_t) (rnd) % (DYNAMIC_UID_MAX - DYNAMIC_UID_MIN + 1)) + DYNAMIC_UID_MIN) @@ -245,8 +246,8 @@ static int pick_uid(const char *name, uid_t *ret_uid) { /* Let's store the user name in the lock file, so that we can use it for looking up the username for a UID */ l = pwritev(lock_fd, (struct iovec[2]) { - { .iov_base = (char*) name, .iov_len = strlen(name) }, - { .iov_base = (char[1]) { '\n' }, .iov_len = 1 } + IOVEC_INIT_STRING(name), + IOVEC_INIT((char[1]) { '\n' }, 1), }, 2, 0); if (l < 0) { (void) unlink(lock_path); @@ -271,10 +272,7 @@ static int pick_uid(const char *name, uid_t *ret_uid) { static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) { uid_t uid = UID_INVALID; - struct iovec iov = { - .iov_base = &uid, - .iov_len = sizeof(uid), - }; + struct iovec iov = IOVEC_INIT(&uid, sizeof(uid)); union { struct cmsghdr cmsghdr; uint8_t buf[CMSG_SPACE(sizeof(int))]; @@ -314,10 +312,7 @@ static int dynamic_user_pop(DynamicUser *d, uid_t *ret_uid, int *ret_lock_fd) { } static int dynamic_user_push(DynamicUser *d, uid_t uid, int lock_fd) { - struct iovec iov = { - .iov_base = &uid, - .iov_len = sizeof(uid), - }; + struct iovec iov = IOVEC_INIT(&uid, sizeof(uid)); union { struct cmsghdr cmsghdr; uint8_t buf[CMSG_SPACE(sizeof(int))]; diff --git a/src/core/execute.c b/src/core/execute.c index 6b4336430e9..0b49be20007 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2351,9 +2351,9 @@ static int send_user_lookup( if (writev(user_lookup_fd, (struct iovec[]) { - { .iov_base = &uid, .iov_len = sizeof(uid) }, - { .iov_base = &gid, .iov_len = sizeof(gid) }, - { .iov_base = unit->id, .iov_len = strlen(unit->id) }}, 3) < 0) + IOVEC_INIT(&uid, sizeof(uid)), + IOVEC_INIT(&gid, sizeof(gid)), + IOVEC_INIT_STRING(unit->id) }, 3) < 0) return -errno; return 0; diff --git a/src/core/show-status.c b/src/core/show-status.c index 65f9cb888af..8c945738440 100644 --- a/src/core/show-status.c +++ b/src/core/show-status.c @@ -93,21 +93,21 @@ int status_vprintf(const char *status, bool ellipse, bool ephemeral, const char } if (prev_ephemeral) - IOVEC_SET_STRING(iovec[n++], "\r" ANSI_ERASE_TO_END_OF_LINE); + iovec[n++] = IOVEC_MAKE_STRING("\r" ANSI_ERASE_TO_END_OF_LINE); prev_ephemeral = ephemeral; if (status) { if (!isempty(status)) { - IOVEC_SET_STRING(iovec[n++], "["); - IOVEC_SET_STRING(iovec[n++], status); - IOVEC_SET_STRING(iovec[n++], "] "); + iovec[n++] = IOVEC_MAKE_STRING("["); + iovec[n++] = IOVEC_MAKE_STRING(status); + iovec[n++] = IOVEC_MAKE_STRING("] "); } else - IOVEC_SET_STRING(iovec[n++], status_indent); + iovec[n++] = IOVEC_MAKE_STRING(status_indent); } - IOVEC_SET_STRING(iovec[n++], s); + iovec[n++] = IOVEC_MAKE_STRING(s); if (!ephemeral) - IOVEC_SET_STRING(iovec[n++], "\n"); + iovec[n++] = IOVEC_MAKE_STRING("\n"); if (writev(fd, iovec, n) < 0) return -errno; diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c index 57d1af454a3..96a4d400f89 100644 --- a/src/coredump/coredump.c +++ b/src/coredump/coredump.c @@ -749,7 +749,7 @@ static int submit_coredump( const char *coredump_filename; coredump_filename = strjoina("COREDUMP_FILENAME=", filename); - IOVEC_SET_STRING(iovec[n_iovec++], coredump_filename); + iovec[n_iovec++] = IOVEC_MAKE_STRING(coredump_filename); } else if (arg_storage == COREDUMP_STORAGE_EXTERNAL) log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)", coredump_size, arg_external_size_max); @@ -804,10 +804,10 @@ log: return 0; } - IOVEC_SET_STRING(iovec[n_iovec++], core_message); + iovec[n_iovec++] = IOVEC_MAKE_STRING(core_message); if (truncated) - IOVEC_SET_STRING(iovec[n_iovec++], "COREDUMP_TRUNCATED=1"); + iovec[n_iovec++] = IOVEC_MAKE_STRING("COREDUMP_TRUNCATED=1"); /* Optionally store the entire coredump in the journal */ if (arg_storage == COREDUMP_STORAGE_JOURNAL) { @@ -817,11 +817,9 @@ log: /* Store the coredump itself in the journal */ r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz); - if (r >= 0) { - iovec[n_iovec].iov_base = coredump_data; - iovec[n_iovec].iov_len = sz; - n_iovec++; - } else + if (r >= 0) + iovec[n_iovec++] = IOVEC_MAKE(coredump_data, sz); + else log_warning_errno(r, "Failed to attach the core to the journal entry: %m"); } else log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)", @@ -1070,7 +1068,7 @@ static char* set_iovec_field(struct iovec iovec[27], size_t *n_iovec, const char x = strappend(field, value); if (x) - IOVEC_SET_STRING(iovec[(*n_iovec)++], x); + iovec[(*n_iovec)++] = IOVEC_MAKE_STRING(x); return x; } @@ -1162,7 +1160,7 @@ static int gather_pid_metadata( if (sd_pid_get_owner_uid(pid, &owner_uid) >= 0) { r = asprintf(&t, "COREDUMP_OWNER_UID=" UID_FMT, owner_uid); if (r > 0) - IOVEC_SET_STRING(iovec[(*n_iovec)++], t); + iovec[(*n_iovec)++] = IOVEC_MAKE_STRING(t); } if (sd_pid_get_slice(pid, &t) >= 0) @@ -1218,7 +1216,7 @@ static int gather_pid_metadata( t = strjoin("COREDUMP_TIMESTAMP=", context[CONTEXT_TIMESTAMP], "000000", NULL); if (t) - IOVEC_SET_STRING(iovec[(*n_iovec)++], t); + iovec[(*n_iovec)++] = IOVEC_MAKE_STRING(t); if (safe_atoi(context[CONTEXT_SIGNAL], &signo) >= 0 && SIGNAL_VALID(signo)) set_iovec_field(iovec, n_iovec, "COREDUMP_SIGNAL_NAME=SIG", signal_to_string(signo)); @@ -1253,10 +1251,10 @@ static int process_kernel(int argc, char* argv[]) { n_iovec = n_to_free; - IOVEC_SET_STRING(iovec[n_iovec++], "MESSAGE_ID=" SD_MESSAGE_COREDUMP_STR); + iovec[n_iovec++] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_COREDUMP_STR); assert_cc(2 == LOG_CRIT); - IOVEC_SET_STRING(iovec[n_iovec++], "PRIORITY=2"); + iovec[n_iovec++] = IOVEC_MAKE_STRING("PRIORITY=2"); assert(n_iovec <= ELEMENTSOF(iovec)); @@ -1344,15 +1342,15 @@ static int process_backtrace(int argc, char *argv[]) { r = log_oom(); goto finish; } - IOVEC_SET_STRING(iovec[n_iovec++], message); + iovec[n_iovec++] = IOVEC_MAKE_STRING(message); } else { for (i = 0; i < importer.iovw.count; i++) iovec[n_iovec++] = importer.iovw.iovec[i]; } - IOVEC_SET_STRING(iovec[n_iovec++], "MESSAGE_ID=" SD_MESSAGE_BACKTRACE_STR); + iovec[n_iovec++] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_BACKTRACE_STR); assert_cc(2 == LOG_CRIT); - IOVEC_SET_STRING(iovec[n_iovec++], "PRIORITY=2"); + iovec[n_iovec++] = IOVEC_MAKE_STRING("PRIORITY=2"); assert(n_iovec <= n_allocated); diff --git a/src/journal/journal-send.c b/src/journal/journal-send.c index 440fba67ca6..5d8b394752a 100644 --- a/src/journal/journal-send.c +++ b/src/journal/journal-send.c @@ -114,9 +114,8 @@ _public_ int sd_journal_printv(int priority, const char *format, va_list ap) { if (isempty(buffer+8)) return 0; - zero(iov); - IOVEC_SET_STRING(iov[0], buffer); - IOVEC_SET_STRING(iov[1], p); + iov[0] = IOVEC_MAKE_STRING(buffer); + iov[1] = IOVEC_MAKE_STRING(p); return sd_journal_sendv(iov, 2); } @@ -167,7 +166,7 @@ _printf_(1, 0) static int fill_iovec_sprintf(const char *format, va_list ap, int (void) strstrip(buffer); /* strip trailing whitespace, keep prefixing whitespace */ - IOVEC_SET_STRING(iov[i++], buffer); + iov[i++] = IOVEC_MAKE_STRING(buffer); format = va_arg(ap, char *); } @@ -259,27 +258,19 @@ _public_ int sd_journal_sendv(const struct iovec *iov, int n) { * newline, then the size (64bit LE), followed * by the data and a final newline */ - w[j].iov_base = iov[i].iov_base; - w[j].iov_len = c - (char*) iov[i].iov_base; - j++; - - IOVEC_SET_STRING(w[j++], "\n"); + w[j++] = IOVEC_MAKE(iov[i].iov_base, c - (char*) iov[i].iov_base); + w[j++] = IOVEC_MAKE_STRING("\n"); l[i] = htole64(iov[i].iov_len - (c - (char*) iov[i].iov_base) - 1); - w[j].iov_base = &l[i]; - w[j].iov_len = sizeof(uint64_t); - j++; - - w[j].iov_base = c + 1; - w[j].iov_len = iov[i].iov_len - (c - (char*) iov[i].iov_base) - 1; - j++; + w[j++] = IOVEC_MAKE(&l[i], sizeof(uint64_t)); + w[j++] = IOVEC_MAKE(c + 1, iov[i].iov_len - (c - (char*) iov[i].iov_base) - 1); } else /* Nothing special? Then just add the line and * append a newline */ w[j++] = iov[i]; - IOVEC_SET_STRING(w[j++], "\n"); + w[j++] = IOVEC_MAKE_STRING("\n"); } if (!have_syslog_identifier && @@ -291,9 +282,9 @@ _public_ int sd_journal_sendv(const struct iovec *iov, int n) { * since everything else is much nicer to retrieve * from the outside. */ - IOVEC_SET_STRING(w[j++], "SYSLOG_IDENTIFIER="); - IOVEC_SET_STRING(w[j++], program_invocation_short_name); - IOVEC_SET_STRING(w[j++], "\n"); + w[j++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER="); + w[j++] = IOVEC_MAKE_STRING(program_invocation_short_name); + w[j++] = IOVEC_MAKE_STRING("\n"); } fd = journal_fd(); @@ -380,9 +371,9 @@ static int fill_iovec_perror_and_send(const char *message, int skip, struct iove xsprintf(error, "ERRNO=%i", _saved_errno_); assert_cc(3 == LOG_ERR); - IOVEC_SET_STRING(iov[skip+0], "PRIORITY=3"); - IOVEC_SET_STRING(iov[skip+1], buffer); - IOVEC_SET_STRING(iov[skip+2], error); + iov[skip+0] = IOVEC_MAKE_STRING("PRIORITY=3"); + iov[skip+1] = IOVEC_MAKE_STRING(buffer); + iov[skip+2] = IOVEC_MAKE_STRING(error); return sd_journal_sendv(iov, skip + 3); } @@ -492,20 +483,19 @@ _public_ int sd_journal_printv_with_location(int priority, const char *file, con * CODE_FUNC=, hence let's do it manually here. */ ALLOCA_CODE_FUNC(f, func); - zero(iov); - IOVEC_SET_STRING(iov[0], buffer); - IOVEC_SET_STRING(iov[1], p); - IOVEC_SET_STRING(iov[2], file); - IOVEC_SET_STRING(iov[3], line); - IOVEC_SET_STRING(iov[4], f); + iov[0] = IOVEC_MAKE_STRING(buffer); + iov[1] = IOVEC_MAKE_STRING(p); + iov[2] = IOVEC_MAKE_STRING(file); + iov[3] = IOVEC_MAKE_STRING(line); + iov[4] = IOVEC_MAKE_STRING(f); return sd_journal_sendv(iov, ELEMENTSOF(iov)); } _public_ int sd_journal_send_with_location(const char *file, const char *line, const char *func, const char *format, ...) { + _cleanup_free_ struct iovec *iov = NULL; int r, i, j; va_list ap; - struct iovec *iov = NULL; char *f; va_start(ap, format); @@ -519,9 +509,9 @@ _public_ int sd_journal_send_with_location(const char *file, const char *line, c ALLOCA_CODE_FUNC(f, func); - IOVEC_SET_STRING(iov[0], file); - IOVEC_SET_STRING(iov[1], line); - IOVEC_SET_STRING(iov[2], f); + iov[0] = IOVEC_MAKE_STRING(file); + iov[1] = IOVEC_MAKE_STRING(line); + iov[2] = IOVEC_MAKE_STRING(f); r = sd_journal_sendv(iov, i); @@ -529,8 +519,6 @@ finish: for (j = 3; j < i; j++) free(iov[j].iov_base); - free(iov); - return r; } @@ -550,9 +538,9 @@ _public_ int sd_journal_sendv_with_location( ALLOCA_CODE_FUNC(f, func); - IOVEC_SET_STRING(niov[n++], file); - IOVEC_SET_STRING(niov[n++], line); - IOVEC_SET_STRING(niov[n++], f); + niov[n++] = IOVEC_MAKE_STRING(file); + niov[n++] = IOVEC_MAKE_STRING(line); + niov[n++] = IOVEC_MAKE_STRING(f); return sd_journal_sendv(niov, n); } @@ -567,9 +555,9 @@ _public_ int sd_journal_perror_with_location( ALLOCA_CODE_FUNC(f, func); - IOVEC_SET_STRING(iov[0], file); - IOVEC_SET_STRING(iov[1], line); - IOVEC_SET_STRING(iov[2], f); + iov[0] = IOVEC_MAKE_STRING(file); + iov[1] = IOVEC_MAKE_STRING(line); + iov[2] = IOVEC_MAKE_STRING(f); return fill_iovec_perror_and_send(message, 3, iov); } diff --git a/src/journal/journald-audit.c b/src/journal/journald-audit.c index 38ac3befddc..869c996aefe 100644 --- a/src/journal/journald-audit.c +++ b/src/journal/journald-audit.c @@ -383,26 +383,26 @@ static void process_audit_string(Server *s, int type, const char *data, size_t s return; } - IOVEC_SET_STRING(iov[n_iov++], "_TRANSPORT=audit"); + iov[n_iov++] = IOVEC_MAKE_STRING("_TRANSPORT=audit"); sprintf(source_time_field, "_SOURCE_REALTIME_TIMESTAMP=%" PRIu64, (usec_t) seconds * USEC_PER_SEC + (usec_t) msec * USEC_PER_MSEC); - IOVEC_SET_STRING(iov[n_iov++], source_time_field); + iov[n_iov++] = IOVEC_MAKE_STRING(source_time_field); sprintf(type_field, "_AUDIT_TYPE=%i", type); - IOVEC_SET_STRING(iov[n_iov++], type_field); + iov[n_iov++] = IOVEC_MAKE_STRING(type_field); sprintf(id_field, "_AUDIT_ID=%" PRIu64, id); - IOVEC_SET_STRING(iov[n_iov++], id_field); + iov[n_iov++] = IOVEC_MAKE_STRING(id_field); assert_cc(4 == LOG_FAC(LOG_AUTH)); - IOVEC_SET_STRING(iov[n_iov++], "SYSLOG_FACILITY=4"); - IOVEC_SET_STRING(iov[n_iov++], "SYSLOG_IDENTIFIER=audit"); + iov[n_iov++] = IOVEC_MAKE_STRING("SYSLOG_FACILITY=4"); + iov[n_iov++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=audit"); type_name = audit_type_name_alloca(type); m = strjoina("MESSAGE=", type_name, " ", p); - IOVEC_SET_STRING(iov[n_iov++], m); + iov[n_iov++] = IOVEC_MAKE_STRING(m); z = n_iov; diff --git a/src/journal/journald-console.c b/src/journal/journald-console.c index 5fbcdb43c2b..039f1a68cef 100644 --- a/src/journal/journald-console.c +++ b/src/journal/journald-console.c @@ -59,9 +59,10 @@ void server_forward_console( struct timespec ts; char tbuf[sizeof("[] ")-1 + DECIMAL_STR_MAX(ts.tv_sec) + DECIMAL_STR_MAX(ts.tv_nsec)-3 + 1]; char header_pid[sizeof("[]: ")-1 + DECIMAL_STR_MAX(pid_t)]; - int n = 0, fd; _cleanup_free_ char *ident_buf = NULL; + _cleanup_close_ int fd = -1; const char *tty; + int n = 0; assert(s); assert(message); @@ -75,7 +76,8 @@ void server_forward_console( xsprintf(tbuf, "[%5"PRI_TIME".%06"PRI_NSEC"] ", ts.tv_sec, (nsec_t)ts.tv_nsec / 1000); - IOVEC_SET_STRING(iovec[n++], tbuf); + + iovec[n++] = IOVEC_MAKE_STRING(tbuf); } /* Second: identifier and PID */ @@ -88,19 +90,19 @@ void server_forward_console( xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); if (identifier) - IOVEC_SET_STRING(iovec[n++], identifier); + iovec[n++] = IOVEC_MAKE_STRING(identifier); - IOVEC_SET_STRING(iovec[n++], header_pid); + iovec[n++] = IOVEC_MAKE_STRING(header_pid); } else if (identifier) { - IOVEC_SET_STRING(iovec[n++], identifier); - IOVEC_SET_STRING(iovec[n++], ": "); + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); } /* Fourth: message */ - IOVEC_SET_STRING(iovec[n++], message); - IOVEC_SET_STRING(iovec[n++], "\n"); + iovec[n++] = IOVEC_MAKE_STRING(message); + iovec[n++] = IOVEC_MAKE_STRING("\n"); - tty = s->tty_path ? s->tty_path : "/dev/console"; + tty = s->tty_path ?: "/dev/console"; /* Before you ask: yes, on purpose we open/close the console for each log line we write individually. This is a * good strategy to avoid journald getting killed by the kernel's SAK concept (it doesn't fix this entirely, @@ -115,6 +117,4 @@ void server_forward_console( if (writev(fd, iovec, n) < 0) log_debug_errno(errno, "Failed to write to %s for logging: %m", tty); - - safe_close(fd); } diff --git a/src/journal/journald-kmsg.c b/src/journal/journald-kmsg.c index 2be82be5f64..1bad7cb2eeb 100644 --- a/src/journal/journald-kmsg.c +++ b/src/journal/journald-kmsg.c @@ -26,6 +26,7 @@ #include "libudev.h" #include "sd-messages.h" +#include "alloc-util.h" #include "escape.h" #include "fd-util.h" #include "format-util.h" @@ -45,11 +46,11 @@ void server_forward_kmsg( const char *message, const struct ucred *ucred) { + _cleanup_free_ char *ident_buf = NULL; struct iovec iovec[5]; char header_priority[DECIMAL_STR_MAX(priority) + 3], header_pid[sizeof("[]: ")-1 + DECIMAL_STR_MAX(pid_t) + 1]; int n = 0; - char *ident_buf = NULL; assert(s); assert(priority >= 0); @@ -68,7 +69,7 @@ void server_forward_kmsg( /* First: priority field */ xsprintf(header_priority, "<%i>", priority); - IOVEC_SET_STRING(iovec[n++], header_priority); + iovec[n++] = IOVEC_MAKE_STRING(header_priority); /* Second: identifier and PID */ if (ucred) { @@ -80,22 +81,20 @@ void server_forward_kmsg( xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); if (identifier) - IOVEC_SET_STRING(iovec[n++], identifier); + iovec[n++] = IOVEC_MAKE_STRING(identifier); - IOVEC_SET_STRING(iovec[n++], header_pid); + iovec[n++] = IOVEC_MAKE_STRING(header_pid); } else if (identifier) { - IOVEC_SET_STRING(iovec[n++], identifier); - IOVEC_SET_STRING(iovec[n++], ": "); + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); } /* Fourth: message */ - IOVEC_SET_STRING(iovec[n++], message); - IOVEC_SET_STRING(iovec[n++], "\n"); + iovec[n++] = IOVEC_MAKE_STRING(message); + iovec[n++] = IOVEC_MAKE_STRING("\n"); if (writev(s->dev_kmsg_fd, iovec, n) < 0) log_debug_errno(errno, "Failed to write to /dev/kmsg for logging: %m"); - - free(ident_buf); } static bool is_us(const char *pid) { @@ -111,11 +110,11 @@ static bool is_us(const char *pid) { static void dev_kmsg_record(Server *s, const char *p, size_t l) { struct iovec iovec[N_IOVEC_META_FIELDS + 7 + N_IOVEC_KERNEL_FIELDS + 2 + N_IOVEC_UDEV_FIELDS]; - char *message = NULL, *syslog_priority = NULL, *syslog_pid = NULL, *syslog_facility = NULL, *syslog_identifier = NULL, *source_time = NULL; + _cleanup_free_ char *message = NULL, *syslog_priority = NULL, *syslog_pid = NULL, *syslog_facility = NULL, *syslog_identifier = NULL, *source_time = NULL, *identifier = NULL, *pid = NULL; int priority, r; unsigned n = 0, z = 0, j; unsigned long long usec; - char *identifier = NULL, *pid = NULL, *e, *f, *k; + char *e, *f, *k; uint64_t serial; size_t pl; char *kernel_device = NULL; @@ -216,7 +215,7 @@ static void dev_kmsg_record(Server *s, const char *p, size_t l) { if (startswith(m, "_KERNEL_DEVICE=")) kernel_device = m + 15; - IOVEC_SET_STRING(iovec[n++], m); + iovec[n++] = IOVEC_MAKE_STRING(m); z++; l -= (e - k) + 1; @@ -236,7 +235,7 @@ static void dev_kmsg_record(Server *s, const char *p, size_t l) { if (g) { b = strappend("_UDEV_DEVNODE=", g); if (b) { - IOVEC_SET_STRING(iovec[n++], b); + iovec[n++] = IOVEC_MAKE_STRING(b); z++; } } @@ -245,7 +244,7 @@ static void dev_kmsg_record(Server *s, const char *p, size_t l) { if (g) { b = strappend("_UDEV_SYSNAME=", g); if (b) { - IOVEC_SET_STRING(iovec[n++], b); + iovec[n++] = IOVEC_MAKE_STRING(b); z++; } } @@ -261,7 +260,7 @@ static void dev_kmsg_record(Server *s, const char *p, size_t l) { if (g) { b = strappend("_UDEV_DEVLINK=", g); if (b) { - IOVEC_SET_STRING(iovec[n++], b); + iovec[n++] = IOVEC_MAKE_STRING(b); z++; } } @@ -274,18 +273,18 @@ static void dev_kmsg_record(Server *s, const char *p, size_t l) { } if (asprintf(&source_time, "_SOURCE_MONOTONIC_TIMESTAMP=%llu", usec) >= 0) - IOVEC_SET_STRING(iovec[n++], source_time); + iovec[n++] = IOVEC_MAKE_STRING(source_time); - IOVEC_SET_STRING(iovec[n++], "_TRANSPORT=kernel"); + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=kernel"); if (asprintf(&syslog_priority, "PRIORITY=%i", priority & LOG_PRIMASK) >= 0) - IOVEC_SET_STRING(iovec[n++], syslog_priority); + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); if (asprintf(&syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)) >= 0) - IOVEC_SET_STRING(iovec[n++], syslog_facility); + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); if ((priority & LOG_FACMASK) == LOG_KERN) - IOVEC_SET_STRING(iovec[n++], "SYSLOG_IDENTIFIER=kernel"); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=kernel"); else { pl -= syslog_parse_identifier((const char**) &p, &identifier, &pid); @@ -297,33 +296,24 @@ static void dev_kmsg_record(Server *s, const char *p, size_t l) { if (identifier) { syslog_identifier = strappend("SYSLOG_IDENTIFIER=", identifier); if (syslog_identifier) - IOVEC_SET_STRING(iovec[n++], syslog_identifier); + iovec[n++] = IOVEC_MAKE_STRING(syslog_identifier); } if (pid) { syslog_pid = strappend("SYSLOG_PID=", pid); if (syslog_pid) - IOVEC_SET_STRING(iovec[n++], syslog_pid); + iovec[n++] = IOVEC_MAKE_STRING(syslog_pid); } } if (cunescape_length_with_prefix(p, pl, "MESSAGE=", UNESCAPE_RELAX, &message) >= 0) - IOVEC_SET_STRING(iovec[n++], message); + iovec[n++] = IOVEC_MAKE_STRING(message); server_dispatch_message(s, iovec, n, ELEMENTSOF(iovec), NULL, NULL, priority, 0); finish: for (j = 0; j < z; j++) free(iovec[j].iov_base); - - free(message); - free(syslog_priority); - free(syslog_identifier); - free(syslog_pid); - free(syslog_facility); - free(source_time); - free(identifier); - free(pid); } static int server_read_dev_kmsg(Server *s) { diff --git a/src/journal/journald-native.c b/src/journal/journald-native.c index 23afe59bd53..554f91460d4 100644 --- a/src/journal/journald-native.c +++ b/src/journal/journald-native.c @@ -282,7 +282,7 @@ static int server_process_entry( } tn = n++; - IOVEC_SET_STRING(iovec[tn], "_TRANSPORT=journal"); + iovec[tn] = IOVEC_MAKE_STRING("_TRANSPORT=journal"); entry_size += strlen("_TRANSPORT=journal"); if (entry_size + n + 1 > ENTRY_SIZE_MAX) { /* data + separators + trailer */ diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c index 27c2571cfc9..2d51be7c89e 100644 --- a/src/journal/journald-server.c +++ b/src/journal/journald-server.c @@ -724,14 +724,14 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned char *k; \ k = newa(char, strlen(field "=") + DECIMAL_STR_MAX(type) + 1); \ sprintf(k, field "=" format, value); \ - IOVEC_SET_STRING(iovec[n++], k); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ } #define IOVEC_ADD_STRING_FIELD(iovec, n, value, field) \ if (!isempty(value)) { \ char *k; \ k = strjoina(field "=", value); \ - IOVEC_SET_STRING(iovec[n++], k); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ } #define IOVEC_ADD_ID128_FIELD(iovec, n, value, field) \ @@ -739,7 +739,7 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned char *k; \ k = newa(char, strlen(field "=") + SD_ID128_STRING_MAX); \ sd_id128_to_string(value, stpcpy(k, field "=")); \ - IOVEC_SET_STRING(iovec[n++], k); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ } #define IOVEC_ADD_SIZED_FIELD(iovec, n, value, value_size, field) \ @@ -747,7 +747,7 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned char *k; \ k = newa(char, strlen(field "=") + value_size + 1); \ *((char*) mempcpy(stpcpy(k, field "="), value, value_size)) = 0; \ - IOVEC_SET_STRING(iovec[n++], k); \ + iovec[n++] = IOVEC_MAKE_STRING(k); \ } \ static void dispatch_message_real( @@ -826,20 +826,20 @@ static void dispatch_message_real( if (tv) { sprintf(source_time, "_SOURCE_REALTIME_TIMESTAMP=" USEC_FMT, timeval_load(tv)); - IOVEC_SET_STRING(iovec[n++], source_time); + iovec[n++] = IOVEC_MAKE_STRING(source_time); } /* Note that strictly speaking storing the boot id here is * redundant since the entry includes this in-line * anyway. However, we need this indexed, too. */ if (!isempty(s->boot_id_field)) - IOVEC_SET_STRING(iovec[n++], s->boot_id_field); + iovec[n++] = IOVEC_MAKE_STRING(s->boot_id_field); if (!isempty(s->machine_id_field)) - IOVEC_SET_STRING(iovec[n++], s->machine_id_field); + iovec[n++] = IOVEC_MAKE_STRING(s->machine_id_field); if (!isempty(s->hostname_field)) - IOVEC_SET_STRING(iovec[n++], s->hostname_field); + iovec[n++] = IOVEC_MAKE_STRING(s->hostname_field); assert(n <= m); @@ -870,15 +870,15 @@ void server_driver_message(Server *s, const char *message_id, const char *format assert(format); assert_cc(3 == LOG_FAC(LOG_DAEMON)); - IOVEC_SET_STRING(iovec[n++], "SYSLOG_FACILITY=3"); - IOVEC_SET_STRING(iovec[n++], "SYSLOG_IDENTIFIER=systemd-journald"); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_FACILITY=3"); + iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=systemd-journald"); - IOVEC_SET_STRING(iovec[n++], "_TRANSPORT=driver"); + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=driver"); assert_cc(6 == LOG_INFO); - IOVEC_SET_STRING(iovec[n++], "PRIORITY=6"); + iovec[n++] = IOVEC_MAKE_STRING("PRIORITY=6"); if (message_id) - IOVEC_SET_STRING(iovec[n++], message_id); + iovec[n++] = IOVEC_MAKE_STRING(message_id); m = n; va_start(ap, format); @@ -899,8 +899,8 @@ void server_driver_message(Server *s, const char *message_id, const char *format xsprintf(buf, "MESSAGE=Entry printing failed: %s", strerror(-r)); n = 3; - IOVEC_SET_STRING(iovec[n++], "PRIORITY=4"); - IOVEC_SET_STRING(iovec[n++], buf); + iovec[n++] = IOVEC_MAKE_STRING("PRIORITY=4"); + iovec[n++] = IOVEC_MAKE_STRING(buf); dispatch_message_real(s, iovec, n, ELEMENTSOF(iovec), s->my_context, NULL, LOG_INFO, 0); } } diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c index a44c540f67e..d0b95ea02cc 100644 --- a/src/journal/journald-stream.c +++ b/src/journal/journald-stream.c @@ -282,22 +282,21 @@ static int stdout_stream_log(StdoutStream *s, const char *p, LineBreak line_brea if (s->server->forward_to_wall) server_forward_wall(s->server, priority, s->identifier, p, &s->ucred); - IOVEC_SET_STRING(iovec[n++], "_TRANSPORT=stdout"); - - IOVEC_SET_STRING(iovec[n++], s->id_field); + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=stdout"); + iovec[n++] = IOVEC_MAKE_STRING(s->id_field); syslog_priority[strlen("PRIORITY=")] = '0' + LOG_PRI(priority); - IOVEC_SET_STRING(iovec[n++], syslog_priority); + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); if (priority & LOG_FACMASK) { xsprintf(syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)); - IOVEC_SET_STRING(iovec[n++], syslog_facility); + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); } if (s->identifier) { syslog_identifier = strappend("SYSLOG_IDENTIFIER=", s->identifier); if (syslog_identifier) - IOVEC_SET_STRING(iovec[n++], syslog_identifier); + iovec[n++] = IOVEC_MAKE_STRING(syslog_identifier); } if (line_break != LINE_BREAK_NEWLINE) { @@ -309,12 +308,12 @@ static int stdout_stream_log(StdoutStream *s, const char *p, LineBreak line_brea c = line_break == LINE_BREAK_NUL ? "_LINE_BREAK=nul" : line_break == LINE_BREAK_LINE_MAX ? "_LINE_BREAK=line-max" : "_LINE_BREAK=eof"; - IOVEC_SET_STRING(iovec[n++], c); + iovec[n++] = IOVEC_MAKE_STRING(c); } message = strappend("MESSAGE=", p); if (message) - IOVEC_SET_STRING(iovec[n++], message); + iovec[n++] = IOVEC_MAKE_STRING(message); if (s->context) (void) client_context_maybe_refresh(s->server, s->context, NULL, NULL, 0, NULL, USEC_INFINITY); diff --git a/src/journal/journald-syslog.c b/src/journal/journald-syslog.c index a03c36df347..fa597e47a23 100644 --- a/src/journal/journald-syslog.c +++ b/src/journal/journald-syslog.c @@ -124,7 +124,7 @@ static void forward_syslog_raw(Server *s, int priority, const char *buffer, cons if (LOG_PRI(priority) > s->max_level_syslog) return; - IOVEC_SET_STRING(iovec, buffer); + iovec = IOVEC_MAKE_STRING(buffer); forward_syslog_iovec(s, &iovec, 1, ucred, tv); } @@ -135,7 +135,7 @@ void server_forward_syslog(Server *s, int priority, const char *identifier, cons int n = 0; time_t t; struct tm *tm; - char *ident_buf = NULL; + _cleanup_free_ char *ident_buf = NULL; assert(s); assert(priority >= 0); @@ -147,7 +147,7 @@ void server_forward_syslog(Server *s, int priority, const char *identifier, cons /* First: priority field */ xsprintf(header_priority, "<%i>", priority); - IOVEC_SET_STRING(iovec[n++], header_priority); + iovec[n++] = IOVEC_MAKE_STRING(header_priority); /* Second: timestamp */ t = tv ? tv->tv_sec : ((time_t) (now(CLOCK_REALTIME) / USEC_PER_SEC)); @@ -156,7 +156,7 @@ void server_forward_syslog(Server *s, int priority, const char *identifier, cons return; if (strftime(header_time, sizeof(header_time), "%h %e %T ", tm) <= 0) return; - IOVEC_SET_STRING(iovec[n++], header_time); + iovec[n++] = IOVEC_MAKE_STRING(header_time); /* Third: identifier and PID */ if (ucred) { @@ -168,20 +168,18 @@ void server_forward_syslog(Server *s, int priority, const char *identifier, cons xsprintf(header_pid, "["PID_FMT"]: ", ucred->pid); if (identifier) - IOVEC_SET_STRING(iovec[n++], identifier); + iovec[n++] = IOVEC_MAKE_STRING(identifier); - IOVEC_SET_STRING(iovec[n++], header_pid); + iovec[n++] = IOVEC_MAKE_STRING(header_pid); } else if (identifier) { - IOVEC_SET_STRING(iovec[n++], identifier); - IOVEC_SET_STRING(iovec[n++], ": "); + iovec[n++] = IOVEC_MAKE_STRING(identifier); + iovec[n++] = IOVEC_MAKE_STRING(": "); } /* Fourth: message */ - IOVEC_SET_STRING(iovec[n++], message); + iovec[n++] = IOVEC_MAKE_STRING(message); forward_syslog_iovec(s, iovec, n, ucred, tv); - - free(ident_buf); } int syslog_fixup_facility(int priority) { @@ -353,29 +351,29 @@ void server_process_syslog_message( if (s->forward_to_wall) server_forward_wall(s, priority, identifier, buf, ucred); - IOVEC_SET_STRING(iovec[n++], "_TRANSPORT=syslog"); + iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=syslog"); xsprintf(syslog_priority, "PRIORITY=%i", priority & LOG_PRIMASK); - IOVEC_SET_STRING(iovec[n++], syslog_priority); + iovec[n++] = IOVEC_MAKE_STRING(syslog_priority); if (priority & LOG_FACMASK) { xsprintf(syslog_facility, "SYSLOG_FACILITY=%i", LOG_FAC(priority)); - IOVEC_SET_STRING(iovec[n++], syslog_facility); + iovec[n++] = IOVEC_MAKE_STRING(syslog_facility); } if (identifier) { syslog_identifier = strjoina("SYSLOG_IDENTIFIER=", identifier); - IOVEC_SET_STRING(iovec[n++], syslog_identifier); + iovec[n++] = IOVEC_MAKE_STRING(syslog_identifier); } if (pid) { syslog_pid = strjoina("SYSLOG_PID=", pid); - IOVEC_SET_STRING(iovec[n++], syslog_pid); + iovec[n++] = IOVEC_MAKE_STRING(syslog_pid); } message = strjoina("MESSAGE=", buf); if (message) - IOVEC_SET_STRING(iovec[n++], message); + iovec[n++] = IOVEC_MAKE_STRING(message); if (ucred && pid_is_valid(ucred->pid)) { r = client_context_get(s, ucred->pid, ucred, label, label_len, NULL, &context); diff --git a/src/test/test-fileio.c b/src/test/test-fileio.c index b1d688c89e3..5b09d596359 100644 --- a/src/test/test-fileio.c +++ b/src/test/test-fileio.c @@ -609,9 +609,9 @@ static void test_writing_tmpfile(void) { int fd, r; struct iovec iov[3]; - IOVEC_SET_STRING(iov[0], "abc\n"); - IOVEC_SET_STRING(iov[1], ALPHANUMERICAL "\n"); - IOVEC_SET_STRING(iov[2], ""); + iov[0] = IOVEC_MAKE_STRING("abc\n"); + iov[1] = IOVEC_MAKE_STRING(ALPHANUMERICAL "\n"); + iov[2] = IOVEC_MAKE_STRING(""); fd = mkostemp_safe(name); printf("tmpfile: %s", name); From 8e5430c4bd9d39a5e405794f9c883f48de5205d9 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 14:02:31 +0200 Subject: [PATCH 33/42] nspawn: set up a new session keyring for the container process keyring material should not leak into the container. So far we relied on seccomp to deny access to the keyring, but given that we now made the seccomp configurable, and access to keyctl() and friends may optionally be permitted to containers now let's make sure we disconnect the callers keyring from the keyring of PID 1 in the container. --- src/nspawn/nspawn.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index cf804ed1b35..5ba09a994a9 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1616,6 +1616,27 @@ static int setup_dev_console(const char *dest, const char *console) { return mount_verbose(LOG_ERR, console, to, NULL, MS_BIND, NULL); } +static int setup_keyring(void) { + key_serial_t keyring; + + /* Allocate a new session keyring for the container. This makes sure the keyring of the session systemd-nspawn + * was invoked from doesn't leak into the container. Note that by default we block keyctl() and request_key() + * anyway via seccomp so doing this operation isn't strictly necessary, but in case people explicitly whitelist + * these system calls let's make sure we don't leak anything into the container. */ + + keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0); + if (keyring == -1) { + if (errno == ENOSYS) + log_debug_errno(errno, "Kernel keyring not supported, ignoring."); + else if (IN_SET(errno, EACCES, EPERM)) + log_debug_errno(errno, "Kernel keyring access prohibited, ignoring."); + else + return log_error_errno(errno, "Setting up kernel keyring failed: %m"); + } + + return 0; +} + static int setup_kmsg(const char *dest, int kmsg_socket) { const char *from, *to; _cleanup_umask_ mode_t u; @@ -2642,6 +2663,10 @@ static int outer_child( if (r < 0) return r; + r = setup_keyring(); + if (r < 0) + return r; + r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist); if (r < 0) return r; From 915b1d0174af808dd4cbea1357490febfba7cdc0 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 14:05:35 +0200 Subject: [PATCH 34/42] core: whenever a unit terminates, log its consumed resources to the journal This adds a new recognizable log message for each unit invocation that contains structured information about consumed resources of the unit as a whole after it terminated. This is particular useful for apps that want to figure out what the resource consumption of a unit given a specific invocation ID was. The log message is only generated for units that have at least one XyzAccounting= property turned on, and currently only covers IP traffic and CPU time metrics. --- src/basic/log.c | 67 +++++++++++++++ src/basic/log.h | 14 ++++ src/core/unit.c | 168 ++++++++++++++++++++++++++++++++++---- src/systemd/sd-messages.h | 3 + 4 files changed, 235 insertions(+), 17 deletions(-) diff --git a/src/basic/log.c b/src/basic/log.c index a43e8206779..591c6d2a13e 100644 --- a/src/basic/log.c +++ b/src/basic/log.c @@ -975,6 +975,73 @@ int log_struct_internal( return log_dispatch_internal(level, error, file, line, func, NULL, NULL, NULL, NULL, buf + 8); } +int log_struct_iovec_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const struct iovec input_iovec[], + size_t n_input_iovec) { + + LogRealm realm = LOG_REALM_REMOVE_LEVEL(level); + PROTECT_ERRNO; + size_t i; + char *m; + + if (error < 0) + error = -error; + + if (_likely_(LOG_PRI(level) > log_max_level[realm])) + return -error; + + if (log_target == LOG_TARGET_NULL) + return -error; + + if ((level & LOG_FACMASK) == 0) + level = log_facility | LOG_PRI(level); + + if (IN_SET(log_target, LOG_TARGET_AUTO, + LOG_TARGET_JOURNAL_OR_KMSG, + LOG_TARGET_JOURNAL) && + journal_fd >= 0) { + + struct iovec iovec[1 + n_input_iovec*2]; + char header[LINE_MAX]; + struct msghdr mh = { + .msg_iov = iovec, + .msg_iovlen = 1 + n_input_iovec*2, + }; + + log_do_header(header, sizeof(header), level, error, file, line, func, NULL, NULL, NULL, NULL); + iovec[0] = IOVEC_MAKE_STRING(header); + + for (i = 0; i < n_input_iovec; i++) { + iovec[1+i*2] = input_iovec[i]; + iovec[1+i*2+1] = IOVEC_MAKE_STRING("\n"); + } + + if (sendmsg(journal_fd, &mh, MSG_NOSIGNAL) >= 0) + return -errno; + } + + for (i = 0; i < n_input_iovec; i++) { + if (input_iovec[i].iov_len < strlen("MESSAGE=")) + continue; + + if (memcmp(input_iovec[i].iov_base, "MESSAGE=", strlen("MESSAGE=")) == 0) + break; + } + + if (_unlikely_(i >= n_input_iovec)) /* Couldn't find MESSAGE=? */ + return -error; + + m = strndupa(input_iovec[i].iov_base + strlen("MESSAGE="), + input_iovec[i].iov_len - strlen("MESSAGE=")); + + return log_dispatch_internal(level, error, file, line, func, NULL, NULL, NULL, NULL, m); +} + int log_set_target_from_string(const char *e) { LogTarget t; diff --git a/src/basic/log.h b/src/basic/log.h index 186747ff8e2..e3fd3203d04 100644 --- a/src/basic/log.h +++ b/src/basic/log.h @@ -187,6 +187,15 @@ int log_format_iovec( const char *format, va_list ap) _printf_(6, 0); +int log_struct_iovec_internal( + int level, + int error, + const char *file, + int line, + const char *func, + const struct iovec input_iovec[], + size_t n_input_iovec); + /* This modifies the buffer passed! */ int log_dump_internal( int level, @@ -270,6 +279,11 @@ void log_assert_failed_return_realm( error, __FILE__, __LINE__, __func__, __VA_ARGS__) #define log_struct(level, ...) log_struct_errno(level, 0, __VA_ARGS__) +#define log_struct_iovec_errno(level, error, iovec, n_iovec) \ + log_struct_iovec_internal(LOG_REALM_PLUS_LEVEL(LOG_REALM, level), \ + error, __FILE__, __LINE__, __func__, iovec, n_iovec) +#define log_struct_iovec(level, iovec, n_iovec) log_struct_iovec_errno(level, 0, iovec, n_iovec) + /* This modifies the buffer passed! */ #define log_dump(level, buffer) \ log_dump_internal(LOG_REALM_PLUS_LEVEL(LOG_REALM, level), \ diff --git a/src/core/unit.c b/src/core/unit.c index 65a8c77f86f..5526dd805cf 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -39,6 +39,7 @@ #include "fileio-label.h" #include "format-util.h" #include "id128-util.h" +#include "io-util.h" #include "load-dropin.h" #include "load-fragment.h" #include "log.h" @@ -2001,6 +2002,134 @@ void unit_trigger_notify(Unit *u) { UNIT_VTABLE(other)->trigger_notify(other, u); } +static int unit_log_resources(Unit *u) { + + struct iovec iovec[1 + _CGROUP_IP_ACCOUNTING_METRIC_MAX + 4]; + size_t n_message_parts = 0, n_iovec = 0; + char* message_parts[3 + 1], *t; + nsec_t nsec = NSEC_INFINITY; + CGroupIPAccountingMetric m; + size_t i; + int r; + const char* const ip_fields[_CGROUP_IP_ACCOUNTING_METRIC_MAX] = { + [CGROUP_IP_INGRESS_BYTES] = "IP_METRIC_INGRESS_BYTES", + [CGROUP_IP_INGRESS_PACKETS] = "IP_METRIC_INGRESS_PACKETS", + [CGROUP_IP_EGRESS_BYTES] = "IP_METRIC_EGRESS_BYTES", + [CGROUP_IP_EGRESS_PACKETS] = "IP_METRIC_EGRESS_PACKETS", + }; + + assert(u); + + /* Invoked whenever a unit enters failed or dead state. Logs information about consumed resources if resource + * accounting was enabled for a unit. It does this in two ways: a friendly human readable string with reduced + * information and the complete data in structured fields. */ + + (void) unit_get_cpu_usage(u, &nsec); + if (nsec != NSEC_INFINITY) { + char buf[FORMAT_TIMESPAN_MAX] = ""; + + /* Format the CPU time for inclusion in the structured log message */ + if (asprintf(&t, "CPU_USAGE_NSEC=%" PRIu64, nsec) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the CPU time for inclusion in the human language message string */ + format_timespan(buf, sizeof(buf), nsec / NSEC_PER_USEC, USEC_PER_MSEC); + t = strjoin(n_message_parts > 0 ? "consumed " : "Consumed ", buf, " CPU time"); + if (!t) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = t; + } + + for (m = 0; m < _CGROUP_IP_ACCOUNTING_METRIC_MAX; m++) { + char buf[FORMAT_BYTES_MAX] = ""; + uint64_t value = UINT64_MAX; + + assert(ip_fields[m]); + + (void) unit_get_ip_accounting(u, m, &value); + if (value == UINT64_MAX) + continue; + + /* Format IP accounting data for inclusion in the structured log message */ + if (asprintf(&t, "%s=%" PRIu64, ip_fields[m], value) < 0) { + r = log_oom(); + goto finish; + } + iovec[n_iovec++] = IOVEC_MAKE_STRING(t); + + /* Format the IP accounting data for inclusion in the human language message string, but only for the + * bytes counters (and not for the packets counters) */ + if (m == CGROUP_IP_INGRESS_BYTES) + t = strjoin(n_message_parts > 0 ? "received " : "Received ", + format_bytes(buf, sizeof(buf), value), + " IP traffic"); + else if (m == CGROUP_IP_EGRESS_BYTES) + t = strjoin(n_message_parts > 0 ? "sent " : "Sent ", + format_bytes(buf, sizeof(buf), value), + " IP traffic"); + else + continue; + if (!t) { + r = log_oom(); + goto finish; + } + + message_parts[n_message_parts++] = t; + } + + /* Is there any accounting data available at all? */ + if (n_iovec == 0) { + r = 0; + goto finish; + } + + if (n_message_parts == 0) + t = strjoina("MESSAGE=", u->id, ": Completed"); + else { + _cleanup_free_ char *joined; + + message_parts[n_message_parts] = NULL; + + joined = strv_join(message_parts, ", "); + if (!joined) { + r = log_oom(); + goto finish; + } + + t = strjoina("MESSAGE=", u->id, ": ", joined); + } + + /* The following four fields we allocate on the stack or are static strings, we hence don't want to free them, + * and hence don't increase n_iovec for them */ + iovec[n_iovec] = IOVEC_MAKE_STRING(t); + iovec[n_iovec + 1] = IOVEC_MAKE_STRING("MESSAGE_ID=" SD_MESSAGE_UNIT_RESOURCES_STR); + + t = strjoina(u->manager->unit_log_field, u->id); + iovec[n_iovec + 2] = IOVEC_MAKE_STRING(t); + + t = strjoina(u->manager->invocation_log_field, u->invocation_id_string); + iovec[n_iovec + 3] = IOVEC_MAKE_STRING(t); + + log_struct_iovec(LOG_INFO, iovec, n_iovec + 4); + r = 0; + +finish: + for (i = 0; i < n_message_parts; i++) + free(message_parts[i]); + + for (i = 0; i < n_iovec; i++) + free(iovec[i].iov_base); + + return r; + +} + void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_success) { Manager *m; bool unexpected; @@ -2172,28 +2301,33 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su manager_send_unit_plymouth(m, u); } else { + /* We don't care about D-Bus going down here, since we'll get an asynchronous notification for it + * anyway. */ - /* We don't care about D-Bus here, since we'll get an - * asynchronous notification for it anyway. */ + if (UNIT_IS_INACTIVE_OR_FAILED(ns) && + !UNIT_IS_INACTIVE_OR_FAILED(os) + && !MANAGER_IS_RELOADING(m)) { - if (u->type == UNIT_SERVICE && - UNIT_IS_INACTIVE_OR_FAILED(ns) && - !UNIT_IS_INACTIVE_OR_FAILED(os) && - !MANAGER_IS_RELOADING(m)) { + /* This unit just stopped/failed. */ + if (u->type == UNIT_SERVICE) { - /* Hmm, if there was no start record written - * write it now, so that we always have a nice - * pair */ - if (!u->in_audit) { - manager_send_unit_audit(m, u, AUDIT_SERVICE_START, ns == UNIT_INACTIVE); + /* Hmm, if there was no start record written + * write it now, so that we always have a nice + * pair */ + if (!u->in_audit) { + manager_send_unit_audit(m, u, AUDIT_SERVICE_START, ns == UNIT_INACTIVE); - if (ns == UNIT_INACTIVE) - manager_send_unit_audit(m, u, AUDIT_SERVICE_STOP, true); - } else - /* Write audit record if we have just finished shutting down */ - manager_send_unit_audit(m, u, AUDIT_SERVICE_STOP, ns == UNIT_INACTIVE); + if (ns == UNIT_INACTIVE) + manager_send_unit_audit(m, u, AUDIT_SERVICE_STOP, true); + } else + /* Write audit record if we have just finished shutting down */ + manager_send_unit_audit(m, u, AUDIT_SERVICE_STOP, ns == UNIT_INACTIVE); - u->in_audit = false; + u->in_audit = false; + } + + /* Write a log message about consumed resources */ + unit_log_resources(u); } } diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h index 4bc248a4b16..8c23486779f 100644 --- a/src/systemd/sd-messages.h +++ b/src/systemd/sd-messages.h @@ -103,6 +103,9 @@ _SD_BEGIN_DECLARATIONS; #define SD_MESSAGE_UNIT_RESTART_SCHEDULED_STR \ SD_ID128_MAKE_STR(5e,b0,34,94,b6,58,48,70,a5,36,b3,37,29,08,09,b3) +#define SD_MESSAGE_UNIT_RESOURCES SD_ID128_MAKE(ae,8f,7b,86,6b,03,47,b9,af,31,fe,1c,80,b1,27,c0) +#define SD_MESSAGE_UNIT_RESOURCES_STR SD_ID128_MAKE_STR(ae,8f,7b,86,6b,03,47,b9,af,31,fe,1c,80,b1,27,c0) + #define SD_MESSAGE_SPAWN_FAILED SD_ID128_MAKE(64,12,57,65,1c,1b,4e,c9,a8,62,4d,7a,40,a9,e1,e7) #define SD_MESSAGE_SPAWN_FAILED_STR SD_ID128_MAKE_STR(64,12,57,65,1c,1b,4e,c9,a8,62,4d,7a,40,a9,e1,e7) From c4ad3f43ef4cbe73d7d2c4516ab17f0e907dfe16 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 19:37:11 +0200 Subject: [PATCH 35/42] rlimit: don't assume getrlimit() always succeeds In times of seccomp it might very well fail, and given that we return failures from this function anyway, let's also propagate getrlimit() failures, just to be safe. --- src/basic/rlimit-util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/basic/rlimit-util.c b/src/basic/rlimit-util.c index ca834df6213..5c41429f01c 100644 --- a/src/basic/rlimit-util.c +++ b/src/basic/rlimit-util.c @@ -42,7 +42,8 @@ int setrlimit_closest(int resource, const struct rlimit *rlim) { /* So we failed to set the desired setrlimit, then let's try * to get as close as we can */ - assert_se(getrlimit(resource, &highest) == 0); + if (getrlimit(resource, &highest) < 0) + return -errno; fixed.rlim_cur = MIN(rlim->rlim_cur, highest.rlim_max); fixed.rlim_max = MIN(rlim->rlim_max, highest.rlim_max); From fb3ae275cbd6bc17444e0816ee4c7496f2750209 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 19:43:07 +0200 Subject: [PATCH 36/42] main: bump RLIMIT_NOFILE for the root user substantially On current kernels BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK even for privileged users that have CAP_IPC_LOCK. Given that mlock() generally ignores RLIMIT_MEMLOCK if CAP_IPC_LOCK is set this appears to be an oversight in the kernel. Either way, until that's fixed, let's just bump RLIMIT_MEMLOCK for the root user considerably, as the default is quite limiting, and doesn't permit us to create more than a few TRIE maps. --- src/core/main.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/core/main.c b/src/core/main.c index 8660a31a2dd..2dfd48005b7 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -1205,6 +1205,26 @@ static int bump_rlimit_nofile(struct rlimit *saved_rlimit) { return 0; } +static int bump_rlimit_memlock(struct rlimit *saved_rlimit) { + int r; + + assert(saved_rlimit); + assert(getuid() == 0); + + /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even though we have CAP_IPC_LOCK which + * should normally disable such checks. We need them to implement IPAccessAllow= and IPAccessDeny=, hence let's + * bump the value high enough for the root user. */ + + if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0) + return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m"); + + r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(1024ULL*1024ULL*16ULL)); + if (r < 0) + return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m"); + + return 0; +} + static void test_usr(void) { /* Check that /usr is not a separate fs */ @@ -1388,7 +1408,7 @@ int main(int argc, char *argv[]) { bool queue_default_job = false; bool empty_etc = false; char *switch_root_dir = NULL, *switch_root_init = NULL; - struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0); + struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0), saved_rlimit_memlock = RLIMIT_MAKE_CONST((rlim_t) -1); const char *error_message = NULL; #ifdef HAVE_SYSV_COMPAT @@ -1815,9 +1835,11 @@ int main(int argc, char *argv[]) { if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) log_warning_errno(errno, "Failed to make us a subreaper: %m"); - if (arg_system) + if (arg_system) { /* Bump up RLIMIT_NOFILE for systemd itself */ (void) bump_rlimit_nofile(&saved_rlimit_nofile); + (void) bump_rlimit_memlock(&saved_rlimit_memlock); + } } r = manager_new(arg_system ? UNIT_FILE_SYSTEM : UNIT_FILE_USER, @@ -2051,6 +2073,8 @@ finish: * its child processes */ if (saved_rlimit_nofile.rlim_cur > 0) (void) setrlimit(RLIMIT_NOFILE, &saved_rlimit_nofile); + if (saved_rlimit_memlock.rlim_cur != (rlim_t) -1) + (void) setrlimit(RLIMIT_MEMLOCK, &saved_rlimit_memlock); if (switch_root_dir) { /* Kill all remaining processes from the From ee859930d3528ea0843b3da588315df349824216 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 20:12:11 +0200 Subject: [PATCH 37/42] man: drop misplaced "," before "-.slice" --- man/systemd.slice.xml | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/man/systemd.slice.xml b/man/systemd.slice.xml index 67f7a934481..c46ba7a2e16 100644 --- a/man/systemd.slice.xml +++ b/man/systemd.slice.xml @@ -53,22 +53,15 @@ Description - A unit configuration file whose name ends in - .slice encodes information about a slice which - is a concept for hierarchically managing resources of a group of - processes. This management is performed by creating a node in the - Linux Control Group (cgroup) tree. Units that manage processes - (primarily scope and service units) may be assigned to a specific - slice. For each slice, certain resource limits may be set that - apply to all processes of all units contained in that - slice. Slices are organized hierarchically in a tree. The name of - the slice encodes the location in the tree. The name consists of a - dash-separated series of names, which describes the path to the - slice from the root slice. The root slice is named, - -.slice. Example: - foo-bar.slice is a slice that is located - within foo.slice, which in turn is located in - the root slice -.slice. + A unit configuration file whose name ends in .slice encodes information about a slice + unit. A slice unit is a concept for hierarchically managing resources of a group of processes. This management is + performed by creating a node in the Linux Control Group (cgroup) tree. Units that manage processes (primarily scope + and service units) may be assigned to a specific slice. For each slice, certain resource limits may be set that + apply to all processes of all units contained in that slice. Slices are organized hierarchically in a tree. The + name of the slice encodes the location in the tree. The name consists of a dash-separated series of names, which + describes the path to the slice from the root slice. The root slice is named -.slice. Example: + foo-bar.slice is a slice that is located within foo.slice, which in turn + is located in the root slice -.slice. Note that slice units cannot be templated, nor is possible to add multiple names to a slice unit by creating From 1180181a5111d6cb4900727d57fc894a96b8f247 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 20:12:40 +0200 Subject: [PATCH 38/42] man: remove double newlines in systemd.special man page header The comment lines resulted in double newlines in the man page header, which looks quite ugly. Let's rearrange a bit so that these comments don't result in changes in the output. --- man/systemd.special.xml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/man/systemd.special.xml b/man/systemd.special.xml index 5a831149bcf..d74d3d719e8 100644 --- a/man/systemd.special.xml +++ b/man/systemd.special.xml @@ -48,8 +48,7 @@ - - basic.target, + basic.target, bluetooth.target, cryptsetup-pre.target, cryptsetup.target, @@ -107,13 +106,11 @@ time-sync.target, timers.target, umount.target, - - -.slice, + -.slice, system.slice, user.slice, machine.slice, - - dbus.service, + dbus.service, dbus.socket, display-manager.service, system-update-cleanup.service From 1c382774c51afb21abdb776adac57b524648bd46 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 20:22:06 +0200 Subject: [PATCH 39/42] man: document two more special units --- man/systemd.special.xml | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/man/systemd.special.xml b/man/systemd.special.xml index d74d3d719e8..dc0b313b0ca 100644 --- a/man/systemd.special.xml +++ b/man/systemd.special.xml @@ -110,9 +110,11 @@ system.slice, user.slice, machine.slice, - dbus.service, + -.mount, + dbus.service, dbus.socket, display-manager.service, + init.scope, system-update-cleanup.service @@ -128,6 +130,15 @@ Special System Units + + -.mount + + The root mount point, i.e. the mount unit for the / path. This unit is + unconditionally active, during the entire time the system is up, as this mount point is where the basic + userspace is running from. + + + basic.target @@ -323,6 +334,13 @@ directly. + + init.scope + + This scope unit is where the system and service manager (PID 1) itself resides. It is active as long as + the system is running. + + initrd-fs.target From 9f2e6892a2e70ea3ee84d232f5f4ef3bf217ce4f Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Sep 2017 20:38:07 +0200 Subject: [PATCH 40/42] bpf: set BPF_F_ALLOW_OVERRIDE when attaching a cgroup program if Delegate=yes is set Let's permit installing BPF programs in cgroup subtrees if Delegeate=yes. Let's not document this precise behaviour for now though, as most likely the logic here should become recursive, but that's only going to happen if the kernel starts supporting that. Until then, support this in a non-recursive fashion. --- src/basic/bpf-program.c | 3 ++- src/basic/bpf-program.h | 2 +- src/core/bpf-firewall.c | 12 ++++++++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/basic/bpf-program.c b/src/basic/bpf-program.c index 9326176743f..ce6f9e44098 100644 --- a/src/basic/bpf-program.c +++ b/src/basic/bpf-program.c @@ -91,7 +91,7 @@ int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) { return 0; } -int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path) { +int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) { _cleanup_close_ int fd = -1; union bpf_attr attr; @@ -107,6 +107,7 @@ int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path) { .attach_type = type, .target_fd = fd, .attach_bpf_fd = p->kernel_fd, + .attach_flags = flags, }; if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) diff --git a/src/basic/bpf-program.h b/src/basic/bpf-program.h index 0dd150b60a0..35a41ffc446 100644 --- a/src/basic/bpf-program.h +++ b/src/basic/bpf-program.h @@ -45,7 +45,7 @@ BPFProgram *bpf_program_unref(BPFProgram *p); int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count); int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size); -int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path); +int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags); int bpf_program_cgroup_detach(int type, const char *path); int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags); diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c index 732c36fc1a8..909c1c8253f 100644 --- a/src/core/bpf-firewall.c +++ b/src/core/bpf-firewall.c @@ -539,10 +539,18 @@ int bpf_firewall_compile(Unit *u) { int bpf_firewall_install(Unit *u) { _cleanup_free_ char *path = NULL; + CGroupContext *cc; int r; assert(u); + if (!u->cgroup_path) + return -EINVAL; + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + r = bpf_firewall_supported(); if (r < 0) return r; @@ -560,7 +568,7 @@ int bpf_firewall_install(Unit *u) { if (r < 0) return log_error_errno(r, "Kernel upload of egress BPF program failed: %m"); - r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path); + r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, cc->delegate ? BPF_F_ALLOW_OVERRIDE : 0); if (r < 0) return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path); } else { @@ -575,7 +583,7 @@ int bpf_firewall_install(Unit *u) { if (r < 0) return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m"); - r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path); + r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, cc->delegate ? BPF_F_ALLOW_OVERRIDE : 0); if (r < 0) return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path); } else { From 22c8321b09959eebe7bbbf609e9305e0d9a699b8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 7 Sep 2017 20:03:55 +0200 Subject: [PATCH 41/42] update TODO --- TODO | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/TODO b/TODO index cabba100a52..9488cd475e8 100644 --- a/TODO +++ b/TODO @@ -24,6 +24,15 @@ Janitorial Clean-ups: Features: +* fix logging in execute.c: extend log.c to have an optional mode where + log_open() is implicitly done before each log line and log_close() right + after. This way we don't have open fds around but logs will still + work. Because it is slow this mode should used exclusively in the execute.c + case. + +* set IPAddressDeny=any on all services that shouldn't do networking (possibly + combined with IPAddressAllow=localhost). + * dissect: when we discover squashfs, don't claim we had a "writable" partition in systemd-dissect From 40056777309b06e6119a51cf789d2afa421564d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Tue, 26 Sep 2017 16:04:33 +0200 Subject: [PATCH 42/42] basic/log: fix return value from log_struct_iovec_internal() This returned value so far wasn't used anywhere, so there's no change in behaviour. --- src/basic/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/log.c b/src/basic/log.c index 591c6d2a13e..168c6c37ceb 100644 --- a/src/basic/log.c +++ b/src/basic/log.c @@ -1022,7 +1022,7 @@ int log_struct_iovec_internal( } if (sendmsg(journal_fd, &mh, MSG_NOSIGNAL) >= 0) - return -errno; + return -error; } for (i = 0; i < n_input_iovec; i++) {