diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 19c7b26bcee..94c8e7a2dd3 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1100,7 +1100,29 @@ BindReadOnlyPaths=/var/lib/systemd Note that the implementation of this setting might be impossible (for example if network namespaces are not available), and the unit should be written in a way that does not solely rely on this setting for - security. + security. + + When this option is used on a socket unit any sockets bound on behalf of this unit will be + bound within a private network namespace. This may be combined with + JoinsNamespaceOf= to listen on sockets inside of network namespaces of other + services. + + + + NetworkNamespacePath= + + Takes an absolute file system path refererring to a Linux network namespace + pseudo-file (i.e. a file like /proc/$PID/ns/net or a bind mount or symlink to + one). When set the invoked processes are added to the network namespace referenced by that path. The + path has to point to a valid namespace file at the moment the processes are forked off. If this + option is used PrivateNetwork= has no effect. If this option is used together with + JoinsNamespaceOf= then it only has an effect if this unit is started before any of + the listed units that have PrivateNetwork= or + NetworkNamespacePath= configured, as otherwise the network namespace of those + units is reused. + + When this option is used on a socket unit any sockets bound on behalf of this unit will be + bound within the specified network namespace. diff --git a/man/systemd.unit.xml b/man/systemd.unit.xml index 82c63e1609d..14418c359f4 100644 --- a/man/systemd.unit.xml +++ b/man/systemd.unit.xml @@ -728,23 +728,18 @@ JoinsNamespaceOf= - For units that start processes (such as - service units), lists one or more other units whose network - and/or temporary file namespace to join. This only applies to - unit types which support the - PrivateNetwork= and + For units that start processes (such as service units), lists one or more other units + whose network and/or temporary file namespace to join. This only applies to unit types which support + the PrivateNetwork=, NetworkNamespacePath= and PrivateTmp= directives (see - systemd.exec5 - for details). If a unit that has this setting set is started, - its processes will see the same /tmp, - /var/tmp and network namespace as one - listed unit that is started. If multiple listed units are - already started, it is not defined which namespace is joined. - Note that this setting only has an effect if - PrivateNetwork= and/or - PrivateTmp= is enabled for both the unit - that joins the namespace and the unit whose namespace is - joined. + systemd.exec5 for + details). If a unit that has this setting set is started, its processes will see the same + /tmp, /var/tmp and network namespace as one listed unit + that is started. If multiple listed units are already started, it is not defined which namespace is + joined. Note that this setting only has an effect if + PrivateNetwork=/NetworkNamespacePath= and/or + PrivateTmp= is enabled for both the unit that joins the namespace and the unit + whose namespace is joined. diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index f22bf4a371c..0b28643e791 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -778,6 +778,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), /* Obsolete/redundant properties: */ SD_BUS_PROPERTY("Capabilities", "s", property_get_empty_string, 0, SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), @@ -1217,6 +1218,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "MountFlags")) return bus_set_transient_mount_flags(u, name, &c->mount_flags, message, flags, error); + if (streq(name, "NetworkNamespacePath")) + return bus_set_transient_path(u, name, &c->network_namespace_path, message, flags, error); + if (streq(name, "SupplementaryGroups")) { _cleanup_strv_free_ char **l = NULL; char **p; diff --git a/src/core/execute.c b/src/core/execute.c index 3f784dd1797..c6fd82bbf3e 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3062,6 +3062,14 @@ static int exec_child( } } + if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) { + r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path); + if (r < 0) { + *exit_status = EXIT_NETWORK; + return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path); + } + } + r = setup_input(context, params, socket_fd, named_iofds); if (r < 0) { *exit_status = EXIT_STDIN; @@ -3272,13 +3280,17 @@ static int exec_child( } } - if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) { + if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) { + if (ns_type_supported(NAMESPACE_NET)) { r = setup_netns(runtime->netns_storage_socket); if (r < 0) { *exit_status = EXIT_NETWORK; return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m"); } + } else if (context->network_namespace_path) { + *exit_status = EXIT_NETWORK; + return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing."); } else log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring."); } @@ -3879,6 +3891,8 @@ void exec_context_done(ExecContext *c) { c->stdin_data = mfree(c->stdin_data); c->stdin_data_size = 0; + + c->network_namespace_path = mfree(c->network_namespace_path); } int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) { @@ -4556,6 +4570,11 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, s); } + if (c->network_namespace_path) + fprintf(f, + "%sNetworkNamespacePath: %s\n", + prefix, c->network_namespace_path); + if (c->syscall_errno > 0) { const char *errno_name; @@ -4855,18 +4874,23 @@ static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) { } static void exec_runtime_freep(ExecRuntime **rt) { - if (*rt) - (void) exec_runtime_free(*rt, false); + (void) exec_runtime_free(*rt, false); } -static int exec_runtime_allocate(ExecRuntime **rt) { - assert(rt); +static int exec_runtime_allocate(ExecRuntime **ret) { + ExecRuntime *n; - *rt = new0(ExecRuntime, 1); - if (!*rt) + assert(ret); + + n = new(ExecRuntime, 1); + if (!n) return -ENOMEM; - (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1; + *n = (ExecRuntime) { + .netns_storage_socket = { -1, -1 }, + }; + + *ret = n; return 0; } @@ -4929,7 +4953,7 @@ static int exec_runtime_add( static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) { _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL; - _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1}; + _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }; int r; assert(m); @@ -4937,7 +4961,7 @@ static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, E assert(id); /* It is not necessary to create ExecRuntime object. */ - if (!c->private_network && !c->private_tmp) + if (!c->private_network && !c->private_tmp && !c->network_namespace_path) return 0; if (c->private_tmp) { @@ -4946,7 +4970,7 @@ static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, E return r; } - if (c->private_network) { + if (c->private_network || c->network_namespace_path) { if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0) return -errno; } @@ -4956,8 +4980,7 @@ static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, E return r; /* Avoid cleanup */ - netns_storage_socket[0] = -1; - netns_storage_socket[1] = -1; + netns_storage_socket[0] = netns_storage_socket[1] = -1; return 1; } diff --git a/src/core/execute.h b/src/core/execute.h index 4b5b2d98cef..df6dd9f3886 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -279,6 +279,8 @@ struct ExecContext { bool nice_set:1; bool ioprio_set:1; bool cpu_sched_set:1; + + char *network_namespace_path; }; static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) { diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 2ac822ef4b2..c7c097d0a4a 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -114,6 +114,7 @@ $1.PrivateDevices, config_parse_bool, 0, $1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables) $1.ProtectKernelModules, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_modules) $1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups) +$1.NetworkNamespacePath, config_parse_unit_path_printf, 0, offsetof($1, exec_context.network_namespace_path) $1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network) $1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users) $1.PrivateMounts, config_parse_bool, 0, offsetof($1, exec_context.private_mounts) diff --git a/src/core/namespace.c b/src/core/namespace.c index d482c40c240..02ac49d02cf 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1661,14 +1661,14 @@ int setup_netns(int netns_storage_socket[static 2]) { netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); if (netns == -EAGAIN) { - /* Nothing stored yet, so let's create a new namespace */ + /* Nothing stored yet, so let's create a new namespace. */ if (unshare(CLONE_NEWNET) < 0) { r = -errno; goto fail; } - loopback_setup(); + (void) loopback_setup(); netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY); if (netns < 0) { @@ -1703,6 +1703,59 @@ fail: return r; } +int open_netns_path(int netns_storage_socket[static 2], const char *path) { + _cleanup_close_ int netns = -1; + int q, r; + + assert(netns_storage_socket); + assert(netns_storage_socket[0] >= 0); + assert(netns_storage_socket[1] >= 0); + assert(path); + + /* If the storage socket doesn't contain a netns fd yet, open one via the file system and store it in + * it. This is supposed to be called ahead of time, i.e. before setup_netns() which will allocate a + * new anonymous netns if needed. */ + + if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0) + return -errno; + + netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT); + if (netns == -EAGAIN) { + /* Nothing stored yet. Open the file from the file system. */ + + netns = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (netns < 0) { + r = -errno; + goto fail; + } + + r = fd_is_network_ns(netns); + if (r == 0) { /* Not a netns? Refuse early. */ + r = -EINVAL; + goto fail; + } + if (r < 0 && r != -EUCLEAN) /* EUCLEAN: we don't know */ + goto fail; + + r = 1; + + } else if (netns < 0) { + r = netns; + goto fail; + } else + r = 0; /* Already allocated */ + + q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT); + if (q < 0) { + r = q; + goto fail; + } + +fail: + (void) lockf(netns_storage_socket[0], F_ULOCK, 0); + return r; +} + bool ns_type_supported(NamespaceType type) { const char *t, *ns_proc; diff --git a/src/core/namespace.h b/src/core/namespace.h index ab3983f790c..cd1e8b77bb2 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -93,6 +93,7 @@ int setup_tmp_dirs( char **var_tmp_dir); int setup_netns(int netns_storage_socket[static 2]); +int open_netns_path(int netns_storage_socket[static 2], const char *path); const char* protect_home_to_string(ProtectHome p) _const_; ProtectHome protect_home_from_string(const char *s) _pure_; diff --git a/src/core/socket.c b/src/core/socket.c index af95e9027e1..3b60914c867 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1473,6 +1473,25 @@ static int socket_address_listen_do( log_unit_error_errno(u, error, fmt, strna(_t)); \ }) +static int fork_needed(const SocketAddress *address, const ExecContext *context) { + int r; + + assert(address); + assert(context); + + /* Check if we need to do the cgroup or netns stuff. If not we can do things much simpler. */ + + if (IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) { + r = bpf_firewall_supported(); + if (r < 0) + return r; + if (r != BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */ + return true; + } + + return context->private_network || context->network_namespace_path; +} + static int socket_address_listen_in_cgroup( Socket *s, const SocketAddress *address, @@ -1485,18 +1504,34 @@ static int socket_address_listen_in_cgroup( assert(s); assert(address); - /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the socket's cgroup - * in which the socket is actually created. This way we ensure the socket is actually properly attached to the - * unit's cgroup for the purpose of BPF filtering and such. */ + /* This is a wrapper around socket_address_listen(), that forks off a helper process inside the + * socket's cgroup and network namespace in which the socket is actually created. This way we ensure + * the socket is actually properly attached to the unit's cgroup for the purpose of BPF filtering and + * such. */ - if (!IN_SET(address->sockaddr.sa.sa_family, AF_INET, AF_INET6)) - goto shortcut; /* BPF filtering only applies to IPv4 + IPv6, shortcut things for other protocols */ - - r = bpf_firewall_supported(); + r = fork_needed(address, &s->exec_context); if (r < 0) return r; - if (r == BPF_FIREWALL_UNSUPPORTED) /* If BPF firewalling isn't supported anyway — there's no point in this forking complexity */ - goto shortcut; + if (r == 0) { + /* Shortcut things... */ + fd = socket_address_listen_do(s, address, label); + if (fd < 0) + return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); + + return fd; + } + + r = unit_setup_exec_runtime(UNIT(s)); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed acquire runtime: %m"); + + if (s->exec_context.network_namespace_path && + s->exec_runtime && + s->exec_runtime->netns_storage_socket[0] >= 0) { + r = open_netns_path(s->exec_runtime->netns_storage_socket, s->exec_context.network_namespace_path); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to open network namespace path %s: %m", s->exec_context.network_namespace_path); + } if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pair) < 0) return log_unit_error_errno(UNIT(s), errno, "Failed to create communication channel: %m"); @@ -1509,6 +1544,23 @@ static int socket_address_listen_in_cgroup( pair[0] = safe_close(pair[0]); + if ((s->exec_context.private_network || s->exec_context.network_namespace_path) && + s->exec_runtime && + s->exec_runtime->netns_storage_socket[0] >= 0) { + + if (ns_type_supported(NAMESPACE_NET)) { + r = setup_netns(s->exec_runtime->netns_storage_socket); + if (r < 0) { + log_unit_error_errno(UNIT(s), r, "Failed to join network namespace: %m"); + _exit(EXIT_NETWORK); + } + } else if (s->exec_context.network_namespace_path) { + log_unit_error(UNIT(s), "Network namespace path configured but network namespaces not supported."); + _exit(EXIT_NETWORK); + } else + log_unit_warning(UNIT(s), "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring."); + } + fd = socket_address_listen_do(s, address, label); if (fd < 0) { log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); @@ -1538,13 +1590,6 @@ static int socket_address_listen_in_cgroup( return log_address_error_errno(UNIT(s), address, fd, "Failed to receive listening socket (%s): %m"); return fd; - -shortcut: - fd = socket_address_listen_do(s, address, label); - if (fd < 0) - return log_address_error_errno(UNIT(s), address, fd, "Failed to create listening socket (%s): %m"); - - return fd; } DEFINE_TRIVIAL_CLEANUP_FUNC(Socket *, socket_close_fds); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index dff87f565ed..3ea1bd29c90 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -744,7 +744,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con "UtmpIdentifier", "UtmpMode", "PAMName", "TTYPath", "WorkingDirectory", "RootDirectory", "SyslogIdentifier", "ProtectSystem", "ProtectHome", "SELinuxContext", "RootImage", - "RuntimeDirectoryPreserve", "Personality", "KeyringMode")) + "RuntimeDirectoryPreserve", "Personality", "KeyringMode", "NetworkNamespacePath")) return bus_append_string(m, field, eq);