From 7bd4bcf740aa0ee2ac9651dced2019c5b84526f8 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 16 Jul 2018 12:19:05 +0200 Subject: [PATCH 1/9] update TODO --- TODO | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/TODO b/TODO index 72b1dc7609..9bb01fd574 100644 --- a/TODO +++ b/TODO @@ -35,6 +35,8 @@ Features: course, and watching things asynchronously even more so, but it should be good enough for the idle detection logic at least. +* maybe extend .path units to expose fanotify() per-mount change events + * Add a "systemctl list-units --by-slice" mode or so, which rearranges the output of "systemctl list-units" slightly by showing the tree structure of the slices, and the units attached to them. @@ -69,10 +71,6 @@ Features: that our log messages could contain clickable links for example for unit files and suchlike we operate on. -* introduce a new SystemCallFilters= group called "@system-service" with a - sensible default set for system services, then make use of them in portable - profiles - * add support for "portablectl attach http://foobar.com/waaa.raw (i.e. importd integration) * add attach --enable and attach --now (for attach+enable+start) From 1e5057b904473696ae0d591d7555233adcb51fa4 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 17 Jul 2018 12:23:26 +0200 Subject: [PATCH 2/9] sd-bus: allow connecting to the pseudo-container ".host" machined exposes the pseudo-container ".host" as a reference to the host system, and this means "machinectl login .host" and "machinectl shell .host" get your a login/shell on the host. systemd-run currently doesn't allow that. Let's fix that, and make sd-bus understand ".host" as an alias for connecting to the host system. --- src/basic/util.c | 5 +++++ src/libsystemd/sd-bus/sd-bus.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/basic/util.c b/src/basic/util.c index 2206c1b4ad..f951d641d7 100644 --- a/src/basic/util.c +++ b/src/basic/util.c @@ -230,6 +230,11 @@ int container_get_leader(const char *machine, pid_t *pid) { assert(machine); assert(pid); + if (streq(machine, ".host")) { + *pid = 1; + return 0; + } + if (!machine_name_is_valid(machine)) return -EINVAL; diff --git a/src/libsystemd/sd-bus/sd-bus.c b/src/libsystemd/sd-bus/sd-bus.c index a87f4625ec..4412fe152f 100644 --- a/src/libsystemd/sd-bus/sd-bus.c +++ b/src/libsystemd/sd-bus/sd-bus.c @@ -949,7 +949,7 @@ static int parse_container_unix_address(sd_bus *b, const char **p, char **guid) return -EINVAL; if (machine) { - if (!machine_name_is_valid(machine)) + if (!streq(machine, ".host") && !machine_name_is_valid(machine)) return -EINVAL; free_and_replace(b->machine, machine); @@ -1447,7 +1447,7 @@ _public_ int sd_bus_open_system_machine(sd_bus **ret, const char *machine) { assert_return(machine, -EINVAL); assert_return(ret, -EINVAL); - assert_return(machine_name_is_valid(machine), -EINVAL); + assert_return(streq(machine, ".host") || machine_name_is_valid(machine), -EINVAL); r = sd_bus_new(&b); if (r < 0) From a8c9b7a0fc0aa02666042543ff9a652aae3c9499 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 17 Jul 2018 12:24:50 +0200 Subject: [PATCH 3/9] sd-login: let's also make sd-login understand ".host" if sd-bus and machined grok it, then sd-login should grok it too. --- src/libsystemd/sd-login/sd-login.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/libsystemd/sd-login/sd-login.c b/src/libsystemd/sd-login/sd-login.c index 9dfba51cea..5940eccead 100644 --- a/src/libsystemd/sd-login/sd-login.c +++ b/src/libsystemd/sd-login/sd-login.c @@ -890,20 +890,27 @@ _public_ int sd_machine_get_class(const char *machine, char **class) { const char *p; int r; - assert_return(machine_name_is_valid(machine), -EINVAL); assert_return(class, -EINVAL); - p = strjoina("/run/systemd/machines/", machine); - r = parse_env_file(NULL, p, NEWLINE, "CLASS", &c, NULL); - if (r == -ENOENT) - return -ENXIO; - if (r < 0) - return r; - if (!c) - return -EIO; + if (streq(machine, ".host")) { + c = strdup("host"); + if (!c) + return -ENOMEM; + } else { + if (!machine_name_is_valid(machine)) + return -EINVAL; + + p = strjoina("/run/systemd/machines/", machine); + r = parse_env_file(NULL, p, NEWLINE, "CLASS", &c, NULL); + if (r == -ENOENT) + return -ENXIO; + if (r < 0) + return r; + if (!c) + return -EIO; + } *class = TAKE_PTR(c); - return 0; } From 25b583d7ffd699384435eba8e49f6ce927a83af0 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 5 Jul 2018 09:56:54 +0200 Subject: [PATCH 4/9] core: swap order of "n_storage_fds" and "n_socket_fds" parameters When process fd lists to pass to activated programs we always place the socket activation fds first, and the storage fds last. Irritatingly in almost all calls the "n_storage_fds" parameter (i.e. the number of storage fds to pass) came first so far, and the "n_socket_fds" parameter second. Let's clean this up, and specify the number of fds in the order the fds themselves are passed. (Also, let's fix one more case where "unsigned" was used to size an array, while we should use "size_t" instead.) --- src/core/execute.c | 14 +++++++------- src/core/execute.h | 2 +- src/core/service.c | 25 ++++++++++++++----------- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index a08e3105bf..8c5139dc02 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -148,11 +148,11 @@ static int shift_fds(int fds[], size_t n_fds) { return 0; } -static int flags_fds(const int fds[], size_t n_storage_fds, size_t n_socket_fds, bool nonblock) { +static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) { size_t i, n_fds; int r; - n_fds = n_storage_fds + n_socket_fds; + n_fds = n_socket_fds + n_storage_fds; if (n_fds <= 0) return 0; @@ -2725,8 +2725,8 @@ static int exec_child( int socket_fd, int named_iofds[3], int *fds, - size_t n_storage_fds, size_t n_socket_fds, + size_t n_storage_fds, char **files_env, int user_lookup_fd, int *exit_status) { @@ -3178,7 +3178,7 @@ static int exec_child( if (r >= 0) r = shift_fds(fds, n_fds); if (r >= 0) - r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking); + r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking); if (r < 0) { *exit_status = EXIT_FDS; return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m"); @@ -3456,7 +3456,7 @@ int exec_spawn(Unit *unit, assert(context); assert(ret); assert(params); - assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0)); + assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0)); if (context->std_input == EXEC_INPUT_SOCKET || context->std_output == EXEC_OUTPUT_SOCKET || @@ -3476,8 +3476,8 @@ int exec_spawn(Unit *unit, } else { socket_fd = -1; fds = params->fds; - n_storage_fds = params->n_storage_fds; n_socket_fds = params->n_socket_fds; + n_storage_fds = params->n_storage_fds; } r = exec_context_named_iofds(context, params, named_iofds); @@ -3516,8 +3516,8 @@ int exec_spawn(Unit *unit, socket_fd, named_iofds, fds, - n_storage_fds, n_socket_fds, + n_storage_fds, files_env, unit->manager->user_lookup_fds[1], &exit_status); diff --git a/src/core/execute.h b/src/core/execute.h index de44085492..82d1f3ae88 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -297,8 +297,8 @@ struct ExecParameters { int *fds; char **fd_names; - size_t n_storage_fds; size_t n_socket_fds; + size_t n_storage_fds; ExecFlags flags; bool selinux_context_net:1; diff --git a/src/core/service.c b/src/core/service.c index db1356c417..fd6450328e 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1178,21 +1178,23 @@ static int service_coldplug(Unit *u) { return 0; } -static int service_collect_fds(Service *s, - int **fds, - char ***fd_names, - unsigned *n_storage_fds, - unsigned *n_socket_fds) { +static int service_collect_fds( + Service *s, + int **fds, + char ***fd_names, + size_t *n_socket_fds, + size_t *n_storage_fds) { _cleanup_strv_free_ char **rfd_names = NULL; _cleanup_free_ int *rfds = NULL; - unsigned rn_socket_fds = 0, rn_storage_fds = 0; + size_t rn_socket_fds = 0, rn_storage_fds = 0; int r; assert(s); assert(fds); assert(fd_names); assert(n_socket_fds); + assert(n_storage_fds); if (s->socket_fd >= 0) { @@ -1256,7 +1258,7 @@ static int service_collect_fds(Service *s, if (s->n_fd_store > 0) { ServiceFDStore *fs; - unsigned n_fds; + size_t n_fds; char **nl; int *t; @@ -1325,9 +1327,10 @@ static int service_spawn( .stdin_fd = -1, .stdout_fd = -1, .stderr_fd = -1, + .exec_fd = -1, }; _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL, **fd_names = NULL; - unsigned n_storage_fds = 0, n_socket_fds = 0, n_env = 0; + size_t n_socket_fds = 0, n_storage_fds = 0, n_env = 0; _cleanup_free_ int *fds = NULL; pid_t pid; int r; @@ -1353,11 +1356,11 @@ static int service_spawn( s->exec_context.std_output == EXEC_OUTPUT_SOCKET || s->exec_context.std_error == EXEC_OUTPUT_SOCKET) { - r = service_collect_fds(s, &fds, &fd_names, &n_storage_fds, &n_socket_fds); + r = service_collect_fds(s, &fds, &fd_names, &n_socket_fds, &n_storage_fds); if (r < 0) return r; - log_unit_debug(UNIT(s), "Passing %i fds to service", n_storage_fds + n_socket_fds); + log_unit_debug(UNIT(s), "Passing %zu fds to service", n_socket_fds + n_storage_fds); } r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout)); @@ -1450,8 +1453,8 @@ static int service_spawn( exec_params.environment = final_env; exec_params.fds = fds; exec_params.fd_names = fd_names; - exec_params.n_storage_fds = n_storage_fds; exec_params.n_socket_fds = n_socket_fds; + exec_params.n_storage_fds = n_storage_fds; exec_params.watchdog_usec = s->watchdog_usec; exec_params.selinux_context_net = s->socket_fd_selinux_context_net; if (s->type == SERVICE_IDLE) From ce0d60a7c4e07c5bdfed9f076bd48752287f0777 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 5 Jul 2018 10:00:52 +0200 Subject: [PATCH 5/9] execute: use our usual syntax for defining bit masks --- src/core/execute.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/execute.h b/src/core/execute.h index 82d1f3ae88..aa6f1ccfda 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -86,10 +86,10 @@ struct ExecStatus { }; typedef enum ExecCommandFlags { - EXEC_COMMAND_IGNORE_FAILURE = 1, - EXEC_COMMAND_FULLY_PRIVILEGED = 2, - EXEC_COMMAND_NO_SETUID = 4, - EXEC_COMMAND_AMBIENT_MAGIC = 8, + EXEC_COMMAND_IGNORE_FAILURE = 1 << 0, + EXEC_COMMAND_FULLY_PRIVILEGED = 1 << 1, + EXEC_COMMAND_NO_SETUID = 1 << 2, + EXEC_COMMAND_AMBIENT_MAGIC = 1 << 3, } ExecCommandFlags; struct ExecCommand { From 5686391b006ee82d8a4559067ad9818e3e631247 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 17 Jul 2018 11:47:14 +0200 Subject: [PATCH 6/9] core: introduce new Type=exec service type Users are often surprised that "systemd-run" command lines like "systemd-run -p User=idontexist /bin/true" will return successfully, even though the logs show that the process couldn't be invoked, as the user "idontexist" doesn't exist. This is because Type=simple will only wait until fork() succeeded before returning start-up success. This patch adds a new service type Type=exec, which is very similar to Type=simple, but waits until the child process completed the execve() before returning success. It uses a pipe that has O_CLOEXEC set for this logic, so that the kernel automatically sends POLLHUP on it when the execve() succeeded but leaves the pipe open if not. This means PID 1 waits exactly until the execve() succeeded in the child, and not longer and not shorter, which is the desired functionality. Making use of this new functionality, the command line "systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail, as expected. --- src/core/execute.c | 91 +++++++++++++++++++++--- src/core/execute.h | 3 + src/core/mount.c | 9 +-- src/core/service.c | 167 ++++++++++++++++++++++++++++++++++++++++++--- src/core/service.h | 4 ++ src/core/socket.c | 9 +-- src/core/swap.c | 1 + 7 files changed, 255 insertions(+), 29 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index 8c5139dc02..0f36f6c825 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2573,6 +2573,7 @@ static int close_remaining_fds( const DynamicCreds *dcreds, int user_lookup_fd, int socket_fd, + int exec_fd, int *fds, size_t n_fds) { size_t n_dont_close = 0; @@ -2589,6 +2590,8 @@ static int close_remaining_fds( if (socket_fd >= 0) dont_close[n_dont_close++] = socket_fd; + if (exec_fd >= 0) + dont_close[n_dont_close++] = exec_fd; if (n_fds > 0) { memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds); n_dont_close += n_fds; @@ -2732,9 +2735,10 @@ static int exec_child( int *exit_status) { _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL; - _cleanup_free_ char *home_buffer = NULL; + int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1; _cleanup_free_ gid_t *supplementary_gids = NULL; const char *username = NULL, *groupname = NULL; + _cleanup_free_ char *home_buffer = NULL; const char *home = NULL, *shell = NULL; dev_t journal_stream_dev = 0; ino_t journal_stream_ino = 0; @@ -2754,7 +2758,6 @@ static int exec_child( #endif uid_t uid = UID_INVALID; gid_t gid = GID_INVALID; - int r, ngids = 0; size_t n_fds; ExecDirectoryType dt; int secure_bits; @@ -2798,8 +2801,8 @@ static int exec_child( /* In case anything used libc syslog(), close this here, too */ closelog(); - n_fds = n_storage_fds + n_socket_fds; - r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds); + n_fds = n_socket_fds + n_storage_fds; + r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds); if (r < 0) { *exit_status = EXIT_FDS; return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m"); @@ -3172,9 +3175,45 @@ static int exec_child( } /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are - * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd - * was needed to upload the policy and can now be closed as well. */ - r = close_all_fds(fds, n_fds); + * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd + * however if we have it as we want to keep it open until the final execve(). */ + + if (params->exec_fd >= 0) { + exec_fd = params->exec_fd; + + if (exec_fd < 3 + (int) n_fds) { + int moved_fd; + + /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the + * process we are about to execute. */ + + moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds); + if (moved_fd < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m"); + } + + safe_close(exec_fd); + exec_fd = moved_fd; + } else { + /* This fd should be FD_CLOEXEC already, but let's make sure. */ + r = fd_cloexec(exec_fd, true); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m"); + } + } + + fds_with_exec_fd = newa(int, n_fds + 1); + memcpy(fds_with_exec_fd, fds, n_fds * sizeof(int)); + fds_with_exec_fd[n_fds] = exec_fd; + n_fds_with_exec_fd = n_fds + 1; + } else { + fds_with_exec_fd = fds; + n_fds_with_exec_fd = n_fds; + } + + r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd); if (r >= 0) r = shift_fds(fds, n_fds); if (r >= 0) @@ -3184,6 +3223,11 @@ static int exec_child( return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m"); } + /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off + * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined, + * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we + * came this far. */ + secure_bits = context->secure_bits; if (needs_sandboxing) { @@ -3414,10 +3458,35 @@ static int exec_child( LOG_UNIT_INVOCATION_ID(unit)); } - execve(command->path, final_argv, accum_env); + if (exec_fd >= 0) { + uint8_t hot = 1; - if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { - log_struct_errno(LOG_INFO, errno, + /* We have finished with all our initializations. Let's now let the manager know that. From this point + * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m"); + } + } + + execve(command->path, final_argv, accum_env); + r = -errno; + + if (exec_fd >= 0) { + uint8_t hot = 0; + + /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager + * that POLLHUP on it no longer means execve() succeeded. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m"); + } + } + + if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { + log_struct_errno(LOG_INFO, r, "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, LOG_UNIT_ID(unit), LOG_UNIT_INVOCATION_ID(unit), @@ -3428,7 +3497,7 @@ static int exec_child( } *exit_status = EXIT_EXEC; - return log_unit_error_errno(unit, errno, "Failed to execute command: %m"); + return log_unit_error_errno(unit, r, "Failed to execute command: %m"); } static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l); diff --git a/src/core/execute.h b/src/core/execute.h index aa6f1ccfda..cd5165c2d1 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -317,6 +317,9 @@ struct ExecParameters { int stdin_fd; int stdout_fd; int stderr_fd; + + /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */ + int exec_fd; }; #include "unit.h" diff --git a/src/core/mount.c b/src/core/mount.c index 21437dad08..16229d4af1 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -747,10 +747,11 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) { static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { ExecParameters exec_params = { - .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, - .stdin_fd = -1, - .stdout_fd = -1, - .stderr_fd = -1, + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, }; pid_t pid; int r; diff --git a/src/core/service.c b/src/core/service.c index fd6450328e..6abf87cb39 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -79,9 +79,10 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING }; -static int service_dispatch_io(sd_event_source *source, int fd, uint32_t events, void *userdata); +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata); static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata); +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata); static void service_enter_signal(Service *s, ServiceState state, ServiceResult f); static void service_enter_reload_by_notify(Service *s); @@ -389,6 +390,7 @@ static void service_done(Unit *u) { service_stop_watchdog(s); s->timer_event_source = sd_event_source_unref(s->timer_event_source); + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); service_release_resources(u); } @@ -1066,6 +1068,9 @@ static void service_set_state(Service *s, ServiceState state) { !(state == SERVICE_DEAD && UNIT(s)->job)) service_close_socket_fd(s); + if (state != SERVICE_START) + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) service_stop_watchdog(s); @@ -1296,6 +1301,63 @@ static int service_collect_fds( return 0; } +static int service_allocate_exec_fd_event_source( + Service *s, + int fd, + sd_event_source **ret_event_source) { + + _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL; + int r; + + assert(s); + assert(fd >= 0); + assert(ret_event_source); + + r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m"); + + /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */ + + r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m"); + + (void) sd_event_source_set_description(source, "service event_fd"); + + r = sd_event_source_set_io_fd_own(source, true); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m"); + + *ret_event_source = TAKE_PTR(source); + return 0; +} + +static int service_allocate_exec_fd( + Service *s, + sd_event_source **ret_event_source, + int* ret_exec_fd) { + + _cleanup_close_pair_ int p[2] = { -1, -1 }; + int r; + + assert(s); + assert(ret_event_source); + assert(ret_exec_fd); + + if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m"); + + r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source); + if (r < 0) + return r; + + p[0] = -1; + *ret_exec_fd = TAKE_FD(p[1]); + + return 0; +} + static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) { assert(s); @@ -1330,7 +1392,9 @@ static int service_spawn( .exec_fd = -1, }; _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL, **fd_names = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL; size_t n_socket_fds = 0, n_storage_fds = 0, n_env = 0; + _cleanup_close_ int exec_fd = -1; _cleanup_free_ int *fds = NULL; pid_t pid; int r; @@ -1363,6 +1427,14 @@ static int service_spawn( log_unit_debug(UNIT(s), "Passing %zu fds to service", n_socket_fds + n_storage_fds); } + if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { + assert(!s->exec_fd_event_source); + + r = service_allocate_exec_fd(s, &exec_fd_source, &exec_fd); + if (r < 0) + return r; + } + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout)); if (r < 0) return r; @@ -1462,6 +1534,7 @@ static int service_spawn( exec_params.stdin_fd = s->stdin_fd; exec_params.stdout_fd = s->stdout_fd; exec_params.stderr_fd = s->stderr_fd; + exec_params.exec_fd = exec_fd; r = exec_spawn(UNIT(s), c, @@ -1473,6 +1546,9 @@ static int service_spawn( if (r < 0) return r; + s->exec_fd_event_source = TAKE_PTR(exec_fd_source); + s->exec_fd_hot = false; + r = unit_watch_pid(UNIT(s), pid); if (r < 0) /* FIXME: we need to do something here */ return r; @@ -1984,14 +2060,12 @@ static void service_enter_start(Service *s) { s->control_pid = pid; service_set_state(s, SERVICE_START); - } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY)) { + } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_EXEC)) { - /* For oneshot services we wait until the start - * process exited, too, but it is our main process. */ + /* For oneshot services we wait until the start process exited, too, but it is our main process. */ - /* For D-Bus services we know the main pid right away, - * but wait for the bus name to appear on the - * bus. Notify services are similar. */ + /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the + * bus. 'notify' and 'exec' services are similar. */ service_set_main_pid(s, pid); service_set_state(s, SERVICE_START); @@ -2444,6 +2518,13 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) { if (r < 0) return r; + if (s->exec_fd_event_source) { + r = unit_serialize_item_fd(u, f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source)); + if (r < 0) + return r; + unit_serialize_item(u, f, "exec-fd-hot", yes_no(s->exec_fd_hot)); + } + if (UNIT_ISSET(s->accept_socket)) { r = unit_serialize_item(u, f, "accept-socket", UNIT_DEREF(s->accept_socket)->id); if (r < 0) @@ -2777,6 +2858,18 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, s->stderr_fd = fdset_remove(fds, fd); s->exec_context.stdio_as_fds = true; } + } else if (streq(key, "exec-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse exec-fd value: %s", value); + else { + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + fd = fdset_remove(fds, fd); + if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) < 0) + safe_close(fd); + } } else if (streq(key, "watchdog-override-usec")) { usec_t watchdog_override_usec; if (timestamp_deserialize(value, &watchdog_override_usec) < 0) @@ -2860,7 +2953,7 @@ static int service_watch_pid_file(Service *s) { log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path); - r = path_spec_watch(s->pid_file_pathspec, service_dispatch_io); + r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io); if (r < 0) goto fail; @@ -2904,7 +2997,7 @@ static int service_demand_pid_file(Service *s) { return service_watch_pid_file(s); } -static int service_dispatch_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { PathSpec *p = userdata; Service *s; @@ -2937,6 +3030,59 @@ fail: return 0; } +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { + Service *s = SERVICE(userdata); + + assert(s); + + log_unit_debug(UNIT(s), "got exec-fd event"); + + /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve() + * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically + * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the + * parent. We need to be careful however, as there are other reasons that we might cause the child's side of + * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless + * the child signalled us first that it is about to call the execve(). It does so by sending us a simple + * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it + * sends a zero byte we'll ignore POLLHUP on the fd again. */ + + for (;;) { + uint8_t x; + ssize_t n; + + n = read(fd, &x, sizeof(x)); + if (n < 0) { + if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ + return 0; + + return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); + } + if (n == 0) { /* EOF → the event we are waiting for */ + + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ + log_unit_debug(UNIT(s), "Got EOF on exec-fd"); + + s->exec_fd_hot = false; + + /* Nice! This is what we have been waiting for. Transition to next state. */ + if (s->type == SERVICE_EXEC && s->state == SERVICE_START) + service_enter_start_post(s); + } else + log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring."); + + return 0; + } + + /* A byte was read → this turns on/off the exec fd logic */ + assert(n == sizeof(x)); + s->exec_fd_hot = x; + } + + return 0; +} + static void service_notify_cgroup_empty_event(Unit *u) { Service *s = SERVICE(u); @@ -3846,7 +3992,8 @@ static const char* const service_type_table[_SERVICE_TYPE_MAX] = { [SERVICE_ONESHOT] = "oneshot", [SERVICE_DBUS] = "dbus", [SERVICE_NOTIFY] = "notify", - [SERVICE_IDLE] = "idle" + [SERVICE_IDLE] = "idle", + [SERVICE_EXEC] = "exec", }; DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType); diff --git a/src/core/service.h b/src/core/service.h index 9c06e91883..b53e4204e7 100644 --- a/src/core/service.h +++ b/src/core/service.h @@ -30,6 +30,7 @@ typedef enum ServiceType { SERVICE_DBUS, /* we fork and wait until a specific D-Bus name appears on the bus */ SERVICE_NOTIFY, /* we fork and wait until a daemon sends us a ready message with sd_notify() */ SERVICE_IDLE, /* much like simple, but delay exec() until all jobs are dispatched. */ + SERVICE_EXEC, /* we fork and wait until we execute exec() (this means our own setup is waited for) */ _SERVICE_TYPE_MAX, _SERVICE_TYPE_INVALID = -1 } ServiceType; @@ -165,6 +166,8 @@ struct Service { NotifyAccess notify_access; NotifyState notify_state; + sd_event_source *exec_fd_event_source; + ServiceFDStore *fd_store; size_t n_fd_store; unsigned n_fd_store_max; @@ -179,6 +182,7 @@ struct Service { unsigned n_restarts; bool flush_n_restarts; + bool exec_fd_hot; }; extern const UnitVTable service_vtable; diff --git a/src/core/socket.c b/src/core/socket.c index 56d32225c4..d488c64e91 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1867,10 +1867,11 @@ static int socket_coldplug(Unit *u) { static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { ExecParameters exec_params = { - .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, - .stdin_fd = -1, - .stdout_fd = -1, - .stderr_fd = -1, + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, }; pid_t pid; int r; diff --git a/src/core/swap.c b/src/core/swap.c index b78b1aa266..e01e61e56d 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -606,6 +606,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { .stdin_fd = -1, .stdout_fd = -1, .stderr_fd = -1, + .exec_fd = -1, }; pid_t pid; int r; From 79905a246d645d21633f09f564b3672d5085a85c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 17 Jul 2018 12:01:26 +0200 Subject: [PATCH 7/9] man: document the new Type=exec type And while we are at it, let's rearrange and extend the Type= documentation a bit. Let's make it an itemized list, and let's add a paragraph explaining which type best to use. --- man/systemd-run.xml | 10 +++ man/systemd.service.xml | 146 ++++++++++++++++++++++------------------ 2 files changed, 91 insertions(+), 65 deletions(-) diff --git a/man/systemd-run.xml b/man/systemd-run.xml index 149228ee02..6c15d2d837 100644 --- a/man/systemd-run.xml +++ b/man/systemd-run.xml @@ -83,6 +83,16 @@ COMMAND may be omitted. In this case, systemd-run creates only a .path, .socket, or .timer unit that triggers the specified unit. + + By default, services created with systemd-run default to the type, + see the description of Type= in + systemd.service5 for + details. Note that when this type is used the service manager (and thus the systemd-run command) + considers service start-up successful as soon as the fork() for the main service process + succeeded, i.e. before the execve() is invoked, and thus even if the specified command cannot + be started. Consider using the service type (i.e. ) to + ensure that systemd-run returns successfully only if the specified command line has been + successfully started. diff --git a/man/systemd.service.xml b/man/systemd.service.xml index f14a057280..0cd5385f9b 100644 --- a/man/systemd.service.xml +++ b/man/systemd.service.xml @@ -153,77 +153,93 @@ Type= - Configures the process start-up type for this - service unit. One of - , - , - , - , - or - . + + Configures the process start-up type for this service unit. One of , + , , , , + or : - If set to (the default if - neither Type= nor - BusName=, but ExecStart= - are specified), it is expected that the process configured - with ExecStart= is the main process of the - service. In this mode, if the process offers functionality to - other processes on the system, its communication channels - should be installed before the daemon is started up (e.g. - sockets set up by systemd, via socket activation), as systemd - will immediately proceed starting follow-up units. + + If set to (the default if ExecStart= is + specified but neither Type= nor BusName= are), the service manager + will consider the unit started immediately after the main service process has been forked off. It is + expected that the process configured with ExecStart= is the main process of the + service. In this mode, if the process offers functionality to other processes on the system, its + communication channels should be installed before the service is started up (e.g. sockets set up by + systemd, via socket activation), as the service manager will immediately proceed starting follow-up units, + right after creating the main service process, and before executing the service's binary. Note that this + means systemctl start command lines for services will report + success even if the service's binary cannot be invoked successfully (for example because the selected + User= doesn't exist, or the service binary is missing). - If set to , it is expected that - the process configured with ExecStart= will - call fork() as part of its start-up. The - parent process is expected to exit when start-up is complete - and all communication channels are set up. The child continues - to run as the main daemon process. This is the behavior of - traditional UNIX daemons. If this setting is used, it is - recommended to also use the PIDFile= - option, so that systemd can identify the main process of the - daemon. systemd will proceed with starting follow-up units as - soon as the parent process exits. + The type is similar to , but the service + manager will consider the unit started immediately after the main service binary has been executed. The service + manager will delay starting of follow-up units until that point. (Or in other words: + proceeds with further jobs right after fork() returns, while + will not proceed before both fork() and + execve() in the service process succeeded.) Note that this means systemctl + start command lines for services will report failure when the service's + binary cannot be invoked successfully (for example because the selected User= doesn't + exist, or the service binary is missing). - Behavior of is similar to - ; however, it is expected that the - process has to exit before systemd starts follow-up units. - RemainAfterExit= is particularly useful for - this type of service. This is the implied default if neither - Type= nor ExecStart= are - specified. + If set to , it is expected that the process configured with + ExecStart= will call fork() as part of its start-up. The parent + process is expected to exit when start-up is complete and all communication channels are set up. The child + continues to run as the main service process, and the service manager will consider the unit started when + the parent process exits. This is the behavior of traditional UNIX services. If this setting is used, it is + recommended to also use the PIDFile= option, so that systemd can reliably identify the + main process of the service. systemd will proceed with starting follow-up units as soon as the parent + process exits. - Behavior of is similar to - ; however, it is expected that the - daemon acquires a name on the D-Bus bus, as configured by - BusName=. systemd will proceed with - starting follow-up units after the D-Bus bus name has been - acquired. Service units with this option configured implicitly - gain dependencies on the dbus.socket - unit. This type is the default if BusName= - is specified. + Behavior of is similar to ; however, the + service manager will consider the unit started after the main process exits. It will then start follow-up + units. RemainAfterExit= is particularly useful for this type of + service. Type= is the implied default if neither + Type= nor ExecStart= are specified. - Behavior of is similar to - ; however, it is expected that the - daemon sends a notification message via - sd_notify3 - or an equivalent call when it has finished starting up. - systemd will proceed with starting follow-up units after this - notification message has been sent. If this option is used, - NotifyAccess= (see below) should be set to - open access to the notification socket provided by systemd. If - NotifyAccess= is missing or set to - , it will be forcibly set to - . Note that currently - Type= will not work - if used in combination with - PrivateNetwork=. + Behavior of is similar to ; however, it is + expected that the service acquires a name on the D-Bus bus, as configured by + BusName=. systemd will proceed with starting follow-up units after the D-Bus bus name + has been acquired. Service units with this option configured implicitly gain dependencies on the + dbus.socket unit. This type is the default if BusName= is + specified. - Behavior of is very similar to ; however, actual execution - of the service program is delayed until all active jobs are dispatched. This may be used to avoid interleaving - of output of shell services with the status output on the console. Note that this type is useful only to - improve console output, it is not useful as a general unit ordering tool, and the effect of this service type - is subject to a 5s time-out, after which the service program is invoked anyway. + Behavior of is similar to ; however, it is + expected that the service sends a notification message via + sd_notify3 or an + equivalent call when it has finished starting up. systemd will proceed with starting follow-up units after + this notification message has been sent. If this option is used, NotifyAccess= (see + below) should be set to open access to the notification socket provided by systemd. If + NotifyAccess= is missing or set to , it will be forcibly set to + . Note that currently Type= will not work if + used in combination with PrivateNetwork=. + + Behavior of is very similar to ; however, + actual execution of the service program is delayed until all active jobs are dispatched. This may be used + to avoid interleaving of output of shell services with the status output on the console. Note that this + type is useful only to improve console output, it is not useful as a general unit ordering tool, and the + effect of this service type is subject to a 5s time-out, after which the service program is invoked + anyway. + + + It is generally recommended to use Type= for long-running + services whenever possible, as it is the simplest and fastest option. However, as this service type won't + propagate service start-up failures and doesn't allow ordering of other units against completion of + initialization of the service (which for example is useful if clients need to connect to the service through + some form of IPC, and the IPC channel is only established by the service itself — in contrast to doing this + ahead of time through socket or bus activation or similar), it might not be sufficient for many cases. If so, + or (the latter only in case the service provides a D-Bus + interface) are the preferred options as they allow service program code to precisely schedule when to + consider the service started up successfully and when to proceed with follow-up units. The + service type requires explicit support in the service codebase (as + sd_notify() or an equivalent API needs to be invoked by the service at the appropriate + time) — if it's not supported, then is an alternative: it supports the traditional + UNIX service start-up protocol. Finally, might be an option for cases where it is + enough to ensure the service binary is invoked, and where the service binary itself executes no or little + initialization on its own (and its initialization is unlikely to fail). Note that using any type other than + possibly delays the boot process, as the service manager needs to wait for service + initialization to complete. It is hence recommended not to needlessly use any types other than + . (Also note it is generally not recommended to use or + for long-running services.) From 0e1f17561f5f6061ec5503de044298372ed7ca37 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 17 Jul 2018 12:35:12 +0200 Subject: [PATCH 8/9] test: add test for Type=exec --- test/TEST-23-TYPE-EXEC/Makefile | 4 +++ test/TEST-23-TYPE-EXEC/test.sh | 42 +++++++++++++++++++++++++++++ test/TEST-23-TYPE-EXEC/testsuite.sh | 28 +++++++++++++++++++ 3 files changed, 74 insertions(+) create mode 100644 test/TEST-23-TYPE-EXEC/Makefile create mode 100755 test/TEST-23-TYPE-EXEC/test.sh create mode 100755 test/TEST-23-TYPE-EXEC/testsuite.sh diff --git a/test/TEST-23-TYPE-EXEC/Makefile b/test/TEST-23-TYPE-EXEC/Makefile new file mode 100644 index 0000000000..34d7cc6cdf --- /dev/null +++ b/test/TEST-23-TYPE-EXEC/Makefile @@ -0,0 +1,4 @@ +BUILD_DIR=$(shell ../../tools/find-build-dir.sh) + +all setup clean run: + @basedir=../.. TEST_BASE_DIR=../ BUILD_DIR=$(BUILD_DIR) ./test.sh --$@ diff --git a/test/TEST-23-TYPE-EXEC/test.sh b/test/TEST-23-TYPE-EXEC/test.sh new file mode 100755 index 0000000000..bdcea239a7 --- /dev/null +++ b/test/TEST-23-TYPE-EXEC/test.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# -*- mode: shell-script; indent-tabs-mode: nil; sh-basic-offset: 4; -*- +# ex: ts=8 sw=4 sts=4 et filetype=sh +set -e +TEST_DESCRIPTION="test Type=exec" + +. $TEST_BASE_DIR/test-functions + +test_setup() { + create_empty_image + mkdir -p $TESTDIR/root + mount ${LOOPDEV}p1 $TESTDIR/root + + ( + LOG_LEVEL=5 + eval $(udevadm info --export --query=env --name=${LOOPDEV}p2) + + setup_basic_environment + + # setup the testsuite service + cat >$initdir/etc/systemd/system/testsuite.service < /testok + +exit 0 From fcb975129693d2d82dafb839fa66b9ac027bb080 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 25 Jul 2018 20:36:11 +0200 Subject: [PATCH 9/9] NEWS: add entry about Type=exec and announce that systemd-run is going to default to it in 241 --- NEWS | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/NEWS b/NEWS index 537c4b6131..0d5b95a42a 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,32 @@ systemd System and Service Manager +CHANGES WITH 240 in spe: + + * A new service type has been added: Type=exec. It's very similar to + Type=simple and ensures the service manager will wait for both fork() + and execve() of the main service binary to complete before proceeding + with follow-up units. This is primarily useful so that the manager + propagates any errors in the preparation phase of service execution + back to the job that requested the unit to be started. For example, + consider a service that has ExecStart= set to a file system binary + that doesn't exist. With Type=simple starting the unit would + typically succeed instantly, as only fork() has to complete + successfully and execve() is not waited for, and hence its failure is + seen "too late". With the new Type=exec service type starting the + unit will fail, as the execve() will be waited for and will fail, + which is then propagated back to the start job. + + NOTE: with the next release 241 of systemd we intend to change the + systemd-run tool to default to Type=exec for transient services + started by it. This should be mostly safe, but in specific corner + cases might result in problems, as the systemd-run tool will then + block on NSS calls (such as user name lookups due to User=) done + between the fork() and execve(), which under specific circumstances + might cause problems. It is recommended to specify "-p Type=simple" + explicitly in the few cases where this applies. For regular, + non-transient services (i.e. those defined with unit files on disk) + we will continue to default to Type=simple. + CHANGES WITH 239: * NETWORK INTERFACE DEVICE NAMING CHANGES: systemd-udevd's "net_id"