diff --git a/src/core/execute.c b/src/core/execute.c index 854e40ed6df..2356e96628f 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -3767,6 +3767,7 @@ static int compile_bind_mounts( static int compile_symlinks( const ExecContext *context, const ExecParameters *params, + bool setup_os_release_symlink, char ***ret_symlinks) { _cleanup_strv_free_ char **symlinks = NULL; @@ -3812,6 +3813,20 @@ static int compile_symlinks( } } + /* We make the host's os-release available via a symlink, so that we can copy it atomically + * and readers will never get a half-written version. Note that, while the paths specified here are + * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.: + * 'os-release -> .os-release-stage/os-release' is what will be created. */ + if (setup_os_release_symlink) { + r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release"); + if (r < 0) + return r; + + r = strv_extend(&symlinks, "/run/host/os-release"); + if (r < 0) + return r; + } + *ret_symlinks = TAKE_PTR(symlinks); return 0; @@ -3984,11 +3999,11 @@ static int apply_mount_namespace( _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL, **read_write_paths_cleanup = NULL; _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL, - *extension_dir = NULL, *host_os_release = NULL; + *extension_dir = NULL, *host_os_release_stage = NULL; const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL; char **read_write_paths; NamespaceInfo ns_info; - bool needs_sandboxing; + bool needs_sandboxing, setup_os_release_symlink; BindMount *bind_mounts = NULL; size_t n_bind_mounts = 0; int r; @@ -4012,11 +4027,6 @@ static int apply_mount_namespace( if (r < 0) return r; - /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */ - r = compile_symlinks(context, params, &symlinks); - if (r < 0) - return r; - /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the * service will need to write to it in order to start the notifications. */ if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) { @@ -4081,6 +4091,12 @@ static int apply_mount_namespace( else ns_info = (NamespaceInfo) {}; + /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */ + setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image); + r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks); + if (r < 0) + return r; + if (context->mount_propagation_flag == MS_SHARED) log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring"); @@ -4107,9 +4123,9 @@ static int apply_mount_namespace( /* If running under a different root filesystem, propagate the host's os-release. We make a * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */ - if (root_dir || root_image) { - host_os_release = strdup("/run/systemd/propagate/os-release"); - if (!host_os_release) + if (setup_os_release_symlink) { + host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage"); + if (!host_os_release_stage) return -ENOMEM; } } else { @@ -4118,8 +4134,10 @@ static int apply_mount_namespace( if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) return -ENOMEM; - if (root_dir || root_image) { - if (asprintf(&host_os_release, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0) + if (setup_os_release_symlink) { + if (asprintf(&host_os_release_stage, + "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage", + geteuid()) < 0) return -ENOMEM; } } @@ -4169,7 +4187,7 @@ static int apply_mount_namespace( incoming_dir, extension_dir, root_dir || root_image ? params->notify_socket : NULL, - host_os_release, + host_os_release_stage, error_path); /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports diff --git a/src/core/main.c b/src/core/main.c index e6932784d15..c09f922700e 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -1397,11 +1397,11 @@ static int setup_os_release(RuntimeScope scope) { } if (scope == RUNTIME_SCOPE_SYSTEM) { - os_release_dst = strdup("/run/systemd/propagate/os-release"); + os_release_dst = strdup("/run/systemd/propagate/.os-release-stage/os-release"); if (!os_release_dst) return log_oom_debug(); } else { - if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0) + if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid()) < 0) return log_oom_debug(); } @@ -1409,7 +1409,7 @@ static int setup_os_release(RuntimeScope scope) { if (r < 0) return log_debug_errno(r, "Failed to create parent directory of %s, ignoring: %m", os_release_dst); - r = copy_file(os_release_src, os_release_dst, /* open_flags= */ 0, 0644, COPY_MAC_CREATE|COPY_TRUNCATE); + r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE); if (r < 0) return log_debug_errno(r, "Failed to create %s, ignoring: %m", os_release_dst); diff --git a/src/core/namespace.c b/src/core/namespace.c index 32f9c45deaa..51b5aad9c90 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -1823,7 +1823,7 @@ static int apply_mounts( const NamespaceInfo *ns_info, MountEntry *mounts, size_t *n_mounts, - char **exec_dir_symlinks, + char **symlinks, char **error_path) { _cleanup_fclose_ FILE *proc_self_mountinfo = NULL; @@ -1891,12 +1891,12 @@ static int apply_mounts( } /* Now that all filesystems have been set up, but before the - * read-only switches are flipped, create the exec dirs symlinks. + * read-only switches are flipped, create the exec dirs and other symlinks. * Note that when /var/lib is not empty/tmpfs, these symlinks will already * exist, which means this will be a no-op. */ - r = create_symlinks_from_tuples(root, exec_dir_symlinks); + r = create_symlinks_from_tuples(root, symlinks); if (r < 0) - return log_debug_errno(r, "Failed to set up ExecDirectories symlinks inside mount namespace: %m"); + return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m"); /* Create a deny list we can pass to bind_mount_recursive() */ deny_list = new(char*, (*n_mounts)+1); @@ -2006,7 +2006,7 @@ int setup_namespace( char** exec_paths, char** no_exec_paths, char** empty_directories, - char** exec_dir_symlinks, + char** symlinks, const BindMount *bind_mounts, size_t n_bind_mounts, const TemporaryFileSystem *temporary_filesystems, @@ -2028,7 +2028,7 @@ int setup_namespace( const char *incoming_dir, const char *extension_dir, const char *notify_socket, - const char *host_os_release, + const char *host_os_release_stage, char **error_path) { _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL; @@ -2156,7 +2156,7 @@ int setup_namespace( log_namespace, setup_propagate, notify_socket, - host_os_release); + host_os_release_stage); if (n_mounts > 0) { m = mounts = new0(MountEntry, n_mounts); @@ -2391,10 +2391,10 @@ int setup_namespace( .read_only = true, }; - if (host_os_release) + if (host_os_release_stage) *(m++) = (MountEntry) { - .path_const = "/run/host/os-release", - .source_const = host_os_release, + .path_const = "/run/host/.os-release-stage/", + .source_const = host_os_release_stage, .mode = BIND_MOUNT, .read_only = true, .ignore = true, /* Live copy, don't hard-fail if it goes missing */ @@ -2489,7 +2489,7 @@ int setup_namespace( (void) base_filesystem_create(root, UID_INVALID, GID_INVALID); /* Now make the magic happen */ - r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, exec_dir_symlinks, error_path); + r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, symlinks, error_path); if (r < 0) goto finish; diff --git a/src/core/namespace.h b/src/core/namespace.h index 44e8f097dac..b6132154c51 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -111,7 +111,7 @@ int setup_namespace( char **exec_paths, char **no_exec_paths, char **empty_directories, - char **exec_dir_symlinks, + char **symlinks, const BindMount *bind_mounts, size_t n_bind_mounts, const TemporaryFileSystem *temporary_filesystems, @@ -133,7 +133,7 @@ int setup_namespace( const char *incoming_dir, const char *extension_dir, const char *notify_socket, - const char *host_os_release, + const char *host_os_release_stage, char **error_path); #define RUN_SYSTEMD_EMPTY "/run/systemd/empty" diff --git a/test/units/testsuite-82.sh b/test/units/testsuite-82.sh index 7adee341c1e..0bbab330f4e 100755 --- a/test/units/testsuite-82.sh +++ b/test/units/testsuite-82.sh @@ -50,7 +50,7 @@ elif [ -f /run/testsuite82.touch2 ]; then # Test that we really are in the new overlayfs root fs read -r x /tmp/nextroot-lower/usr/lib/os-release echo MARKER=1 >>/tmp/nextroot-lower/usr/lib/os-release - cmp /etc/os-release /run/systemd/propagate/os-release + cmp /etc/os-release /run/systemd/propagate/.os-release-stage/os-release (! grep -q MARKER=1 /etc/os-release) mount -t overlay nextroot /run/nextroot -o lowerdir=/tmp/nextroot-lower:/,ro