mirror of
https://github.com/systemd/systemd.git
synced 2025-03-21 02:50:18 +03:00
core: stage /run/host/os-release with a symlink to avoid possible race condition
If someone reads /run/host/os-release at the exact same time it is being updated, and it is large enough, they might read a half-written file. This is very unlikely as os-release is typically small and very rarely changes, but it is not impossible. Bind mount a staging directory instead of the file, and symlink the file into into, so that we can do atomic file updates and close this gap. Atomic replacement creates a new inode, so existing bind mounts would continue to see the old file, and only new services would see the new file. The indirection via the directory allows to work around this, as the directory is fixed and never changes so the bind mount is always valid, and its content is shared with all existing services. Fixes https://github.com/systemd/systemd/issues/28794 Follow-up for 3f37a82545d461ab
This commit is contained in:
parent
409986ed3e
commit
663e27564f
@ -3767,6 +3767,7 @@ static int compile_bind_mounts(
|
||||
static int compile_symlinks(
|
||||
const ExecContext *context,
|
||||
const ExecParameters *params,
|
||||
bool setup_os_release_symlink,
|
||||
char ***ret_symlinks) {
|
||||
|
||||
_cleanup_strv_free_ char **symlinks = NULL;
|
||||
@ -3812,6 +3813,20 @@ static int compile_symlinks(
|
||||
}
|
||||
}
|
||||
|
||||
/* We make the host's os-release available via a symlink, so that we can copy it atomically
|
||||
* and readers will never get a half-written version. Note that, while the paths specified here are
|
||||
* absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
|
||||
* 'os-release -> .os-release-stage/os-release' is what will be created. */
|
||||
if (setup_os_release_symlink) {
|
||||
r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = strv_extend(&symlinks, "/run/host/os-release");
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
*ret_symlinks = TAKE_PTR(symlinks);
|
||||
|
||||
return 0;
|
||||
@ -3984,11 +3999,11 @@ static int apply_mount_namespace(
|
||||
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
|
||||
**read_write_paths_cleanup = NULL;
|
||||
_cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
|
||||
*extension_dir = NULL, *host_os_release = NULL;
|
||||
*extension_dir = NULL, *host_os_release_stage = NULL;
|
||||
const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
|
||||
char **read_write_paths;
|
||||
NamespaceInfo ns_info;
|
||||
bool needs_sandboxing;
|
||||
bool needs_sandboxing, setup_os_release_symlink;
|
||||
BindMount *bind_mounts = NULL;
|
||||
size_t n_bind_mounts = 0;
|
||||
int r;
|
||||
@ -4012,11 +4027,6 @@ static int apply_mount_namespace(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
|
||||
r = compile_symlinks(context, params, &symlinks);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
|
||||
* service will need to write to it in order to start the notifications. */
|
||||
if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
|
||||
@ -4081,6 +4091,12 @@ static int apply_mount_namespace(
|
||||
else
|
||||
ns_info = (NamespaceInfo) {};
|
||||
|
||||
/* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
|
||||
setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
|
||||
r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
if (context->mount_propagation_flag == MS_SHARED)
|
||||
log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
|
||||
|
||||
@ -4107,9 +4123,9 @@ static int apply_mount_namespace(
|
||||
|
||||
/* If running under a different root filesystem, propagate the host's os-release. We make a
|
||||
* copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
|
||||
if (root_dir || root_image) {
|
||||
host_os_release = strdup("/run/systemd/propagate/os-release");
|
||||
if (!host_os_release)
|
||||
if (setup_os_release_symlink) {
|
||||
host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
|
||||
if (!host_os_release_stage)
|
||||
return -ENOMEM;
|
||||
}
|
||||
} else {
|
||||
@ -4118,8 +4134,10 @@ static int apply_mount_namespace(
|
||||
if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
|
||||
return -ENOMEM;
|
||||
|
||||
if (root_dir || root_image) {
|
||||
if (asprintf(&host_os_release, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
|
||||
if (setup_os_release_symlink) {
|
||||
if (asprintf(&host_os_release_stage,
|
||||
"/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
|
||||
geteuid()) < 0)
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
@ -4169,7 +4187,7 @@ static int apply_mount_namespace(
|
||||
incoming_dir,
|
||||
extension_dir,
|
||||
root_dir || root_image ? params->notify_socket : NULL,
|
||||
host_os_release,
|
||||
host_os_release_stage,
|
||||
error_path);
|
||||
|
||||
/* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
|
||||
|
@ -1397,11 +1397,11 @@ static int setup_os_release(RuntimeScope scope) {
|
||||
}
|
||||
|
||||
if (scope == RUNTIME_SCOPE_SYSTEM) {
|
||||
os_release_dst = strdup("/run/systemd/propagate/os-release");
|
||||
os_release_dst = strdup("/run/systemd/propagate/.os-release-stage/os-release");
|
||||
if (!os_release_dst)
|
||||
return log_oom_debug();
|
||||
} else {
|
||||
if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
|
||||
if (asprintf(&os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid()) < 0)
|
||||
return log_oom_debug();
|
||||
}
|
||||
|
||||
@ -1409,7 +1409,7 @@ static int setup_os_release(RuntimeScope scope) {
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to create parent directory of %s, ignoring: %m", os_release_dst);
|
||||
|
||||
r = copy_file(os_release_src, os_release_dst, /* open_flags= */ 0, 0644, COPY_MAC_CREATE|COPY_TRUNCATE);
|
||||
r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to create %s, ignoring: %m", os_release_dst);
|
||||
|
||||
|
@ -1823,7 +1823,7 @@ static int apply_mounts(
|
||||
const NamespaceInfo *ns_info,
|
||||
MountEntry *mounts,
|
||||
size_t *n_mounts,
|
||||
char **exec_dir_symlinks,
|
||||
char **symlinks,
|
||||
char **error_path) {
|
||||
|
||||
_cleanup_fclose_ FILE *proc_self_mountinfo = NULL;
|
||||
@ -1891,12 +1891,12 @@ static int apply_mounts(
|
||||
}
|
||||
|
||||
/* Now that all filesystems have been set up, but before the
|
||||
* read-only switches are flipped, create the exec dirs symlinks.
|
||||
* read-only switches are flipped, create the exec dirs and other symlinks.
|
||||
* Note that when /var/lib is not empty/tmpfs, these symlinks will already
|
||||
* exist, which means this will be a no-op. */
|
||||
r = create_symlinks_from_tuples(root, exec_dir_symlinks);
|
||||
r = create_symlinks_from_tuples(root, symlinks);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to set up ExecDirectories symlinks inside mount namespace: %m");
|
||||
return log_debug_errno(r, "Failed to set up symlinks inside mount namespace: %m");
|
||||
|
||||
/* Create a deny list we can pass to bind_mount_recursive() */
|
||||
deny_list = new(char*, (*n_mounts)+1);
|
||||
@ -2006,7 +2006,7 @@ int setup_namespace(
|
||||
char** exec_paths,
|
||||
char** no_exec_paths,
|
||||
char** empty_directories,
|
||||
char** exec_dir_symlinks,
|
||||
char** symlinks,
|
||||
const BindMount *bind_mounts,
|
||||
size_t n_bind_mounts,
|
||||
const TemporaryFileSystem *temporary_filesystems,
|
||||
@ -2028,7 +2028,7 @@ int setup_namespace(
|
||||
const char *incoming_dir,
|
||||
const char *extension_dir,
|
||||
const char *notify_socket,
|
||||
const char *host_os_release,
|
||||
const char *host_os_release_stage,
|
||||
char **error_path) {
|
||||
|
||||
_cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
|
||||
@ -2156,7 +2156,7 @@ int setup_namespace(
|
||||
log_namespace,
|
||||
setup_propagate,
|
||||
notify_socket,
|
||||
host_os_release);
|
||||
host_os_release_stage);
|
||||
|
||||
if (n_mounts > 0) {
|
||||
m = mounts = new0(MountEntry, n_mounts);
|
||||
@ -2391,10 +2391,10 @@ int setup_namespace(
|
||||
.read_only = true,
|
||||
};
|
||||
|
||||
if (host_os_release)
|
||||
if (host_os_release_stage)
|
||||
*(m++) = (MountEntry) {
|
||||
.path_const = "/run/host/os-release",
|
||||
.source_const = host_os_release,
|
||||
.path_const = "/run/host/.os-release-stage/",
|
||||
.source_const = host_os_release_stage,
|
||||
.mode = BIND_MOUNT,
|
||||
.read_only = true,
|
||||
.ignore = true, /* Live copy, don't hard-fail if it goes missing */
|
||||
@ -2489,7 +2489,7 @@ int setup_namespace(
|
||||
(void) base_filesystem_create(root, UID_INVALID, GID_INVALID);
|
||||
|
||||
/* Now make the magic happen */
|
||||
r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, exec_dir_symlinks, error_path);
|
||||
r = apply_mounts(root, mount_image_policy, extension_image_policy, ns_info, mounts, &n_mounts, symlinks, error_path);
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
|
@ -111,7 +111,7 @@ int setup_namespace(
|
||||
char **exec_paths,
|
||||
char **no_exec_paths,
|
||||
char **empty_directories,
|
||||
char **exec_dir_symlinks,
|
||||
char **symlinks,
|
||||
const BindMount *bind_mounts,
|
||||
size_t n_bind_mounts,
|
||||
const TemporaryFileSystem *temporary_filesystems,
|
||||
@ -133,7 +133,7 @@ int setup_namespace(
|
||||
const char *incoming_dir,
|
||||
const char *extension_dir,
|
||||
const char *notify_socket,
|
||||
const char *host_os_release,
|
||||
const char *host_os_release_stage,
|
||||
char **error_path);
|
||||
|
||||
#define RUN_SYSTEMD_EMPTY "/run/systemd/empty"
|
||||
|
@ -50,7 +50,7 @@ elif [ -f /run/testsuite82.touch2 ]; then
|
||||
# Test that we really are in the new overlayfs root fs
|
||||
read -r x </lower
|
||||
test "$x" = "miep"
|
||||
cmp /etc/os-release /run/systemd/propagate/os-release
|
||||
cmp /etc/os-release /run/systemd/propagate/.os-release-stage/os-release
|
||||
grep -q MARKER=1 /etc/os-release
|
||||
|
||||
# Switch back to the original root, away from the overlayfs
|
||||
@ -96,7 +96,7 @@ elif [ -f /run/testsuite82.touch ]; then
|
||||
mkdir -p /tmp/nextroot-lower/usr/lib
|
||||
grep ID /etc/os-release >/tmp/nextroot-lower/usr/lib/os-release
|
||||
echo MARKER=1 >>/tmp/nextroot-lower/usr/lib/os-release
|
||||
cmp /etc/os-release /run/systemd/propagate/os-release
|
||||
cmp /etc/os-release /run/systemd/propagate/.os-release-stage/os-release
|
||||
(! grep -q MARKER=1 /etc/os-release)
|
||||
|
||||
mount -t overlay nextroot /run/nextroot -o lowerdir=/tmp/nextroot-lower:/,ro
|
||||
|
Loading…
x
Reference in New Issue
Block a user