1
0
mirror of https://github.com/systemd/systemd.git synced 2024-12-22 17:35:35 +03:00

mount-util: use mount beneath to replace previous namespace mount

Instead of mounting over, do an atomic swap using mount beneath, if
available. This way assets can be mounted again and again (e.g.:
updates) without leaking mounts.
This commit is contained in:
Luca Boccassi 2023-09-29 01:50:15 +01:00 committed by Luca Boccassi
parent f5e6f3117c
commit 7c83d42ef8
9 changed files with 109 additions and 41 deletions

View File

@ -1310,11 +1310,16 @@ node /org/freedesktop/systemd1 {
<function>TryRestartUnit()</function> or <function>ReloadOrTryRestartUnit()</function> for the marked
units.</para>
<para><function>BindMountUnit()</function> can be used to bind mount new files or directories into
a running service mount namespace.</para>
<para><function>BindMountUnit()</function> can be used to bind mount new files or directories into a
running service mount namespace. If supported by the kernel, any prior mount on the selected target
will be replaced by the new mount. If not supported, any prior mount will be over-mounted, but remain
pinned and inaccessible.
</para>
<para><function>MountImageUnit()</function> can be used to mount new images into a running service
mount namespace.</para>
mount namespace. If supported by the kernel, any prior mount on the selected target will be replaced
by the new mount. If not supported, any prior mount will be over-mounted, but remain pinned and
inaccessible.</para>
<para><function>KillUnit()</function> may be used to kill (i.e. send a signal to) all processes of a
unit. It takes the unit <varname>name</varname>, an enum <varname>who</varname> and a UNIX

View File

@ -663,6 +663,10 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err
<option>ExecReload=</option>, <option>ExecStartPre=</option>, etc.) run in distinct namespaces.
</para>
<para>If supported by the kernel, any prior mount on the selected target will be replaced by the
new mount. If not supported, any prior mount will be over-mounted, but remain pinned and
inaccessible.</para>
<xi:include href="version-info.xml" xpointer="v248"/></listitem>
</varlistentry>
@ -693,6 +697,10 @@ Jan 12 10:46:45 example.com bluetoothd[8900]: gatt-time-server: Input/output err
<option>ExecReload=</option>, <option>ExecStartPre=</option>, etc.) run in distinct namespaces.
</para>
<para>If supported by the kernel, any prior mount on the selected target will be replaced by the
new mount. If not supported, any prior mount will be over-mounted, but remain pinned and
inaccessible.</para>
<para>Example:
<programlisting>systemctl mount-image foo.service /tmp/img.raw /var/lib/image root:ro,nosuid</programlisting>
<programlisting>systemctl mount-image --mkdir bar.service /tmp/img.raw /var/lib/baz/img</programlisting>

View File

@ -539,6 +539,10 @@ static inline int missing_open_tree(
/* ======================================================================= */
#ifndef MOVE_MOUNT_BENEATH
#define MOVE_MOUNT_BENEATH 0x00000200
#endif
#if !HAVE_MOVE_MOUNT
#ifndef MOVE_MOUNT_F_EMPTY_PATH

View File

@ -2007,8 +2007,12 @@ static int mount_partition(
if (m->fsmount_fd >= 0) {
/* Case #1: Attach existing fsmount fd to the file system */
if (move_mount(m->fsmount_fd, "", -EBADF, p, MOVE_MOUNT_F_EMPTY_PATH) < 0)
return -errno;
r = mount_exchange_graceful(
m->fsmount_fd,
p,
FLAGS_SET(flags, DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE));
if (r < 0)
return log_debug_errno(r, "Failed to mount image on '%s': %m", p);
} else {
assert(node);

View File

@ -56,36 +56,37 @@ struct DissectedPartition {
})
typedef enum DissectImageFlags {
DISSECT_IMAGE_DEVICE_READ_ONLY = 1 << 0, /* Make device read-only */
DISSECT_IMAGE_DISCARD_ON_LOOP = 1 << 1, /* Turn on "discard" if on a loop device and file system supports it */
DISSECT_IMAGE_DISCARD = 1 << 2, /* Turn on "discard" if file system supports it, on all block devices */
DISSECT_IMAGE_DISCARD_ON_CRYPTO = 1 << 3, /* Turn on "discard" also on crypto devices */
DISSECT_IMAGE_DISCARD_ANY = DISSECT_IMAGE_DISCARD_ON_LOOP |
DISSECT_IMAGE_DISCARD |
DISSECT_IMAGE_DISCARD_ON_CRYPTO,
DISSECT_IMAGE_GPT_ONLY = 1 << 4, /* Only recognize images with GPT partition tables */
DISSECT_IMAGE_GENERIC_ROOT = 1 << 5, /* If no partition table or only single generic partition, assume it's the root fs */
DISSECT_IMAGE_MOUNT_ROOT_ONLY = 1 << 6, /* Mount only the root and /usr partitions */
DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY = 1 << 7, /* Mount only the non-root and non-/usr partitions */
DISSECT_IMAGE_VALIDATE_OS = 1 << 8, /* Refuse mounting images that aren't identifiable as OS images */
DISSECT_IMAGE_VALIDATE_OS_EXT = 1 << 9, /* Refuse mounting images that aren't identifiable as OS extension images */
DISSECT_IMAGE_RELAX_VAR_CHECK = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */
DISSECT_IMAGE_FSCK = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */
DISSECT_IMAGE_NO_PARTITION_TABLE = 1 << 12, /* Only recognize single file system images */
DISSECT_IMAGE_VERITY_SHARE = 1 << 13, /* When activating a verity device, reuse existing one if already open */
DISSECT_IMAGE_MKDIR = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */
DISSECT_IMAGE_USR_NO_ROOT = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */
DISSECT_IMAGE_REQUIRE_ROOT = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */
DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 17, /* Make mounts read-only */
DISSECT_IMAGE_READ_ONLY = DISSECT_IMAGE_DEVICE_READ_ONLY |
DISSECT_IMAGE_MOUNT_READ_ONLY,
DISSECT_IMAGE_GROWFS = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */
DISSECT_IMAGE_MOUNT_IDMAPPED = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */
DISSECT_IMAGE_ADD_PARTITION_DEVICES = 1 << 20, /* Create partition devices via BLKPG_ADD_PARTITION */
DISSECT_IMAGE_PIN_PARTITION_DEVICES = 1 << 21, /* Open dissected partitions and decrypted partitions and pin them by fd */
DISSECT_IMAGE_RELAX_EXTENSION_CHECK = 1 << 22, /* Don't insist that the extension-release file name matches the image name */
DISSECT_IMAGE_DISKSEQ_DEVNODE = 1 << 23, /* Prefer /dev/disk/by-diskseq/… device nodes */
DISSECT_IMAGE_ALLOW_EMPTY = 1 << 24, /* Allow that no usable partitions is present */
DISSECT_IMAGE_DEVICE_READ_ONLY = 1 << 0, /* Make device read-only */
DISSECT_IMAGE_DISCARD_ON_LOOP = 1 << 1, /* Turn on "discard" if on a loop device and file system supports it */
DISSECT_IMAGE_DISCARD = 1 << 2, /* Turn on "discard" if file system supports it, on all block devices */
DISSECT_IMAGE_DISCARD_ON_CRYPTO = 1 << 3, /* Turn on "discard" also on crypto devices */
DISSECT_IMAGE_DISCARD_ANY = DISSECT_IMAGE_DISCARD_ON_LOOP |
DISSECT_IMAGE_DISCARD |
DISSECT_IMAGE_DISCARD_ON_CRYPTO,
DISSECT_IMAGE_GPT_ONLY = 1 << 4, /* Only recognize images with GPT partition tables */
DISSECT_IMAGE_GENERIC_ROOT = 1 << 5, /* If no partition table or only single generic partition, assume it's the root fs */
DISSECT_IMAGE_MOUNT_ROOT_ONLY = 1 << 6, /* Mount only the root and /usr partitions */
DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY = 1 << 7, /* Mount only the non-root and non-/usr partitions */
DISSECT_IMAGE_VALIDATE_OS = 1 << 8, /* Refuse mounting images that aren't identifiable as OS images */
DISSECT_IMAGE_VALIDATE_OS_EXT = 1 << 9, /* Refuse mounting images that aren't identifiable as OS extension images */
DISSECT_IMAGE_RELAX_VAR_CHECK = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */
DISSECT_IMAGE_FSCK = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */
DISSECT_IMAGE_NO_PARTITION_TABLE = 1 << 12, /* Only recognize single file system images */
DISSECT_IMAGE_VERITY_SHARE = 1 << 13, /* When activating a verity device, reuse existing one if already open */
DISSECT_IMAGE_MKDIR = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */
DISSECT_IMAGE_USR_NO_ROOT = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */
DISSECT_IMAGE_REQUIRE_ROOT = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */
DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 17, /* Make mounts read-only */
DISSECT_IMAGE_READ_ONLY = DISSECT_IMAGE_DEVICE_READ_ONLY |
DISSECT_IMAGE_MOUNT_READ_ONLY,
DISSECT_IMAGE_GROWFS = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */
DISSECT_IMAGE_MOUNT_IDMAPPED = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */
DISSECT_IMAGE_ADD_PARTITION_DEVICES = 1 << 20, /* Create partition devices via BLKPG_ADD_PARTITION */
DISSECT_IMAGE_PIN_PARTITION_DEVICES = 1 << 21, /* Open dissected partitions and decrypted partitions and pin them by fd */
DISSECT_IMAGE_RELAX_EXTENSION_CHECK = 1 << 22, /* Don't insist that the extension-release file name matches the image name */
DISSECT_IMAGE_DISKSEQ_DEVNODE = 1 << 23, /* Prefer /dev/disk/by-diskseq/… device nodes */
DISSECT_IMAGE_ALLOW_EMPTY = 1 << 24, /* Allow that no usable partitions is present */
DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE = 1 << 25, /* Try to mount the image beneath the specified mountpoint, rather than on top of it, and then umount the top */
} DissectImageFlags;
struct DissectedImage {

View File

@ -730,6 +730,45 @@ int umount_verbose(
return 0;
}
int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath) {
int r;
assert(fsmount_fd >= 0);
assert(dest);
/* First, try to mount beneath an existing mount point, and if that works, umount the old mount,
* which is now at the top. This will ensure we can atomically replace a mount. Note that this works
* also in the case where there are submounts down the tree. Mount propagation is allowed but
* restricted to layouts that don't end up propagation the new mount on top of the mount stack. If
* this is not supported (minimum kernel v6.5), or if there is no mount on the mountpoint, we get
* -EINVAL and then we fallback to normal mounting. */
r = RET_NERRNO(move_mount(
fsmount_fd,
/* from_path= */ "",
/* to_fd= */ -EBADF,
dest,
MOVE_MOUNT_F_EMPTY_PATH | (mount_beneath ? MOVE_MOUNT_BENEATH : 0)));
if (mount_beneath) {
if (r == -EINVAL) { /* Fallback if mount_beneath is not supported */
log_debug_errno(r,
"Failed to mount beneath '%s', falling back to overmount",
dest);
return RET_NERRNO(move_mount(
fsmount_fd,
/* from_path= */ "",
/* to_fd= */ -EBADF,
dest,
MOVE_MOUNT_F_EMPTY_PATH));
}
if (r >= 0) /* If it is, now remove the old mount */
return umount_verbose(LOG_DEBUG, dest, UMOUNT_NOFOLLOW|MNT_DETACH);
}
return r;
}
int mount_option_mangle(
const char *options,
unsigned long mount_flags,
@ -1155,7 +1194,7 @@ static int mount_in_namespace(
(void) mkdir_parents(dest, 0755);
if (img) {
DissectImageFlags f = 0;
DissectImageFlags f = DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE;
if (make_file_or_directory)
f |= DISSECT_IMAGE_MKDIR;
@ -1174,11 +1213,7 @@ static int mount_in_namespace(
if (make_file_or_directory)
(void) make_mount_point_inode_from_stat(&st, dest, 0700);
r = RET_NERRNO(move_mount(new_mount_fd,
"",
-EBADF,
dest,
MOVE_MOUNT_F_EMPTY_PATH));
r = mount_exchange_graceful(new_mount_fd, dest, /* mount_beneath= */ true);
}
if (r < 0) {
(void) write(errno_pipe_fd[1], &r, sizeof(r));

View File

@ -68,6 +68,8 @@ int umount_verbose(
const char *where,
int flags);
int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath);
int mount_option_mangle(
const char *options,
unsigned long mount_flags,

View File

@ -23,8 +23,12 @@ systemctl start testsuite-23-namespaced.service
# Ensure that inaccessible paths aren't bypassed by the runtime setup,
(! systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-fixed /run/inaccessible/testfile-marker-fixed)
echo "MARKER_WRONG" >/run/testsuite-23-marker-wrong
echo "MARKER_RUNTIME" >/run/testsuite-23-marker-runtime
# Mount twice to exercise mount-beneath (on kernel 6.5+, on older kernels it will just overmount)
systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-wrong /tmp/testfile-marker-runtime
test "$(systemctl show -P SubState testsuite-23-namespaced.service)" = "running"
systemctl bind --mkdir testsuite-23-namespaced.service /run/testsuite-23-marker-runtime /tmp/testfile-marker-runtime
timeout 10 bash -xec 'while [[ "$(systemctl show -P SubState testsuite-23-namespaced.service)" == running ]]; do sleep .5; done'

View File

@ -363,6 +363,11 @@ ExecStart=/bin/sh -c ' \\
EOF
systemctl start testservice-50d.service
# Mount twice to exercise mount-beneath (on kernel 6.5+, on older kernels it will just overmount)
mkdir -p /tmp/wrong/foo
mksquashfs /tmp/wrong/foo /tmp/wrong.raw
systemctl mount-image --mkdir testservice-50d.service /tmp/wrong.raw /tmp/img
test "$(systemctl show -P SubState testservice-50d.service)" = "running"
systemctl mount-image --mkdir testservice-50d.service "${image}.raw" /tmp/img root:nosuid
while systemctl show -P SubState testservice-50d.service | grep -q running