1
1
mirror of https://github.com/systemd/systemd-stable.git synced 2024-12-25 23:21:33 +03:00

shared: add new safe_fork flag FORK_PRIVATE_TMP

If the flag is set, we mount /tmp/ in a way that is suitable for generators and
other quick jobs.

Unfortunately I had to move some code from shared/mount-util.c to
basic/mountpoint-util.c. The functions that are moved are very thin wrappers
around mount(2), so this doesn't actually change much in the code split between
libbasic and libshared.

Implications for the host would be weird if a private mount namespace is not
used, so assert on FORK_NEW_MOUNTNS when the flag is used.
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2022-12-14 13:40:53 +01:00
parent 7c76e1812d
commit 61ef30515b
7 changed files with 100 additions and 86 deletions

View File

@ -524,6 +524,52 @@ int dev_is_devtmpfs(void) {
return false;
}
int mount_fd(const char *source,
int target_fd,
const char *filesystemtype,
unsigned long mountflags,
const void *data) {
if (mount(source, FORMAT_PROC_FD_PATH(target_fd), filesystemtype, mountflags, data) < 0) {
if (errno != ENOENT)
return -errno;
/* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
* mounted. Check for the latter to generate better error messages. */
if (proc_mounted() == 0)
return -ENOSYS;
return -ENOENT;
}
return 0;
}
int mount_nofollow(
const char *source,
const char *target,
const char *filesystemtype,
unsigned long mountflags,
const void *data) {
_cleanup_close_ int fd = -1;
/* In almost all cases we want to manipulate the mount table without following symlinks, hence
* mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
* not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
* initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
* fs to mount) we can only use traditional mount() directly.
*
* Note that this disables following only for the final component of the target, i.e symlinks within
* the path of the target are honoured, as are symlinks in the source path everywhere. */
fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
if (fd < 0)
return -errno;
return mount_fd(source, fd, filesystemtype, mountflags, data);
}
const char *mount_propagation_flags_to_string(unsigned long flags) {
switch (flags & (MS_SHARED|MS_SLAVE|MS_PRIVATE)) {

View File

@ -5,6 +5,36 @@
#include <stdbool.h>
#include <sys/types.h>
/* The limit used for /dev itself. 4MB should be enough since device nodes and symlinks don't
* consume any space and udev isn't supposed to create regular file either. There's no limit on the
* max number of inodes since such limit is hard to guess especially on large storage array
* systems. */
#define TMPFS_LIMITS_DEV ",size=4m"
/* The limit used for /dev in private namespaces. 4MB for contents of regular files. The number of
* inodes should be relatively low in private namespaces but for now use a 64k limit. */
#define TMPFS_LIMITS_PRIVATE_DEV ",size=4m,nr_inodes=64k"
/* Very little, if any use expected */
#define TMPFS_LIMITS_EMPTY_OR_ALMOST ",size=4m,nr_inodes=1k"
#define TMPFS_LIMITS_SYS TMPFS_LIMITS_EMPTY_OR_ALMOST
#define TMPFS_LIMITS_SYS_FS_CGROUP TMPFS_LIMITS_EMPTY_OR_ALMOST
/* On an extremely small device with only 256MB of RAM, 20% of RAM should be enough for the re-execution of
* PID1 because 16MB of free space is required. */
#define TMPFS_LIMITS_RUN ",size=20%,nr_inodes=800k"
/* The limit used for various nested tmpfs mounts, in particular for guests started by systemd-nspawn.
* 10% of RAM (using 16GB of RAM as a baseline) translates to 400k inodes (assuming 4k each) and 25%
* translates to 1M inodes.
* (On the host, /tmp is configured through a .mount unit file.) */
#define NESTED_TMPFS_LIMITS ",size=10%,nr_inodes=400k"
/* More space for volatile root and /var */
#define TMPFS_LIMITS_VAR ",size=25%,nr_inodes=1m"
#define TMPFS_LIMITS_ROOTFS TMPFS_LIMITS_VAR
#define TMPFS_LIMITS_VOLATILE_STATE TMPFS_LIMITS_VAR
int name_to_handle_at_loop(int fd, const char *path, struct file_handle **ret_handle, int *ret_mnt_id, int flags);
int path_get_mnt_id(const char *path, int *ret);
@ -22,5 +52,8 @@ bool fstype_can_uid_gid(const char *fstype);
int dev_is_devtmpfs(void);
int mount_fd(const char *source, int target_fd, const char *filesystemtype, unsigned long mountflags, const void *data);
int mount_nofollow(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
const char *mount_propagation_flags_to_string(unsigned long flags);
int mount_propagation_flags_from_string(const char *name, unsigned long *ret);

View File

@ -36,6 +36,7 @@
#include "memory-util.h"
#include "missing_sched.h"
#include "missing_syscall.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "nulstr-util.h"
#include "parse-util.h"
@ -1252,15 +1253,26 @@ int safe_fork_full(
}
if (FLAGS_SET(flags, FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE)) {
/* Optionally, make sure we never propagate mounts to the host. */
if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
log_full_errno(prio, errno, "Failed to remount root directory as MS_SLAVE: %m");
_exit(EXIT_FAILURE);
}
}
if (FLAGS_SET(flags, FORK_PRIVATE_TMP)) {
assert(FLAGS_SET(flags, FORK_NEW_MOUNTNS));
/* Optionally, overmount new tmpfs instance on /tmp/. */
r = mount_nofollow("tmpfs", "/tmp", "tmpfs",
MS_NOSUID|MS_NODEV,
"mode=01777" TMPFS_LIMITS_RUN);
if (r < 0) {
log_full_errno(prio, r, "Failed to overmount /tmp/: %m");
_exit(EXIT_FAILURE);
}
}
if (flags & FORK_CLOSE_ALL_FDS) {
/* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
log_close();

View File

@ -147,11 +147,12 @@ typedef enum ForkFlags {
FORK_WAIT = 1 << 7, /* Wait until child exited */
FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace */
FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */
FORK_RLIMIT_NOFILE_SAFE = 1 << 10, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
FORK_STDOUT_TO_STDERR = 1 << 11, /* Make stdout a copy of stderr */
FORK_FLUSH_STDIO = 1 << 12, /* fflush() stdout (and stderr) before forking */
FORK_NEW_USERNS = 1 << 13, /* Run child in its own user namespace */
FORK_CLOEXEC_OFF = 1 << 14, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */
FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */
FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace */
FORK_CLOEXEC_OFF = 1 << 15, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
} ForkFlags;
int safe_fork_full(const char *name, const int except_fds[], size_t n_except_fds, ForkFlags flags, pid_t *ret_pid);

View File

@ -42,52 +42,6 @@
#include "tmpfile-util.h"
#include "user-util.h"
int mount_fd(const char *source,
int target_fd,
const char *filesystemtype,
unsigned long mountflags,
const void *data) {
if (mount(source, FORMAT_PROC_FD_PATH(target_fd), filesystemtype, mountflags, data) < 0) {
if (errno != ENOENT)
return -errno;
/* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
* mounted. Check for the latter to generate better error messages. */
if (proc_mounted() == 0)
return -ENOSYS;
return -ENOENT;
}
return 0;
}
int mount_nofollow(
const char *source,
const char *target,
const char *filesystemtype,
unsigned long mountflags,
const void *data) {
_cleanup_close_ int fd = -1;
/* In almost all cases we want to manipulate the mount table without following symlinks, hence
* mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
* not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
* initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
* fs to mount) we can only use traditional mount() directly.
*
* Note that this disables following only for the final component of the target, i.e symlinks within
* the path of the target are honoured, as are symlinks in the source path everywhere. */
fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
if (fd < 0)
return -errno;
return mount_fd(source, fd, filesystemtype, mountflags, data);
}
int umount_recursive(const char *prefix, int flags) {
int n = 0, r;
bool again;

View File

@ -25,39 +25,6 @@ const char* mount_attr_propagation_type_to_string(MountAttrPropagationType t) _c
MountAttrPropagationType mount_attr_propagation_type_from_string(const char *s) _pure_;
unsigned int mount_attr_propagation_type_to_flag(MountAttrPropagationType t);
/* The limit used for /dev itself. 4MB should be enough since device nodes and symlinks don't
* consume any space and udev isn't supposed to create regular file either. There's no limit on the
* max number of inodes since such limit is hard to guess especially on large storage array
* systems. */
#define TMPFS_LIMITS_DEV ",size=4m"
/* The limit used for /dev in private namespaces. 4MB for contents of regular files. The number of
* inodes should be relatively low in private namespaces but for now use a 64k limit. */
#define TMPFS_LIMITS_PRIVATE_DEV ",size=4m,nr_inodes=64k"
/* Very little, if any use expected */
#define TMPFS_LIMITS_EMPTY_OR_ALMOST ",size=4m,nr_inodes=1k"
#define TMPFS_LIMITS_SYS TMPFS_LIMITS_EMPTY_OR_ALMOST
#define TMPFS_LIMITS_SYS_FS_CGROUP TMPFS_LIMITS_EMPTY_OR_ALMOST
/* On an extremely small device with only 256MB of RAM, 20% of RAM should be enough for the re-execution of
* PID1 because 16MB of free space is required. */
#define TMPFS_LIMITS_RUN ",size=20%,nr_inodes=800k"
/* The limit used for various nested tmpfs mounts, in particular for guests started by systemd-nspawn.
* 10% of RAM (using 16GB of RAM as a baseline) translates to 400k inodes (assuming 4k each) and 25%
* translates to 1M inodes.
* (On the host, /tmp is configured through a .mount unit file.) */
#define NESTED_TMPFS_LIMITS ",size=10%,nr_inodes=400k"
/* More space for volatile root and /var */
#define TMPFS_LIMITS_VAR ",size=25%,nr_inodes=1m"
#define TMPFS_LIMITS_ROOTFS TMPFS_LIMITS_VAR
#define TMPFS_LIMITS_VOLATILE_STATE TMPFS_LIMITS_VAR
int mount_fd(const char *source, int target_fd, const char *filesystemtype, unsigned long mountflags, const void *data);
int mount_nofollow(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
int repeat_unmount(const char *path, int flags);
int umount_recursive(const char *target, int flags);

View File

@ -11,6 +11,7 @@
#include "missing_mount.h"
#include "mkdir.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "path-util.h"
#include "process-util.h"