From 4a4654e0241fbeabecb8587fd3520b6b39264b9c Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 19 Oct 2021 14:56:49 +0200 Subject: [PATCH] nspawn: add --suppress-sync=yes mode for turning sync() and friends into NOPs via seccomp This is supposed to be used by package/image builders such as mkosi to speed up building, since it allows us to suppress sync() inside a container. This does what Debian's eatmydata tool does, but for a container, and via seccomp (instead of LD_PRELOAD). --- docs/ENVIRONMENT.md | 6 ++ man/systemd-nspawn.xml | 18 ++++++ man/systemd.nspawn.xml | 10 +++ shell-completion/bash/systemd-nspawn | 2 +- src/nspawn/nspawn-gperf.gperf | 1 + src/nspawn/nspawn-settings.h | 8 ++- src/nspawn/nspawn.c | 30 ++++++++- src/shared/seccomp-util.c | 95 ++++++++++++++++++++++++++++ src/shared/seccomp-util.h | 2 + 9 files changed, 167 insertions(+), 5 deletions(-) diff --git a/docs/ENVIRONMENT.md b/docs/ENVIRONMENT.md index 9a824820da..328934cd17 100644 --- a/docs/ENVIRONMENT.md +++ b/docs/ENVIRONMENT.md @@ -138,6 +138,12 @@ All tools: * `$SYSTEMD_NSPAWN_TMPFS_TMP=0` — if set, do not overmount `/tmp/` in the container with a tmpfs, but leave the directory from the image in place. +* `$SYSTEMD_SUPPRESS_SYNC=1` — if set, all disk synchronization syscalls are + blocked to the container payload (e.g. `sync()`, `fsync()`, `syncfs()`, …) + and the `O_SYNC`/`O_DSYNC` flags are made unavailable to `open()` and + friends. This is equivalent to passing `--suppress-sync=yes` on the + `systemd-nspawn` command line. + `systemd-logind`: * `$SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK=1` — if set, report that diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index e84ac6ae42..aec0b0e129 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -570,6 +570,24 @@ before sending its own to systemd. For more details about notifications see sd_notify3. + + + + + Expects a boolean argument. If true, turns off any form of on-disk file system + synchronization for the container payload. This means all system calls such as sync2, + fsync(), syncfs(), … will execute no operation, and the + O_SYNC/O_DSYNC flags to open2 and + related calls will be made unavailable. This is potentially dangerous, as assumed data integrity + guarantees to the container payload are not actually enforced (i.e. data assumed to have been written + to disk might be lost if the system is shut down abnormally). However, this can dramatically improve + container runtime performance – as long as these guarantees are not required or desirable, for + example because any data written by the container is of temporary, redundant nature, or just an + intermediary artifact that will be further processed and finalized by a later step in a + pipeline. Defaults to false. + diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml index dc0e2f9fd2..bb9bf4b5d9 100644 --- a/man/systemd.nspawn.xml +++ b/man/systemd.nspawn.xml @@ -365,6 +365,16 @@ details. + + SuppressSync= + + Configures whether to suppress disk synchronization for the container payload. This + is equivalent to the command line switch, and takes the same + parameter. See + systemd-nspawn1 + for details. + + diff --git a/shell-completion/bash/systemd-nspawn b/shell-completion/bash/systemd-nspawn index f367c7d14c..3b6d65d96a 100644 --- a/shell-completion/bash/systemd-nspawn +++ b/shell-completion/bash/systemd-nspawn @@ -63,7 +63,7 @@ _systemd_nspawn() { local -A OPTS=( [STANDALONE]='-h --help --version --private-network -b --boot --read-only -q --quiet --share-system - --keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U' + --keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U --suppress-sync=yes' [ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro -M --machine -S --slice -E --setenv -Z --selinux-context -L --selinux-apifs-context --register --network-interface --network-bridge --personality -i --image --tmpfs diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index ea15e27148..4af00c8d95 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -59,6 +59,7 @@ Exec.CPUAffinity, config_parse_cpu_affinity, 0, 0 Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf) Exec.LinkJournal, config_parse_link_journal, 0, 0 Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone) +Exec.SuppressSync, config_parse_bool, 0, offsetof(Settings, suppress_sync) Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) Files.Bind, config_parse_bind, 0, 0 diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index 939e1c757b..1b3ace5f8f 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -127,9 +127,10 @@ typedef enum SettingsMask { SETTING_CONSOLE_MODE = UINT64_C(1) << 29, SETTING_CREDENTIALS = UINT64_C(1) << 30, SETTING_BIND_USER = UINT64_C(1) << 31, - SETTING_RLIMIT_FIRST = UINT64_C(1) << 32, /* we define one bit per resource limit here */ - SETTING_RLIMIT_LAST = UINT64_C(1) << (32 + _RLIMIT_MAX - 1), - _SETTINGS_MASK_ALL = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1, + SETTING_SUPPRESS_SYNC = UINT64_C(1) << 32, + SETTING_RLIMIT_FIRST = UINT64_C(1) << 33, /* we define one bit per resource limit here */ + SETTING_RLIMIT_LAST = UINT64_C(1) << (33 + _RLIMIT_MAX - 1), + _SETTINGS_MASK_ALL = (UINT64_C(1) << (33 + _RLIMIT_MAX)) -1, _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX } SettingsMask; @@ -189,6 +190,7 @@ typedef struct Settings { LinkJournal link_journal; bool link_journal_try; TimezoneMode timezone; + bool suppress_sync; /* [Files] */ int read_only; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 7b767fb296..7dbc84369b 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -229,6 +229,7 @@ static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID; static Credential *arg_credentials = NULL; static size_t arg_n_credentials = 0; static char **arg_bind_user = NULL; +static bool arg_suppress_sync = false; STATIC_DESTRUCTOR_REGISTER(arg_directory, freep); STATIC_DESTRUCTOR_REGISTER(arg_template, freep); @@ -342,7 +343,9 @@ static int help(void) { " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n" " -u --user=USER Run the command under specified user or UID\n" " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n" - " --notify-ready=BOOLEAN Receive notifications from the child init process\n\n" + " --notify-ready=BOOLEAN Receive notifications from the child init process\n" + " --suppress-sync=BOOLEAN\n" + " Suppress any form of disk data synchronization\n\n" "%3$sSystem Identity:%4$s\n" " -M --machine=NAME Set the machine name for the container\n" " --hostname=NAME Override the hostname for the container\n" @@ -654,6 +657,12 @@ static int parse_environment(void) { if (e) arg_container_service_name = e; + r = getenv_bool("SYSTEMD_SUPPRESS_SYNC"); + if (r >= 0) + arg_suppress_sync = r; + else if (r != -ENXIO) + log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m"); + return detect_unified_cgroup_hierarchy_from_environment(); } @@ -713,6 +722,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_SET_CREDENTIAL, ARG_LOAD_CREDENTIAL, ARG_BIND_USER, + ARG_SUPPRESS_SYNC, }; static const struct option options[] = { @@ -785,6 +795,7 @@ static int parse_argv(int argc, char *argv[]) { { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL }, { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL }, { "bind-user", required_argument, NULL, ARG_BIND_USER }, + { "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC }, {} }; @@ -1668,6 +1679,14 @@ static int parse_argv(int argc, char *argv[]) { arg_settings_mask |= SETTING_BIND_USER; break; + case ARG_SUPPRESS_SYNC: + r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_SUPPRESS_SYNC; + break; + case '?': return -EINVAL; @@ -3385,6 +3404,12 @@ static int inner_child( return r; } + if (arg_suppress_sync) { + r = seccomp_suppress_sync(); + if (r < 0) + log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m"); + } + #if HAVE_SELINUX if (arg_selinux_context) if (setexeccon(arg_selinux_context) < 0) @@ -4552,6 +4577,9 @@ static int merge_settings(Settings *settings, const char *path) { arg_console_mode = settings->console_mode; } + if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0) + arg_suppress_sync = settings->suppress_sync; + /* The following properties can only be set through the OCI settings logic, not from the command line, hence we * don't consult arg_settings_mask for them. */ diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c index 31d6b542c0..ff90af538b 100644 --- a/src/shared/seccomp-util.c +++ b/src/shared/seccomp-util.c @@ -2205,3 +2205,98 @@ int parse_syscall_and_errno(const char *in, char **name, int *error) { return 0; } + +static int block_open_flag(scmp_filter_ctx seccomp, int flag) { + bool any = false; + int r; + + /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return + * EINVAL, in the hope the client code will retry without O_SYNC then. */ + +#if SCMP_SYS(open) > 0 + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EINVAL), + SCMP_SYS(open), + 1, + SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for open: %m"); + else + any = true; +#endif + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(EINVAL), + SCMP_SYS(openat), + 1, + SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag)); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat: %m"); + else + any = true; + +#if defined(__SNR_openat2) + /* The new openat2() system call can't be filtered sensibly, see above. */ + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(openat2), + 0); + if (r < 0) + log_debug_errno(r, "Failed to add filter for openat2: %m"); + else + any = true; +#endif + + return any ? 0 : r; +} + +int seccomp_suppress_sync(void) { + uint32_t arch; + int r; + + /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately + * manageable, and also masks O_SYNC/O_DSYNC */ + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + const char *c; + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return r; + + NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) { + int id; + + id = seccomp_syscall_resolve_name(c); + if (id == __NR_SCMP_ERROR) { + log_debug("System call %s is not known, ignoring.", c); + continue; + } + + r = seccomp_rule_add_exact( + seccomp, + SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */ + id, + 0); + if (r < 0) + log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c); + } + + (void) block_open_flag(seccomp, O_SYNC); +#if O_DSYNC != O_SYNC + (void) block_open_flag(seccomp, O_DSYNC); +#endif + + r = seccomp_load(seccomp); + if (ERRNO_IS_SECCOMP_FATAL(r)) + return r; + if (r < 0) + log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + return 0; +} diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h index b3d25c9f3f..4f4bc48431 100644 --- a/src/shared/seccomp-util.h +++ b/src/shared/seccomp-util.h @@ -150,3 +150,5 @@ static inline const char *seccomp_errno_or_action_to_string(int num) { } int parse_syscall_and_errno(const char *in, char **name, int *error); + +int seccomp_suppress_sync(void);