mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-01-11 05:17:44 +03:00
nspawn: add --suppress-sync=yes mode for turning sync() and friends into NOPs via seccomp
This is supposed to be used by package/image builders such as mkosi to speed up building, since it allows us to suppress sync() inside a container. This does what Debian's eatmydata tool does, but for a container, and via seccomp (instead of LD_PRELOAD).
This commit is contained in:
parent
231c7645ca
commit
4a4654e024
@ -138,6 +138,12 @@ All tools:
|
|||||||
* `$SYSTEMD_NSPAWN_TMPFS_TMP=0` — if set, do not overmount `/tmp/` in the
|
* `$SYSTEMD_NSPAWN_TMPFS_TMP=0` — if set, do not overmount `/tmp/` in the
|
||||||
container with a tmpfs, but leave the directory from the image in place.
|
container with a tmpfs, but leave the directory from the image in place.
|
||||||
|
|
||||||
|
* `$SYSTEMD_SUPPRESS_SYNC=1` — if set, all disk synchronization syscalls are
|
||||||
|
blocked to the container payload (e.g. `sync()`, `fsync()`, `syncfs()`, …)
|
||||||
|
and the `O_SYNC`/`O_DSYNC` flags are made unavailable to `open()` and
|
||||||
|
friends. This is equivalent to passing `--suppress-sync=yes` on the
|
||||||
|
`systemd-nspawn` command line.
|
||||||
|
|
||||||
`systemd-logind`:
|
`systemd-logind`:
|
||||||
|
|
||||||
* `$SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK=1` — if set, report that
|
* `$SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK=1` — if set, report that
|
||||||
|
@ -570,6 +570,24 @@
|
|||||||
before sending its own to systemd. For more details about notifications
|
before sending its own to systemd. For more details about notifications
|
||||||
see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>.</para></listitem>
|
see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>.</para></listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
|
<varlistentry>
|
||||||
|
<term><option>--suppress-sync=</option></term>
|
||||||
|
|
||||||
|
<listitem><para>Expects a boolean argument. If true, turns off any form of on-disk file system
|
||||||
|
synchronization for the container payload. This means all system calls such as <citerefentry
|
||||||
|
project='man-pages'><refentrytitle>sync</refentrytitle><manvolnum>2</manvolnum></citerefentry>,
|
||||||
|
<function>fsync()</function>, <function>syncfs()</function>, … will execute no operation, and the
|
||||||
|
<constant>O_SYNC</constant>/<constant>O_DSYNC</constant> flags to <citerefentry
|
||||||
|
project='man-pages'><refentrytitle>open</refentrytitle><manvolnum>2</manvolnum></citerefentry> and
|
||||||
|
related calls will be made unavailable. This is potentially dangerous, as assumed data integrity
|
||||||
|
guarantees to the container payload are not actually enforced (i.e. data assumed to have been written
|
||||||
|
to disk might be lost if the system is shut down abnormally). However, this can dramatically improve
|
||||||
|
container runtime performance – as long as these guarantees are not required or desirable, for
|
||||||
|
example because any data written by the container is of temporary, redundant nature, or just an
|
||||||
|
intermediary artifact that will be further processed and finalized by a later step in a
|
||||||
|
pipeline. Defaults to false.</para></listitem>
|
||||||
|
</varlistentry>
|
||||||
</variablelist>
|
</variablelist>
|
||||||
|
|
||||||
</refsect2><refsect2>
|
</refsect2><refsect2>
|
||||||
|
@ -365,6 +365,16 @@
|
|||||||
details.</para></listitem>
|
details.</para></listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
|
<varlistentry>
|
||||||
|
<term><varname>SuppressSync=</varname></term>
|
||||||
|
|
||||||
|
<listitem><para>Configures whether to suppress disk synchronization for the container payload. This
|
||||||
|
is equivalent to the <option>--suppress-sync=</option> command line switch, and takes the same
|
||||||
|
parameter. See
|
||||||
|
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
|
||||||
|
for details.</para></listitem>
|
||||||
|
</varlistentry>
|
||||||
|
|
||||||
</variablelist>
|
</variablelist>
|
||||||
</refsect1>
|
</refsect1>
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ _systemd_nspawn() {
|
|||||||
|
|
||||||
local -A OPTS=(
|
local -A OPTS=(
|
||||||
[STANDALONE]='-h --help --version --private-network -b --boot --read-only -q --quiet --share-system
|
[STANDALONE]='-h --help --version --private-network -b --boot --read-only -q --quiet --share-system
|
||||||
--keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U'
|
--keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U --suppress-sync=yes'
|
||||||
[ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro
|
[ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro
|
||||||
-M --machine -S --slice -E --setenv -Z --selinux-context -L --selinux-apifs-context
|
-M --machine -S --slice -E --setenv -Z --selinux-context -L --selinux-apifs-context
|
||||||
--register --network-interface --network-bridge --personality -i --image --tmpfs
|
--register --network-interface --network-bridge --personality -i --image --tmpfs
|
||||||
|
@ -59,6 +59,7 @@ Exec.CPUAffinity, config_parse_cpu_affinity, 0, 0
|
|||||||
Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf)
|
Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf)
|
||||||
Exec.LinkJournal, config_parse_link_journal, 0, 0
|
Exec.LinkJournal, config_parse_link_journal, 0, 0
|
||||||
Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone)
|
Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone)
|
||||||
|
Exec.SuppressSync, config_parse_bool, 0, offsetof(Settings, suppress_sync)
|
||||||
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
|
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
|
||||||
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
|
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
|
||||||
Files.Bind, config_parse_bind, 0, 0
|
Files.Bind, config_parse_bind, 0, 0
|
||||||
|
@ -127,9 +127,10 @@ typedef enum SettingsMask {
|
|||||||
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
|
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
|
||||||
SETTING_CREDENTIALS = UINT64_C(1) << 30,
|
SETTING_CREDENTIALS = UINT64_C(1) << 30,
|
||||||
SETTING_BIND_USER = UINT64_C(1) << 31,
|
SETTING_BIND_USER = UINT64_C(1) << 31,
|
||||||
SETTING_RLIMIT_FIRST = UINT64_C(1) << 32, /* we define one bit per resource limit here */
|
SETTING_SUPPRESS_SYNC = UINT64_C(1) << 32,
|
||||||
SETTING_RLIMIT_LAST = UINT64_C(1) << (32 + _RLIMIT_MAX - 1),
|
SETTING_RLIMIT_FIRST = UINT64_C(1) << 33, /* we define one bit per resource limit here */
|
||||||
_SETTINGS_MASK_ALL = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1,
|
SETTING_RLIMIT_LAST = UINT64_C(1) << (33 + _RLIMIT_MAX - 1),
|
||||||
|
_SETTINGS_MASK_ALL = (UINT64_C(1) << (33 + _RLIMIT_MAX)) -1,
|
||||||
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
|
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
|
||||||
} SettingsMask;
|
} SettingsMask;
|
||||||
|
|
||||||
@ -189,6 +190,7 @@ typedef struct Settings {
|
|||||||
LinkJournal link_journal;
|
LinkJournal link_journal;
|
||||||
bool link_journal_try;
|
bool link_journal_try;
|
||||||
TimezoneMode timezone;
|
TimezoneMode timezone;
|
||||||
|
bool suppress_sync;
|
||||||
|
|
||||||
/* [Files] */
|
/* [Files] */
|
||||||
int read_only;
|
int read_only;
|
||||||
|
@ -229,6 +229,7 @@ static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
|
|||||||
static Credential *arg_credentials = NULL;
|
static Credential *arg_credentials = NULL;
|
||||||
static size_t arg_n_credentials = 0;
|
static size_t arg_n_credentials = 0;
|
||||||
static char **arg_bind_user = NULL;
|
static char **arg_bind_user = NULL;
|
||||||
|
static bool arg_suppress_sync = false;
|
||||||
|
|
||||||
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
|
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
|
||||||
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
|
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
|
||||||
@ -342,7 +343,9 @@ static int help(void) {
|
|||||||
" -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
|
" -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
|
||||||
" -u --user=USER Run the command under specified user or UID\n"
|
" -u --user=USER Run the command under specified user or UID\n"
|
||||||
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
|
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
|
||||||
" --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
|
" --notify-ready=BOOLEAN Receive notifications from the child init process\n"
|
||||||
|
" --suppress-sync=BOOLEAN\n"
|
||||||
|
" Suppress any form of disk data synchronization\n\n"
|
||||||
"%3$sSystem Identity:%4$s\n"
|
"%3$sSystem Identity:%4$s\n"
|
||||||
" -M --machine=NAME Set the machine name for the container\n"
|
" -M --machine=NAME Set the machine name for the container\n"
|
||||||
" --hostname=NAME Override the hostname for the container\n"
|
" --hostname=NAME Override the hostname for the container\n"
|
||||||
@ -654,6 +657,12 @@ static int parse_environment(void) {
|
|||||||
if (e)
|
if (e)
|
||||||
arg_container_service_name = e;
|
arg_container_service_name = e;
|
||||||
|
|
||||||
|
r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
|
||||||
|
if (r >= 0)
|
||||||
|
arg_suppress_sync = r;
|
||||||
|
else if (r != -ENXIO)
|
||||||
|
log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
|
||||||
|
|
||||||
return detect_unified_cgroup_hierarchy_from_environment();
|
return detect_unified_cgroup_hierarchy_from_environment();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -713,6 +722,7 @@ static int parse_argv(int argc, char *argv[]) {
|
|||||||
ARG_SET_CREDENTIAL,
|
ARG_SET_CREDENTIAL,
|
||||||
ARG_LOAD_CREDENTIAL,
|
ARG_LOAD_CREDENTIAL,
|
||||||
ARG_BIND_USER,
|
ARG_BIND_USER,
|
||||||
|
ARG_SUPPRESS_SYNC,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const struct option options[] = {
|
static const struct option options[] = {
|
||||||
@ -785,6 +795,7 @@ static int parse_argv(int argc, char *argv[]) {
|
|||||||
{ "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
|
{ "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
|
||||||
{ "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
|
{ "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
|
||||||
{ "bind-user", required_argument, NULL, ARG_BIND_USER },
|
{ "bind-user", required_argument, NULL, ARG_BIND_USER },
|
||||||
|
{ "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
|
||||||
{}
|
{}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1668,6 +1679,14 @@ static int parse_argv(int argc, char *argv[]) {
|
|||||||
arg_settings_mask |= SETTING_BIND_USER;
|
arg_settings_mask |= SETTING_BIND_USER;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ARG_SUPPRESS_SYNC:
|
||||||
|
r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
|
||||||
|
if (r < 0)
|
||||||
|
return r;
|
||||||
|
|
||||||
|
arg_settings_mask |= SETTING_SUPPRESS_SYNC;
|
||||||
|
break;
|
||||||
|
|
||||||
case '?':
|
case '?':
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
@ -3385,6 +3404,12 @@ static int inner_child(
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (arg_suppress_sync) {
|
||||||
|
r = seccomp_suppress_sync();
|
||||||
|
if (r < 0)
|
||||||
|
log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
|
||||||
|
}
|
||||||
|
|
||||||
#if HAVE_SELINUX
|
#if HAVE_SELINUX
|
||||||
if (arg_selinux_context)
|
if (arg_selinux_context)
|
||||||
if (setexeccon(arg_selinux_context) < 0)
|
if (setexeccon(arg_selinux_context) < 0)
|
||||||
@ -4552,6 +4577,9 @@ static int merge_settings(Settings *settings, const char *path) {
|
|||||||
arg_console_mode = settings->console_mode;
|
arg_console_mode = settings->console_mode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0)
|
||||||
|
arg_suppress_sync = settings->suppress_sync;
|
||||||
|
|
||||||
/* The following properties can only be set through the OCI settings logic, not from the command line, hence we
|
/* The following properties can only be set through the OCI settings logic, not from the command line, hence we
|
||||||
* don't consult arg_settings_mask for them. */
|
* don't consult arg_settings_mask for them. */
|
||||||
|
|
||||||
|
@ -2205,3 +2205,98 @@ int parse_syscall_and_errno(const char *in, char **name, int *error) {
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
|
||||||
|
bool any = false;
|
||||||
|
int r;
|
||||||
|
|
||||||
|
/* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
|
||||||
|
* EINVAL, in the hope the client code will retry without O_SYNC then. */
|
||||||
|
|
||||||
|
#if SCMP_SYS(open) > 0
|
||||||
|
r = seccomp_rule_add_exact(
|
||||||
|
seccomp,
|
||||||
|
SCMP_ACT_ERRNO(EINVAL),
|
||||||
|
SCMP_SYS(open),
|
||||||
|
1,
|
||||||
|
SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
|
||||||
|
if (r < 0)
|
||||||
|
log_debug_errno(r, "Failed to add filter for open: %m");
|
||||||
|
else
|
||||||
|
any = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
r = seccomp_rule_add_exact(
|
||||||
|
seccomp,
|
||||||
|
SCMP_ACT_ERRNO(EINVAL),
|
||||||
|
SCMP_SYS(openat),
|
||||||
|
1,
|
||||||
|
SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
|
||||||
|
if (r < 0)
|
||||||
|
log_debug_errno(r, "Failed to add filter for openat: %m");
|
||||||
|
else
|
||||||
|
any = true;
|
||||||
|
|
||||||
|
#if defined(__SNR_openat2)
|
||||||
|
/* The new openat2() system call can't be filtered sensibly, see above. */
|
||||||
|
r = seccomp_rule_add_exact(
|
||||||
|
seccomp,
|
||||||
|
SCMP_ACT_ERRNO(ENOSYS),
|
||||||
|
SCMP_SYS(openat2),
|
||||||
|
0);
|
||||||
|
if (r < 0)
|
||||||
|
log_debug_errno(r, "Failed to add filter for openat2: %m");
|
||||||
|
else
|
||||||
|
any = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return any ? 0 : r;
|
||||||
|
}
|
||||||
|
|
||||||
|
int seccomp_suppress_sync(void) {
|
||||||
|
uint32_t arch;
|
||||||
|
int r;
|
||||||
|
|
||||||
|
/* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
|
||||||
|
* manageable, and also masks O_SYNC/O_DSYNC */
|
||||||
|
|
||||||
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
||||||
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
||||||
|
const char *c;
|
||||||
|
|
||||||
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
||||||
|
if (r < 0)
|
||||||
|
return r;
|
||||||
|
|
||||||
|
NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
|
||||||
|
int id;
|
||||||
|
|
||||||
|
id = seccomp_syscall_resolve_name(c);
|
||||||
|
if (id == __NR_SCMP_ERROR) {
|
||||||
|
log_debug("System call %s is not known, ignoring.", c);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
r = seccomp_rule_add_exact(
|
||||||
|
seccomp,
|
||||||
|
SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
|
||||||
|
id,
|
||||||
|
0);
|
||||||
|
if (r < 0)
|
||||||
|
log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
|
||||||
|
}
|
||||||
|
|
||||||
|
(void) block_open_flag(seccomp, O_SYNC);
|
||||||
|
#if O_DSYNC != O_SYNC
|
||||||
|
(void) block_open_flag(seccomp, O_DSYNC);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
r = seccomp_load(seccomp);
|
||||||
|
if (ERRNO_IS_SECCOMP_FATAL(r))
|
||||||
|
return r;
|
||||||
|
if (r < 0)
|
||||||
|
log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
@ -150,3 +150,5 @@ static inline const char *seccomp_errno_or_action_to_string(int num) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int parse_syscall_and_errno(const char *in, char **name, int *error);
|
int parse_syscall_and_errno(const char *in, char **name, int *error);
|
||||||
|
|
||||||
|
int seccomp_suppress_sync(void);
|
||||||
|
Loading…
Reference in New Issue
Block a user