mirror of
https://github.com/systemd/systemd.git
synced 2025-01-23 02:04:32 +03:00
Merge pull request #9153 from poettering/private-mounts
introduce PrivateMounts= setting and clean up documentation for MountFlags=
This commit is contained in:
commit
70127be805
9
NEWS
9
NEWS
@ -285,6 +285,15 @@ CHANGES WITH 239 in spe:
|
||||
query the default, built-in $PATH PID 1 will pass to the services it
|
||||
manages.
|
||||
|
||||
* A new unit file setting PrivateMounts= has been added. It's a boolean
|
||||
option. If enabled the unit's processes are invoked in their own file
|
||||
system namespace. Note that this behaviour is also implied if any
|
||||
other file system namespacing options (such as PrivateTmp=,
|
||||
PrivateDevices=, ProtectSystem=, …) are used. This option is hence
|
||||
primarily useful for services that do not use any of the other file
|
||||
system namespacing options. One such service is systemd-udevd.service
|
||||
wher this is now used by default.
|
||||
|
||||
Contributions from: Adam Duskett, Alan Jenkins, Alessandro Casale,
|
||||
Alexander Kurtz, Alex Gartrell, Anssi Hannula, Antique, Arnaud
|
||||
Rebillout, Brian J. Murrell, Bruno Vernay, Chris Lesiak, Christian
|
||||
|
@ -1277,28 +1277,69 @@ RestrictNamespaces=~cgroup net</programlisting>
|
||||
stopped. This setting is implied if <varname>DynamicUser=</varname> is set.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>PrivateMounts=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean parameter. If set, the processes of this unit will be run in their own private
|
||||
file system (mount) namespace with all mount propagation from the processes towards the host's main file system
|
||||
namespace turned off. This means any file system mount points established or removed by the unit's processes
|
||||
will be private to them and not be visible to the host. However, file system mount points established or
|
||||
removed on the host will be propagated to the unit's processes. See <citerefentry
|
||||
project='man-pages'><refentrytitle>mount_namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry> for
|
||||
details on file system namespaces. Defaults to off.</para>
|
||||
|
||||
<para>When turned on, this executes three operations for each invoked process: a new
|
||||
<constant>CLONE_NEWNS</constant> namespace is created, after which all existing mounts are remounted to
|
||||
<constant>MS_SLAVE</constant> to disable propagation from the unit's processes to the host (but leaving
|
||||
propagation in the opposite direction in effect). Finally, the mounts are remounted again to the propagation
|
||||
mode configured with <varname>MountFlags=</varname>, see below.</para>
|
||||
|
||||
<para>File system namespaces are set up individually for each process forked off by the service manager. Mounts
|
||||
established in the namespace of the process created by <varname>ExecStartPre=</varname> will hence be cleaned
|
||||
up automatically as soon as that process exits and will not be available to subsequent processes forked off for
|
||||
<varname>ExecStart=</varname> (and similar applies to the various other commands configured for
|
||||
units). Similarly, <varname>JoinsNamespaceOf=</varname> does not permit sharing kernel mount namespaces between
|
||||
units, it only enables sharing of the <filename>/tmp/</filename> and <filename>/var/tmp/</filename>
|
||||
directories.</para>
|
||||
|
||||
<para>Other file system namespace unit settings — <varname>PrivateMounts=</varname>,
|
||||
<varname>PrivateTmp=</varname>, <varname>PrivateDevices=</varname>, <varname>ProtectSystem=</varname>,
|
||||
<varname>ProtectHome=</varname>, <varname>ReadOnlyPaths=</varname>, <varname>InaccessiblePaths=</varname>,
|
||||
<varname>ReadWritePaths=</varname>, … — also enable file system namespacing in a fashion equivalent to this
|
||||
option. Hence it is primarily useful to explicitly request this behaviour if none of the other settings are
|
||||
used.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>MountFlags=</varname></term>
|
||||
|
||||
<listitem><para>Takes a mount propagation flag: <option>shared</option>, <option>slave</option> or
|
||||
<option>private</option>, which control whether mounts in the file system namespace set up for this unit's
|
||||
processes will receive or propagate mounts and unmounts. See <citerefentry
|
||||
project='man-pages'><refentrytitle>mount</refentrytitle><manvolnum>2</manvolnum></citerefentry> for
|
||||
details. Defaults to <option>shared</option>. Use <option>shared</option> to ensure that mounts and unmounts
|
||||
are propagated from systemd's namespace to the service's namespace and vice versa. Use <option>slave</option>
|
||||
to run processes so that none of their mounts and unmounts will propagate to the host. Use
|
||||
<option>private</option> to also ensure that no mounts and unmounts from the host will propagate into the unit
|
||||
processes' namespace. If this is set to <option>slave</option> or <option>private</option>, any mounts created
|
||||
by spawned processes will be unmounted after the completion of the current command line of
|
||||
<varname>ExecStartPre=</varname>, <varname>ExecStartPost=</varname>, <varname>ExecStart=</varname>, and
|
||||
<varname>ExecStopPost=</varname>. Note that <option>slave</option> means that file systems mounted on the host
|
||||
might stay mounted continuously in the unit's namespace, and thus keep the device busy. Note that the file
|
||||
system namespace related options (<varname>PrivateTmp=</varname>, <varname>PrivateDevices=</varname>,
|
||||
<varname>ProtectSystem=</varname>, <varname>ProtectHome=</varname>, <varname>ProtectKernelTunables=</varname>,
|
||||
<varname>ProtectControlGroups=</varname>, <varname>ReadOnlyPaths=</varname>,
|
||||
<varname>InaccessiblePaths=</varname>, <varname>ReadWritePaths=</varname>) require that mount and unmount
|
||||
propagation from the unit's file system namespace is disabled, and hence downgrade <option>shared</option> to
|
||||
<option>slave</option>. </para></listitem>
|
||||
<listitem><para>Takes a mount propagation setting: <option>shared</option>, <option>slave</option> or
|
||||
<option>private</option>, which controls whether file system mount points in the file system namespaces set up
|
||||
for this unit's processes will receive or propagate mounts and unmounts from other file system namespaces. See
|
||||
<citerefentry project='man-pages'><refentrytitle>mount</refentrytitle><manvolnum>2</manvolnum></citerefentry>
|
||||
for details on mount propagation, and the three propagation flags in particular.</para>
|
||||
|
||||
<para>This setting only controls the <emphasis>final</emphasis> propagation setting in effect on all mount
|
||||
points of the file system namespace created for each process of this unit. Other file system namespacing unit
|
||||
settings (see the discussion in <varname>PrivateMounts=</varname> above) will implicitly disable mount and
|
||||
unmount propagation from the unit's processes towards the host by changing the propagation setting of all mount
|
||||
points in the unit's file system namepace to <option>slave</option> first. Setting this option to
|
||||
<option>shared</option> does not reestablish propagation in that case. Conversely, if this option is set, but
|
||||
no other file system namespace setting is used, then new file system namespaces will be created for the unit's
|
||||
processes and this propagation flag will be applied right away to all mounts within it, without the
|
||||
intermediary application of <option>slave</option>.</para>
|
||||
|
||||
<para>If not set – but file system namespaces are enabled through another file system namespace unit setting –
|
||||
<option>shared</option> mount propagation is used, but — as mentioned — as <option>slave</option> is applied
|
||||
first, propagation from the unit's processes to the host is still turned off.</para>
|
||||
|
||||
<para>It is not recommended to to use <option>private</option> mount propagation for units, as this means
|
||||
temporary mounts (such as removable media) of the host will stay mounted and thus indefinitely busy in forked
|
||||
off processes, as unmount propagation events won't be received by the file system namespace of the unit.</para>
|
||||
|
||||
<para>Usually, it is best to leave this setting unmodified, and use higher level file system namespacing
|
||||
options instead, in particular <varname>PrivateMounts=</varname>, see above.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
@ -744,6 +744,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
|
||||
SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_bool, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
@ -1110,6 +1111,9 @@ int bus_exec_context_set_transient_property(
|
||||
if (streq(name, "PrivateDevices"))
|
||||
return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
|
||||
|
||||
if (streq(name, "PrivateMounts"))
|
||||
return bus_set_transient_bool(u, name, &c->private_mounts, message, flags, error);
|
||||
|
||||
if (streq(name, "PrivateNetwork"))
|
||||
return bus_set_transient_bool(u, name, &c->private_network, message, flags, error);
|
||||
|
||||
|
@ -1780,6 +1780,7 @@ static bool exec_needs_mount_namespace(
|
||||
return true;
|
||||
|
||||
if (context->private_devices ||
|
||||
context->private_mounts ||
|
||||
context->protect_system != PROTECT_SYSTEM_NO ||
|
||||
context->protect_home != PROTECT_HOME_NO ||
|
||||
context->protect_kernel_tunables ||
|
||||
@ -2312,7 +2313,7 @@ static int apply_mount_namespace(
|
||||
_cleanup_strv_free_ char **empty_directories = NULL;
|
||||
char *tmp = NULL, *var = NULL;
|
||||
const char *root_dir = NULL, *root_image = NULL;
|
||||
NamespaceInfo ns_info = {};
|
||||
NamespaceInfo ns_info;
|
||||
bool needs_sandboxing;
|
||||
BindMount *bind_mounts = NULL;
|
||||
size_t n_bind_mounts = 0;
|
||||
@ -2342,16 +2343,7 @@ static int apply_mount_namespace(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/*
|
||||
* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
|
||||
* sandbox info, otherwise enforce it, don't ignore protected paths and
|
||||
* fail if we are enable to apply the sandbox inside the mount namespace.
|
||||
*/
|
||||
if (!context->dynamic_user && root_dir)
|
||||
ns_info.ignore_protect_paths = true;
|
||||
|
||||
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
|
||||
|
||||
if (needs_sandboxing)
|
||||
ns_info = (NamespaceInfo) {
|
||||
.ignore_protect_paths = false,
|
||||
@ -2360,7 +2352,19 @@ static int apply_mount_namespace(
|
||||
.protect_kernel_tunables = context->protect_kernel_tunables,
|
||||
.protect_kernel_modules = context->protect_kernel_modules,
|
||||
.mount_apivfs = context->mount_apivfs,
|
||||
.private_mounts = context->private_mounts,
|
||||
};
|
||||
else if (!context->dynamic_user && root_dir)
|
||||
/*
|
||||
* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
|
||||
* sandbox info, otherwise enforce it, don't ignore protected paths and
|
||||
* fail if we are enable to apply the sandbox inside the mount namespace.
|
||||
*/
|
||||
ns_info = (NamespaceInfo) {
|
||||
.ignore_protect_paths = true,
|
||||
};
|
||||
else
|
||||
ns_info = (NamespaceInfo) {};
|
||||
|
||||
r = setup_namespace(root_dir, root_image,
|
||||
&ns_info, context->read_write_paths,
|
||||
|
@ -228,6 +228,7 @@ struct ExecContext {
|
||||
bool private_network;
|
||||
bool private_devices;
|
||||
bool private_users;
|
||||
bool private_mounts;
|
||||
ProtectSystem protect_system;
|
||||
ProtectHome protect_home;
|
||||
bool protect_kernel_tunables;
|
||||
|
@ -114,6 +114,7 @@ $1.ProtectKernelModules, config_parse_bool, 0,
|
||||
$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups)
|
||||
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
|
||||
$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)
|
||||
$1.PrivateMounts, config_parse_bool, 0, offsetof($1, exec_context.private_mounts)
|
||||
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context.protect_system)
|
||||
$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context.protect_home)
|
||||
$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context.mount_flags)
|
||||
|
@ -1133,9 +1133,9 @@ int setup_namespace(
|
||||
_cleanup_free_ void *root_hash = NULL;
|
||||
MountEntry *m, *mounts = NULL;
|
||||
size_t root_hash_size = 0;
|
||||
bool make_slave = false;
|
||||
const char *root;
|
||||
size_t n_mounts;
|
||||
bool make_slave;
|
||||
bool require_prefix = false;
|
||||
int r = 0;
|
||||
|
||||
@ -1200,8 +1200,7 @@ int setup_namespace(
|
||||
protect_home, protect_system);
|
||||
|
||||
/* Set mount slave mode */
|
||||
if (root || n_mounts > 0)
|
||||
make_slave = true;
|
||||
make_slave = root || n_mounts > 0 || ns_info->private_mounts;
|
||||
|
||||
if (n_mounts > 0) {
|
||||
m = mounts = (MountEntry *) alloca0(n_mounts * sizeof(MountEntry));
|
||||
|
@ -50,6 +50,7 @@ typedef enum ProtectSystem {
|
||||
struct NamespaceInfo {
|
||||
bool ignore_protect_paths:1;
|
||||
bool private_dev:1;
|
||||
bool private_mounts:1;
|
||||
bool protect_control_groups:1;
|
||||
bool protect_kernel_tunables:1;
|
||||
bool protect_kernel_modules:1;
|
||||
|
@ -699,7 +699,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
|
||||
if (STR_IN_SET(field,
|
||||
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "TTYVTDisallocate",
|
||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
|
||||
"NoNewPrivileges", "SyslogLevelPrefix",
|
||||
"PrivateMounts", "NoNewPrivileges", "SyslogLevelPrefix",
|
||||
"MemoryDenyWriteExecute", "RestrictRealtime", "DynamicUser", "RemoveIPC",
|
||||
"ProtectKernelTunables", "ProtectKernelModules", "ProtectControlGroups",
|
||||
"MountAPIVFS", "CPUSchedulingResetOnFork", "LockPersonality"))
|
||||
|
@ -25,7 +25,7 @@ ExecStart=@rootlibexecdir@/systemd-udevd
|
||||
KillMode=mixed
|
||||
WatchdogSec=3min
|
||||
TasksMax=infinity
|
||||
MountFlags=slave
|
||||
PrivateMounts=yes
|
||||
MemoryDenyWriteExecute=yes
|
||||
RestrictRealtime=yes
|
||||
RestrictAddressFamilies=AF_UNIX AF_NETLINK AF_INET AF_INET6
|
||||
|
Loading…
x
Reference in New Issue
Block a user