mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-01-11 05:17:44 +03:00
core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjunction with RootDirectory=
This adds a boolean unit file setting MountAPIVFS=. If set, the three main API VFS mounts will be mounted for the service. This only has an effect on RootDirectory=, which it makes a ton times more useful. (This is basically the /dev + /proc + /sys mounting code posted in the original #4727, but rebased on current git, and with the automatic logic replaced by explicit logic controlled by a unit file setting)
This commit is contained in:
parent
1eb7e08e20
commit
5d997827e2
@ -132,8 +132,22 @@
|
|||||||
the <function>chroot()</function> jail. Note that setting this parameter might result in additional
|
the <function>chroot()</function> jail. Note that setting this parameter might result in additional
|
||||||
dependencies to be added to the unit (see above).</para>
|
dependencies to be added to the unit (see above).</para>
|
||||||
|
|
||||||
<para>The <varname>PrivateUsers=</varname> setting is particularly useful in conjunction with
|
<para>The <varname>MountAPIVFS=</varname> and <varname>PrivateUsers=</varname> settings are particularly useful
|
||||||
<varname>RootDirectory=</varname>. For details, see below.</para></listitem>
|
in conjunction with <varname>RootDirectory=</varname>. For details, see below.</para></listitem>
|
||||||
|
</varlistentry>
|
||||||
|
|
||||||
|
<varlistentry>
|
||||||
|
<term><varname>MountAPIVFS=</varname></term>
|
||||||
|
|
||||||
|
<listitem><para>Takes a boolean argument. If on, a private mount namespace for the unit's processes is created
|
||||||
|
and the API file systems <filename>/proc</filename>, <filename>/sys</filename> and <filename>/dev</filename>
|
||||||
|
will be mounted inside of it, unless they are already mounted. Note that this option has no effect unless used
|
||||||
|
in conjunction with <varname>RootDirectory=</varname> as these three mounts are generally mounted in the host
|
||||||
|
anyway, and unless the root directory is changed the private mount namespace will be a 1:1 copy of the host's,
|
||||||
|
and include these three mounts. Note that the <filename>/dev</filename> file system of the host is bind mounted
|
||||||
|
if this option is used without <varname>PrivateDevices=</varname>. To run the service with a private, minimal
|
||||||
|
version of <filename>/dev/</filename>, combine this option with
|
||||||
|
<varname>PrivateDevices=</varname>.</para></listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
<varlistentry>
|
<varlistentry>
|
||||||
|
@ -828,6 +828,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
|
|||||||
SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
|
SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||||
SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
|
SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
|
||||||
SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
|
SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
|
||||||
|
SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||||
SD_BUS_VTABLE_END
|
SD_BUS_VTABLE_END
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1207,7 +1208,7 @@ int bus_exec_context_set_transient_property(
|
|||||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
|
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
|
||||||
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
|
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
|
||||||
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
|
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
|
||||||
"ProtectKernelModules", "ProtectControlGroups")) {
|
"ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {
|
||||||
int b;
|
int b;
|
||||||
|
|
||||||
r = sd_bus_message_read(message, "b", &b);
|
r = sd_bus_message_read(message, "b", &b);
|
||||||
@ -1247,6 +1248,8 @@ int bus_exec_context_set_transient_property(
|
|||||||
c->protect_kernel_modules = b;
|
c->protect_kernel_modules = b;
|
||||||
else if (streq(name, "ProtectControlGroups"))
|
else if (streq(name, "ProtectControlGroups"))
|
||||||
c->protect_control_groups = b;
|
c->protect_control_groups = b;
|
||||||
|
else if (streq(name, "MountAPIVFS"))
|
||||||
|
c->mount_apivfs = b;
|
||||||
|
|
||||||
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
|
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
|
||||||
}
|
}
|
||||||
|
@ -1662,6 +1662,9 @@ static bool exec_needs_mount_namespace(
|
|||||||
context->protect_control_groups)
|
context->protect_control_groups)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
if (context->mount_apivfs)
|
||||||
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1942,6 +1945,7 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,
|
|||||||
.protect_control_groups = context->protect_control_groups,
|
.protect_control_groups = context->protect_control_groups,
|
||||||
.protect_kernel_tunables = context->protect_kernel_tunables,
|
.protect_kernel_tunables = context->protect_kernel_tunables,
|
||||||
.protect_kernel_modules = context->protect_kernel_modules,
|
.protect_kernel_modules = context->protect_kernel_modules,
|
||||||
|
.mount_apivfs = context->mount_apivfs,
|
||||||
};
|
};
|
||||||
|
|
||||||
assert(context);
|
assert(context);
|
||||||
@ -3294,6 +3298,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
|||||||
"%sPrivateUsers: %s\n"
|
"%sPrivateUsers: %s\n"
|
||||||
"%sProtectHome: %s\n"
|
"%sProtectHome: %s\n"
|
||||||
"%sProtectSystem: %s\n"
|
"%sProtectSystem: %s\n"
|
||||||
|
"%sMountAPIVFS: %s\n"
|
||||||
"%sIgnoreSIGPIPE: %s\n"
|
"%sIgnoreSIGPIPE: %s\n"
|
||||||
"%sMemoryDenyWriteExecute: %s\n"
|
"%sMemoryDenyWriteExecute: %s\n"
|
||||||
"%sRestrictRealtime: %s\n",
|
"%sRestrictRealtime: %s\n",
|
||||||
@ -3310,6 +3315,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
|||||||
prefix, yes_no(c->private_users),
|
prefix, yes_no(c->private_users),
|
||||||
prefix, protect_home_to_string(c->protect_home),
|
prefix, protect_home_to_string(c->protect_home),
|
||||||
prefix, protect_system_to_string(c->protect_system),
|
prefix, protect_system_to_string(c->protect_system),
|
||||||
|
prefix, yes_no(c->mount_apivfs),
|
||||||
prefix, yes_no(c->ignore_sigpipe),
|
prefix, yes_no(c->ignore_sigpipe),
|
||||||
prefix, yes_no(c->memory_deny_write_execute),
|
prefix, yes_no(c->memory_deny_write_execute),
|
||||||
prefix, yes_no(c->restrict_realtime));
|
prefix, yes_no(c->restrict_realtime));
|
||||||
|
@ -183,6 +183,7 @@ struct ExecContext {
|
|||||||
bool protect_kernel_tunables;
|
bool protect_kernel_tunables;
|
||||||
bool protect_kernel_modules;
|
bool protect_kernel_modules;
|
||||||
bool protect_control_groups;
|
bool protect_control_groups;
|
||||||
|
bool mount_apivfs;
|
||||||
|
|
||||||
bool no_new_privileges;
|
bool no_new_privileges;
|
||||||
|
|
||||||
|
@ -101,6 +101,7 @@ $1.PrivateUsers, config_parse_bool, 0,
|
|||||||
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)
|
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)
|
||||||
$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context)
|
$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context)
|
||||||
$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context)
|
$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context)
|
||||||
|
$1.MountAPIVFS, config_parse_bool, 0, offsetof($1, exec_context.mount_apivfs)
|
||||||
$1.Personality, config_parse_personality, 0, offsetof($1, exec_context.personality)
|
$1.Personality, config_parse_personality, 0, offsetof($1, exec_context.personality)
|
||||||
$1.RuntimeDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.runtime_directory_mode)
|
$1.RuntimeDirectoryMode, config_parse_mode, 0, offsetof($1, exec_context.runtime_directory_mode)
|
||||||
$1.RuntimeDirectory, config_parse_runtime_directory, 0, offsetof($1, exec_context.runtime_directory)
|
$1.RuntimeDirectory, config_parse_runtime_directory, 0, offsetof($1, exec_context.runtime_directory)
|
||||||
|
@ -52,10 +52,13 @@ typedef enum MountMode {
|
|||||||
INACCESSIBLE,
|
INACCESSIBLE,
|
||||||
BIND_MOUNT,
|
BIND_MOUNT,
|
||||||
BIND_MOUNT_RECURSIVE,
|
BIND_MOUNT_RECURSIVE,
|
||||||
READONLY,
|
|
||||||
PRIVATE_TMP,
|
PRIVATE_TMP,
|
||||||
PRIVATE_VAR_TMP,
|
PRIVATE_VAR_TMP,
|
||||||
PRIVATE_DEV,
|
PRIVATE_DEV,
|
||||||
|
BIND_DEV,
|
||||||
|
SYSFS,
|
||||||
|
PROCFS,
|
||||||
|
READONLY,
|
||||||
READWRITE,
|
READWRITE,
|
||||||
} MountMode;
|
} MountMode;
|
||||||
|
|
||||||
@ -70,13 +73,13 @@ typedef struct MountEntry {
|
|||||||
char *source_malloc;
|
char *source_malloc;
|
||||||
} MountEntry;
|
} MountEntry;
|
||||||
|
|
||||||
/*
|
/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
|
||||||
* The following Protect tables are to protect paths and mark some of them
|
* something there already. These mounts are hence overriden by any other explicitly configured mounts. */
|
||||||
* READONLY, in case a path is covered by an option from another table, then
|
static const MountEntry apivfs_table[] = {
|
||||||
* it is marked READWRITE in the current one, and the more restrictive mode is
|
{ "/proc", PROCFS, false },
|
||||||
* applied from that other table. This way all options can be combined in a
|
{ "/dev", BIND_DEV, false },
|
||||||
* safe and comprehensible way for users.
|
{ "/sys", SYSFS, false },
|
||||||
*/
|
};
|
||||||
|
|
||||||
/* ProtectKernelTunables= option and the related filesystem APIs */
|
/* ProtectKernelTunables= option and the related filesystem APIs */
|
||||||
static const MountEntry protect_kernel_tunables_table[] = {
|
static const MountEntry protect_kernel_tunables_table[] = {
|
||||||
@ -465,7 +468,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne
|
|||||||
*n = t - m;
|
*n = t - m;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int mount_dev(MountEntry *m) {
|
static int mount_private_dev(MountEntry *m) {
|
||||||
static const char devnodes[] =
|
static const char devnodes[] =
|
||||||
"/dev/null\0"
|
"/dev/null\0"
|
||||||
"/dev/zero\0"
|
"/dev/zero\0"
|
||||||
@ -604,6 +607,62 @@ fail:
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int mount_bind_dev(MountEntry *m) {
|
||||||
|
int r;
|
||||||
|
|
||||||
|
assert(m);
|
||||||
|
|
||||||
|
/* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
|
||||||
|
* /dev. This is only used when RootDirectory= is set. */
|
||||||
|
|
||||||
|
r = path_is_mount_point(mount_entry_path(m), NULL, 0);
|
||||||
|
if (r < 0)
|
||||||
|
return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
|
||||||
|
if (r > 0) /* make this a NOP if /dev is already a mount point */
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
|
||||||
|
return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mount_sysfs(MountEntry *m) {
|
||||||
|
int r;
|
||||||
|
|
||||||
|
assert(m);
|
||||||
|
|
||||||
|
r = path_is_mount_point(mount_entry_path(m), NULL, 0);
|
||||||
|
if (r < 0)
|
||||||
|
return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
|
||||||
|
if (r > 0) /* make this a NOP if /sys is already a mount point */
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* Bind mount the host's version so that we get all child mounts of it, too. */
|
||||||
|
if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
|
||||||
|
return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mount_procfs(MountEntry *m) {
|
||||||
|
int r;
|
||||||
|
|
||||||
|
assert(m);
|
||||||
|
|
||||||
|
r = path_is_mount_point(mount_entry_path(m), NULL, 0);
|
||||||
|
if (r < 0)
|
||||||
|
return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
|
||||||
|
if (r > 0) /* make this a NOP if /proc is already a mount point */
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
|
||||||
|
if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
|
||||||
|
return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
static int mount_entry_chase(
|
static int mount_entry_chase(
|
||||||
const char *root_directory,
|
const char *root_directory,
|
||||||
MountEntry *m,
|
MountEntry *m,
|
||||||
@ -691,6 +750,7 @@ static int apply_mount(
|
|||||||
|
|
||||||
case BIND_MOUNT_RECURSIVE:
|
case BIND_MOUNT_RECURSIVE:
|
||||||
/* Also chase the source mount */
|
/* Also chase the source mount */
|
||||||
|
|
||||||
r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
|
r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
|
||||||
if (r <= 0)
|
if (r <= 0)
|
||||||
return r;
|
return r;
|
||||||
@ -707,7 +767,16 @@ static int apply_mount(
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case PRIVATE_DEV:
|
case PRIVATE_DEV:
|
||||||
return mount_dev(m);
|
return mount_private_dev(m);
|
||||||
|
|
||||||
|
case BIND_DEV:
|
||||||
|
return mount_bind_dev(m);
|
||||||
|
|
||||||
|
case SYSFS:
|
||||||
|
return mount_sysfs(m);
|
||||||
|
|
||||||
|
case PROCFS:
|
||||||
|
return mount_procfs(m);
|
||||||
|
|
||||||
default:
|
default:
|
||||||
assert_not_reached("Unknown mode");
|
assert_not_reached("Unknown mode");
|
||||||
@ -729,7 +798,7 @@ static int make_read_only(MountEntry *m, char **blacklist) {
|
|||||||
|
|
||||||
if (mount_entry_read_only(m))
|
if (mount_entry_read_only(m))
|
||||||
r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
|
r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
|
||||||
else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
|
else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't*/
|
||||||
if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
|
if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
|
||||||
r = -errno;
|
r = -errno;
|
||||||
} else
|
} else
|
||||||
@ -745,6 +814,17 @@ static int make_read_only(MountEntry *m, char **blacklist) {
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool namespace_info_mount_apivfs(const NameSpaceInfo *ns_info) {
|
||||||
|
assert(ns_info);
|
||||||
|
|
||||||
|
/* ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, since to protect the API VFS mounts,
|
||||||
|
* they need to be around in the first place... */
|
||||||
|
|
||||||
|
return ns_info->mount_apivfs ||
|
||||||
|
ns_info->protect_control_groups ||
|
||||||
|
ns_info->protect_kernel_tunables;
|
||||||
|
}
|
||||||
|
|
||||||
static unsigned namespace_calculate_mounts(
|
static unsigned namespace_calculate_mounts(
|
||||||
const NameSpaceInfo *ns_info,
|
const NameSpaceInfo *ns_info,
|
||||||
char** read_write_paths,
|
char** read_write_paths,
|
||||||
@ -781,7 +861,8 @@ static unsigned namespace_calculate_mounts(
|
|||||||
(ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
|
(ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
|
||||||
(ns_info->protect_control_groups ? 1 : 0) +
|
(ns_info->protect_control_groups ? 1 : 0) +
|
||||||
(ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
|
(ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
|
||||||
protect_home_cnt + protect_system_cnt;
|
protect_home_cnt + protect_system_cnt +
|
||||||
|
(namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int setup_namespace(
|
int setup_namespace(
|
||||||
@ -885,6 +966,12 @@ int setup_namespace(
|
|||||||
if (r < 0)
|
if (r < 0)
|
||||||
goto finish;
|
goto finish;
|
||||||
|
|
||||||
|
if (namespace_info_mount_apivfs(ns_info)) {
|
||||||
|
r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
|
||||||
|
if (r < 0)
|
||||||
|
goto finish;
|
||||||
|
}
|
||||||
|
|
||||||
assert(mounts + n_mounts == m);
|
assert(mounts + n_mounts == m);
|
||||||
|
|
||||||
/* Prepend the root directory where that's necessary */
|
/* Prepend the root directory where that's necessary */
|
||||||
|
@ -50,6 +50,7 @@ struct NameSpaceInfo {
|
|||||||
bool protect_control_groups:1;
|
bool protect_control_groups:1;
|
||||||
bool protect_kernel_tunables:1;
|
bool protect_kernel_tunables:1;
|
||||||
bool protect_kernel_modules:1;
|
bool protect_kernel_modules:1;
|
||||||
|
bool mount_apivfs:1;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct BindMount {
|
struct BindMount {
|
||||||
|
@ -208,7 +208,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
|
|||||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
|
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
|
||||||
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
|
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
|
||||||
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
|
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
|
||||||
"ProtectKernelModules", "ProtectControlGroups")) {
|
"ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {
|
||||||
|
|
||||||
r = parse_boolean(eq);
|
r = parse_boolean(eq);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
|
Loading…
Reference in New Issue
Block a user