1
0
mirror of https://github.com/systemd/systemd.git synced 2024-10-26 17:27:41 +03:00

cgroup: Add support for ProtectControlGroups= private and strict

This commit adds two settings private and strict to
the ProtectControlGroups= property. Private will unshare the cgroup
namespace and mount a read-write private cgroup2 filesystem at /sys/fs/cgroup.
Strict does the same except the mount is read-only. Since the unit is
running in a cgroup namespace, the new root of /sys/fs/cgroup is the unit's
own cgroup.

We also add a new dbus property ProtectControlGroupsEx which accepts strings
instead of boolean. This will allow users to use private/strict via dbus
and systemd-run in addition to service files.

Note private and strict fall back to no and yes respectively if the kernel
doesn't support cgroup2 or system is not using unified hierarchy.

Fixes: #34634
This commit is contained in:
Ryan Wilson 2024-10-18 11:41:09 -07:00
parent c3bda20737
commit 836e29c5b2
10 changed files with 262 additions and 24 deletions

View File

@ -3251,6 +3251,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectControlGroupsEx = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
@ -3868,8 +3870,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property ProtectKernelLogs is not documented!-->
<!--property ProtectControlGroups is not documented!-->
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
@ -4572,6 +4572,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@ -4858,6 +4860,12 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
unit file setting <varname>ManagedOOMMemoryPressureDurationSec=</varname> listed in
<citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Note the time unit is expressed in <literal>μs</literal>.</para>
<para><varname>ProtectControlGroupsEx</varname> implement the destination parameter of the
unit file setting <varname>ProtectControlGroups=</varname> listed in
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname>
is a string type.</para>
</refsect2>
</refsect1>
@ -5415,6 +5423,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectControlGroupsEx = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
@ -6044,8 +6054,6 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property ProtectKernelLogs is not documented!-->
<!--property ProtectControlGroups is not documented!-->
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
@ -6720,6 +6728,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@ -7416,6 +7426,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectControlGroupsEx = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
@ -7971,8 +7983,6 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property ProtectKernelLogs is not documented!-->
<!--property ProtectControlGroups is not documented!-->
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
@ -8559,6 +8569,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@ -9384,6 +9396,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b ProtectControlGroups = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectControlGroupsEx = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateNetwork = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateUsers = ...;
@ -9925,8 +9939,6 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property ProtectKernelLogs is not documented!-->
<!--property ProtectControlGroups is not documented!-->
<!--property PrivateNetwork is not documented!-->
<!--property PrivateUsers is not documented!-->
@ -10499,6 +10511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroups"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectControlGroupsEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateNetwork"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateUsers"/>
@ -12262,7 +12276,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ImportCredentialEx</varname>,
<varname>ExtraFileDescriptorNames</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>BindLogSockets</varname>, and
<varname>BindLogSockets</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivateUsersEx</varname> were added in version 257.</para>
</refsect2>
<refsect2>

View File

@ -2117,14 +2117,22 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
<varlistentry>
<term><varname>ProtectControlGroups=</varname></term>
<listitem><para>Takes a boolean argument. If true, the Linux Control Groups (<citerefentry
project='man-pages'><refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
<listitem><para>Takes a boolean argument or the special values <literal>private</literal> or
<literal>strict</literal>. If true, the Linux Control Groups (<citerefentry project='man-pages'>
<refentrytitle>cgroups</refentrytitle><manvolnum>7</manvolnum></citerefentry>) hierarchies
accessible through <filename>/sys/fs/cgroup/</filename> will be made read-only to all processes of the
unit. Except for container managers no services should require write access to the control groups hierarchies;
it is hence recommended to turn this on for most services. For this setting the same restrictions regarding
mount propagation and privileges apply as for <varname>ReadOnlyPaths=</varname> and related calls, see
above. Defaults to off. If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname>
is implied.</para>
unit. If set to <literal>private</literal>, the unit will run in a cgroup namespace with a private
writable mount of <filename>/sys/fs/cgroup/</filename>. If set to <literal>strict</literal>, the unit
will run in a cgroup namespace with a private read-only mount of <filename>/sys/fs/cgroup/</filename>.
Note <literal>private</literal> and <literal>strict</literal> are downgraded to false and true
respectively unless the system is using the unified control group hierarchy and the kernel supports
cgroup namespaces.</para>
<para>Except for container managers no services should require write access to the control groups hierarchies;
it is hence recommended to set <varname>ProtectControlGroups=</varname> to true or <literal>strict</literal>
on for most services. For this setting the same restrictions regarding mount propagation and privileges apply
as for <varname>ReadOnlyPaths=</varname> and related calls, see above. Defaults to off.
If <varname>ProtectControlGroups=</varname> is set, <varname>MountAPIVFS=yes</varname> is implied.</para>
<xi:include href="system-only.xml" xpointer="singular"/>

View File

@ -61,6 +61,7 @@ static BUS_DEFINE_PROPERTY_GET2(property_get_ioprio_priority, "i", ExecContext,
static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
@ -1179,6 +1180,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectKernelLogs", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_logs), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectControlGroups", "b", property_get_protect_control_groups, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectControlGroupsEx", "s", property_get_protect_control_groups_ex, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsers", "b", property_get_private_users, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
@ -1939,6 +1941,27 @@ int bus_exec_context_set_transient_property(
return 1;
}
if (streq(name, "ProtectControlGroupsEx")) {
const char *s;
ProtectControlGroups t;
r = sd_bus_message_read(message, "s", &s);
if (r < 0)
return r;
t = protect_control_groups_from_string(s);
if (t < 0)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s);
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->protect_control_groups = t;
(void) unit_write_settingf(u, flags, name, "ProtectControlGroups=%s",
protect_control_groups_to_string(c->protect_control_groups));
}
return 1;
}
if (streq(name, "PrivateDevices"))
return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);

View File

@ -3082,7 +3082,7 @@ static int apply_mount_namespace(
/* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
* service will need to write to it in order to start the notifications. */
if (context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
if (exec_is_cgroup_mount_read_only(context, params) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
read_write_paths_cleanup = strv_copy(context->read_write_paths);
if (!read_write_paths_cleanup)
return -ENOMEM;
@ -3226,7 +3226,7 @@ static int apply_mount_namespace(
* sandbox inside the mount namespace. */
.ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
.protect_control_groups = needs_sandboxing ? context->protect_control_groups : PROTECT_CONTROL_GROUPS_NO,
.protect_control_groups = needs_sandboxing ? get_adjusted_protect_control_groups(context, params) : PROTECT_CONTROL_GROUPS_NO,
.protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
.protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
.protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
@ -3870,7 +3870,7 @@ static bool exec_context_need_unprivileged_private_users(
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
exec_needs_cgroup_mount(context, params) ||
context->protect_clock ||
context->protect_hostname ||
!strv_isempty(context->read_write_paths) ||
@ -4564,6 +4564,10 @@ int exec_invoke(
}
}
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
* from it. */
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
if (params->cgroup_path) {
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
@ -4607,6 +4611,18 @@ int exec_invoke(
"Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
memory_pressure_path = mfree(memory_pressure_path);
}
/* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
* to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
* pressure path environment variable or read-write mount to the unit. This is why we check if
* memory_pressure_path != NULL in the conditional below. */
if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
memory_pressure_path = mfree(memory_pressure_path);
r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
if (r < 0) {
*exit_status = EXIT_MEMORY;
return log_oom();
}
}
} else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
if (!memory_pressure_path) {
@ -4693,10 +4709,6 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
}
/* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
* from it. */
needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
/* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
* for it, and the kernel doesn't actually support ambient caps. */
needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
@ -4837,6 +4849,14 @@ int exec_invoke(
log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
}
if (needs_sandboxing && exec_needs_cgroup_namespace(context, params)) {
r = unshare(CLONE_NEWCGROUP);
if (r < 0) {
*exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to set up cgroup namespacing: %m");
}
}
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;

View File

@ -210,6 +210,64 @@ bool exec_needs_ipc_namespace(const ExecContext *context) {
return context->private_ipc || context->ipc_namespace_path;
}
static bool can_apply_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
int r;
r = cg_all_unified();
if (r < 0) {
log_once_exec_warning_errno(context, params, r, "Failed to determine cgroup hierarchy version: %m");
return false;
}
if (r == 0) {
log_once_exec_warning(context, params, "ProtectControlGroups=%s is configured, but the unified cgroups hierarchy is not set up, ignoring namespace setup.",
protect_control_groups_to_string(context->protect_control_groups));
return false;
}
if (!ns_type_supported(NAMESPACE_CGROUP)) {
log_once_exec_warning(context, params, "ProtectControlGroups=%s is configured, but the kernel does not support cgroup namespaces, ignoring namespace setup.",
protect_control_groups_to_string(context->protect_control_groups));
return false;
}
return true;
}
static inline bool needs_cgroup_namespace(ProtectControlGroups i) {
return IN_SET(i, PROTECT_CONTROL_GROUPS_PRIVATE, PROTECT_CONTROL_GROUPS_STRICT);
}
ProtectControlGroups get_adjusted_protect_control_groups(const ExecContext *context, const ExecParameters *params) {
/* If cgroup namespace is configured via ProtectControlGroups=private or strict but we can't actually
* use cgroup namespace, either from not having unified hierarchy or kernel support, we ignore the
* setting and do not unshare the namespace. ProtectControlGroups=private and strict get downgraded
* to no and yes respectively. This ensures that strict always gets a read-only mount of /sys/fs/cgroup. */
if (needs_cgroup_namespace(context->protect_control_groups) && !can_apply_cgroup_namespace(context, params)) {
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_PRIVATE)
return PROTECT_CONTROL_GROUPS_NO;
if (context->protect_control_groups == PROTECT_CONTROL_GROUPS_STRICT)
return PROTECT_CONTROL_GROUPS_YES;
}
return context->protect_control_groups;
}
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params) {
assert(context);
return needs_cgroup_namespace(get_adjusted_protect_control_groups(context, params));
}
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params) {
assert(context);
return get_adjusted_protect_control_groups(context, params) != PROTECT_CONTROL_GROUPS_NO;
}
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params) {
assert(context);
return IN_SET(get_adjusted_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
}
bool exec_needs_mount_namespace(
const ExecContext *context,
const ExecParameters *params,
@ -259,7 +317,7 @@ bool exec_needs_mount_namespace(
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
context->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
exec_needs_cgroup_mount(context, params) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
exec_needs_ipc_namespace(context))

View File

@ -616,6 +616,11 @@ bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters
bool exec_needs_network_namespace(const ExecContext *context);
bool exec_needs_ipc_namespace(const ExecContext *context);
ProtectControlGroups get_adjusted_protect_control_groups(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_mount(const ExecContext *context, const ExecParameters *params);
bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParameters *params);
/* These logging macros do the same logging as those in unit.h, but using ExecContext and ExecParameters
* instead of the unit object, so that it can be used in the sd-executor context (where the unit object is
* not available). */

View File

@ -65,6 +65,7 @@ typedef enum MountMode {
MOUNT_PRIVATE_SYSFS,
MOUNT_BIND_SYSFS,
MOUNT_PROCFS,
MOUNT_PRIVATE_CGROUP2FS,
MOUNT_READ_ONLY,
MOUNT_READ_WRITE,
MOUNT_NOEXEC,
@ -204,6 +205,16 @@ static const MountEntry protect_control_groups_yes_table[] = {
{ "/sys/fs/cgroup", MOUNT_READ_ONLY, false },
};
/* ProtectControlGroups=private table */
static const MountEntry protect_control_groups_private_table[] = {
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = false, .nosuid = true, .noexec = true },
};
/* ProtectControlGroups=strict table */
static const MountEntry protect_control_groups_strict_table[] = {
{ "/sys/fs/cgroup", MOUNT_PRIVATE_CGROUP2FS, false, .read_only = true, .nosuid = true, .noexec = true },
};
/* ProtectSystem=yes table */
static const MountEntry protect_system_yes_table[] = {
{ "/usr", MOUNT_READ_ONLY, false },
@ -252,6 +263,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
[MOUNT_EMPTY_DIR] = "empty-dir",
[MOUNT_PRIVATE_SYSFS] = "private-sysfs",
[MOUNT_BIND_SYSFS] = "bind-sysfs",
[MOUNT_PRIVATE_CGROUP2FS] = "private-cgroup2fs",
[MOUNT_PROCFS] = "procfs",
[MOUNT_READ_ONLY] = "read-only",
[MOUNT_READ_WRITE] = "read-write",
@ -743,6 +755,12 @@ static int append_protect_control_groups(MountList *ml, ProtectControlGroups pro
case PROTECT_CONTROL_GROUPS_YES:
return append_static_mounts(ml, protect_control_groups_yes_table, ELEMENTSOF(protect_control_groups_yes_table), ignore_protect);
case PROTECT_CONTROL_GROUPS_PRIVATE:
return append_static_mounts(ml, protect_control_groups_private_table, ELEMENTSOF(protect_control_groups_private_table), ignore_protect);
case PROTECT_CONTROL_GROUPS_STRICT:
return append_static_mounts(ml, protect_control_groups_strict_table, ELEMENTSOF(protect_control_groups_strict_table), ignore_protect);
default:
assert_not_reached();
}
@ -1339,6 +1357,12 @@ static int mount_private_sysfs(const MountEntry *m, const NamespaceParameters *p
return mount_private_apivfs("sysfs", mount_entry_path(m), "/sys", /* opts = */ NULL, p->runtime_scope);
}
static int mount_private_cgroup2fs(const MountEntry *m, const NamespaceParameters *p) {
assert(m);
assert(p);
return mount_private_apivfs("cgroup2", mount_entry_path(m), "/sys/fs/cgroup", /* opts = */ NULL, p->runtime_scope);
}
static int mount_procfs(const MountEntry *m, const NamespaceParameters *p) {
_cleanup_free_ char *opts = NULL;
@ -1784,6 +1808,9 @@ static int apply_one_mount(
case MOUNT_PROCFS:
return mount_procfs(m, p);
case MOUNT_PRIVATE_CGROUP2FS:
return mount_private_cgroup2fs(m, p);
case MOUNT_RUN:
return mount_run(m);
@ -3212,6 +3239,8 @@ DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_system, ProtectSystem, PROTECT_S
static const char *const protect_control_groups_table[_PROTECT_CONTROL_GROUPS_MAX] = {
[PROTECT_CONTROL_GROUPS_NO] = "no",
[PROTECT_CONTROL_GROUPS_YES] = "yes",
[PROTECT_CONTROL_GROUPS_PRIVATE] = "private",
[PROTECT_CONTROL_GROUPS_STRICT] = "strict",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(protect_control_groups, ProtectControlGroups, PROTECT_CONTROL_GROUPS_YES);

View File

@ -72,6 +72,8 @@ typedef enum PrivateUsers {
typedef enum ProtectControlGroups {
PROTECT_CONTROL_GROUPS_NO,
PROTECT_CONTROL_GROUPS_YES,
PROTECT_CONTROL_GROUPS_PRIVATE,
PROTECT_CONTROL_GROUPS_STRICT,
_PROTECT_CONTROL_GROUPS_MAX,
_PROTECT_CONTROL_GROUPS_INVALID = -EINVAL,
} ProtectControlGroups;

View File

@ -1047,6 +1047,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
"ProtectHome",
"PrivateTmpEx",
"PrivateUsersEx",
"ProtectControlGroupsEx",
"SELinuxContext",
"RootImage",
"RootVerity",

View File

@ -0,0 +1,77 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
# shellcheck disable=SC2016
set -eux
set -o pipefail
# shellcheck source=test/units/test-control.sh
. "$(dirname "$0")"/test-control.sh
# shellcheck source=test/units/util.sh
. "$(dirname "$0")"/util.sh
SLICE="system.slice"
UNIT="test-07-protect-control-groups"
READ_ONLY_MOUNT_FLAG="ro"
READ_WRITE_MOUNT_FLAG="rw"
at_exit() {
set +e
systemctl stop "$UNIT"
systemctl reset-failed
}
trap at_exit EXIT
ROOT_CGROUP_NS=$(readlink /proc/self/ns/cgroup)
test_basic() {
local protect_control_groups_ex="$1"
local protect_control_groups="$2"
local in_cgroup_ns="$3"
local mount_flag="$4"
if [[ $in_cgroup_ns == true ]]; then
local ns_cmp_op="!="
local unit_cgroup="0::/"
local memory_pressure_watch="/sys/fs/cgroup/memory.pressure"
else
local ns_cmp_op="=="
local unit_cgroup="0::/$SLICE/$UNIT.service"
local memory_pressure_watch="/sys/fs/cgroup/$SLICE/$UNIT.service/memory.pressure"
fi
# Compare cgroup namespace to root namespace
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --unit "$UNIT" --wait bash -xec "test \"\$(readlink /proc/self/ns/cgroup)\" $ns_cmp_op \"$ROOT_CGROUP_NS\""
# Verify unit cgroup
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --unit "$UNIT" --wait bash -xec "test \"\$(cat /proc/self/cgroup)\" == \"$unit_cgroup\""
# Verify memory pressure watch points to correct file
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --unit "$UNIT" --wait bash -xec "test \"\$MEMORY_PRESSURE_WATCH\" == \"$memory_pressure_watch\""
# Verify /sys/fs/cgroup mount is read-only or read-write
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --unit "$UNIT" --wait bash -xec "mount | grep '/sys/fs/cgroup ' | grep -q \"$mount_flag\""
# Verify dbus properties
systemd-run -p "ProtectControlGroupsEx=$protect_control_groups_ex" --slice "$SLICE" --unit "$UNIT" --remain-after-exit bash -xec "true"
assert_eq "$(systemctl show -P ProtectControlGroupsEx "$UNIT")" "$protect_control_groups_ex"
assert_eq "$(systemctl show -P ProtectControlGroups "$UNIT")" "$protect_control_groups"
systemctl stop "$UNIT"
}
testcase_basic_no() {
test_basic "no" "no" false "$READ_WRITE_MOUNT_FLAG"
}
testcase_basic_yes() {
test_basic "yes" "yes" false "$READ_ONLY_MOUNT_FLAG"
}
testcase_basic_private() {
test_basic "private" "yes" true "$READ_WRITE_MOUNT_FLAG"
}
testcase_basic_strict() {
test_basic "strict" "yes" true "$READ_ONLY_MOUNT_FLAG"
}
run_testcases