mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-01-11 05:17:44 +03:00
core: add two new service settings ProtectKernelTunables= and ProtectControlGroups=
If enabled, these will block write access to /sys, /proc/sys and /proc/sys/fs/cgroup.
This commit is contained in:
parent
72246c2a65
commit
59eeb84ba6
@ -1059,6 +1059,26 @@
|
||||
Defaults to off.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>ProtectKernelTunables=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument. If true, kernel variables accessible through
|
||||
<filename>/proc/sys</filename> and <filename>/sys</filename> will be made read-only to all processes of the
|
||||
unit. Usually, tunable kernel variables should only be written at boot-time, with the
|
||||
<citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry> mechanism. Almost
|
||||
no services need to write to these at runtime; it is hence recommended to turn this on for most
|
||||
services. Defaults to off.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>ProtectControlGroups=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument. If true, the Linux Control Groups ("cgroups") hierarchies accessible
|
||||
through <filename>/sys/fs/cgroup</filename> will be made read-only to all processes of the unit. Except for
|
||||
container managers no services should require write access to the control groups hierarchies; it is hence
|
||||
recommended to turn this on for most services. Defaults to off.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>MountFlags=</varname></term>
|
||||
|
||||
|
@ -707,6 +707,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
|
||||
SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
@ -1072,7 +1074,8 @@ int bus_exec_context_set_transient_property(
|
||||
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
|
||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
|
||||
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
|
||||
"RestrictRealtime", "DynamicUser", "RemoveIPC")) {
|
||||
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
|
||||
"ProtectControlGroups")) {
|
||||
int b;
|
||||
|
||||
r = sd_bus_message_read(message, "b", &b);
|
||||
@ -1106,6 +1109,10 @@ int bus_exec_context_set_transient_property(
|
||||
c->dynamic_user = b;
|
||||
else if (streq(name, "RemoveIPC"))
|
||||
c->remove_ipc = b;
|
||||
else if (streq(name, "ProtectKernelTunables"))
|
||||
c->protect_kernel_tunables = b;
|
||||
else if (streq(name, "ProtectControlGroups"))
|
||||
c->protect_control_groups = b;
|
||||
|
||||
unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
|
||||
}
|
||||
|
@ -1383,6 +1383,45 @@ finish:
|
||||
return r;
|
||||
}
|
||||
|
||||
static int apply_protect_sysctl(Unit *u, const ExecContext *c) {
|
||||
scmp_filter_ctx *seccomp;
|
||||
int r;
|
||||
|
||||
assert(c);
|
||||
|
||||
/* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
|
||||
* let's protect even those systems where this is left on in the kernel. */
|
||||
|
||||
if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
|
||||
return 0;
|
||||
|
||||
seccomp = seccomp_init(SCMP_ACT_ALLOW);
|
||||
if (!seccomp)
|
||||
return -ENOMEM;
|
||||
|
||||
r = seccomp_add_secondary_archs(seccomp);
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
r = seccomp_rule_add(
|
||||
seccomp,
|
||||
SCMP_ACT_ERRNO(EPERM),
|
||||
SCMP_SYS(_sysctl),
|
||||
0);
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
|
||||
r = seccomp_load(seccomp);
|
||||
|
||||
finish:
|
||||
seccomp_release(seccomp);
|
||||
return r;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void do_idle_pipe_dance(int idle_pipe[4]) {
|
||||
@ -1589,7 +1628,9 @@ static bool exec_needs_mount_namespace(
|
||||
|
||||
if (context->private_devices ||
|
||||
context->protect_system != PROTECT_SYSTEM_NO ||
|
||||
context->protect_home != PROTECT_HOME_NO)
|
||||
context->protect_home != PROTECT_HOME_NO ||
|
||||
context->protect_kernel_tunables ||
|
||||
context->protect_control_groups)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
@ -1804,6 +1845,37 @@ static int close_remaining_fds(
|
||||
return close_all_fds(dont_close, n_dont_close);
|
||||
}
|
||||
|
||||
static bool context_has_address_families(const ExecContext *c) {
|
||||
assert(c);
|
||||
|
||||
return c->address_families_whitelist ||
|
||||
!set_isempty(c->address_families);
|
||||
}
|
||||
|
||||
static bool context_has_syscall_filters(const ExecContext *c) {
|
||||
assert(c);
|
||||
|
||||
return c->syscall_whitelist ||
|
||||
!set_isempty(c->syscall_filter) ||
|
||||
!set_isempty(c->syscall_archs);
|
||||
}
|
||||
|
||||
static bool context_has_no_new_privileges(const ExecContext *c) {
|
||||
assert(c);
|
||||
|
||||
if (c->no_new_privileges)
|
||||
return true;
|
||||
|
||||
if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
|
||||
return false;
|
||||
|
||||
return context_has_address_families(c) || /* we need NNP if we have any form of seccomp and are unprivileged */
|
||||
c->memory_deny_write_execute ||
|
||||
c->restrict_realtime ||
|
||||
c->protect_kernel_tunables ||
|
||||
context_has_syscall_filters(c);
|
||||
}
|
||||
|
||||
static int send_user_lookup(
|
||||
Unit *unit,
|
||||
int user_lookup_fd,
|
||||
@ -2255,6 +2327,8 @@ static int exec_child(
|
||||
tmp,
|
||||
var,
|
||||
context->private_devices,
|
||||
context->protect_kernel_tunables,
|
||||
context->protect_control_groups,
|
||||
context->protect_home,
|
||||
context->protect_system,
|
||||
context->mount_flags);
|
||||
@ -2343,11 +2417,6 @@ static int exec_child(
|
||||
|
||||
if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
|
||||
|
||||
bool use_address_families = context->address_families_whitelist ||
|
||||
!set_isempty(context->address_families);
|
||||
bool use_syscall_filter = context->syscall_whitelist ||
|
||||
!set_isempty(context->syscall_filter) ||
|
||||
!set_isempty(context->syscall_archs);
|
||||
int secure_bits = context->secure_bits;
|
||||
|
||||
for (i = 0; i < _RLIMIT_MAX; i++) {
|
||||
@ -2424,15 +2493,14 @@ static int exec_child(
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (context->no_new_privileges ||
|
||||
(!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
|
||||
if (context_has_no_new_privileges(context))
|
||||
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
|
||||
*exit_status = EXIT_NO_NEW_PRIVILEGES;
|
||||
return -errno;
|
||||
}
|
||||
|
||||
#ifdef HAVE_SECCOMP
|
||||
if (use_address_families) {
|
||||
if (context_has_address_families(context)) {
|
||||
r = apply_address_families(unit, context);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_ADDRESS_FAMILIES;
|
||||
@ -2456,7 +2524,15 @@ static int exec_child(
|
||||
}
|
||||
}
|
||||
|
||||
if (use_syscall_filter) {
|
||||
if (context->protect_kernel_tunables) {
|
||||
r = apply_protect_sysctl(unit, context);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_SECCOMP;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
if (context_has_syscall_filters(context)) {
|
||||
r = apply_seccomp(unit, context);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_SECCOMP;
|
||||
@ -2888,6 +2964,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
||||
"%sNonBlocking: %s\n"
|
||||
"%sPrivateTmp: %s\n"
|
||||
"%sPrivateDevices: %s\n"
|
||||
"%sProtectKernelTunables: %s\n"
|
||||
"%sProtectControlGroups: %s\n"
|
||||
"%sPrivateNetwork: %s\n"
|
||||
"%sPrivateUsers: %s\n"
|
||||
"%sProtectHome: %s\n"
|
||||
@ -2901,6 +2979,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
||||
prefix, yes_no(c->non_blocking),
|
||||
prefix, yes_no(c->private_tmp),
|
||||
prefix, yes_no(c->private_devices),
|
||||
prefix, yes_no(c->protect_kernel_tunables),
|
||||
prefix, yes_no(c->protect_control_groups),
|
||||
prefix, yes_no(c->private_network),
|
||||
prefix, yes_no(c->private_users),
|
||||
prefix, protect_home_to_string(c->protect_home),
|
||||
|
@ -174,6 +174,8 @@ struct ExecContext {
|
||||
bool private_users;
|
||||
ProtectSystem protect_system;
|
||||
ProtectHome protect_home;
|
||||
bool protect_kernel_tunables;
|
||||
bool protect_control_groups;
|
||||
|
||||
bool no_new_privileges;
|
||||
|
||||
|
@ -89,6 +89,8 @@ $1.ReadOnlyPaths, config_parse_namespace_path_strv, 0,
|
||||
$1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths)
|
||||
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
|
||||
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
|
||||
$1.ProtectKernelTunables, config_parse_bool, 0, offsetof($1, exec_context.protect_kernel_tunables)
|
||||
$1.ProtectControlGroups, config_parse_bool, 0, offsetof($1, exec_context.protect_control_groups)
|
||||
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
|
||||
$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)
|
||||
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)
|
||||
|
@ -53,7 +53,7 @@ typedef enum MountMode {
|
||||
PRIVATE_TMP,
|
||||
PRIVATE_VAR_TMP,
|
||||
PRIVATE_DEV,
|
||||
READWRITE
|
||||
READWRITE,
|
||||
} MountMode;
|
||||
|
||||
typedef struct BindMount {
|
||||
@ -366,6 +366,8 @@ int setup_namespace(
|
||||
const char* tmp_dir,
|
||||
const char* var_tmp_dir,
|
||||
bool private_dev,
|
||||
bool protect_sysctl,
|
||||
bool protect_cgroups,
|
||||
ProtectHome protect_home,
|
||||
ProtectSystem protect_system,
|
||||
unsigned long mount_flags) {
|
||||
@ -385,6 +387,8 @@ int setup_namespace(
|
||||
strv_length(read_only_paths) +
|
||||
strv_length(inaccessible_paths) +
|
||||
private_dev +
|
||||
(protect_sysctl ? 3 : 0) +
|
||||
(protect_cgroups != protect_sysctl) +
|
||||
(protect_home != PROTECT_HOME_NO ? 3 : 0) +
|
||||
(protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
|
||||
(protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
|
||||
@ -421,6 +425,27 @@ int setup_namespace(
|
||||
m++;
|
||||
}
|
||||
|
||||
if (protect_sysctl) {
|
||||
m->path = prefix_roota(root_directory, "/proc/sys");
|
||||
m->mode = READONLY;
|
||||
m++;
|
||||
|
||||
m->path = prefix_roota(root_directory, "/proc/sysrq-trigger");
|
||||
m->mode = READONLY;
|
||||
m->ignore = true; /* Not always compiled into the kernel */
|
||||
m++;
|
||||
|
||||
m->path = prefix_roota(root_directory, "/sys");
|
||||
m->mode = READONLY;
|
||||
m++;
|
||||
}
|
||||
|
||||
if (protect_cgroups != protect_sysctl) {
|
||||
m->path = prefix_roota(root_directory, "/sys/fs/cgroup");
|
||||
m->mode = protect_cgroups ? READONLY : READWRITE;
|
||||
m++;
|
||||
}
|
||||
|
||||
if (protect_home != PROTECT_HOME_NO) {
|
||||
const char *home_dir, *run_user_dir, *root_dir;
|
||||
|
||||
@ -505,9 +530,12 @@ int setup_namespace(
|
||||
|
||||
fail:
|
||||
if (n > 0) {
|
||||
for (m = mounts; m < mounts + n; ++m)
|
||||
if (m->done)
|
||||
(void) umount2(m->path, MNT_DETACH);
|
||||
for (m = mounts; m < mounts + n; ++m) {
|
||||
if (!m->done)
|
||||
continue;
|
||||
|
||||
(void) umount2(m->path, MNT_DETACH);
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
|
@ -46,6 +46,8 @@ int setup_namespace(const char *chroot,
|
||||
const char *tmp_dir,
|
||||
const char *var_tmp_dir,
|
||||
bool private_dev,
|
||||
bool protect_sysctl,
|
||||
bool protect_cgroups,
|
||||
ProtectHome protect_home,
|
||||
ProtectSystem protect_system,
|
||||
unsigned long mount_flags);
|
||||
|
@ -204,7 +204,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
|
||||
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
|
||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
|
||||
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
|
||||
"RestrictRealtime", "DynamicUser", "RemoveIPC")) {
|
||||
"RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
|
||||
|
||||
r = parse_boolean(eq);
|
||||
if (r < 0)
|
||||
|
@ -69,6 +69,8 @@ int main(int argc, char *argv[]) {
|
||||
tmp_dir,
|
||||
var_tmp_dir,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
PROTECT_HOME_NO,
|
||||
PROTECT_SYSTEM_NO,
|
||||
0);
|
||||
|
Loading…
Reference in New Issue
Block a user