mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-02-03 13:47:04 +03:00
core: introduce ProtectSystem=strict
Let's tighten our sandbox a bit more: with this change ProtectSystem= gains a new setting "strict". If set, the entire directory tree of the system is mounted read-only, but the API file systems /proc, /dev, /sys are excluded (they may be managed with PrivateDevices= and ProtectKernelTunables=). Also, /home and /root are excluded as those are left for ProtectHome= to manage. In this mode, all "real" file systems (i.e. non-API file systems) are mounted read-only, and specific directories may only be excluded via ReadWriteDirectories=, thus implementing an effective whitelist instead of blacklist of writable directories. While we are at, also add /efi to the list of paths always affected by ProtectSystem=. This is a follow-up for b52a109ad38cd37b660ccd5394ff5c171a5e5355 which added /efi as alternative for /boot. Our namespacing logic should respect that too.
This commit is contained in:
parent
160cfdbed3
commit
3f815163ff
@ -1020,22 +1020,23 @@
|
||||
<varlistentry>
|
||||
<term><varname>ProtectSystem=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument or
|
||||
<literal>full</literal>. If true, mounts the
|
||||
<filename>/usr</filename> and <filename>/boot</filename>
|
||||
directories read-only for processes invoked by this unit. If
|
||||
set to <literal>full</literal>, the <filename>/etc</filename>
|
||||
directory is mounted read-only, too. This setting ensures that
|
||||
any modification of the vendor-supplied operating system (and
|
||||
optionally its configuration) is prohibited for the service.
|
||||
It is recommended to enable this setting for all long-running
|
||||
services, unless they are involved with system updates or need
|
||||
to modify the operating system in other ways. Note however
|
||||
that processes retaining the CAP_SYS_ADMIN capability can undo
|
||||
the effect of this setting. This setting is hence particularly
|
||||
useful for daemons which have this capability removed, for
|
||||
example with <varname>CapabilityBoundingSet=</varname>.
|
||||
Defaults to off.</para></listitem>
|
||||
<listitem><para>Takes a boolean argument or the special values <literal>full</literal> or
|
||||
<literal>strict</literal>. If true, mounts the <filename>/usr</filename> and <filename>/boot</filename>
|
||||
directories read-only for processes invoked by this unit. If set to <literal>full</literal>, the
|
||||
<filename>/etc</filename> directory is mounted read-only, too. If set to <literal>strict</literal> the entire
|
||||
file system hierarchy is mounted read-only, except for the API file system subtrees <filename>/dev</filename>,
|
||||
<filename>/proc</filename> and <filename>/sys</filename> (protect these directories using
|
||||
<varname>PrivateDevices=</varname>, <varname>ProtectKernelTunables=</varname>,
|
||||
<varname>ProtectControlGroups=</varname>). This setting ensures that any modification of the vendor-supplied
|
||||
operating system (and optionally its configuration, and local mounts) is prohibited for the service. It is
|
||||
recommended to enable this setting for all long-running services, unless they are involved with system updates
|
||||
or need to modify the operating system in other ways. If this option is used,
|
||||
<varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. Note
|
||||
that processes retaining the <constant>CAP_SYS_ADMIN</constant> capability (and with no system call filter that
|
||||
prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence
|
||||
particularly useful for daemons which have this either the <literal>@mount</literal> set filtered using
|
||||
<varname>SystemCallFilter=</varname>, or have the <constant>CAP_SYS_ADMIN</constant> capability removed, for
|
||||
example with <varname>CapabilityBoundingSet=</varname>. Defaults to off.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
@ -472,9 +472,11 @@ int setup_namespace(
|
||||
private_dev +
|
||||
(protect_sysctl ? 3 : 0) +
|
||||
(protect_cgroups != protect_sysctl) +
|
||||
(protect_home != PROTECT_HOME_NO ? 3 : 0) +
|
||||
(protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
|
||||
(protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
|
||||
(protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
|
||||
(protect_system == PROTECT_SYSTEM_STRICT ?
|
||||
(2 + !private_dev + !protect_sysctl) :
|
||||
((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
|
||||
(protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
|
||||
|
||||
if (n > 0) {
|
||||
m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
|
||||
@ -529,9 +531,13 @@ int setup_namespace(
|
||||
m++;
|
||||
}
|
||||
|
||||
if (protect_home != PROTECT_HOME_NO) {
|
||||
if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
|
||||
const char *home_dir, *run_user_dir, *root_dir;
|
||||
|
||||
/* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
|
||||
* strict system protection mode, then also add entries for these directories, but mark them
|
||||
* writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
|
||||
|
||||
home_dir = prefix_roota(root_directory, "/home");
|
||||
home_dir = strjoina("-", home_dir);
|
||||
run_user_dir = prefix_roota(root_directory, "/run/user");
|
||||
@ -540,22 +546,53 @@ int setup_namespace(
|
||||
root_dir = strjoina("-", root_dir);
|
||||
|
||||
r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
|
||||
protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
|
||||
protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
|
||||
protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
if (protect_system != PROTECT_SYSTEM_NO) {
|
||||
const char *usr_dir, *boot_dir, *etc_dir;
|
||||
if (protect_system == PROTECT_SYSTEM_STRICT) {
|
||||
/* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
|
||||
* kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
|
||||
* protect those, and these options should be fully orthogonal. (And of course /home and
|
||||
* friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
|
||||
* above). */
|
||||
|
||||
m->path = prefix_roota(root_directory, "/");
|
||||
m->mode = READONLY;
|
||||
m++;
|
||||
|
||||
m->path = prefix_roota(root_directory, "/proc");
|
||||
m->mode = READWRITE;
|
||||
m++;
|
||||
|
||||
if (!private_dev) {
|
||||
m->path = prefix_roota(root_directory, "/dev");
|
||||
m->mode = READWRITE;
|
||||
m++;
|
||||
}
|
||||
if (!protect_sysctl) {
|
||||
m->path = prefix_roota(root_directory, "/sys");
|
||||
m->mode = READWRITE;
|
||||
m++;
|
||||
}
|
||||
|
||||
} else if (protect_system != PROTECT_SYSTEM_NO) {
|
||||
const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
|
||||
|
||||
/* In any other mode we simply mark the relevant three directories ready-only. */
|
||||
|
||||
usr_dir = prefix_roota(root_directory, "/usr");
|
||||
boot_dir = prefix_roota(root_directory, "/boot");
|
||||
boot_dir = strjoina("-", boot_dir);
|
||||
efi_dir = prefix_roota(root_directory, "/efi");
|
||||
efi_dir = strjoina("-", efi_dir);
|
||||
etc_dir = prefix_roota(root_directory, "/etc");
|
||||
|
||||
r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
|
||||
? STRV_MAKE(usr_dir, boot_dir, etc_dir)
|
||||
: STRV_MAKE(usr_dir, boot_dir), READONLY);
|
||||
? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
|
||||
: STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
@ -780,6 +817,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
|
||||
[PROTECT_SYSTEM_NO] = "no",
|
||||
[PROTECT_SYSTEM_YES] = "yes",
|
||||
[PROTECT_SYSTEM_FULL] = "full",
|
||||
[PROTECT_SYSTEM_STRICT] = "strict",
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
|
||||
|
@ -35,6 +35,7 @@ typedef enum ProtectSystem {
|
||||
PROTECT_SYSTEM_NO,
|
||||
PROTECT_SYSTEM_YES,
|
||||
PROTECT_SYSTEM_FULL,
|
||||
PROTECT_SYSTEM_STRICT,
|
||||
_PROTECT_SYSTEM_MAX,
|
||||
_PROTECT_SYSTEM_INVALID = -1
|
||||
} ProtectSystem;
|
||||
|
Loading…
x
Reference in New Issue
Block a user