From ffc1ec73b3a271d8ed59b3aef18cc5b42ec76579 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 29 Nov 2023 18:52:28 +0100 Subject: [PATCH] pid1: add ProtectSystem= as system-wide configuration, and default it to true in the initrd This adds a new ProtectSystem= setting that mirrors the option of the same of services, but in a more restrictive way. If enabled will remount /usr/ to read-only, very early at boot. Takes a special value "auto" (which is the default) which is equivalent to true in the initrd, and false otherwise. Unlike the per-service option we don't support full/strict modes, but the door is open to eventually support that too if it makes sense. It's not entirely trivial though as we have very little mounted this early, and hence the mechanism might not apply 1:1. Hence in this PR is a conservative first step. My primary goal with this is to lock down initrds a bit, since they conceptually are mostly immutable, but they are unpacked into a mutable tmpfs. let's tighten the screws a bit on that, and at least make /usr/ immutable. This is particularly nice on USIs (i.e. Unified System Images, that pack a whole OS into a UKI without transitioning out of it), such as diskomator. --- man/systemd-system.conf.xml | 14 +++++++ src/core/main.c | 75 ++++++++++++++++++++++++++++++++++++- src/core/system.conf.in | 1 + 3 files changed, 89 insertions(+), 1 deletion(-) diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index 3c06b65f935..0546283b285 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -289,6 +289,20 @@ + + ProtectSystem= + + Takes a boolean argument or the string auto. If set to true this + will remount /usr/ read-only. If set to auto (the default) + and running in an initrd equivalent to true, otherwise false. This implements a restricted subset of + the per-unit setting of the same name, see + systemd.exec5 for + details: currently, the full or struct values are not + supported. + + + + SystemCallArchitectures= diff --git a/src/core/main.c b/src/core/main.c index 2ac59dabf5a..dc166452a00 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -68,6 +68,7 @@ #include "manager-serialize.h" #include "mkdir-label.h" #include "mount-setup.h" +#include "mount-util.h" #include "os-util.h" #include "pager.h" #include "parse-argument.h" @@ -140,6 +141,7 @@ static char **arg_default_environment; static char **arg_manager_environment; static uint64_t arg_capability_bounding_set; static bool arg_no_new_privs; +static int arg_protect_system; static nsec_t arg_timer_slack_nsec; static Set* arg_syscall_archs; static FILE* arg_serialization; @@ -610,6 +612,43 @@ static int config_parse_oom_score_adjust( return 0; } +static int config_parse_protect_system_pid1( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int *v = ASSERT_PTR(data), r; + + /* This is modelled after the per-service ProtectSystem= setting, but a bit more restricted on one + * hand, and more automatic in another. i.e. we currently only support yes/no (not "strict" or + * "full"). And we will enable this automatically for the initrd unless configured otherwise. + * + * We might extend this later to match more closely what the per-service ProtectSystem= can do, but + * this is not trivial, due to ordering constraints: besides /usr/ we don't really have much mounted + * at the moment we enable this logic. */ + + if (isempty(rvalue) || streq(rvalue, "auto")) { + *v = -1; + return 0; + } + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse ProtectSystem= argument '%s', ignoring: %m", rvalue); + return 0; + } + + *v = r; + return 0; +} + static int parse_config_file(void) { const ConfigTableItem items[] = { { "Manager", "LogLevel", config_parse_level2, 0, NULL }, @@ -637,6 +676,7 @@ static int parse_config_file(void) { { "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor }, { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set }, { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs }, + { "Manager", "ProtectSystem", config_parse_protect_system_pid1, 0, &arg_protect_system }, #if HAVE_SECCOMP { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs }, #else @@ -1684,6 +1724,35 @@ static void initialize_core_pattern(bool skip_setup) { arg_early_core_pattern); } +static void apply_protect_system(bool skip_setup) { + int r; + + if (skip_setup || getpid_cached() != 1 || arg_protect_system == 0) + return; + + if (arg_protect_system < 0 && !in_initrd()) { + log_debug("ProtectSystem=auto selected, but not running in an initrd, skipping."); + return; + } + + r = make_mount_point("/usr"); + if (r < 0) { + log_warning_errno(r, "Failed to make /usr/ a mount point, ignoring: %m"); + return; + } + + if (mount_nofollow_verbose( + LOG_WARNING, + /* what= */ NULL, + "/usr", + /* fstype= */ NULL, + MS_BIND|MS_REMOUNT|MS_RDONLY, + /* options= */ NULL) < 0) + return; + + log_info("Successfully made /usr/ read-only."); +} + static void update_cpu_affinity(bool skip_setup) { _cleanup_free_ char *mask = NULL; @@ -2531,6 +2600,7 @@ static void reset_arguments(void) { arg_capability_bounding_set = CAP_MASK_UNSET; arg_no_new_privs = false; + arg_protect_system = -1; arg_timer_slack_nsec = NSEC_INFINITY; arg_syscall_archs = set_free(arg_syscall_archs); @@ -3040,9 +3110,12 @@ int main(int argc, char *argv[]) { cmdline_take_random_seed(); } - /* A core pattern might have been specified via the cmdline. */ + /* A core pattern might have been specified via the cmdline. */ initialize_core_pattern(skip_setup); + /* Make /usr/ read-only */ + apply_protect_system(skip_setup); + /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */ log_close(); diff --git a/src/core/system.conf.in b/src/core/system.conf.in index 05eb6812700..9b89a6aa77d 100644 --- a/src/core/system.conf.in +++ b/src/core/system.conf.in @@ -39,6 +39,7 @@ #WatchdogDevice= #CapabilityBoundingSet= #NoNewPrivileges=no +#ProtectSystem=auto #SystemCallArchitectures= #TimerSlackNSec= #StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}