From 1a298a206c5dfe03c6cc9e690e1a81719c25c20c Mon Sep 17 00:00:00 2001 From: Lennart Poettering <lennart@poettering.net> Date: Wed, 5 May 2021 10:45:48 +0200 Subject: [PATCH 1/4] user-record: optionally, allow parsing empty user record JSON objects --- src/shared/user-record.c | 2 +- src/shared/user-record.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/shared/user-record.c b/src/shared/user-record.c index bc35edd729..d82b4d3636 100644 --- a/src/shared/user-record.c +++ b/src/shared/user-record.c @@ -1552,7 +1552,7 @@ int user_group_record_mangle( if (FLAGS_SET(load_flags, USER_RECORD_REQUIRE_REGULAR) && !FLAGS_SET(m, USER_RECORD_REGULAR)) return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record lacks basic identity fields, which are required."); - if (m == 0) + if (!FLAGS_SET(load_flags, USER_RECORD_EMPTY_OK) && m == 0) return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record is empty."); if (w) diff --git a/src/shared/user-record.h b/src/shared/user-record.h index 623f7bc9e4..66dceecfdd 100644 --- a/src/shared/user-record.h +++ b/src/shared/user-record.h @@ -169,6 +169,9 @@ typedef enum UserRecordLoadFlags { /* Whether to ignore errors and load what we can */ USER_RECORD_PERMISSIVE = 1U << 29, + + /* Whether an empty record is OK */ + USER_RECORD_EMPTY_OK = 1U << 30, } UserRecordLoadFlags; static inline UserRecordLoadFlags USER_RECORD_REQUIRE(UserRecordMask m) { From 91181e075be46e9c919315f2e8f903a963754cb2 Mon Sep 17 00:00:00 2001 From: Lennart Poettering <lennart@poettering.net> Date: Wed, 5 May 2021 12:29:01 +0200 Subject: [PATCH 2/4] nspawn: export userns_mkdir() + userns_lchown() so that it can be used elsewhere in nspawn --- src/nspawn/meson.build | 1 + src/nspawn/nspawn.c | 5 +++-- src/nspawn/nspawn.h | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 src/nspawn/nspawn.h diff --git a/src/nspawn/meson.build b/src/nspawn/meson.build index 172ded43c1..a0e051ed32 100644 --- a/src/nspawn/meson.build +++ b/src/nspawn/meson.build @@ -26,6 +26,7 @@ libnspawn_core_sources = files(''' nspawn-setuid.h nspawn-stub-pid1.c nspawn-stub-pid1.h + nspawn.h '''.split()) nspawn_gperf_c = custom_target( diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index a8441bf8e0..9dbe2af5d9 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -76,6 +76,7 @@ #include "nspawn-settings.h" #include "nspawn-setuid.h" #include "nspawn-stub-pid1.h" +#include "nspawn.h" #include "nulstr-util.h" #include "os-util.h" #include "pager.h" @@ -1818,7 +1819,7 @@ static int verify_arguments(void) { return 0; } -static int userns_lchown(const char *p, uid_t uid, gid_t gid) { +int userns_lchown(const char *p, uid_t uid, gid_t gid) { assert(p); if (arg_userns_mode == USER_NAMESPACE_NO) @@ -1847,7 +1848,7 @@ static int userns_lchown(const char *p, uid_t uid, gid_t gid) { return 0; } -static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) { +int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) { const char *q; int r; diff --git a/src/nspawn/nspawn.h b/src/nspawn/nspawn.h new file mode 100644 index 0000000000..27fb0b44eb --- /dev/null +++ b/src/nspawn/nspawn.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <sys/types.h> + +int userns_lchown(const char *p, uid_t uid, gid_t gid); +int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid); From 2f8930449079403b26c9164b8eeac78d5af2c8df Mon Sep 17 00:00:00 2001 From: Lennart Poettering <lennart@poettering.net> Date: Wed, 5 May 2021 12:45:22 +0200 Subject: [PATCH 3/4] nspawn: add new --bind-user= option for binding a host user into the container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This new option does three things for a host user specified via --bind-user=: 1. Bind mount the home directory from the host directory into /run/host/home/<username> 2. Install an additional user namepace UID/GID mapping mapping the host UID/GID of the host user to an unused one from the container in the range 60514…60577. 3. Synthesize a user/group record for the user/group under the same name as on the host, with minimized information, and the UID/GID set to the mapped UID/GID. This data is written to /run/host/userdb/ where nss-system will pick it up. This should make sharing users and home directories from host into the container pretty seamless, under some conditions: 1. User namespacing must be used. 2. The host UID/GID of the user/group cannot be in the range assigned to the container (kernel already refuses this, as this would mean two host UIDs/GIDs might end up being mapped to the same continer UID/GID. 3. There's a free UID/GID in the aforementioned range in the container, and the name of the user/group is not used in the container. 4. Container payload is new enough to include an nss-systemd version that picks up records from /run/host/userdb/ --- src/nspawn/meson.build | 2 + src/nspawn/nspawn-bind-user.c | 479 ++++++++++++++++++++++++++++++++++ src/nspawn/nspawn-bind-user.h | 29 ++ src/nspawn/nspawn-gperf.gperf | 1 + src/nspawn/nspawn-settings.c | 49 ++++ src/nspawn/nspawn-settings.h | 9 +- src/nspawn/nspawn.c | 173 +++++++++++- 7 files changed, 731 insertions(+), 11 deletions(-) create mode 100644 src/nspawn/nspawn-bind-user.c create mode 100644 src/nspawn/nspawn-bind-user.h diff --git a/src/nspawn/meson.build b/src/nspawn/meson.build index a0e051ed32..d465b3d804 100644 --- a/src/nspawn/meson.build +++ b/src/nspawn/meson.build @@ -1,6 +1,8 @@ # SPDX-License-Identifier: LGPL-2.1-or-later libnspawn_core_sources = files(''' + nspawn-bind-user.c + nspawn-bind-user.h nspawn-cgroup.c nspawn-cgroup.h nspawn-creds.c diff --git a/src/nspawn/nspawn-bind-user.c b/src/nspawn/nspawn-bind-user.c new file mode 100644 index 0000000000..ebf7d4d917 --- /dev/null +++ b/src/nspawn/nspawn-bind-user.c @@ -0,0 +1,479 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "nspawn-bind-user.h" +#include "nspawn.h" +#include "path-util.h" +#include "user-util.h" +#include "userdb.h" + +#define MAP_UID_START 60514 +#define MAP_UID_END 60577 + +static int check_etc_passwd_collisions( + const char *directory, + const char *name, + uid_t uid) { + + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(directory); + assert(name || uid_is_valid(uid)); + + r = chase_symlinks_and_fopen_unlocked("/etc/passwd", directory, CHASE_PREFIX_ROOT, "re", &f, NULL); + if (r == -ENOENT) + return 0; /* no user database? then no user, hence no collision */ + if (r < 0) + return log_error_errno(r, "Failed to open /etc/passwd of container: %m"); + + for (;;) { + struct passwd *pw; + + r = fgetpwent_sane(f, &pw); + if (r < 0) + return log_error_errno(r, "Failed to iterate through /etc/passwd of container: %m"); + if (r == 0) /* EOF */ + return 0; /* no collision */ + + if (name && streq_ptr(pw->pw_name, name)) + return 1; /* name collision */ + if (uid_is_valid(uid) && pw->pw_uid == uid) + return 1; /* UID collision */ + } +} + +static int check_etc_group_collisions( + const char *directory, + const char *name, + gid_t gid) { + + _cleanup_fclose_ FILE *f = NULL; + int r; + + assert(directory); + assert(name || gid_is_valid(gid)); + + r = chase_symlinks_and_fopen_unlocked("/etc/group", directory, CHASE_PREFIX_ROOT, "re", &f, NULL); + if (r == -ENOENT) + return 0; /* no group database? then no group, hence no collision */ + if (r < 0) + return log_error_errno(r, "Failed to open /etc/group of container: %m"); + + for (;;) { + struct group *gr; + + r = fgetgrent_sane(f, &gr); + if (r < 0) + return log_error_errno(r, "Failed to iterate through /etc/group of container: %m"); + if (r == 0) + return 0; /* no collision */ + + if (name && streq_ptr(gr->gr_name, name)) + return 1; /* name collision */ + if (gid_is_valid(gid) && gr->gr_gid == gid) + return 1; /* gid collision */ + } +} + +static int convert_user( + const char *directory, + UserRecord *u, + GroupRecord *g, + uid_t allocate_uid, + UserRecord **ret_converted_user, + GroupRecord **ret_converted_group) { + + _cleanup_(group_record_unrefp) GroupRecord *converted_group = NULL; + _cleanup_(user_record_unrefp) UserRecord *converted_user = NULL; + _cleanup_free_ char *h = NULL; + JsonVariant *p, *hp = NULL; + int r; + + assert(u); + assert(g); + assert(u->gid == g->gid); + + r = check_etc_passwd_collisions(directory, u->user_name, UID_INVALID); + if (r < 0) + return r; + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), + "Sorry, the user '%s' already exists in the container.", u->user_name); + + r = check_etc_group_collisions(directory, g->group_name, GID_INVALID); + if (r < 0) + return r; + if (r > 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), + "Sorry, the group '%s' already exists in the container.", g->group_name); + + h = path_join("/run/host/home/", u->user_name); + if (!h) + return log_oom(); + + /* Acquire the source hashed password array as-is, so that it retains the JSON_VARIANT_SENSITIVE flag */ + p = json_variant_by_key(u->json, "privileged"); + if (p) + hp = json_variant_by_key(p, "hashedPassword"); + + r = user_record_build( + &converted_user, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(u->user_name)), + JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(allocate_uid)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(allocate_uid)), + JSON_BUILD_PAIR_CONDITION(u->disposition >= 0, "disposition", JSON_BUILD_STRING(user_disposition_to_string(u->disposition))), + JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_STRING(h)), + JSON_BUILD_PAIR("service", JSON_BUILD_STRING("io.systemd.NSpawn")), + JSON_BUILD_PAIR_CONDITION(!strv_isempty(u->hashed_password), "privileged", JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_VARIANT(hp)))))); + if (r < 0) + return log_error_errno(r, "Failed to build container user record: %m"); + + r = group_record_build( + &converted_group, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(g->group_name)), + JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(allocate_uid)), + JSON_BUILD_PAIR_CONDITION(g->disposition >= 0, "disposition", JSON_BUILD_STRING(user_disposition_to_string(g->disposition))), + JSON_BUILD_PAIR("service", JSON_BUILD_STRING("io.systemd.NSpawn")))); + if (r < 0) + return log_error_errno(r, "Failed to build container group record: %m"); + + *ret_converted_user = TAKE_PTR(converted_user); + *ret_converted_group = TAKE_PTR(converted_group); + + return 0; +} + +static int find_free_uid(const char *directory, uid_t max_uid, uid_t *current_uid) { + int r; + + assert(directory); + assert(current_uid); + + for (;; (*current_uid) ++) { + if (*current_uid > MAP_UID_END || *current_uid > max_uid) + return log_error_errno( + SYNTHETIC_ERRNO(EBUSY), + "No suitable available UID in range " UID_FMT "…" UID_FMT " in container detected, can't map user.", + MAP_UID_START, MAP_UID_END); + + r = check_etc_passwd_collisions(directory, NULL, *current_uid); + if (r < 0) + return r; + if (r > 0) /* already used */ + continue; + + /* We want to use the UID also as GID, hence check for it in /etc/group too */ + r = check_etc_group_collisions(directory, NULL, (gid_t) *current_uid); + if (r < 0) + return r; + if (r == 0) /* free! yay! */ + return 0; + } +} + +BindUserContext* bind_user_context_free(BindUserContext *c) { + if (!c) + return NULL; + + assert(c->n_data == 0 || c->data); + + for (size_t i = 0; i < c->n_data; i++) { + user_record_unref(c->data[i].host_user); + group_record_unref(c->data[i].host_group); + user_record_unref(c->data[i].payload_user); + group_record_unref(c->data[i].payload_group); + } + + return mfree(c); +} + +int bind_user_prepare( + const char *directory, + char **bind_user, + uid_t uid_shift, + uid_t uid_range, + CustomMount **custom_mounts, + size_t *n_custom_mounts, + BindUserContext **ret) { + + _cleanup_(bind_user_context_freep) BindUserContext *c = NULL; + uid_t current_uid = MAP_UID_START; + size_t n_allocated = 0; + char **n; + int r; + + assert(custom_mounts); + assert(n_custom_mounts); + assert(ret); + + /* This resolves the users specified in 'bind_user', generates a minimalized JSON user + group record + * for it to stick in the container, allocates a UID/GID for it, and updates the custom mount table, + * to include an appropriate bind mount mapping. + * + * This extends the passed custom_mounts/n_custom_mounts with the home directories, and allocates a + * new BindUserContext for the user records */ + + if (strv_isempty(bind_user)) { + *ret = NULL; + return 0; + } + + c = new0(BindUserContext, 1); + if (!c) + return log_oom(); + + STRV_FOREACH(n, bind_user) { + _cleanup_(user_record_unrefp) UserRecord *u = NULL, *cu = NULL; + _cleanup_(group_record_unrefp) GroupRecord *g = NULL, *cg = NULL; + _cleanup_free_ char *sm = NULL, *sd = NULL; + CustomMount *cm; + + r = userdb_by_name(*n, USERDB_DONT_SYNTHESIZE, &u); + if (r < 0) + return log_error_errno(r, "Failed to resolve user '%s': %m", *n); + + /* For now, let's refuse mapping the root/nobody users explicitly. The records we generate + * are strictly additive, nss-systemd is typically placed last in /etc/nsswitch.conf. Thus + * even if we wanted, we couldn't override the root or nobody user records. Note we also + * check for name conflicts in /etc/passwd + /etc/group later on, which would usually filter + * out root/nobody too, hence these checks might appear redundant — but they actually are + * not, as we want to support environments where /etc/passwd and /etc/group are non-existent, + * and the user/group databases fully synthesized at runtime. Moreover, the name of the + * user/group name of the "nobody" account differs between distros, hence a check by numeric + * UID is safer. */ + if (u->uid == 0 || streq(u->user_name, "root")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'root' user not supported, sorry."); + if (u->uid == UID_NOBODY || STR_IN_SET(u->user_name, NOBODY_USER_NAME, "nobody")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'nobody' user not supported, sorry."); + + if (u->uid >= uid_shift && u->uid < uid_shift + uid_range) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID of user '%s' to map is already in container UID range, refusing.", u->user_name); + + r = groupdb_by_gid(u->gid, USERDB_DONT_SYNTHESIZE, &g); + if (r < 0) + return log_error_errno(r, "Failed to resolve group of user '%s': %m", u->user_name); + + if (g->gid >= uid_shift && g->gid < uid_shift + uid_range) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "GID of group '%s' to map is already in container GID range, refusing.", g->group_name); + + /* We want to synthesize exactly one user + group from the host into the container. This only + * makes sense if the user on the host has its own private group. We can't reasonably check + * this, so we just check of the name of user and group match. + * + * One of these days we might want to support users in a shared/common group too, but it's + * not clear to me how this would have to be mapped, precisely given that the common group + * probably already exists in the container. */ + if (!streq(u->user_name, g->group_name)) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Sorry, mapping users without private groups is currently not supported."); + + r = find_free_uid(directory, uid_range, ¤t_uid); + if (r < 0) + return r; + + r = convert_user(directory, u, g, current_uid, &cu, &cg); + if (r < 0) + return r; + + if (!GREEDY_REALLOC(c->data, n_allocated, c->n_data + 1)) + return log_oom(); + + sm = strdup(u->home_directory); + if (!sm) + return log_oom(); + + sd = strdup(cu->home_directory); + if (!sd) + return log_oom(); + + cm = reallocarray(*custom_mounts, sizeof(CustomMount), *n_custom_mounts + 1); + if (!cm) + return log_oom(); + + *custom_mounts = cm; + + (*custom_mounts)[(*n_custom_mounts)++] = (CustomMount) { + .type = CUSTOM_MOUNT_BIND, + .source = TAKE_PTR(sm), + .destination = TAKE_PTR(sd), + }; + + c->data[c->n_data++] = (BindUserData) { + .host_user = TAKE_PTR(u), + .host_group = TAKE_PTR(g), + .payload_user = TAKE_PTR(cu), + .payload_group = TAKE_PTR(cg), + }; + + current_uid++; + } + + *ret = TAKE_PTR(c); + return 1; +} + +static int write_and_symlink( + const char *root, + JsonVariant *v, + const char *name, + uid_t uid, + const char *suffix, + WriteStringFileFlags extra_flags) { + + _cleanup_free_ char *j = NULL, *f = NULL, *p = NULL, *q = NULL; + int r; + + assert(root); + assert(v); + assert(name); + assert(uid_is_valid(uid)); + assert(suffix); + + r = json_variant_format(v, JSON_FORMAT_NEWLINE, &j); + if (r < 0) + return log_error_errno(r, "Failed to format user record JSON: %m"); + + f = strjoin(name, suffix); + if (!f) + return log_oom(); + + p = path_join(root, "/run/host/userdb/", f); + if (!p) + return log_oom(); + + if (asprintf(&q, "%s/run/host/userdb/" UID_FMT "%s", root, uid, suffix) < 0) + return log_oom(); + + if (symlink(f, q) < 0) + return log_error_errno(errno, "Failed to create symlink '%s': %m", q); + + r = userns_lchown(q, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to adjust access mode of '%s': %m", q); + + r = write_string_file(p, j, WRITE_STRING_FILE_CREATE|extra_flags); + if (r < 0) + return log_error_errno(r, "Failed to write %s: %m", p); + + r = userns_lchown(p, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to adjust access mode of '%s': %m", p); + + return 0; +} + +int bind_user_setup( + const BindUserContext *c, + const char *root) { + + static const UserRecordLoadFlags strip_flags = /* Removes privileged info */ + USER_RECORD_REQUIRE_REGULAR| + USER_RECORD_STRIP_PRIVILEGED| + USER_RECORD_ALLOW_PER_MACHINE| + USER_RECORD_ALLOW_BINDING| + USER_RECORD_ALLOW_SIGNATURE; + static const UserRecordLoadFlags shadow_flags = /* Extracts privileged info */ + USER_RECORD_STRIP_REGULAR| + USER_RECORD_ALLOW_PRIVILEGED| + USER_RECORD_STRIP_PER_MACHINE| + USER_RECORD_STRIP_BINDING| + USER_RECORD_STRIP_SIGNATURE| + USER_RECORD_EMPTY_OK; + int r; + + assert(root); + + if (!c || c->n_data == 0) + return 0; + + r = userns_mkdir(root, "/run/host", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host: %m"); + + r = userns_mkdir(root, "/run/host/home", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host/userdb: %m"); + + r = userns_mkdir(root, "/run/host/userdb", 0755, 0, 0); + if (r < 0) + return log_error_errno(r, "Failed to create /run/host/userdb: %m"); + + for (size_t i = 0; i < c->n_data; i++) { + _cleanup_(group_record_unrefp) GroupRecord *stripped_group = NULL, *shadow_group = NULL; + _cleanup_(user_record_unrefp) UserRecord *stripped_user = NULL, *shadow_user = NULL; + const BindUserData *d = c->data + i; + + /* First, write shadow (i.e. privileged) data for group record */ + r = group_record_clone(d->payload_group, shadow_flags, &shadow_group); + if (r < 0) + return log_error_errno(r, "Failed to extract privileged information from group record: %m"); + + if (!json_variant_is_blank_object(shadow_group->json)) { + r = write_and_symlink( + root, + shadow_group->json, + d->payload_group->group_name, + d->payload_group->gid, + ".group-privileged", + WRITE_STRING_FILE_MODE_0600); + if (r < 0) + return r; + } + + /* Second, write main part of group record. */ + r = group_record_clone(d->payload_group, strip_flags, &stripped_group); + if (r < 0) + return log_error_errno(r, "Failed to strip privileged information from group record: %m"); + + r = write_and_symlink( + root, + stripped_group->json, + d->payload_group->group_name, + d->payload_group->gid, + ".group", + 0); + if (r < 0) + return r; + + /* Third, write out user shadow data. i.e. extract privileged info from user record */ + r = user_record_clone(d->payload_user, shadow_flags, &shadow_user); + if (r < 0) + return log_error_errno(r, "Failed to extract privileged information from user record: %m"); + + if (!json_variant_is_blank_object(shadow_user->json)) { + r = write_and_symlink( + root, + shadow_user->json, + d->payload_user->user_name, + d->payload_user->uid, + ".user-privileged", + WRITE_STRING_FILE_MODE_0600); + if (r < 0) + return r; + } + + /* Finally write out the main part of the user record */ + r = user_record_clone(d->payload_user, strip_flags, &stripped_user); + if (r < 0) + return log_error_errno(r, "Failed to strip privileged information from user record: %m"); + + r = write_and_symlink( + root, + stripped_user->json, + d->payload_user->user_name, + d->payload_user->uid, + ".user", + 0); + if (r < 0) + return r; + } + + return 1; +} diff --git a/src/nspawn/nspawn-bind-user.h b/src/nspawn/nspawn-bind-user.h new file mode 100644 index 0000000000..4352ce0ab2 --- /dev/null +++ b/src/nspawn/nspawn-bind-user.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "user-record.h" +#include "group-record.h" +#include "nspawn-mount.h" + +typedef struct BindUserData { + /* The host's user/group records */ + UserRecord *host_user; + GroupRecord *host_group; + + /* The mapped records to place into the container */ + UserRecord *payload_user; + GroupRecord *payload_group; +} BindUserData; + +typedef struct BindUserContext { + BindUserData *data; + size_t n_data; +} BindUserContext; + +BindUserContext* bind_user_context_free(BindUserContext *c); + +DEFINE_TRIVIAL_CLEANUP_FUNC(BindUserContext*, bind_user_context_free); + +int bind_user_prepare(const char *directory, char **bind_user, uid_t uid_shift, uid_t uid_range, CustomMount **custom_mounts, size_t *n_custom_mounts, BindUserContext **ret); + +int bind_user_setup(const BindUserContext *c, const char *root); diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index 67a3682689..ea15e27148 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -69,6 +69,7 @@ Files.Overlay, config_parse_overlay, 0, 0 Files.OverlayReadOnly, config_parse_overlay, 1, 0 Files.PrivateUsersChown, config_parse_userns_chown, 0, offsetof(Settings, userns_ownership) Files.PrivateUsersOwnership, config_parse_userns_ownership, 0, offsetof(Settings, userns_ownership) +Files.BindUser, config_parse_bind_user, 0, offsetof(Settings, bind_user) Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 55b8c4375f..3847fe4ec4 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -132,6 +132,7 @@ Settings* settings_free(Settings *s) { rlimit_free_all(s->rlimit); free(s->hostname); cpu_set_reset(&s->cpu_set); + strv_free(s->bind_user); strv_free(s->network_interfaces); strv_free(s->network_macvlan); @@ -907,3 +908,51 @@ int config_parse_userns_chown( *ownership = r ? USER_NAMESPACE_OWNERSHIP_CHOWN : USER_NAMESPACE_OWNERSHIP_OFF; return 0; } + +int config_parse_bind_user( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + char ***bind_user = data; + int r; + + assert(rvalue); + assert(bind_user); + + if (isempty(rvalue)) { + *bind_user = strv_free(*bind_user); + return 0; + } + + for (const char* p = rvalue;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, NULL, 0); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse BindUser= list, ignoring: %s", rvalue); + return 0; + } + if (r == 0) + break; + + if (!valid_user_group_name(word, 0)) { + log_syntax(unit, LOG_WARNING, filename, line, 0, "User name '%s' not valid, ignoring.", word); + return 0; + } + + if (strv_consume(bind_user, TAKE_PTR(word)) < 0) + return log_oom(); + } + + return 0; +} diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index c0ad0741ab..939e1c757b 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -126,9 +126,10 @@ typedef enum SettingsMask { SETTING_CLONE_NS_FLAGS = UINT64_C(1) << 28, SETTING_CONSOLE_MODE = UINT64_C(1) << 29, SETTING_CREDENTIALS = UINT64_C(1) << 30, - SETTING_RLIMIT_FIRST = UINT64_C(1) << 31, /* we define one bit per resource limit here */ - SETTING_RLIMIT_LAST = UINT64_C(1) << (31 + _RLIMIT_MAX - 1), - _SETTINGS_MASK_ALL = (UINT64_C(1) << (31 + _RLIMIT_MAX)) -1, + SETTING_BIND_USER = UINT64_C(1) << 31, + SETTING_RLIMIT_FIRST = UINT64_C(1) << 32, /* we define one bit per resource limit here */ + SETTING_RLIMIT_LAST = UINT64_C(1) << (32 + _RLIMIT_MAX - 1), + _SETTINGS_MASK_ALL = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1, _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX } SettingsMask; @@ -195,6 +196,7 @@ typedef struct Settings { CustomMount *custom_mounts; size_t n_custom_mounts; UserNamespaceOwnership userns_ownership; + char **bind_user; /* [Network] */ int private_network; @@ -266,6 +268,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_link_journal); CONFIG_PARSER_PROTOTYPE(config_parse_timezone); CONFIG_PARSER_PROTOTYPE(config_parse_userns_chown); CONFIG_PARSER_PROTOTYPE(config_parse_userns_ownership); +CONFIG_PARSER_PROTOTYPE(config_parse_bind_user); const char *resolv_conf_mode_to_string(ResolvConfMode a) _const_; ResolvConfMode resolv_conf_mode_from_string(const char *s) _pure_; diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 9dbe2af5d9..21aa4f246f 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -63,6 +63,7 @@ #include "mountpoint-util.h" #include "namespace-util.h" #include "netlink-util.h" +#include "nspawn-bind-user.h" #include "nspawn-cgroup.h" #include "nspawn-creds.h" #include "nspawn-def.h" @@ -226,6 +227,7 @@ static char **arg_sysctl = NULL; static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID; static Credential *arg_credentials = NULL; static size_t arg_n_credentials = 0; +static char **arg_bind_user = NULL; STATIC_DESTRUCTOR_REGISTER(arg_directory, freep); STATIC_DESTRUCTOR_REGISTER(arg_template, freep); @@ -258,6 +260,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep); #endif STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset); STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep); static int handle_arg_console(const char *arg) { if (streq(arg, "help")) { @@ -423,7 +426,8 @@ static int help(void) { " Create an overlay mount from the host to \n" " the container\n" " --overlay-ro=PATH[:PATH...]:PATH\n" - " Similar, but creates a read-only overlay mount\n\n" + " Similar, but creates a read-only overlay mount\n" + " --bind-user=NAME Bind user from host to container\n\n" "%3$sInput/Output:%4$s\n" " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n" " set up for the container.\n" @@ -707,6 +711,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_NO_PAGER, ARG_SET_CREDENTIAL, ARG_LOAD_CREDENTIAL, + ARG_BIND_USER, }; static const struct option options[] = { @@ -778,6 +783,7 @@ static int parse_argv(int argc, char *argv[]) { { "no-pager", no_argument, NULL, ARG_NO_PAGER }, { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL }, { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL }, + { "bind-user", required_argument, NULL, ARG_BIND_USER }, {} }; @@ -1656,6 +1662,16 @@ static int parse_argv(int argc, char *argv[]) { break; } + case ARG_BIND_USER: + if (!valid_user_group_name(optarg, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg); + + if (strv_extend(&arg_bind_user, optarg) < 0) + return log_oom(); + + arg_settings_mask |= SETTING_BIND_USER; + break; + case '?': return -EINVAL; @@ -1812,6 +1828,12 @@ static int verify_arguments(void) { return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode."); } + if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users"); + + /* Drop duplicate --bind-user= entries */ + strv_uniq(arg_bind_user); + r = custom_mount_check_all(); if (r < 0) return r; @@ -3569,6 +3591,7 @@ static int outer_child( FDSet *fds, int netns_fd) { + _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL; _cleanup_strv_free_ char **os_release_pairs = NULL; _cleanup_close_ int fd = -1; bool idmap = false; @@ -3716,6 +3739,36 @@ static int outer_child( if (r < 0) return r; + r = bind_user_prepare( + directory, + arg_bind_user, + arg_uid_shift, + arg_uid_range, + &arg_custom_mounts, &arg_n_custom_mounts, + &bind_user_context); + if (r < 0) + return r; + + if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) { + /* Send the user maps we determined to the parent, so that it installs it in our user namespace UID map table */ + + for (size_t i = 0; i < bind_user_context->n_data; i++) { + uid_t map[] = { + bind_user_context->data[i].payload_user->uid, + bind_user_context->data[i].host_user->uid, + (uid_t) bind_user_context->data[i].payload_group->gid, + (uid_t) bind_user_context->data[i].host_group->gid, + }; + + l = send(uid_shift_socket, map, sizeof(map), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send user UID map: %m"); + if (l != sizeof(map)) + return log_error_errno(SYNTHETIC_ERRNO(EIO), + "Short write while sending user UID map."); + } + } + r = mount_custom( directory, arg_custom_mounts, @@ -3832,6 +3885,10 @@ static int outer_child( if (r < 0) return r; + r = bind_user_setup(bind_user_context, directory); + if (r < 0) + return r; + r = mount_custom( directory, arg_custom_mounts, @@ -4012,21 +4069,96 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { } } -static int setup_uid_map(pid_t pid) { - char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; +static int add_one_uid_map( + char **p, + uid_t container_uid, + uid_t host_uid, + uid_t range) { + + return strextendf(p, + UID_FMT " " UID_FMT " " UID_FMT "\n", + container_uid, host_uid, range); +} + +static int make_uid_map_string( + const uid_t bind_user_uid[], + size_t n_bind_user_uid, + size_t offset, + char **ret) { + + _cleanup_free_ char *s = NULL; + uid_t previous_uid = 0; + int r; + + assert(n_bind_user_uid == 0 || bind_user_uid); + assert(offset == 0 || offset == 2); /* used to switch between UID and GID map */ + assert(ret); + + /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one + * quadruplet, consisting of host and container UID + GID. */ + + for (size_t i = 0; i < n_bind_user_uid; i++) { + uid_t payload_uid = bind_user_uid[i*2+offset], + host_uid = bind_user_uid[i*2+offset+1]; + + assert(previous_uid <= payload_uid); + assert(payload_uid < arg_uid_range); + + /* Add a range to close the gap to previous entry */ + if (payload_uid > previous_uid) { + r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid); + if (r < 0) + return r; + } + + /* Map this specific user */ + r = add_one_uid_map(&s, payload_uid, host_uid, 1); + if (r < 0) + return r; + + previous_uid = payload_uid + 1; + } + + /* And add a range to close the gap to finish the range */ + if (arg_uid_range > previous_uid) { + r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid); + if (r < 0) + return r; + } + + assert(s); + + *ret = TAKE_PTR(s); + return 0; +} + +static int setup_uid_map( + pid_t pid, + const uid_t bind_user_uid[], + size_t n_bind_user_uid) { + + char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_free_ char *s = NULL; int r; assert(pid > 1); + /* Build the UID map string */ + if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */ + return log_oom(); + xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid); - xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range); - r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER); + r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) return log_error_errno(r, "Failed to write UID map: %m"); - /* We always assign the same UID and GID ranges */ + /* And now build the GID map string */ + s = mfree(s); + if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */ + return log_oom(); + xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid); - r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER); + r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) return log_error_errno(r, "Failed to write GID map: %m"); @@ -4302,6 +4434,9 @@ static int merge_settings(Settings *settings, const char *path) { } } + if ((arg_settings_mask & SETTING_BIND_USER) == 0) + strv_free_and_replace(arg_bind_user, settings->bind_user); + if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0) arg_notify_ready = settings->notify_ready; @@ -4568,6 +4703,8 @@ static int run_container( _cleanup_(pty_forward_freep) PTYForward *forward = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ uid_t *bind_user_uid = NULL; + size_t n_bind_user_uid = 0; ContainerStatus container_status = 0; int ifi = 0, r; ssize_t l; @@ -4723,6 +4860,26 @@ static int run_container( if (l != sizeof arg_uid_shift) return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift."); } + + n_bind_user_uid = strv_length(arg_bind_user); + if (n_bind_user_uid > 0) { + /* Right after the UID shift, we'll receive the list of UID mappings for the + * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */ + + bind_user_uid = new(uid_t, n_bind_user_uid*4); + if (!bind_user_uid) + return log_oom(); + + for (size_t i = 0; i < n_bind_user_uid; i++) { + l = recv(uid_shift_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read user UID map pair: %m"); + if (l != sizeof(uid_t)*4) + return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING, + SYNTHETIC_ERRNO(EIO), + "Short read while reading bind user UID pairs."); + } + } } if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { @@ -4768,7 +4925,7 @@ static int run_container( if (!barrier_place_and_sync(&barrier)) /* #1 */ return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early."); - r = setup_uid_map(*pid); + r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid); if (r < 0) return r; From a06c9ac27782f21cd0eaf4078f4588b4f8cd2585 Mon Sep 17 00:00:00 2001 From: Lennart Poettering <lennart@poettering.net> Date: Fri, 7 May 2021 11:44:26 +0200 Subject: [PATCH 4/4] man: document new nspawn --bind-user= feature --- docs/UIDS-GIDS.md | 5 ++-- man/systemd-nspawn.xml | 52 ++++++++++++++++++++++++++++++++++++++++++ man/systemd.nspawn.xml | 10 ++++++++ 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/docs/UIDS-GIDS.md b/docs/UIDS-GIDS.md index e289a9b68e..5342ccd166 100644 --- a/docs/UIDS-GIDS.md +++ b/docs/UIDS-GIDS.md @@ -241,8 +241,9 @@ the artifacts the container manager persistently leaves in the system. | 5 | `tty` group | `systemd` | `/etc/passwd` | | 6…999 | System users | Distributions | `/etc/passwd` | | 1000…60000 | Regular users | Distributions | `/etc/passwd` + LDAP/NIS/… | -| 60001…60513 | Human Users (homed) | `systemd` | `nss-systemd` | -| 60514…61183 | Unused | | | +| 60001…60513 | Human users (homed) | `systemd` | `nss-systemd` | +| 60514…60577 | Host users mapped into containers | `systemd` | `systemd-nspawn` | +| 60578…61183 | Unused | | | | 61184…65519 | Dynamic service users | `systemd` | `nss-systemd` | | 65520…65533 | Unused | | | | 65534 | `nobody` user | Linux | `/etc/passwd` + `nss-systemd` | diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 403636545a..e929d32f62 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -1352,6 +1352,58 @@ After=sys-subsystem-net-devices-ens1.device</programlisting> make them read-only, using <option>--bind-ro=</option>.</para></listitem> </varlistentry> + <varlistentry> + <term><option>--bind-user=</option></term> + + <listitem><para>Binds the home directory of the specified user on the host into the container. Takes + the name of an existing user on the host as argument. May be used multiple times to bind multiple + users into the container. This does three things:</para> + + <orderedlist> + <listitem><para>The user's home directory is bind mounted from the host into + <filename>/run/hosts/home/</filename>.</para></listitem> + + <listitem><para>An additional UID/GID mapping is added that maps the host user's UID/GID to a + container UID/GID, allocated from the 60514…60577 range.</para></listitem> + + <listitem><para>A JSON user and group record is generated in <filename>/run/userdb/</filename> that + describes the mapped user. It contains a minimized representation of the host's user record, + adjusted to the UID/GID and home directory path assigned to the user in the container. The + <citerefentry><refentrytitle>nss-systemd</refentrytitle><manvolnum>8</manvolnum></citerefentry> + glibc NSS module will pick up these records from there and make them available in the container's + user/group databases.</para></listitem> + </orderedlist> + + <para>The combination of the three operations above ensures that it is possible to log into the + host's user account inside the container as if it was local to the container. The user is only mapped + transiently, while the container is running and the mapping itself does not result in persistent + changes to the container (except maybe for generated log messages at login time, and similar). Note + that in particular the UID/GID assignment in the container is not made persistently. If the user is + mapped transiently, it is best to not allow the user to make persistent changes to the container. If + the user leaves files or directories owned by the user, and those UIDs/GIDs are recycled during later + container invocations (possibly with a different <option>--bind-user=</option> mapping), those files + and directories will be accessible to the "new" user.</para> + + <para>The user/group record mapping only works if the container contains systemd 249 or newer, with + <command>nss-systemd</command> properly configured in <filename>nsswitch.conf</filename>. See + <citerefentry><refentrytitle>nss-systemd</refentrytitle><manvolnum>8</manvolnum></citerefentry> for + details.</para> + + <para>Note that the user record propagated from the host into the container will contain the UNIX + password hash of the user, so that seamless logins in the container are possible. If the container is + less trusted than the host it's hence important to use a strong UNIX password hash function + (e.g. yescrypt or similar, with the <literal>$y$</literal> hash prefix).</para> + + <para>When binding a user from the host into the container checks are executed to ensure that the + username is not yet known in the container. Moreover, it is checked that the UID/GID allocated for it + is not currently defined in the user/group databases of the container. Both checks directly access + the container's <filename>/etc/passwd</filename> and <filename>/etc/group</filename>, and thus might + not detect existing accounts in other databases.</para> + + <para>This operation is only supported in combination with + <option>--private-users=</option>/<option>-U</option>.</para></listitem> + </varlistentry> + <varlistentry> <term><option>--inaccessible=</option></term> diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml index 186616b6ad..7ba8e361b4 100644 --- a/man/systemd.nspawn.xml +++ b/man/systemd.nspawn.xml @@ -415,6 +415,16 @@ is privileged (see above).</para></listitem> </varlistentry> + <varlistentry> + <term><varname>BindUser=</varname></term> + + <listitem><para>Binds a user from the host into the container. This option is equivalent to the + command line switch <option>--bind-user=</option>, see + <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> + for details about the specific options supported. This setting is privileged (see + above).</para></listitem> + </varlistentry> + <varlistentry> <term><varname>TemporaryFileSystem=</varname></term>