From 6402d5c628f1872a4874508bbe975aaac1cc747b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 21 Apr 2016 12:47:36 +0200 Subject: [PATCH 01/10] util: copy_file_range() returns EBADF when used on a tty In nspawn we invoke copy_bytes() on a TTY fd. copy_file_range() returns EBADF on a TTY and this error is considered fatal by copy_bytes() so far. Correct that, so that nspawn's copy_bytes() operation works again. This is a follow-up for a44202e98b638024c45e50ad404c7069c7835c04. --- src/basic/copy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic/copy.c b/src/basic/copy.c index e2db4be9ff4..03487a6878d 100644 --- a/src/basic/copy.c +++ b/src/basic/copy.c @@ -102,7 +102,7 @@ int copy_bytes(int fdf, int fdt, uint64_t max_bytes, bool try_reflink) { if (try_cfr) { n = try_copy_file_range(fdf, NULL, fdt, NULL, m, 0u); if (n < 0) { - if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV)) + if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV, -EBADF)) return n; try_cfr = false; From 7336138eedf1c9b09b432428c4cccc2da25ab9e0 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 20 Apr 2016 22:53:39 +0200 Subject: [PATCH 02/10] nspawn: optionally fix up OS tree uid/gids for userns This adds a new --private-userns-chown switch that may be used in combination with --private-userns. If it is passed a recursive chmod() operation is run on the OS tree, fixing all file owner UID/GIDs to the right ranges. This should make user namespacing pretty workable, as the OS trees don't need to be prepared manually anymore. --- .gitignore | 1 + Makefile.am | 13 ++ src/nspawn/nspawn-patch-uid.c | 417 ++++++++++++++++++++++++++++++++++ src/nspawn/nspawn-patch-uid.h | 23 ++ src/nspawn/nspawn.c | 47 +++- src/nspawn/test-patch-uid.c | 61 +++++ 6 files changed, 560 insertions(+), 2 deletions(-) create mode 100644 src/nspawn/nspawn-patch-uid.c create mode 100644 src/nspawn/nspawn-patch-uid.h create mode 100644 src/nspawn/test-patch-uid.c diff --git a/.gitignore b/.gitignore index 02ba86ef6f7..c7eb14452db 100644 --- a/.gitignore +++ b/.gitignore @@ -240,6 +240,7 @@ /test-ns /test-nss /test-parse-util +/test-patch-uid /test-path /test-path-lookup /test-path-util diff --git a/Makefile.am b/Makefile.am index 0f475c6d095..b323de55c6a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3021,6 +3021,8 @@ systemd_nspawn_SOURCES = \ src/nspawn/nspawn-setuid.h \ src/nspawn/nspawn-stub-pid1.c \ src/nspawn/nspawn-stub-pid1.h \ + src/nspawn/nspawn-patch-uid.c \ + src/nspawn/nspawn-patch-uid.h \ src/core/mount-setup.c \ src/core/mount-setup.h \ src/core/loopback-setup.c \ @@ -3048,6 +3050,17 @@ systemd_nspawn_LDADD += \ libfirewall.la endif +test_patch_uid_SOURCES = \ + src/nspawn/nspawn-patch-uid.c \ + src/nspawn/nspawn-patch-uid.h \ + src/nspawn/test-patch-uid.c + +test_patch_uid_LDADD = \ + libshared.la + +manual_tests += \ + test-patch-uid + # ------------------------------------------------------------------------------ systemd_run_SOURCES = \ src/run/run.c diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c new file mode 100644 index 00000000000..f53164fbbf6 --- /dev/null +++ b/src/nspawn/nspawn-patch-uid.c @@ -0,0 +1,417 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include +#ifdef HAVE_ACL +#include +#endif +#include +#include + +#include "acl-util.h" +#include "dirent-util.h" +#include "fd-util.h" +#include "nspawn-patch-uid.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +#ifdef HAVE_ACL + +static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) { + char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1]; + acl_t acl; + + assert(fd >= 0); + assert(ret); + + if (name) { + _cleanup_close_ int child_fd = -1; + + child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + xsprintf(procfs_path, "/proc/self/fd/%i", child_fd); + acl = acl_get_file(procfs_path, type); + } else if (type == ACL_TYPE_ACCESS) + acl = acl_get_fd(fd); + else { + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + acl = acl_get_file(procfs_path, type); + } + if (!acl) + return -errno; + + *ret = acl; + return 0; +} + +static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) { + char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1]; + int r; + + assert(fd >= 0); + assert(acl); + + if (name) { + _cleanup_close_ int child_fd = -1; + + child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW); + if (child_fd < 0) + return -errno; + + xsprintf(procfs_path, "/proc/self/fd/%i", child_fd); + r = acl_set_file(procfs_path, type, acl); + } else if (type == ACL_TYPE_ACCESS) + r = acl_set_fd(fd, acl); + else { + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + r = acl_set_file(procfs_path, type, acl); + } + if (r < 0) + return -errno; + + return 0; +} + +static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) { + _cleanup_(acl_freep) acl_t copy = NULL; + acl_entry_t i; + int r; + + assert(acl); + assert(ret); + + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + if (r < 0) + return -errno; + while (r > 0) { + uid_t *old_uid, new_uid; + bool modify = false; + acl_tag_t tag; + + if (acl_get_tag_type(i, &tag) < 0) + return -errno; + + if (IN_SET(tag, ACL_USER, ACL_GROUP)) { + + /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that + * this is actually OK */ + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + old_uid = acl_get_qualifier(i); + if (!old_uid) + return -errno; + + new_uid = shift | (*old_uid & UINT32_C(0xFFFF)); + if (!uid_is_valid(new_uid)) + return -EINVAL; + + modify = new_uid != *old_uid; + if (modify && !copy) { + int n; + + /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the + * beginning, so that we copy all entries, starting from the first, this time. */ + + n = acl_entries(acl); + if (n < 0) + return -errno; + + copy = acl_init(n); + if (!copy) + return -errno; + + /* Seek back to the beginning */ + r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i); + if (r < 0) + return -errno; + continue; + } + } + + if (copy) { + acl_entry_t new_entry; + + if (acl_create_entry(©, &new_entry) < 0) + return -errno; + + if (acl_copy_entry(new_entry, i) < 0) + return -errno; + + if (modify) + if (acl_set_qualifier(new_entry, &new_uid) < 0) + return -errno; + } + + r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i); + if (r < 0) + return -errno; + } + + *ret = copy; + copy = NULL; + + return !!*ret; +} + +static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) { + _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + /* ACLs are not supported on symlinks, there's no point in trying */ + if (S_ISLNK(st->st_mode)) + return 0; + + r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl); + if (r == -EOPNOTSUPP) + return 0; + if (r < 0) + return r; + + r = shift_acl(acl, shift, &shifted); + if (r < 0) + return r; + if (r > 0) { + r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted); + if (r < 0) + return r; + + changed = true; + } + + if (S_ISDIR(st->st_mode)) { + acl_free(acl); + acl_free(shifted); + + acl = shifted = NULL; + + r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl); + if (r < 0) + return r; + + r = shift_acl(acl, shift, &shifted); + if (r < 0) + return r; + if (r > 0) { + r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted); + if (r < 0) + return r; + + changed = true; + } + } + + return changed; +} + +#else + +static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) { + return 0; +} + +#endif + +static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) { + uid_t new_uid; + gid_t new_gid; + bool changed = false; + int r; + + assert(fd >= 0); + assert(st); + + new_uid = shift | (st->st_uid & UINT32_C(0xFFFF)); + new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF)); + + if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid)) + return -EINVAL; + + if (st->st_uid != new_uid || st->st_gid != new_gid) { + if (name) + r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW); + else + r = fchown(fd, new_uid, new_gid); + if (r < 0) + return -errno; + + /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */ + if (name && !S_ISLNK(st->st_mode)) + r = fchmodat(fd, name, st->st_mode, 0); + else + r = fchmod(fd, st->st_mode); + if (r < 0) + return -errno; + + changed = true; + } + + r = patch_acls(fd, name, st, shift); + if (r < 0) + return r; + + return r > 0 || changed; +} + +static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift) { + bool changed = false; + int r; + + assert(fd >= 0); + + r = patch_fd(fd, NULL, st, shift); + if (r < 0) + goto finish; + + if (S_ISDIR(st->st_mode)) { + _cleanup_closedir_ DIR *d = NULL; + struct dirent *de; + + if (!donate_fd) { + int copy; + + copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (copy < 0) + return -errno; + + fd = copy; + donate_fd = true; + } + + d = fdopendir(fd); + if (!d) { + r = -errno; + goto finish; + } + fd = -1; + + FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) { + struct stat fst; + + if (STR_IN_SET(de->d_name, ".", "..")) + continue; + + if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) { + r = -errno; + goto finish; + } + + if (S_ISDIR(fst.st_mode)) { + int subdir_fd; + + subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (subdir_fd < 0) { + r = -errno; + goto finish; + + } + + r = recurse_fd(subdir_fd, true, &fst, shift); + if (r < 0) + goto finish; + if (r > 0) + changed = true; + + } else { + r = patch_fd(dirfd(d), de->d_name, &fst, shift); + if (r < 0) + goto finish; + if (r > 0) + changed = true; + } + } + } + + r = changed; + +finish: + if (donate_fd) + safe_close(fd); + + return r; +} + +static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) { + struct stat st; + int r; + + assert(fd >= 0); + + /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an + * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges + * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual + * UID within the container. */ + + if ((shift & 0xFFFF) != 0) { + /* We only support containers where the shift starts at a 2^16 boundary */ + r = -EOPNOTSUPP; + goto finish; + } + + if (range != 0x10000) { + /* We only support containers with 16bit UID ranges for the patching logic */ + r = -EOPNOTSUPP; + goto finish; + } + + if (fstat(fd, &st) < 0) { + r = -errno; + goto finish; + } + + if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) { + /* We only support containers where the uid/gid container ID match */ + r = -EBADE; + goto finish; + } + + /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume + * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */ + if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0) + return 0; + + return recurse_fd(fd, donate_fd, &st, shift); + +finish: + if (donate_fd) + safe_close(fd); + + return r; +} + +int fd_patch_uid(int fd, uid_t shift, uid_t range) { + return fd_patch_uid_internal(fd, false, shift, range); +} + +int path_patch_uid(const char *path, uid_t shift, uid_t range) { + int fd; + + fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME); + if (fd < 0) + return -errno; + + return fd_patch_uid_internal(fd, true, shift, range); +} diff --git a/src/nspawn/nspawn-patch-uid.h b/src/nspawn/nspawn-patch-uid.h new file mode 100644 index 00000000000..55d09900164 --- /dev/null +++ b/src/nspawn/nspawn-patch-uid.h @@ -0,0 +1,23 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +int fd_patch_uid(int fd, uid_t shift, uid_t range); +int path_patch_uid(const char *path, uid_t shift, uid_t range); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index e1d37d383a2..a3190545fa7 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -75,6 +75,7 @@ #include "nspawn-expose-ports.h" #include "nspawn-mount.h" #include "nspawn-network.h" +#include "nspawn-patch-uid.h" #include "nspawn-register.h" #include "nspawn-settings.h" #include "nspawn-setuid.h" @@ -175,6 +176,7 @@ static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns = false; +static bool arg_userns_chown = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; static SettingsMask arg_settings_mask = 0; @@ -204,6 +206,7 @@ static void help(void) { " --property=NAME=VALUE Set scope unit property\n" " --private-users[=UIDBASE[:NUIDS]]\n" " Run within user namespace\n" + " --private-user-chown Adjust OS tree file ownership for private user range\n" " --private-network Disable network in container\n" " --network-interface=INTERFACE\n" " Assign an existing network interface to the\n" @@ -349,6 +352,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_KILL_SIGNAL, ARG_SETTINGS, ARG_CHDIR, + ARG_PRIVATE_USERS_CHOWN, }; static const struct option options[] = { @@ -392,6 +396,7 @@ static int parse_argv(int argc, char *argv[]) { { "port", required_argument, NULL, 'p' }, { "property", required_argument, NULL, ARG_PROPERTY }, { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, + { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN}, { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, { "settings", required_argument, NULL, ARG_SETTINGS }, { "chdir", required_argument, NULL, ARG_CHDIR }, @@ -825,6 +830,10 @@ static int parse_argv(int argc, char *argv[]) { arg_userns = true; break; + case ARG_PRIVATE_USERS_CHOWN: + arg_userns_chown = true; + break; + case ARG_KILL_SIGNAL: arg_kill_signal = signal_from_string_try_harder(optarg); if (arg_kill_signal < 0) { @@ -933,8 +942,15 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) - return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support."); + if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) { + log_error("--private-users= is not supported, kernel compiled without user namespace support."); + return -EOPNOTSUPP; + } + + if (arg_userns_chown && arg_read_only) { + log_error("--read-only and --private-users-chown may not be combined."); + return -EINVAL; + } if (argc > optind) { arg_parameters = strv_copy(argv + optind); @@ -2218,6 +2234,29 @@ static int setup_machine_id(const char *directory) { return 0; } +static int recursive_chown(const char *directory, uid_t shift, uid_t range) { + int r; + + assert(directory); + + if (!arg_userns || !arg_userns_chown) + return 0; + + r = path_patch_uid(directory, arg_uid_shift, arg_uid_range); + if (r == -EOPNOTSUPP) + return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16."); + if (r == -EBADE) + return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match."); + if (r < 0) + return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m"); + if (r == 0) + log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation."); + else + log_debug("Patched directory tree to match UID/GID range."); + + return r; +} + static int mount_devices( const char *where, const char *root_device, bool root_device_rw, @@ -2763,6 +2802,10 @@ static int outer_child( if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) return log_error_errno(errno, "Failed to make bind mount: %m"); + r = recursive_chown(directory, arg_uid_shift, arg_uid_range); + if (r < 0) + return r; + r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); if (r < 0) return r; diff --git a/src/nspawn/test-patch-uid.c b/src/nspawn/test-patch-uid.c new file mode 100644 index 00000000000..11c53217885 --- /dev/null +++ b/src/nspawn/test-patch-uid.c @@ -0,0 +1,61 @@ +/*** + This file is part of systemd. + + Copyright 2016 Lennart Poettering + + systemd is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. + + systemd is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with systemd; If not, see . +***/ + +#include + +#include "log.h" +#include "nspawn-patch-uid.h" +#include "user-util.h" +#include "util.h" + +int main(int argc, char *argv[]) { + uid_t shift, range; + int r; + + log_set_max_level(LOG_DEBUG); + log_parse_environment(); + log_open(); + + if (argc != 4) { + log_error("Expected PATH SHIFT RANGE parameters."); + return EXIT_FAILURE; + } + + r = parse_uid(argv[2], &shift); + if (r < 0) { + log_error_errno(r, "Failed to parse UID shift %s.", argv[2]); + return EXIT_FAILURE; + } + + r = parse_gid(argv[3], &range); + if (r < 0) { + log_error_errno(r, "Failed to parse UID range %s.", argv[3]); + return EXIT_FAILURE; + } + + r = path_patch_uid(argv[1], shift, range); + if (r < 0) { + log_error_errno(r, "Failed to patch directory tree: %m"); + return EXIT_FAILURE; + } + + log_info("Changed: %s", yes_no(r)); + + return EXIT_SUCCESS; +} From 0e7ac7515f2fe0782f4062bb223904e2748b535d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 11:28:09 +0200 Subject: [PATCH 03/10] nspawn: optionally, automatically allocate a UID/GID range for userns containers This adds the new value "pick" to --private-users=. When specified a new UID/GID range of 65536 users is automatically and randomly allocated from the host range 0x00080000-0xDFFF0000 and used for the container. The setting implies --private-users-chown, so that container directory is recursively chown()ed to the newly allocated UID/GID range, if that's necessary. As an optimization before picking a randomized UID/GID the UID of the container's root directory is used as starting point and used if currently not used otherwise. To protect against using the same UID/GID range multiple times a few mechanisms are in place: - The first and the last UID and GID of the range are checked with getpwuid() and getgrgid(). If an entry already exists a different range is picked. Note that by "last" UID the user 65534 is used, as 65535 is the 16bit (uid_t) -1. - A lock file for the range is taken in /run/systemd/nspawn-uid/. Since the ranges are taken in a non-overlapping fashion, and always start on 64K boundaries this allows us to maintain a single lock file for each range that can be randomly picked. This protects nspawn from picking the same range in two parallel instances. - If possible the /etc/passwd lock file is taken while a new range is selected until the container is up. This means adduser/addgroup should safely avoid the range as long as nss-mymachines is used, since the allocated range will then show up in the user database. The UID/GID range nspawn picks from is compiled in and not configurable at the moment. That should probably stay that way, since we already provide ways how users can pick their own ranges manually if they don't like the automatic logic. The new --private-users=pick logic makes user namespacing pretty useful now, as it relieves the user from managing UID/GID ranges. --- src/nspawn/nspawn.c | 224 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 188 insertions(+), 36 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index a3190545fa7..c330456ff98 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -22,7 +22,9 @@ #endif #include #include +#include #include +#include #include #ifdef HAVE_SECCOMP #include @@ -102,6 +104,11 @@ #include "user-util.h" #include "util.h" +/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit + * UID range here */ +#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000)) +#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000)) + typedef enum ContainerStatus { CONTAINER_TERMINATED, CONTAINER_REBOOTED @@ -174,9 +181,10 @@ static char *arg_image = NULL; static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; -static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static bool arg_userns = false; static bool arg_userns_chown = false; +static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; +static bool arg_uid_shift_pick = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; static SettingsMask arg_settings_mask = 0; @@ -275,9 +283,15 @@ static int custom_mounts_prepare(void) { for (i = 0; i < arg_n_custom_mounts; i++) { CustomMount *m = &arg_custom_mounts[i]; - if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) { - log_error("--private-users with automatic UID shift may not be combined with custom root mounts."); - return -EINVAL; + if (path_equal(m->destination, "/") && arg_userns) { + + if (arg_userns_chown) { + log_error("--private-users-chown may not be combined with custom root mounts."); + return -EINVAL; + } else if (arg_uid_shift == UID_INVALID) { + log_error("--private-users with automatic UID shift may not be combined with custom root mounts."); + return -EINVAL; + } } if (m->type != CUSTOM_MOUNT_OVERLAY) @@ -806,25 +820,37 @@ static int parse_argv(int argc, char *argv[]) { _cleanup_free_ char *buffer = NULL; const char *range, *shift; - range = strchr(optarg, ':'); - if (range) { - buffer = strndup(optarg, range - optarg); - if (!buffer) - return log_oom(); - shift = buffer; + if (streq(optarg, "pick")) { + arg_uid_shift = UID_INVALID; + arg_uid_range = 0x10000U; + arg_uid_shift_pick = true; + } else { + range = strchr(optarg, ':'); + if (range) { + buffer = strndup(optarg, range - optarg); + if (!buffer) + return log_oom(); + shift = buffer; - range++; - if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { - log_error("Failed to parse UID range: %s", range); + range++; + if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { + log_error("Failed to parse UID range: %s", range); + return -EINVAL; + } + } else + shift = optarg; + + if (parse_uid(shift, &arg_uid_shift) < 0) { + log_error("Failed to parse UID: %s", optarg); return -EINVAL; } - } else - shift = optarg; - if (parse_uid(shift, &arg_uid_shift) < 0) { - log_error("Failed to parse UID: %s", optarg); - return -EINVAL; + arg_uid_shift_pick = false; } + } else { + arg_uid_shift = UID_INVALID; + arg_uid_range = 0x10000U; + arg_uid_shift_pick = false; } arg_userns = true; @@ -902,6 +928,9 @@ static int parse_argv(int argc, char *argv[]) { if (arg_share_system) arg_register = false; + if (arg_uid_shift_pick) + arg_userns_chown = true; + if (arg_start_mode != START_PID1 && arg_share_system) { log_error("--boot and --share-system may not be combined."); return -EINVAL; @@ -2501,7 +2530,6 @@ static int determine_uid_shift(const char *directory) { return -EINVAL; } - log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); return 0; } @@ -2789,6 +2817,7 @@ static int outer_child( return r; if (arg_userns) { + /* Let the parent know which UID shift we read from the image */ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); if (l < 0) return log_error_errno(errno, "Failed to send UID shift: %m"); @@ -2796,6 +2825,22 @@ static int outer_child( log_error("Short write while sending UID shift."); return -EIO; } + + if (arg_uid_shift_pick) { + /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift + * we just read from the image is available. If yes, it will send the UID shift back to us, if + * not it will pick a different one, and send it back to us. */ + + l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0); + if (l < 0) + return log_error_errno(errno, "Failed to recv UID shift: %m"); + if (l != sizeof(arg_uid_shift)) { + log_error("Short read while recieving UID shift."); + return -EIO; + } + } + + log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } /* Turn directory into bind mount */ @@ -2925,6 +2970,61 @@ static int outer_child( return 0; } +static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { + unsigned n_tries = 100; + uid_t candidate; + int r; + + assert(shift); + assert(ret_lock_file); + assert(arg_uid_shift_pick); + assert(arg_uid_range == 0x10000U); + + candidate = *shift; + + (void) mkdir("/run/systemd/nspawn-uid", 0755); + + for (;;) { + char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT; + + if (--n_tries <= 0) + return -EBUSY; + + if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX) + goto next; + if ((candidate & UINT32_C(0xFFFF)) != 0) + goto next; + + xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate); + r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf); + if (r == -EBUSY) /* Range already taken by another nspawn instance */ + goto next; + if (r < 0) + return r; + + /* Make some superficial checks whether the range is currently known in the user database */ + if (getpwuid(candidate)) + goto next; + if (getpwuid(candidate + UINT32_C(0xFFFE))) + goto next; + if (getgrgid(candidate)) + goto next; + if (getgrgid(candidate + UINT32_C(0xFFFE))) + goto next; + + *ret_lock_file = lf; + lf = (struct LockFile) LOCK_FILE_INIT; + *shift = candidate; + return 0; + + next: + random_bytes(&candidate, sizeof(candidate)); + candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN; + candidate &= (uid_t) UINT32_C(0xFFFF0000); + } +} + static int setup_uid_map(pid_t pid) { char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; int r; @@ -3394,20 +3494,42 @@ int main(int argc, char *argv[]) { } for (;;) { - _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, - pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 }; - ContainerStatus container_status; - _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; static const struct sigaction sa = { .sa_handler = nop_signal_handler, .sa_flags = SA_NOCLDSTOP, }; - int ifi = 0; - ssize_t l; + + _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT; + _cleanup_close_ int etc_passwd_lock = -1; + _cleanup_close_pair_ int + kmsg_socket_pair[2] = { -1, -1 }, + rtnl_socket_pair[2] = { -1, -1 }, + pid_socket_pair[2] = { -1, -1 }, + uuid_socket_pair[2] = { -1, -1 }, + uid_shift_socket_pair[2] = { -1, -1 }; + _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + ContainerStatus container_status; char last_char = 0; + int ifi = 0; + ssize_t l; + + if (arg_uid_shift_pick) { + /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely + * check with getpwuid() if the specific user already exists. Note that /etc might be + * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we + * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are + * really just an extra safety net. We kinda assume that the UID range we allocate from is + * really ours. */ + + etc_passwd_lock = take_etc_passwd_lock(NULL); + if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) { + log_error_errno(r, "Failed to take /etc/passwd lock: %m"); + goto finish; + } + } r = barrier_create(&barrier); if (r < 0) { @@ -3511,6 +3633,43 @@ int main(int argc, char *argv[]) { uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); + if (arg_userns) { + /* The child just let us know the UID shift it might have read from the image. */ + l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); + if (l < 0) { + r = log_error_errno(errno, "Failed to read UID shift: %m"); + goto finish; + } + if (l != sizeof(arg_uid_shift)) { + log_error("Short read while reading UID shift."); + r = EIO; + goto finish; + } + + if (arg_uid_shift_pick) { + /* If we are supposed to pick the UID shift, let's try to use the shift read from the + * image, but if that's already in use, pick a new one, and report back to the child, + * which one we now picked. */ + + r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock); + if (r < 0) { + log_error_errno(r, "Failed to pick suitable UID/GID range: %m"); + goto finish; + } + + l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); + if (l < 0) { + r = log_error_errno(errno, "Failed to send UID shift: %m"); + goto finish; + } + if (l != sizeof(arg_uid_shift)) { + log_error("Short write while writing UID shift."); + r = -EIO; + goto finish; + } + } + } + /* Wait for the outer child. */ r = wait_for_terminate_and_warn("namespace helper", pid, NULL); if (r < 0) @@ -3554,17 +3713,6 @@ int main(int argc, char *argv[]) { goto finish; } - l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); - if (l < 0) { - r = log_error_errno(errno, "Failed to read UID shift: %m"); - goto finish; - } - if (l != sizeof(arg_uid_shift)) { - log_error("Short read while reading UID shift."); - r = EIO; - goto finish; - } - r = setup_uid_map(pid); if (r < 0) goto finish; @@ -3662,6 +3810,10 @@ int main(int argc, char *argv[]) { goto finish; } + /* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear + * in getpwuid(), thus we can release the /etc/passwd lock. */ + etc_passwd_lock = safe_close(etc_passwd_lock); + sd_notifyf(false, "READY=1\n" "STATUS=Container running.\n" From 19aac838fc3b7bcaed272f19a0bec3962eef7418 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 11:47:35 +0200 Subject: [PATCH 04/10] nspawn: add -U as shortcut for --private-users=pick Given that user namespacing is pretty useful now, let's add a shortcut command line switch for the logic. --- src/nspawn/nspawn.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index c330456ff98..3e32f59f757 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -212,9 +212,10 @@ static void help(void) { " --uuid=UUID Set a specific machine UUID for the container\n" " -S --slice=SLICE Place the container in the specified slice\n" " --property=NAME=VALUE Set scope unit property\n" + " -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n" " --private-users[=UIDBASE[:NUIDS]]\n" - " Run within user namespace\n" - " --private-user-chown Adjust OS tree file ownership for private user range\n" + " Run within user namespace, user configured UID/GID range\n" + " --private-user-chown Adjust OS tree file ownership for private UID/GID range\n" " --private-network Disable network in container\n" " --network-interface=INTERFACE\n" " Assign an existing network interface to the\n" @@ -425,7 +426,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argc >= 0); assert(argv); - while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0) switch (c) { @@ -860,6 +861,14 @@ static int parse_argv(int argc, char *argv[]) { arg_userns_chown = true; break; + case 'U': + arg_userns = true; + arg_userns_chown = true; + arg_uid_shift = UID_INVALID; + arg_uid_range = 0x10000U; + arg_uid_shift_pick = true; + break; + case ARG_KILL_SIGNAL: arg_kill_signal = signal_from_string_try_harder(optarg); if (arg_kill_signal < 0) { From 0de7accea9e0f7b696f108a43bff7e454fa8f016 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 13:02:53 +0200 Subject: [PATCH 05/10] nspawn: allow configuration of user namespaces in .nspawn files In order to implement this we change the bool arg_userns into an enum UserNamespaceMode, which can take one of NO, PICK or FIXED, and replace the arg_uid_range_pick bool with it. --- src/nspawn/nspawn-gperf.gperf | 4 +- src/nspawn/nspawn-settings.c | 86 ++++++++++++++++ src/nspawn/nspawn-settings.h | 15 ++- src/nspawn/nspawn.c | 179 +++++++++++++++++++++++----------- 4 files changed, 223 insertions(+), 61 deletions(-) diff --git a/src/nspawn/nspawn-gperf.gperf b/src/nspawn/nspawn-gperf.gperf index 116655cdd25..34e1310e296 100644 --- a/src/nspawn/nspawn-gperf.gperf +++ b/src/nspawn/nspawn-gperf.gperf @@ -16,7 +16,7 @@ struct ConfigPerfItem; %includes %% Exec.Boot, config_parse_boot, 0, 0 -Exec.ProcessTwo, config_parse_pid2, 0, 0, +Exec.ProcessTwo, config_parse_pid2, 0, 0 Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) Exec.User, config_parse_string, 0, offsetof(Settings, user) @@ -26,11 +26,13 @@ Exec.KillSignal, config_parse_signal, 0, offsetof(Settings, Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) +Exec.PrivateUsers, config_parse_private_users, 0, 0 Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) Files.Bind, config_parse_bind, 0, 0 Files.BindReadOnly, config_parse_bind, 1, 0 Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 +Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown) Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) diff --git a/src/nspawn/nspawn-settings.c b/src/nspawn/nspawn-settings.c index 4fb00546981..b98a79fd094 100644 --- a/src/nspawn/nspawn-settings.c +++ b/src/nspawn/nspawn-settings.c @@ -25,7 +25,9 @@ #include "parse-util.h" #include "process-util.h" #include "strv.h" +#include "user-util.h" #include "util.h" +#include "string-util.h" int settings_load(FILE *f, const char *path, Settings **ret) { _cleanup_(settings_freep) Settings *s = NULL; @@ -40,9 +42,13 @@ int settings_load(FILE *f, const char *path, Settings **ret) { s->start_mode = _START_MODE_INVALID; s->personality = PERSONALITY_INVALID; + s->userns_mode = _USER_NAMESPACE_MODE_INVALID; + s->uid_shift = UID_INVALID; + s->uid_range = UID_INVALID; s->read_only = -1; s->volatile_mode = _VOLATILE_MODE_INVALID; + s->userns_chown = -1; s->private_network = -1; s->network_veth = -1; @@ -59,6 +65,16 @@ int settings_load(FILE *f, const char *path, Settings **ret) { if (r < 0) return r; + /* Make sure that if userns_mode is set, userns_chown is set to something appropriate, and vice versa. Either + * both fields shall be initialized or neither. */ + if (s->userns_mode == USER_NAMESPACE_PICK) + s->userns_chown = true; + else if (s->userns_mode != _USER_NAMESPACE_MODE_INVALID && s->userns_chown < 0) + s->userns_chown = false; + + if (s->userns_chown >= 0 && s->userns_mode == _USER_NAMESPACE_MODE_INVALID) + s->userns_mode = USER_NAMESPACE_NO; + *ret = s; s = NULL; @@ -392,3 +408,73 @@ conflict: log_syntax(unit, LOG_ERR, filename, line, r, "Conflicting Boot= or ProcessTwo= setting found. Ignoring."); return 0; } + +int config_parse_private_users( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + Settings *settings = data; + int r; + + assert(filename); + assert(lvalue); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r == 0) { + /* no: User namespacing off */ + settings->userns_mode = USER_NAMESPACE_NO; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (r > 0) { + /* yes: User namespacing on, UID range is read from root dir */ + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else if (streq(rvalue, "pick")) { + /* pick: User namespacing on, UID range is picked randomly */ + settings->userns_mode = USER_NAMESPACE_PICK; + settings->uid_shift = UID_INVALID; + settings->uid_range = UINT32_C(0x10000); + } else { + const char *range, *shift; + uid_t sh, rn; + + /* anything else: User namespacing on, UID range is explicitly configured */ + + range = strchr(rvalue, ':'); + if (range) { + shift = strndupa(rvalue, range - rvalue); + range++; + + r = safe_atou32(range, &rn); + if (r < 0 || rn <= 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID range invalid, ignoring: %s", range); + return 0; + } + } else { + shift = rvalue; + rn = UINT32_C(0x10000); + } + + r = parse_uid(shift, &sh); + if (r < 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID shift invalid, ignoring: %s", range); + return 0; + } + + settings->userns_mode = USER_NAMESPACE_FIXED; + settings->uid_shift = sh; + settings->uid_range = rn; + } + + return 0; +} diff --git a/src/nspawn/nspawn-settings.h b/src/nspawn/nspawn-settings.h index a017405cd98..e12e91b886d 100644 --- a/src/nspawn/nspawn-settings.h +++ b/src/nspawn/nspawn-settings.h @@ -33,6 +33,14 @@ typedef enum StartMode { _START_MODE_INVALID = -1 } StartMode; +typedef enum UserNamespaceMode { + USER_NAMESPACE_NO, + USER_NAMESPACE_FIXED, + USER_NAMESPACE_PICK, + _USER_NAMESPACE_MODE_MAX, + _USER_NAMESPACE_MODE_INVALID = -1, +} UserNamespaceMode; + typedef enum SettingsMask { SETTING_START_MODE = 1 << 0, SETTING_ENVIRONMENT = 1 << 1, @@ -47,7 +55,8 @@ typedef enum SettingsMask { SETTING_VOLATILE_MODE = 1 << 10, SETTING_CUSTOM_MOUNTS = 1 << 11, SETTING_WORKING_DIRECTORY = 1 << 12, - _SETTINGS_MASK_ALL = (1 << 13) -1 + SETTING_USERNS = 1 << 13, + _SETTINGS_MASK_ALL = (1 << 14) -1 } SettingsMask; typedef struct Settings { @@ -62,12 +71,15 @@ typedef struct Settings { unsigned long personality; sd_id128_t machine_id; char *working_directory; + UserNamespaceMode userns_mode; + uid_t uid_shift, uid_range; /* [Image] */ int read_only; VolatileMode volatile_mode; CustomMount *custom_mounts; unsigned n_custom_mounts; + int userns_chown; /* [Network] */ int private_network; @@ -99,3 +111,4 @@ int config_parse_tmpfs(const char *unit, const char *filename, unsigned line, co int config_parse_veth_extra(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); +int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 3e32f59f757..40e3d5a3fe3 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -181,10 +181,9 @@ static char *arg_image = NULL; static VolatileMode arg_volatile_mode = VOLATILE_NO; static ExposePort *arg_expose_ports = NULL; static char **arg_property = NULL; -static bool arg_userns = false; -static bool arg_userns_chown = false; +static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; -static bool arg_uid_shift_pick = false; +static bool arg_userns_chown = false; static int arg_kill_signal = 0; static bool arg_unified_cgroup_hierarchy = false; static SettingsMask arg_settings_mask = 0; @@ -284,7 +283,7 @@ static int custom_mounts_prepare(void) { for (i = 0; i < arg_n_custom_mounts; i++) { CustomMount *m = &arg_custom_mounts[i]; - if (path_equal(m->destination, "/") && arg_userns) { + if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) { if (arg_userns_chown) { log_error("--private-users-chown may not be combined with custom root mounts."); @@ -817,56 +816,67 @@ static int parse_argv(int argc, char *argv[]) { break; case ARG_PRIVATE_USERS: - if (optarg) { + + r = optarg ? parse_boolean(optarg) : 1; + if (r == 0) { + /* no: User namespacing off */ + arg_userns_mode = USER_NAMESPACE_NO; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else if (r > 0) { + /* yes: User namespacing on, UID range is read from root dir */ + arg_userns_mode = USER_NAMESPACE_FIXED; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else if (streq(optarg, "pick")) { + /* pick: User namespacing on, UID range is picked randomly */ + arg_userns_mode = USER_NAMESPACE_PICK; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + } else { _cleanup_free_ char *buffer = NULL; const char *range, *shift; - if (streq(optarg, "pick")) { - arg_uid_shift = UID_INVALID; - arg_uid_range = 0x10000U; - arg_uid_shift_pick = true; - } else { - range = strchr(optarg, ':'); - if (range) { - buffer = strndup(optarg, range - optarg); - if (!buffer) - return log_oom(); - shift = buffer; + /* anything else: User namespacing on, UID range is explicitly configured */ - range++; - if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { - log_error("Failed to parse UID range: %s", range); - return -EINVAL; - } - } else - shift = optarg; + range = strchr(optarg, ':'); + if (range) { + buffer = strndup(optarg, range - optarg); + if (!buffer) + return log_oom(); + shift = buffer; - if (parse_uid(shift, &arg_uid_shift) < 0) { - log_error("Failed to parse UID: %s", optarg); + range++; + if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) { + log_error("Failed to parse UID range: %s", range); return -EINVAL; } + } else + shift = optarg; - arg_uid_shift_pick = false; + if (parse_uid(shift, &arg_uid_shift) < 0) { + log_error("Failed to parse UID: %s", optarg); + return -EINVAL; } - } else { - arg_uid_shift = UID_INVALID; - arg_uid_range = 0x10000U; - arg_uid_shift_pick = false; + + arg_userns_mode = USER_NAMESPACE_FIXED; } - arg_userns = true; + arg_settings_mask |= SETTING_USERNS; + break; + + case 'U': + arg_userns_mode = USER_NAMESPACE_PICK; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + + arg_settings_mask |= SETTING_USERNS; break; case ARG_PRIVATE_USERS_CHOWN: arg_userns_chown = true; - break; - case 'U': - arg_userns = true; - arg_userns_chown = true; - arg_uid_shift = UID_INVALID; - arg_uid_range = 0x10000U; - arg_uid_shift_pick = true; + arg_settings_mask |= SETTING_USERNS; break; case ARG_KILL_SIGNAL: @@ -937,7 +947,7 @@ static int parse_argv(int argc, char *argv[]) { if (arg_share_system) arg_register = false; - if (arg_uid_shift_pick) + if (arg_userns_mode == USER_NAMESPACE_PICK) arg_userns_chown = true; if (arg_start_mode != START_PID1 && arg_share_system) { @@ -980,7 +990,7 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) { + if (arg_userns_mode != USER_NAMESPACE_NO && access("/proc/self/uid_map", F_OK) < 0) { log_error("--private-users= is not supported, kernel compiled without user namespace support."); return -EOPNOTSUPP; } @@ -1047,7 +1057,7 @@ static int verify_arguments(void) { static int userns_lchown(const char *p, uid_t uid, gid_t gid) { assert(p); - if (!arg_userns) + if (arg_userns_mode == USER_NAMESPACE_NO) return 0; if (uid == UID_INVALID && gid == GID_INVALID) @@ -2277,7 +2287,7 @@ static int recursive_chown(const char *directory, uid_t shift, uid_t range) { assert(directory); - if (!arg_userns || !arg_userns_chown) + if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown) return 0; r = path_patch_uid(directory, arg_uid_shift, arg_uid_range); @@ -2512,7 +2522,7 @@ static int determine_names(void) { static int determine_uid_shift(const char *directory) { int r; - if (!arg_userns) { + if (arg_userns_mode == USER_NAMESPACE_NO) { arg_uid_shift = 0; return 0; } @@ -2575,7 +2585,7 @@ static int inner_child( cg_unified_flush(); - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { /* Tell the parent, that it now can write the UID map. */ (void) barrier_place(barrier); /* #1 */ @@ -2586,7 +2596,14 @@ static int inner_child( } } - r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context); + r = mount_all(NULL, + arg_userns_mode != USER_NAMESPACE_NO, + true, + arg_private_network, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); + if (r < 0) return r; @@ -2825,7 +2842,7 @@ static int outer_child( if (r < 0) return r; - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { /* Let the parent know which UID shift we read from the image */ l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); if (l < 0) @@ -2835,7 +2852,7 @@ static int outer_child( return -EIO; } - if (arg_uid_shift_pick) { + if (arg_userns_mode == USER_NAMESPACE_PICK) { /* When we are supposed to pick the UID shift, the parent will check now whether the UID shift * we just read from the image is available. If yes, it will send the UID shift back to us, if * not it will pick a different one, and send it back to us. */ @@ -2860,11 +2877,23 @@ static int outer_child( if (r < 0) return r; - r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); + r = setup_volatile( + directory, + arg_volatile_mode, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_context); if (r < 0) return r; - r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); + r = setup_volatile_state( + directory, + arg_volatile_mode, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_context); if (r < 0) return r; @@ -2878,7 +2907,13 @@ static int outer_child( return log_error_errno(r, "Failed to make tree read-only: %m"); } - r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_all(directory, + arg_userns_mode != USER_NAMESPACE_NO, + false, + arg_private_network, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; @@ -2920,11 +2955,24 @@ static int outer_child( if (r < 0) return r; - r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_custom( + directory, + arg_custom_mounts, + arg_n_custom_mounts, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; - r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); + r = mount_cgroups( + directory, + arg_unified_cgroup_hierarchy, + arg_userns_mode != USER_NAMESPACE_NO, + arg_uid_shift, + arg_uid_range, + arg_selinux_apifs_context); if (r < 0) return r; @@ -2935,7 +2983,7 @@ static int outer_child( pid = raw_clone(SIGCHLD|CLONE_NEWNS| (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) | (arg_private_network ? CLONE_NEWNET : 0) | - (arg_userns ? CLONE_NEWUSER : 0), + (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0), NULL); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); @@ -2986,7 +3034,7 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { assert(shift); assert(ret_lock_file); - assert(arg_uid_shift_pick); + assert(arg_userns_mode == USER_NAMESPACE_PICK); assert(arg_uid_range == 0x10000U); candidate = *shift; @@ -3265,6 +3313,19 @@ static int load_settings(void) { } } + if ((arg_settings_mask & SETTING_USERNS) == 0 && + settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) { + + if (!arg_settings_trusted) + log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p); + else { + arg_userns_mode = settings->userns_mode; + arg_uid_shift = settings->uid_shift; + arg_uid_range = settings->uid_range; + arg_userns_chown = settings->userns_chown; + } + } + return 0; } @@ -3525,7 +3586,7 @@ int main(int argc, char *argv[]) { int ifi = 0; ssize_t l; - if (arg_uid_shift_pick) { + if (arg_userns_mode == USER_NAMESPACE_PICK) { /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely * check with getpwuid() if the specific user already exists. Note that /etc might be * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we @@ -3566,7 +3627,7 @@ int main(int argc, char *argv[]) { goto finish; } - if (arg_userns) + if (arg_userns_mode != USER_NAMESPACE_NO) if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); goto finish; @@ -3642,7 +3703,7 @@ int main(int argc, char *argv[]) { uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { /* The child just let us know the UID shift it might have read from the image. */ l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0); if (l < 0) { @@ -3655,7 +3716,7 @@ int main(int argc, char *argv[]) { goto finish; } - if (arg_uid_shift_pick) { + if (arg_userns_mode == USER_NAMESPACE_PICK) { /* If we are supposed to pick the UID shift, let's try to use the shift read from the * image, but if that's already in use, pick a new one, and report back to the child, * which one we now picked. */ @@ -3715,7 +3776,7 @@ int main(int argc, char *argv[]) { log_debug("Init process invoked as PID " PID_FMT, pid); - if (arg_userns) { + if (arg_userns_mode != USER_NAMESPACE_NO) { if (!barrier_place_and_sync(&barrier)) { /* #1 */ log_error("Child died too early."); r = -ESRCH; From d2e5535f9d945db3ca20ed264e1fe1ce99707bba Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 13:46:23 +0200 Subject: [PATCH 06/10] man: document the new user namespacing options --- man/systemd-nspawn.xml | 93 ++++++++++++++++++++++++++++++------------ man/systemd.nspawn.xml | 18 ++++++++ 2 files changed, 84 insertions(+), 27 deletions(-) diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index a0376ed3e00..ea0c6562f88 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -387,38 +387,77 @@ - Enables user namespacing. If enabled, the - container will run with its own private set of Unix user and - group ids (UIDs and GIDs). Takes none, one or two - colon-separated parameters: the first parameter specifies the - first host UID to assign to the container, the second - parameter specifies the number of host UIDs to assign to the - container. If the second parameter is omitted, 65536 UIDs are - assigned. If the first parameter is also omitted (and hence - no parameter passed at all), the first UID assigned to the - container is read from the owner of the root directory of the - container's directory tree. By default, no user namespacing is - applied. + Controls user namespacing. If enabled, the container will run with its own private set of UNIX + user and group ids (UIDs and GIDs). This involves mapping the private UIDs/GIDs used in the container (starting + with the container's root user 0 and up) to a range of UIDs/GIDs on the host that are not used for other + purposes (usually in the range beyond the host's UID/GID 65536). The parameter may be specified as follows: - Note that user namespacing currently requires OS trees - that are prepared for the UID shift that is being applied: - UIDs and GIDs used for file ownership or in file ACL entries - must be shifted to the container UID base that is - used during container runtime. + + The value no turns off user namespacing. This is the default. - It is recommended to assign at least 65536 UIDs to each - container, so that the usable UID range in the container - covers 16 bit. For best security, do not assign overlapping UID - ranges to multiple containers. It is hence a good idea to use - the upper 16 bit of the host 32-bit UIDs as container - identifier, while the lower 16 bit encode the container UID - used. + The value yes (or the omission of a parameter) turns on user + namespacing. The UID/GID range to use is determined automatically from the file ownership of the root + directory of the container's directory tree. To use this option, make sure to prepare the directory tree in + advance, and ensure that all files and directories in it are owned by UIDs/GIDs in the range you'd like to + use. Also, make sure that used file ACLs exclusively reference UIDs/GIDs in the appropriate range. If this + mode is used the number of UIDs/GIDs assigned to the container for use is 65536, and the UID/GID of the + root directory must be a multiple of 65536. - When user namespaces are used, the GID range assigned to - each container is always chosen identical to the UID - range. + The value "pick" turns on user namespacing. In this case the UID/GID range is automatically + chosen. As first step, the file owner of the root directory of the container's directory tree is read, and it + is checked that it is currently not used by the system otherwise (in particular, that no other container is + using it). If this check is successful, the UID/GID range determined this way is used, similar to the + behaviour if "yes" is specified. If the check is not successful (and thus the UID/GID range indicated in the + root directory's file owner is already used elsewhere) a new – currently unused – UID/GID range of 65536 + UIDs/GIDs is randomly chosen between the host UID/GIDs of 524288 and 1878982656, always starting at a + multiple of 65536. This setting implies (see below), which has the + effect that the files and directories in the container's directory tree will be owned by the appropriate + users of the range picked. Using this option makes user namespace behaviour fully automatic. Note that the + first invocation of a previously unused container image might result in picking a new UID/GID range for it, + and thus in the (possibly expensive) file ownership adjustment operation. However, subsequent invocations of + the container will be cheap (unless of course the picked UID/GID range is assigned to a different use by + then). + + Finally if one or two colon-separated numeric parameters are specified, user namespacing is + turned on, too. The first parameter specifies the first host UID/GID to assign to the container, the second + parameter specifies the number of host UIDs/GIDs to assign to the container. If the second parameter is + omitted, 65536 UIDs/GIDs are assigned. + + + It is recommended to assign at least 65536 UIDs/GIDs to each container, so that the usable UID/GID range in the + container covers 16 bit. For best security, do not assign overlapping UID/GID ranges to multiple containers. It is + hence a good idea to use the upper 16 bit of the host 32-bit UIDs/GIDs as container identifier, while the lower 16 + bit encode the container UID/GID used. This is in fact the behaviour enforced by the + option. + + When user namespaces are used, the GID range assigned to each container is always chosen identical to the + UID range. + + In most cases, using is the recommended option as it enhances + container security massively and operates fully automatically in most cases. + + Note that the picked UID/GID range is not written to /etc/passwd or + /etc/group. In fact, the allocation of the range is not stored persistently anywhere, + except in the file ownership of the files and directories of the container. + + + + Equivalent to . + + + + + + If specified, all files and directories in the container's directory tree will adjusted so that + they are owned to the appropriate UIDs/GIDs selected for the container (see above). This operation is + potentially expensive, as it involves descending and iterating through the full directory tree of the + container. Besides actual file ownership, file ACLs are adjusted as well. + + This option is implied if is used. This option has no effect if + user namespacing is not used. + diff --git a/man/systemd.nspawn.xml b/man/systemd.nspawn.xml index ce900a5db12..15360078ef2 100644 --- a/man/systemd.nspawn.xml +++ b/man/systemd.nspawn.xml @@ -251,6 +251,14 @@ command line switch. This option is privileged (see above). + + + PrivateUsers= + + Configures support for usernamespacing. This is equivalent to the + command line switch, and takes the same options. This option is privileged + (see above). + @@ -314,6 +322,16 @@ for details about the specific options supported. This setting is privileged (see above). + + + PrivateUsersChown= + + Configures whether the ownership of the files and directories in the container tree shall be + adjusted to the UID/GID range used, if necessary and user namespacing is enabled. This is equivalent to the + command line switch. This option is privileged (see + above). + + From ccabee0d6465f06b9d339cf449fd8eea0db13373 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 14:10:09 +0200 Subject: [PATCH 07/10] nspawn: make -U a tiny bit smarter With this change -U will turn on user namespacing only if the kernel actually supports it and otherwise gracefully degrade to non-userns mode. --- man/systemd-nspawn.xml | 4 +++- src/basic/user-util.h | 5 +++++ src/nspawn/nspawn.c | 13 ++++++++----- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index ea0c6562f88..bd688a0ee19 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -444,7 +444,9 @@ - Equivalent to . + If the kernel supports the user namespaces feature, equivalent to + , otherwise equivalent to + . diff --git a/src/basic/user-util.h b/src/basic/user-util.h index c23f1d485d5..8026eca3f47 100644 --- a/src/basic/user-util.h +++ b/src/basic/user-util.h @@ -21,6 +21,7 @@ #include #include +#include bool uid_is_valid(uid_t uid); @@ -63,3 +64,7 @@ int take_etc_passwd_lock(const char *root); #define PTR_TO_GID(p) ((gid_t) (((uintptr_t) (p))-1)) #define GID_TO_PTR(u) ((void*) (((uintptr_t) (u))+1)) + +static inline bool userns_supported(void) { + return access("/proc/self/uid_map", F_OK) >= 0; +} diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 40e3d5a3fe3..c8a7ec71a32 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -866,11 +866,14 @@ static int parse_argv(int argc, char *argv[]) { break; case 'U': - arg_userns_mode = USER_NAMESPACE_PICK; - arg_uid_shift = UID_INVALID; - arg_uid_range = UINT32_C(0x10000); + if (userns_supported()) { + arg_userns_mode = USER_NAMESPACE_PICK; + arg_uid_shift = UID_INVALID; + arg_uid_range = UINT32_C(0x10000); + + arg_settings_mask |= SETTING_USERNS; + } - arg_settings_mask |= SETTING_USERNS; break; case ARG_PRIVATE_USERS_CHOWN: @@ -990,7 +993,7 @@ static int parse_argv(int argc, char *argv[]) { return -EINVAL; } - if (arg_userns_mode != USER_NAMESPACE_NO && access("/proc/self/uid_map", F_OK) < 0) { + if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) { log_error("--private-users= is not supported, kernel compiled without user namespace support."); return -EOPNOTSUPP; } From af88764ff8fb64ff0762b0efd8856439c3e77988 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 14:12:27 +0200 Subject: [PATCH 08/10] units: turn on user namespace by default in systemd-nspawn@.service Now that user namespacing is supported in a pretty automatic way, actually turn it on by default if the systemd-nspawn@.service template is used. --- units/systemd-nspawn@.service.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in index 1927c4d4853..ea28941507d 100644 --- a/units/systemd-nspawn@.service.in +++ b/units/systemd-nspawn@.service.in @@ -13,7 +13,7 @@ Before=machines.target After=network.target [Service] -ExecStart=@bindir@/systemd-nspawn --quiet --keep-unit --boot --link-journal=try-guest --network-veth --settings=override --machine=%i +ExecStart=@bindir@/systemd-nspawn --quiet --keep-unit --boot --link-journal=try-guest --network-veth -U --settings=override --machine=%i KillMode=mixed Type=notify RestartForceExitStatus=133 From 88cd066e11aef5dd73b563c1753ad8bf4dfd9f62 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 22 Apr 2016 18:10:16 +0200 Subject: [PATCH 09/10] nspawn: don't try to patch UIDs/GIDs of procfs and suchlike --- src/basic/missing.h | 4 ++++ src/nspawn/nspawn-patch-uid.c | 44 +++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/basic/missing.h b/src/basic/missing.h index 6616f0b7205..b389e94cf71 100644 --- a/src/basic/missing.h +++ b/src/basic/missing.h @@ -445,6 +445,10 @@ struct btrfs_ioctl_quota_ctl_args { #define TMPFS_MAGIC 0x01021994 #endif +#ifndef MQUEUE_MAGIC +#define MQUEUE_MAGIC 0x19800202 +#endif + #ifndef MS_MOVE #define MS_MOVE 8192 #endif diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c index f53164fbbf6..429c45a3a77 100644 --- a/src/nspawn/nspawn-patch-uid.c +++ b/src/nspawn/nspawn-patch-uid.c @@ -18,16 +18,20 @@ ***/ #include +#include #ifdef HAVE_ACL #include #endif #include +#include #include #include "acl-util.h" #include "dirent-util.h" #include "fd-util.h" +#include "missing.h" #include "nspawn-patch-uid.h" +#include "stat-util.h" #include "stdio-util.h" #include "string-util.h" #include "strv.h" @@ -276,12 +280,46 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift return r > 0 || changed; } +static int is_procfs_sysfs_or_suchlike(int fd) { + struct statfs sfs; + + assert(fd >= 0); + + if (fstatfs(fd, &sfs) < 0) + return -errno; + + return F_TYPE_EQUAL(sfs.f_type, BINFMTFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, CGROUP_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, CGROUP2_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, DEBUGFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, DEVPTS_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, EFIVARFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, HUGETLBFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, MQUEUE_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, PROC_SUPER_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) || + F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC); +} + static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift) { bool changed = false; int r; assert(fd >= 0); + /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we + * probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's + * stop the recursion when we hit a procfs or sysfs file system. */ + r = is_procfs_sysfs_or_suchlike(fd); + if (r < 0) + goto finish; + if (r > 0) { + r = 0; /* don't recurse */ + goto finish; + } + r = patch_fd(fd, NULL, st, shift); if (r < 0) goto finish; @@ -294,8 +332,10 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift int copy; copy = fcntl(fd, F_DUPFD_CLOEXEC, 3); - if (copy < 0) - return -errno; + if (copy < 0) { + r = -errno; + goto finish; + } fd = copy; donate_fd = true; From 4aeb20f5aaec25ef969989b64d37377913b2a1ef Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 25 Apr 2016 12:48:05 +0200 Subject: [PATCH 10/10] nspawn: when readjusting UID/GID ownership of OS trees, skip read-only subtrees This should allow tools like rkt to pre-mount read-only subtrees in the OS tree, without breaking the patching code. Note that the code will still fail, if the top-level directory is already read-only. --- src/basic/fd-util.c | 10 ++++++++++ src/basic/fd-util.h | 2 ++ src/nspawn/nspawn-patch-uid.c | 18 +++++++++++++++--- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/basic/fd-util.c b/src/basic/fd-util.c index ec9560cd077..3d46d708c7b 100644 --- a/src/basic/fd-util.c +++ b/src/basic/fd-util.c @@ -25,11 +25,13 @@ #include #include "fd-util.h" +#include "fs-util.h" #include "macro.h" #include "missing.h" #include "parse-util.h" #include "path-util.h" #include "socket-util.h" +#include "stdio-util.h" #include "util.h" int close_nointr(int fd) { @@ -356,3 +358,11 @@ bool fdname_is_valid(const char *s) { return p - s < 256; } + +int fd_get_path(int fd, char **ret) { + char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int)]; + + xsprintf(procfs_path, "/proc/self/fd/%i", fd); + + return readlink_malloc(procfs_path, ret); +} diff --git a/src/basic/fd-util.h b/src/basic/fd-util.h index 44528c6e35f..b86e41698ac 100644 --- a/src/basic/fd-util.h +++ b/src/basic/fd-util.h @@ -72,6 +72,8 @@ void cmsg_close_all(struct msghdr *mh); bool fdname_is_valid(const char *s); +int fd_get_path(int fd, char **ret); + /* Hint: ENETUNREACH happens if we try to connect to "non-existing" special IP addresses, such as ::5 */ #define ERRNO_IS_DISCONNECT(r) \ IN_SET(r, ENOTCONN, ECONNRESET, ECONNREFUSED, ECONNABORTED, EPIPE, ENETUNREACH) diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c index 429c45a3a77..c7382d412d4 100644 --- a/src/nspawn/nspawn-patch-uid.c +++ b/src/nspawn/nspawn-patch-uid.c @@ -303,7 +303,7 @@ static int is_procfs_sysfs_or_suchlike(int fd) { F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC); } -static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift) { +static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) { bool changed = false; int r; @@ -321,6 +321,18 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift } r = patch_fd(fd, NULL, st, shift); + if (r == -EROFS) { + _cleanup_free_ char *name = NULL; + + if (!is_toplevel) { + /* When we hit a ready-only subtree we simply skip it, but log about it. */ + (void) fd_get_path(fd, &name); + log_debug("Skippping read-only file or directory %s.", strna(name)); + r = 0; + } + + goto finish; + } if (r < 0) goto finish; @@ -369,7 +381,7 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift } - r = recurse_fd(subdir_fd, true, &fst, shift); + r = recurse_fd(subdir_fd, true, &fst, shift, false); if (r < 0) goto finish; if (r > 0) @@ -433,7 +445,7 @@ static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t rang if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0) return 0; - return recurse_fd(fd, donate_fd, &st, shift); + return recurse_fd(fd, donate_fd, &st, shift, true); finish: if (donate_fd)