1
0
mirror of https://github.com/systemd/systemd.git synced 2024-11-01 17:51:22 +03:00

nspawn: make recursive chown()ing logic safe for being aborted in the middle

We currently use the ownership of the top-level directory as a hint
whether we need to descent into the whole tree to chown() it recursively
or not. This is problematic with the previous chown()ing algorithm, as
when descending into the tree we'd first chown() and then descend
further down, which meant that the top-level directory would be chowned
first, and an aborted recursive chowning would appear on the next
invocation as successful, even though it was not. Let's reshuffle things
a bit, to make the re-chown()ing safe regarding interruptions:

a) We chown() the dir we are looking at last, and descent into all its
   children first. That way we know that if the top-level dir is
   properly owned everything inside of it is properly owned too.

b) Before starting a chown()ing operation, we mark the top-level
   directory as owned by a special "busy" UID range, which we can use to
   recognize whether a tree was fully chowned: if it is marked as busy,
   it's definitely not fully chowned, as the busy ownership will only be
   fixed as final step of the chowning.

Fixes: #6292
This commit is contained in:
Lennart Poettering 2017-11-16 19:09:32 +01:00
parent 14f8ccc755
commit 3603efdea5
4 changed files with 121 additions and 63 deletions

View File

@ -1,25 +1,26 @@
systemd_nspawn_sources = files('''
nspawn.c
nspawn-settings.c
nspawn-settings.h
nspawn-cgroup.c
nspawn-cgroup.h
nspawn-def.h
nspawn-expose-ports.c
nspawn-expose-ports.h
nspawn-mount.c
nspawn-mount.h
nspawn-network.c
nspawn-network.h
nspawn-expose-ports.c
nspawn-expose-ports.h
nspawn-cgroup.c
nspawn-cgroup.h
nspawn-seccomp.c
nspawn-seccomp.h
nspawn-patch-uid.c
nspawn-patch-uid.h
nspawn-register.c
nspawn-register.h
nspawn-seccomp.c
nspawn-seccomp.h
nspawn-settings.c
nspawn-settings.h
nspawn-setuid.c
nspawn-setuid.h
nspawn-stub-pid1.c
nspawn-stub-pid1.h
nspawn-patch-uid.c
nspawn-patch-uid.h
nspawn.c
'''.split())
nspawn_gperf_c = custom_target(

33
src/nspawn/nspawn-def.h Normal file
View File

@ -0,0 +1,33 @@
#pragma once
/***
This file is part of systemd.
Copyright 2017 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <sys/types.h>
/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
* UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
* may have their own allocation ranges too. */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
/* While we are chmod()ing a directory tree, we set the top-level UID base to this "busy" base, so that we can always
* recognize trees we are were chmod()ing recursively and got interrupted in */
#define UID_BUSY_BASE ((uid_t) UINT32_C(0xFFFE0000))
#define UID_BUSY_MASK ((uid_t) UINT32_C(0xFFFF0000))

View File

@ -23,13 +23,16 @@
#include <sys/acl.h>
#endif
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/vfs.h>
#include <unistd.h>
#include "acl-util.h"
#include "dirent-util.h"
#include "fd-util.h"
#include "fs-util.h"
#include "missing.h"
#include "nspawn-def.h"
#include "nspawn-patch-uid.h"
#include "stat-util.h"
#include "stdio-util.h"
@ -289,42 +292,44 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift
* user namespaces, however their inodes may relate to host resources or only
* valid in the global user namespace, therefore no patching should be applied.
*/
static int is_fs_fully_userns_compatible(int fd) {
static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
assert(sfs);
return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
}
static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
_cleanup_closedir_ DIR *d = NULL;
bool changed = false;
struct statfs sfs;
int r;
assert(fd >= 0);
if (fstatfs(fd, &sfs) < 0)
return -errno;
return F_TYPE_EQUAL(sfs.f_type, BINFMTFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, CGROUP_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, CGROUP2_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, DEBUGFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, DEVPTS_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, EFIVARFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, HUGETLBFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, MQUEUE_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, PROC_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC);
}
/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
* shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
* when we hit procfs, sysfs or some other special file systems. */
static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
bool changed = false;
int r;
assert(fd >= 0);
/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we
* probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's
* stop the recursion when we hit procfs, sysfs or some other special file systems. */
r = is_fs_fully_userns_compatible(fd);
r = is_fs_fully_userns_compatible(&sfs);
if (r < 0)
goto finish;
if (r > 0) {
@ -332,26 +337,12 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
goto finish;
}
r = patch_fd(fd, NULL, st, shift);
if (r == -EROFS) {
_cleanup_free_ char *name = NULL;
if (!is_toplevel) {
/* When we hit a ready-only subtree we simply skip it, but log about it. */
(void) fd_get_path(fd, &name);
log_debug("Skippping read-only file or directory %s.", strna(name));
r = 0;
}
goto finish;
}
if (r < 0)
goto finish;
if (r > 0)
changed = true;
/* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
if ((sfs.f_flags & ST_RDONLY) ||
access_fd(fd, W_OK) == -EROFS)
goto read_only;
if (S_ISDIR(st->st_mode)) {
_cleanup_closedir_ DIR *d = NULL;
struct dirent *de;
if (!donate_fd) {
@ -411,7 +402,27 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
}
}
/* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
* directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
* tree is properly chown()ed already. */
r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
if (r == -EROFS)
goto read_only;
if (r > 0)
changed = true;
r = changed;
goto finish;
read_only:
if (!is_toplevel) {
_cleanup_free_ char *name = NULL;
/* When we hit a ready-only subtree we simply skip it, but log about it. */
(void) fd_get_path(fd, &name);
log_debug("Skippping read-only file or directory %s.", strna(name));
r = changed;
}
finish:
if (donate_fd)
@ -437,6 +448,11 @@ static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t rang
goto finish;
}
if (shift == UID_BUSY_BASE) {
r = -EINVAL;
goto finish;
}
if (range != 0x10000) {
/* We only support containers with 16bit UID ranges for the patching logic */
r = -EOPNOTSUPP;
@ -459,6 +475,19 @@ static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t rang
if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
return 0;
/* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
* range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
* chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
if (fchown(fd,
UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
(gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
r = -errno;
goto finish;
}
}
return recurse_fd(fd, donate_fd, &st, shift, true);
finish:

View File

@ -77,6 +77,7 @@
#include "mount-util.h"
#include "netlink-util.h"
#include "nspawn-cgroup.h"
#include "nspawn-def.h"
#include "nspawn-expose-ports.h"
#include "nspawn-mount.h"
#include "nspawn-network.h"
@ -106,12 +107,6 @@
#include "user-util.h"
#include "util.h"
/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
* UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
* may have their own allocation ranges too. */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
* nspawn_notify_socket_path is relative to the container
* the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */