From 994a6364d2dfcf5fa11ec26e81752fbe842428aa Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 16 Nov 2017 18:05:42 +0100
Subject: [PATCH 1/4] man: document how nspawn's --bind= and --private-users
 interact

Fixes: #5900
---
 man/systemd-nspawn.xml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml
index 98ce1529de..1ef6567e48 100644
--- a/man/systemd-nspawn.xml
+++ b/man/systemd-nspawn.xml
@@ -806,7 +806,13 @@
         <option>norbind</option> are allowed, controlling whether to create a recursive or a regular bind
         mount. Defaults to "rbind". Backslash escapes are interpreted, so <literal>\:</literal> may be used to embed
         colons in either path.  This option may be specified multiple times for creating multiple independent bind
-        mount points. The <option>--bind-ro=</option> option creates read-only bind mounts.</para></listitem>
+        mount points. The <option>--bind-ro=</option> option creates read-only bind mounts.</para>
+
+        <para>Note that when this option is used in combination with <option>--private-users</option>, the resulting
+        mount points will be owned by the <constant>nobody</constant> user. That's because the mount and its files and
+        directories continue to be owned by the relevant host users and groups, which do not exist in the container,
+        and thus show up under the wildcard UID 65534 (nobody). If such bind mounts are created, it is recommended to
+        make them read-only, using <option>--bind-ro=</option>.</para></listitem>
       </varlistentry>
 
       <varlistentry>

From 57a4359ee0bf65c16336e8157f0f9d2b084ddac3 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 16 Nov 2017 18:56:25 +0100
Subject: [PATCH 2/4] fs-util: add access_fd() which is like access() but for
 fds

Linux doesn't have faccess(), hence let's emulate it. Linux has access()
and faccessat() but neither allows checking the access rights of an fd
passed in directly.
---
 src/basic/fs-util.c     | 16 +++++++++++++++-
 src/basic/fs-util.h     |  2 ++
 src/test/test-fs-util.c | 27 +++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/basic/fs-util.c b/src/basic/fs-util.c
index 946f779a32..ff8b4e8747 100644
--- a/src/basic/fs-util.c
+++ b/src/basic/fs-util.c
@@ -104,7 +104,6 @@ int rmdir_parents(const char *path, const char *stop) {
         return 0;
 }
 
-
 int rename_noreplace(int olddirfd, const char *oldpath, int newdirfd, const char *newpath) {
         struct stat buf;
         int ret;
@@ -809,3 +808,18 @@ int chase_symlinks(const char *path, const char *original_root, unsigned flags,
 
         return exists;
 }
+
+int access_fd(int fd, int mode) {
+        char p[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(fd) + 1];
+        int r;
+
+        /* Like access() but operates on an already open fd */
+
+        xsprintf(p, "/proc/self/fd/%i", fd);
+
+        r = access(p, mode);
+        if (r < 0)
+                r = -errno;
+
+        return r;
+}
diff --git a/src/basic/fs-util.h b/src/basic/fs-util.h
index d3342d5cda..9849522f5a 100644
--- a/src/basic/fs-util.h
+++ b/src/basic/fs-util.h
@@ -98,3 +98,5 @@ static inline void unlink_and_free(char *p) {
         free(p);
 }
 DEFINE_TRIVIAL_CLEANUP_FUNC(char*, unlink_and_free);
+
+int access_fd(int fd, int mode);
diff --git a/src/test/test-fs-util.c b/src/test/test-fs-util.c
index a5871a25b0..e8fe2b3ff1 100644
--- a/src/test/test-fs-util.c
+++ b/src/test/test-fs-util.c
@@ -315,6 +315,32 @@ static void test_dot_or_dot_dot(void) {
         assert_se(!dot_or_dot_dot("..foo"));
 }
 
+static void test_access_fd(void) {
+        _cleanup_(rmdir_and_freep) char *p = NULL;
+        _cleanup_close_ int fd = -1;
+
+        assert_se(mkdtemp_malloc("/tmp/access-fd.XXXXXX", &p) >= 0);
+
+        fd = open(p, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+        assert_se(fd >= 0);
+
+        assert_se(access_fd(fd, R_OK) >= 0);
+        assert_se(access_fd(fd, F_OK) >= 0);
+        assert_se(access_fd(fd, W_OK) >= 0);
+
+        assert_se(fchmod(fd, 0000) >= 0);
+
+        assert_se(access_fd(fd, F_OK) >= 0);
+
+        if (geteuid() == 0) {
+                assert_se(access_fd(fd, R_OK) >= 0);
+                assert_se(access_fd(fd, W_OK) >= 0);
+        } else {
+                assert_se(access_fd(fd, R_OK) == -EACCES);
+                assert_se(access_fd(fd, W_OK) == -EACCES);
+        }
+}
+
 int main(int argc, char *argv[]) {
         test_unlink_noerrno();
         test_get_files_in_directory();
@@ -322,6 +348,7 @@ int main(int argc, char *argv[]) {
         test_var_tmp();
         test_chase_symlinks();
         test_dot_or_dot_dot();
+        test_access_fd();
 
         return 0;
 }

From 14f8ccc755e396bbcc7500f2f6157c4ba28305ef Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 16 Nov 2017 18:58:18 +0100
Subject: [PATCH 3/4] nspawn: add missing #pragma once to header file

---
 src/nspawn/nspawn-patch-uid.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nspawn/nspawn-patch-uid.h b/src/nspawn/nspawn-patch-uid.h
index 55d0990016..ff6a7b7647 100644
--- a/src/nspawn/nspawn-patch-uid.h
+++ b/src/nspawn/nspawn-patch-uid.h
@@ -1,3 +1,5 @@
+#pragma once
+
 /***
   This file is part of systemd.
 

From 3603efdea5f87e28604c4cdb02d298e392b7e3a5 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Thu, 16 Nov 2017 19:09:32 +0100
Subject: [PATCH 4/4] nspawn: make recursive chown()ing logic safe for being
 aborted in the middle

We currently use the ownership of the top-level directory as a hint
whether we need to descent into the whole tree to chown() it recursively
or not. This is problematic with the previous chown()ing algorithm, as
when descending into the tree we'd first chown() and then descend
further down, which meant that the top-level directory would be chowned
first, and an aborted recursive chowning would appear on the next
invocation as successful, even though it was not. Let's reshuffle things
a bit, to make the re-chown()ing safe regarding interruptions:

a) We chown() the dir we are looking at last, and descent into all its
   children first. That way we know that if the top-level dir is
   properly owned everything inside of it is properly owned too.

b) Before starting a chown()ing operation, we mark the top-level
   directory as owned by a special "busy" UID range, which we can use to
   recognize whether a tree was fully chowned: if it is marked as busy,
   it's definitely not fully chowned, as the busy ownership will only be
   fixed as final step of the chowning.

Fixes: #6292
---
 src/nspawn/meson.build        |  23 +++----
 src/nspawn/nspawn-def.h       |  33 ++++++++++
 src/nspawn/nspawn-patch-uid.c | 121 +++++++++++++++++++++-------------
 src/nspawn/nspawn.c           |   7 +-
 4 files changed, 121 insertions(+), 63 deletions(-)
 create mode 100644 src/nspawn/nspawn-def.h

diff --git a/src/nspawn/meson.build b/src/nspawn/meson.build
index b6ac6006ab..7f54ebc6db 100644
--- a/src/nspawn/meson.build
+++ b/src/nspawn/meson.build
@@ -1,25 +1,26 @@
 systemd_nspawn_sources = files('''
-        nspawn.c
-        nspawn-settings.c
-        nspawn-settings.h
+        nspawn-cgroup.c
+        nspawn-cgroup.h
+        nspawn-def.h
+        nspawn-expose-ports.c
+        nspawn-expose-ports.h
         nspawn-mount.c
         nspawn-mount.h
         nspawn-network.c
         nspawn-network.h
-        nspawn-expose-ports.c
-        nspawn-expose-ports.h
-        nspawn-cgroup.c
-        nspawn-cgroup.h
-        nspawn-seccomp.c
-        nspawn-seccomp.h
+        nspawn-patch-uid.c
+        nspawn-patch-uid.h
         nspawn-register.c
         nspawn-register.h
+        nspawn-seccomp.c
+        nspawn-seccomp.h
+        nspawn-settings.c
+        nspawn-settings.h
         nspawn-setuid.c
         nspawn-setuid.h
         nspawn-stub-pid1.c
         nspawn-stub-pid1.h
-        nspawn-patch-uid.c
-        nspawn-patch-uid.h
+        nspawn.c
 '''.split())
 
 nspawn_gperf_c = custom_target(
diff --git a/src/nspawn/nspawn-def.h b/src/nspawn/nspawn-def.h
new file mode 100644
index 0000000000..fc3c94064c
--- /dev/null
+++ b/src/nspawn/nspawn-def.h
@@ -0,0 +1,33 @@
+#pragma once
+
+/***
+  This file is part of systemd.
+
+  Copyright 2017 Lennart Poettering
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/types.h>
+
+/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
+ * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
+ * may have their own allocation ranges too. */
+#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
+#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
+
+/* While we are chmod()ing a directory tree, we set the top-level UID base to this "busy" base, so that we can always
+ * recognize trees we are were chmod()ing recursively and got interrupted in */
+#define UID_BUSY_BASE ((uid_t) UINT32_C(0xFFFE0000))
+#define UID_BUSY_MASK ((uid_t) UINT32_C(0xFFFF0000))
diff --git a/src/nspawn/nspawn-patch-uid.c b/src/nspawn/nspawn-patch-uid.c
index 063fdb1053..7857b453d6 100644
--- a/src/nspawn/nspawn-patch-uid.c
+++ b/src/nspawn/nspawn-patch-uid.c
@@ -23,13 +23,16 @@
 #include <sys/acl.h>
 #endif
 #include <sys/stat.h>
+#include <sys/statvfs.h>
 #include <sys/vfs.h>
 #include <unistd.h>
 
 #include "acl-util.h"
 #include "dirent-util.h"
 #include "fd-util.h"
+#include "fs-util.h"
 #include "missing.h"
+#include "nspawn-def.h"
 #include "nspawn-patch-uid.h"
 #include "stat-util.h"
 #include "stdio-util.h"
@@ -289,42 +292,44 @@ static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift
  * user namespaces, however their inodes may relate to host resources or only
  * valid in the global user namespace, therefore no patching should be applied.
  */
-static int is_fs_fully_userns_compatible(int fd) {
+static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
+
+        assert(sfs);
+
+        return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
+               F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
+}
+
+static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
+        _cleanup_closedir_ DIR *d = NULL;
+        bool changed = false;
         struct statfs sfs;
+        int r;
 
         assert(fd >= 0);
 
         if (fstatfs(fd, &sfs) < 0)
                 return -errno;
 
-        return F_TYPE_EQUAL(sfs.f_type, BINFMTFS_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, CGROUP_SUPER_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, CGROUP2_SUPER_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, DEBUGFS_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, DEVPTS_SUPER_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, EFIVARFS_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, HUGETLBFS_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, MQUEUE_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, PROC_SUPER_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, SECURITYFS_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, BPF_FS_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, TRACEFS_MAGIC) ||
-               F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC);
-}
+        /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
+         * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
+         * when we hit procfs, sysfs or some other special file systems. */
 
-static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
-        bool changed = false;
-        int r;
-
-        assert(fd >= 0);
-
-        /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we
-         * probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's
-         * stop the recursion when we hit procfs, sysfs or some other special file systems. */
-        r = is_fs_fully_userns_compatible(fd);
+        r = is_fs_fully_userns_compatible(&sfs);
         if (r < 0)
                 goto finish;
         if (r > 0) {
@@ -332,26 +337,12 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
                 goto finish;
         }
 
-        r = patch_fd(fd, NULL, st, shift);
-        if (r == -EROFS) {
-                _cleanup_free_ char *name = NULL;
-
-                if (!is_toplevel) {
-                        /* When we hit a ready-only subtree we simply skip it, but log about it. */
-                        (void) fd_get_path(fd, &name);
-                        log_debug("Skippping read-only file or directory %s.", strna(name));
-                        r = 0;
-                }
-
-                goto finish;
-        }
-        if (r < 0)
-                goto finish;
-        if (r > 0)
-                changed = true;
+        /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
+        if ((sfs.f_flags & ST_RDONLY) ||
+            access_fd(fd, W_OK) == -EROFS)
+                goto read_only;
 
         if (S_ISDIR(st->st_mode)) {
-                _cleanup_closedir_ DIR *d = NULL;
                 struct dirent *de;
 
                 if (!donate_fd) {
@@ -411,7 +402,27 @@ static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift
                 }
         }
 
+        /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
+         * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
+         * tree is properly chown()ed already. */
+        r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
+        if (r == -EROFS)
+                goto read_only;
+        if (r > 0)
+                changed = true;
+
         r = changed;
+        goto finish;
+
+read_only:
+        if (!is_toplevel) {
+                _cleanup_free_ char *name = NULL;
+
+                /* When we hit a ready-only subtree we simply skip it, but log about it. */
+                (void) fd_get_path(fd, &name);
+                log_debug("Skippping read-only file or directory %s.", strna(name));
+                r = changed;
+        }
 
 finish:
         if (donate_fd)
@@ -437,6 +448,11 @@ static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t rang
                 goto finish;
         }
 
+        if (shift == UID_BUSY_BASE) {
+                r = -EINVAL;
+                goto finish;
+        }
+
         if (range != 0x10000) {
                 /* We only support containers with 16bit UID ranges for the patching logic */
                 r = -EOPNOTSUPP;
@@ -459,6 +475,19 @@ static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t rang
         if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
                 return 0;
 
+        /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
+         * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
+         * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
+
+        if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
+                if (fchown(fd,
+                           UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
+                           (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
+                        r = -errno;
+                        goto finish;
+                }
+        }
+
         return recurse_fd(fd, donate_fd, &st, shift, true);
 
 finish:
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index de72c37452..e946bf80b6 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -77,6 +77,7 @@
 #include "mount-util.h"
 #include "netlink-util.h"
 #include "nspawn-cgroup.h"
+#include "nspawn-def.h"
 #include "nspawn-expose-ports.h"
 #include "nspawn-mount.h"
 #include "nspawn-network.h"
@@ -106,12 +107,6 @@
 #include "user-util.h"
 #include "util.h"
 
-/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
- * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems
- * may have their own allocation ranges too. */
-#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
-#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
-
 /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
  * nspawn_notify_socket_path is relative to the container
  * the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */