diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c index 855172c09c3..52f73844681 100644 --- a/src/nspawn/nspawn-register.c +++ b/src/nspawn/nspawn-register.c @@ -15,6 +15,7 @@ static int append_machine_properties( sd_bus_message *m, + bool enable_fuse, CustomMount *mounts, unsigned n_mounts, int kill_signal, @@ -40,6 +41,12 @@ static int append_machine_properties( "char-pts", "rw"); if (r < 0) return bus_log_create_error(r); + if (enable_fuse) { + r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1, + "/dev/fuse", "rw"); + if (r < 0) + return bus_log_create_error(r); + } for (j = 0; j < n_mounts; j++) { CustomMount *cm = mounts + j; @@ -200,6 +207,7 @@ int register_machine( r = append_machine_properties( m, + FLAGS_SET(flags, REGISTER_MACHINE_ENABLE_FUSE), mounts, n_mounts, kill_signal, @@ -320,6 +328,7 @@ int allocate_scope( r = append_machine_properties( m, + FLAGS_SET(flags, ALLOCATE_SCOPE_ENABLE_FUSE), mounts, n_mounts, kill_signal, diff --git a/src/nspawn/nspawn-register.h b/src/nspawn/nspawn-register.h index 0effb40aa01..5e187e33bb8 100644 --- a/src/nspawn/nspawn-register.h +++ b/src/nspawn/nspawn-register.h @@ -9,7 +9,8 @@ #include "nspawn-settings.h" typedef enum RegisterMachineFlags { - REGISTER_MACHINE_KEEP_UNIT = 1 << 0, + REGISTER_MACHINE_KEEP_UNIT = 1 << 0, + REGISTER_MACHINE_ENABLE_FUSE = 1 << 1, } RegisterMachineFlags; int register_machine( @@ -31,6 +32,7 @@ int unregister_machine(sd_bus *bus, const char *machine_name); typedef enum AllocateScopeFlags { ALLOCATE_SCOPE_ALLOW_PIDFD = 1 << 0, + ALLOCATE_SCOPE_ENABLE_FUSE = 1 << 1, } AllocateScopeFlags; int allocate_scope( diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 8a26333364c..f8bcf26b58d 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2,6 +2,7 @@ #include #include +#include #include #if HAVE_SELINUX #include @@ -2147,7 +2148,85 @@ static int setup_boot_id(void) { return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); } -static int copy_devnodes(const char *dest) { +static int get_fuse_version(uint32_t *ret_major, uint32_t *ret_minor) { + /* Must be called with mount privileges, either via arg_privileged or by being uid=0 in new + * CLONE_NEWUSER/CLONE_NEWNS namespaces. This is true when called from outer_child(). */ + ssize_t n; + _cleanup_close_ int fuse_fd = -EBADF, mnt_fd = -EBADF; + _cleanup_free_ char *opts = NULL; + union { + char unstructured[FUSE_MIN_READ_BUFFER]; + struct { + struct fuse_in_header header; + /* Don't use :`struct fuse_init_in` because a newer fuse.h might give + * us a bigger struct than what an older kernel actually gives us, and that would + * break our .header.len check. */ + struct { + uint32_t major; + uint32_t minor; + } body; + } structured; + } request; + + assert(ret_major); + assert(ret_minor); + + /* Get a FUSE handle. */ + fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR); + if (fuse_fd < 0) + return log_debug_errno(errno, "Failed to open /dev/fuse: %m"); + if (asprintf(&opts, "fd=%i,rootmode=40000,user_id=0,group_id=0", fuse_fd) < 0) + return log_oom_debug(); + mnt_fd = make_fsmount(LOG_DEBUG, "nspawn-fuse", "fuse.nspawn", 0, opts, -EBADF); + if (mnt_fd < 0) + return mnt_fd; + + /* Read a request from the FUSE handle. */ + n = read(fuse_fd, &request.unstructured, sizeof request); + if (n < 0) + return log_debug_errno(errno, "Failed to read /dev/fuse: %m"); + if ((size_t) n < sizeof request.structured.header || + (size_t) n < request.structured.header.len) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read /dev/fuse: Short read"); + + /* Assume that the request is a FUSE_INIT request, and return the version information from it. */ + if (request.structured.header.opcode != FUSE_INIT) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial request from /dev/fuse should have opcode=%i (FUSE_INIT), but has opcode=%"PRIu32, + FUSE_INIT, request.structured.header.opcode); + if (request.structured.header.len < sizeof request.structured) + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial FUSE_INIT request from /dev/fuse is too short"); + *ret_major = request.structured.body.major; + *ret_minor = request.structured.body.minor; + return 0; +} + +static bool should_enable_fuse(void) { + uint32_t fuse_major, fuse_minor; + int r; + + r = get_fuse_version(&fuse_major, &fuse_minor); + if (r < 0) { + if (ERRNO_IS_NEG_DEVICE_ABSENT(r)) + log_debug_errno(r, "Disabling FUSE: FUSE appears to be disabled on the host: %m"); + else if (r == -ENOSYS) + log_debug_errno(r, "Disabling FUSE: Kernel does not support the fsopen() family of syscalls: %m"); + else + log_warning_errno(r, "Disabling FUSE: Failed to determine FUSE version: %m"); + return false; + } + + /* FUSE is only userns-safe in FUSE version 7.27 and later. + * https://github.com/torvalds/linux/commit/da315f6e03988a7127680bbc26e1028991b899b8 */ + if (fuse_major < 7 || (fuse_major == 7 && fuse_minor < 27)) { + log_debug("Disabling FUSE: FUSE version %" PRIu32 ".%" PRIu32 " is too old to support user namespaces", + fuse_major, fuse_minor); + return false; + } + + return true; +} + +static int copy_devnodes(const char *dest, bool enable_fuse) { _cleanup_strv_free_ char **devnodes = NULL; int r = 0; @@ -2159,6 +2238,7 @@ static int copy_devnodes(const char *dest) { "random", "urandom", "tty", + STRV_IFNOTNULL(enable_fuse ? "fuse" : NULL), "net/tun"); if (!devnodes) return log_oom(); @@ -3807,7 +3887,7 @@ static int outer_child( _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL; _cleanup_strv_free_ char **os_release_pairs = NULL; _cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF; - bool idmap = false; + bool idmap = false, enable_fuse; const char *p; pid_t pid; ssize_t l; @@ -4090,7 +4170,12 @@ static int outer_child( if (r < 0) return r; - r = copy_devnodes(directory); + enable_fuse = should_enable_fuse(); + l = send(fd_outer_socket, &enable_fuse, sizeof enable_fuse, 0); + if (l < 0) + return log_error_errno(errno, "Failed to send whether to enable FUSE: %m"); + + r = copy_devnodes(directory, enable_fuse); if (r < 0) return r; @@ -5048,6 +5133,7 @@ static int run_container( ssize_t l; sigset_t mask_chld; _cleanup_close_ int child_netns_fd = -EBADF; + bool enable_fuse; assert_se(sigemptyset(&mask_chld) == 0); assert_se(sigaddset(&mask_chld, SIGCHLD) == 0); @@ -5234,6 +5320,12 @@ static int run_container( l, l == 0 ? " The child is most likely dead." : ""); } + l = recv(fd_outer_socket_pair[0], &enable_fuse, sizeof enable_fuse, 0); + if (l < 0) + return log_error_errno(errno, "Failed to read whether to enable FUSE: %m"); + if (l != sizeof enable_fuse) + return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading whether to enable FUSE."); + /* Wait for the outer child. */ r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL); if (r < 0) @@ -5386,6 +5478,7 @@ static int run_container( if (arg_register) { RegisterMachineFlags flags = 0; SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit); + SET_FLAG(flags, REGISTER_MACHINE_ENABLE_FUSE, enable_fuse); r = register_machine( bus, arg_machine, @@ -5406,6 +5499,7 @@ static int run_container( } else if (!arg_keep_unit) { AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD; + SET_FLAG(flags, ALLOCATE_SCOPE_ENABLE_FUSE, enable_fuse); r = allocate_scope( bus, arg_machine, diff --git a/test/units/TEST-13-NSPAWN.nspawn.sh b/test/units/TEST-13-NSPAWN.nspawn.sh index b0408e55b7e..655e53128ba 100755 --- a/test/units/TEST-13-NSPAWN.nspawn.sh +++ b/test/units/TEST-13-NSPAWN.nspawn.sh @@ -1112,4 +1112,75 @@ testcase_unpriv() { echo hello | cmp "$tmpdir/stdout.txt" - } +testcase_fuse() { + if [[ "$(cat <>/dev/fuse 2>&1)" != 'cat: -: Operation not permitted' ]]; then + echo "FUSE is not supported, skipping the test..." + return 0 + fi + + # Assume that the tests are running on a kernel that is new enough for FUSE + # to have user-namespace support; and so we should expect that nspawn + # enables FUSE. This test does not validate that the version check + # disables FUSE on old kernels. + + local root + + root="$(mktemp -d /var/lib/machines/TEST-13-NSPAWN.fuse.XXX)" + create_dummy_container "$root" + + # To avoid adding any complex dependencies to the test, we simply check + # that /dev/fuse can be opened for reading and writing (O_RDWR), but that + # actually reading from it fails with EPERM. This can be done with a + # simple Bash script: run `cat <>/dev/fuse` and if the EPERM error message + # comes from "bash" then we know it couldn't be opened, while if it comes + # from "cat" then we know that it was opened but not read. If we are able + # to read from the file, then this indicates that it's not a real FUSE + # device (which requires us to mount a type="fuse" filesystem with the + # option string "fd=${num}" for /dev/fuse FD before reading from it will + # return anything other than EPERM); if this happens then most likely + # nspawn didn't create the file at all and Bash "<>" simply created a new + # normal file. + # + # "cat: -: Operation not permitted" # pass the test; opened but not read + # "bash: line 1: /dev/fuse: Operation not permitted" # fail the test; could not open + # "" # fail the test; reading worked + [[ "$(systemd-nspawn --pipe --directory="$root" \ + bash -c 'cat <>/dev/fuse' 2>&1)" == 'cat: -: Operation not permitted' ]] + + rm -fr "$root" +} + +testcase_unpriv_fuse() { + # Same as above, but for unprivileged operation. + + if [[ "$(cat <>/dev/fuse 2>&1)" != 'cat: -: Operation not permitted' ]]; then + echo "FUSE is not supported, skipping the test..." + return 0 + fi + if ! can_do_rootless_nspawn; then + echo "Skipping rootless test..." + return 0 + fi + + local tmpdir name + tmpdir="$(mktemp -d /var/tmp/TEST-13-NSPAWN.unpriv-fuse.XXX)" + # $name must be such that len("ns-$(id -u testuser)-nspawn-${name}-65535") + # <= 31, or nsresourced will reject the request for a namespace. + # Therefore; len($name) <= 10 bytes. + name="ufuse-${tmpdir##*.}" + trap 'rm -fr ${tmpdir@Q} || true; rm -f /run/verity.d/test-13-nspawn-${name@Q} || true' RETURN ERR + create_dummy_ddi "$tmpdir" "$name" + chown --recursive testuser: "$tmpdir" + + [[ "$(systemd-run \ + --pipe \ + --uid=testuser \ + --property=Delegate=yes \ + --setenv=SYSTEMD_LOG_LEVEL \ + --setenv=SYSTEMD_LOG_TARGET \ + -- \ + systemd-nspawn --pipe --private-network --register=no --keep-unit --image="$tmpdir/$name.raw" \ + bash -c 'cat <>/dev/fuse' 2>&1)" == *'cat: -: Operation not permitted' ]] +} + run_testcases diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in index ff66d4090ac..c2f21c6cbba 100644 --- a/units/systemd-nspawn@.service.in +++ b/units/systemd-nspawn@.service.in @@ -30,12 +30,16 @@ CoredumpReceive=yes TasksMax=16384 {{SERVICE_WATCHDOG}} -{# Enforce a strict device policy, similar to the one nspawn configures when it - # allocates its own scope unit. Make sure to keep these policies in sync if you - # change them! #} +{# Enforce a strict device policy, similar to the one nspawn configures (in + # nspawn-register.c:append_machine_properties()) when it allocates its own + # scope unit. Make sure to keep these policies in sync if you change them! #} DevicePolicy=closed DeviceAllow=/dev/net/tun rwm DeviceAllow=char-pts rw +{# /dev/fuse gets 'm' here even though it doesn't in nspawn-register.c, since + # efedb6b0f3 (nspawn: refuse to bind mount device node from host when + # --private-users= is specified, 2024-09-05) #} +DeviceAllow=/dev/fuse rwm # nspawn itself needs access to /dev/loop-control and /dev/loop, to implement # the --image= option. Add these here, too.