1
0
mirror of https://github.com/systemd/systemd.git synced 2025-01-09 01:18:19 +03:00

nspawn: enable FUSE in containers

Linux kernel v4.18 (2018-08-12) added user-namespace support to FUSE, and
bumped the FUSE version to 7.27 (see: da315f6e0398 (Merge tag
'fuse-update-4.18' of
git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse, Linus Torvalds,
2018-06-07).  This means that on such kernels it is safe to enable FUSE in
nspawn containers.

In outer_child(), before calling copy_devnodes(), check the FUSE version to
decide whether enable (>=7.27) or disable (<7.27) FUSE in the container.  We
look at the FUSE version instead of the kernel version in order to enable FUSE
support on older-versioned kernels that may have the mentioned patchset
backported ([as requested by @poettering][1]).  However, I am not sure that
this is safe; user-namespace support is not a documented part of the FUSE
protocol, which is what FUSE_KERNEL_VERSION/FUSE_KERNEL_MINOR_VERSION are meant
to capture.  While the same patchset
 - added FUSE_ABORT_ERROR (which is all that the 7.27 version bump
   is documented as including),
 - bumped FUSE_KERNEL_MINOR_VERSION from 26 to 27, and
 - added user-namespace support
these 3 things are not inseparable; it is conceivable to me that a backport
could include the first 2 of those things and exclude the 3rd; perhaps it would
be safer to check the kernel version.

Do note that our get_fuse_version() function uses the fsopen() family of
syscalls, which were not added until Linux kernel v5.2 (2019-07-07); so if
nothing has been backported, then the minimum kernel version for FUSE-in-nspawn
is actually v5.2, not v4.18.

Pass whether or not to enable FUSE to copy_devnodes(); have copy_devnodes()
copy in /dev/fuse if enabled.

Pass whether or not to enable FUSE back over fd_outer_socket to run_container()
so that it can pass that to append_machine_properties() (via either
register_machine() or allocate_scope()); have append_machine_properties()
append "DeviceAllow=/dev/fuse rw" if enabled.

For testing, simply check that /dev/fuse can be opened for reading and writing,
but that actually reading from it fails with EPERM.  The test assumes that if
FUSE is supported (/dev/fuse exists), then the testsuite is running on a kernel
with FUSE >= 7.27; I am unsure how to go about writing a test that validates
that the version check disables FUSE on old kernels.

[1]: https://github.com/systemd/systemd/issues/17607#issuecomment-745418835

Closes #17607
This commit is contained in:
Luke T. Shumaker 2024-08-21 17:29:10 -06:00
parent f1bf6054ce
commit dc3223919f
5 changed files with 187 additions and 7 deletions

View File

@ -15,6 +15,7 @@
static int append_machine_properties(
sd_bus_message *m,
bool enable_fuse,
CustomMount *mounts,
unsigned n_mounts,
int kill_signal,
@ -40,6 +41,12 @@ static int append_machine_properties(
"char-pts", "rw");
if (r < 0)
return bus_log_create_error(r);
if (enable_fuse) {
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
"/dev/fuse", "rw");
if (r < 0)
return bus_log_create_error(r);
}
for (j = 0; j < n_mounts; j++) {
CustomMount *cm = mounts + j;
@ -200,6 +207,7 @@ int register_machine(
r = append_machine_properties(
m,
FLAGS_SET(flags, REGISTER_MACHINE_ENABLE_FUSE),
mounts,
n_mounts,
kill_signal,
@ -320,6 +328,7 @@ int allocate_scope(
r = append_machine_properties(
m,
FLAGS_SET(flags, ALLOCATE_SCOPE_ENABLE_FUSE),
mounts,
n_mounts,
kill_signal,

View File

@ -10,6 +10,7 @@
typedef enum RegisterMachineFlags {
REGISTER_MACHINE_KEEP_UNIT = 1 << 0,
REGISTER_MACHINE_ENABLE_FUSE = 1 << 1,
} RegisterMachineFlags;
int register_machine(
@ -31,6 +32,7 @@ int unregister_machine(sd_bus *bus, const char *machine_name);
typedef enum AllocateScopeFlags {
ALLOCATE_SCOPE_ALLOW_PIDFD = 1 << 0,
ALLOCATE_SCOPE_ENABLE_FUSE = 1 << 1,
} AllocateScopeFlags;
int allocate_scope(

View File

@ -2,6 +2,7 @@
#include <errno.h>
#include <getopt.h>
#include <linux/fuse.h>
#include <linux/loop.h>
#if HAVE_SELINUX
#include <selinux/selinux.h>
@ -2147,7 +2148,85 @@ static int setup_boot_id(void) {
return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
}
static int copy_devnodes(const char *dest) {
static int get_fuse_version(uint32_t *ret_major, uint32_t *ret_minor) {
/* Must be called with mount privileges, either via arg_privileged or by being uid=0 in new
* CLONE_NEWUSER/CLONE_NEWNS namespaces. This is true when called from outer_child(). */
ssize_t n;
_cleanup_close_ int fuse_fd = -EBADF, mnt_fd = -EBADF;
_cleanup_free_ char *opts = NULL;
union {
char unstructured[FUSE_MIN_READ_BUFFER];
struct {
struct fuse_in_header header;
/* Don't use <linux/fuse.h>:`struct fuse_init_in` because a newer fuse.h might give
* us a bigger struct than what an older kernel actually gives us, and that would
* break our .header.len check. */
struct {
uint32_t major;
uint32_t minor;
} body;
} structured;
} request;
assert(ret_major);
assert(ret_minor);
/* Get a FUSE handle. */
fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR);
if (fuse_fd < 0)
return log_debug_errno(errno, "Failed to open /dev/fuse: %m");
if (asprintf(&opts, "fd=%i,rootmode=40000,user_id=0,group_id=0", fuse_fd) < 0)
return log_oom_debug();
mnt_fd = make_fsmount(LOG_DEBUG, "nspawn-fuse", "fuse.nspawn", 0, opts, -EBADF);
if (mnt_fd < 0)
return mnt_fd;
/* Read a request from the FUSE handle. */
n = read(fuse_fd, &request.unstructured, sizeof request);
if (n < 0)
return log_debug_errno(errno, "Failed to read /dev/fuse: %m");
if ((size_t) n < sizeof request.structured.header ||
(size_t) n < request.structured.header.len)
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read /dev/fuse: Short read");
/* Assume that the request is a FUSE_INIT request, and return the version information from it. */
if (request.structured.header.opcode != FUSE_INIT)
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial request from /dev/fuse should have opcode=%i (FUSE_INIT), but has opcode=%"PRIu32,
FUSE_INIT, request.structured.header.opcode);
if (request.structured.header.len < sizeof request.structured)
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial FUSE_INIT request from /dev/fuse is too short");
*ret_major = request.structured.body.major;
*ret_minor = request.structured.body.minor;
return 0;
}
static bool should_enable_fuse(void) {
uint32_t fuse_major, fuse_minor;
int r;
r = get_fuse_version(&fuse_major, &fuse_minor);
if (r < 0) {
if (ERRNO_IS_NEG_DEVICE_ABSENT(r))
log_debug_errno(r, "Disabling FUSE: FUSE appears to be disabled on the host: %m");
else if (r == -ENOSYS)
log_debug_errno(r, "Disabling FUSE: Kernel does not support the fsopen() family of syscalls: %m");
else
log_warning_errno(r, "Disabling FUSE: Failed to determine FUSE version: %m");
return false;
}
/* FUSE is only userns-safe in FUSE version 7.27 and later.
* https://github.com/torvalds/linux/commit/da315f6e03988a7127680bbc26e1028991b899b8 */
if (fuse_major < 7 || (fuse_major == 7 && fuse_minor < 27)) {
log_debug("Disabling FUSE: FUSE version %" PRIu32 ".%" PRIu32 " is too old to support user namespaces",
fuse_major, fuse_minor);
return false;
}
return true;
}
static int copy_devnodes(const char *dest, bool enable_fuse) {
_cleanup_strv_free_ char **devnodes = NULL;
int r = 0;
@ -2159,6 +2238,7 @@ static int copy_devnodes(const char *dest) {
"random",
"urandom",
"tty",
STRV_IFNOTNULL(enable_fuse ? "fuse" : NULL),
"net/tun");
if (!devnodes)
return log_oom();
@ -3807,7 +3887,7 @@ static int outer_child(
_cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
_cleanup_strv_free_ char **os_release_pairs = NULL;
_cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
bool idmap = false;
bool idmap = false, enable_fuse;
const char *p;
pid_t pid;
ssize_t l;
@ -4090,7 +4170,12 @@ static int outer_child(
if (r < 0)
return r;
r = copy_devnodes(directory);
enable_fuse = should_enable_fuse();
l = send(fd_outer_socket, &enable_fuse, sizeof enable_fuse, 0);
if (l < 0)
return log_error_errno(errno, "Failed to send whether to enable FUSE: %m");
r = copy_devnodes(directory, enable_fuse);
if (r < 0)
return r;
@ -5048,6 +5133,7 @@ static int run_container(
ssize_t l;
sigset_t mask_chld;
_cleanup_close_ int child_netns_fd = -EBADF;
bool enable_fuse;
assert_se(sigemptyset(&mask_chld) == 0);
assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
@ -5234,6 +5320,12 @@ static int run_container(
l, l == 0 ? " The child is most likely dead." : "");
}
l = recv(fd_outer_socket_pair[0], &enable_fuse, sizeof enable_fuse, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read whether to enable FUSE: %m");
if (l != sizeof enable_fuse)
return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading whether to enable FUSE.");
/* Wait for the outer child. */
r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
if (r < 0)
@ -5386,6 +5478,7 @@ static int run_container(
if (arg_register) {
RegisterMachineFlags flags = 0;
SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit);
SET_FLAG(flags, REGISTER_MACHINE_ENABLE_FUSE, enable_fuse);
r = register_machine(
bus,
arg_machine,
@ -5406,6 +5499,7 @@ static int run_container(
} else if (!arg_keep_unit) {
AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD;
SET_FLAG(flags, ALLOCATE_SCOPE_ENABLE_FUSE, enable_fuse);
r = allocate_scope(
bus,
arg_machine,

View File

@ -1112,4 +1112,75 @@ testcase_unpriv() {
echo hello | cmp "$tmpdir/stdout.txt" -
}
testcase_fuse() {
if [[ "$(cat <>/dev/fuse 2>&1)" != 'cat: -: Operation not permitted' ]]; then
echo "FUSE is not supported, skipping the test..."
return 0
fi
# Assume that the tests are running on a kernel that is new enough for FUSE
# to have user-namespace support; and so we should expect that nspawn
# enables FUSE. This test does not validate that the version check
# disables FUSE on old kernels.
local root
root="$(mktemp -d /var/lib/machines/TEST-13-NSPAWN.fuse.XXX)"
create_dummy_container "$root"
# To avoid adding any complex dependencies to the test, we simply check
# that /dev/fuse can be opened for reading and writing (O_RDWR), but that
# actually reading from it fails with EPERM. This can be done with a
# simple Bash script: run `cat <>/dev/fuse` and if the EPERM error message
# comes from "bash" then we know it couldn't be opened, while if it comes
# from "cat" then we know that it was opened but not read. If we are able
# to read from the file, then this indicates that it's not a real FUSE
# device (which requires us to mount a type="fuse" filesystem with the
# option string "fd=${num}" for /dev/fuse FD before reading from it will
# return anything other than EPERM); if this happens then most likely
# nspawn didn't create the file at all and Bash "<>" simply created a new
# normal file.
#
# "cat: -: Operation not permitted" # pass the test; opened but not read
# "bash: line 1: /dev/fuse: Operation not permitted" # fail the test; could not open
# "" # fail the test; reading worked
[[ "$(systemd-nspawn --pipe --directory="$root" \
bash -c 'cat <>/dev/fuse' 2>&1)" == 'cat: -: Operation not permitted' ]]
rm -fr "$root"
}
testcase_unpriv_fuse() {
# Same as above, but for unprivileged operation.
if [[ "$(cat <>/dev/fuse 2>&1)" != 'cat: -: Operation not permitted' ]]; then
echo "FUSE is not supported, skipping the test..."
return 0
fi
if ! can_do_rootless_nspawn; then
echo "Skipping rootless test..."
return 0
fi
local tmpdir name
tmpdir="$(mktemp -d /var/tmp/TEST-13-NSPAWN.unpriv-fuse.XXX)"
# $name must be such that len("ns-$(id -u testuser)-nspawn-${name}-65535")
# <= 31, or nsresourced will reject the request for a namespace.
# Therefore; len($name) <= 10 bytes.
name="ufuse-${tmpdir##*.}"
trap 'rm -fr ${tmpdir@Q} || true; rm -f /run/verity.d/test-13-nspawn-${name@Q} || true' RETURN ERR
create_dummy_ddi "$tmpdir" "$name"
chown --recursive testuser: "$tmpdir"
[[ "$(systemd-run \
--pipe \
--uid=testuser \
--property=Delegate=yes \
--setenv=SYSTEMD_LOG_LEVEL \
--setenv=SYSTEMD_LOG_TARGET \
-- \
systemd-nspawn --pipe --private-network --register=no --keep-unit --image="$tmpdir/$name.raw" \
bash -c 'cat <>/dev/fuse' 2>&1)" == *'cat: -: Operation not permitted' ]]
}
run_testcases

View File

@ -30,12 +30,16 @@ CoredumpReceive=yes
TasksMax=16384
{{SERVICE_WATCHDOG}}
{# Enforce a strict device policy, similar to the one nspawn configures when it
# allocates its own scope unit. Make sure to keep these policies in sync if you
# change them! #}
{# Enforce a strict device policy, similar to the one nspawn configures (in
# nspawn-register.c:append_machine_properties()) when it allocates its own
# scope unit. Make sure to keep these policies in sync if you change them! #}
DevicePolicy=closed
DeviceAllow=/dev/net/tun rwm
DeviceAllow=char-pts rw
{# /dev/fuse gets 'm' here even though it doesn't in nspawn-register.c, since
# efedb6b0f3 (nspawn: refuse to bind mount device node from host when
# --private-users= is specified, 2024-09-05) #}
DeviceAllow=/dev/fuse rwm
# nspawn itself needs access to /dev/loop-control and /dev/loop, to implement
# the --image= option. Add these here, too.