mirror of
https://github.com/systemd/systemd.git
synced 2025-01-09 01:18:19 +03:00
nspawn: enable FUSE in containers
Linux kernel v4.18 (2018-08-12) added user-namespace support to FUSE, and bumped the FUSE version to 7.27 (see: da315f6e0398 (Merge tag 'fuse-update-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse, Linus Torvalds, 2018-06-07). This means that on such kernels it is safe to enable FUSE in nspawn containers. In outer_child(), before calling copy_devnodes(), check the FUSE version to decide whether enable (>=7.27) or disable (<7.27) FUSE in the container. We look at the FUSE version instead of the kernel version in order to enable FUSE support on older-versioned kernels that may have the mentioned patchset backported ([as requested by @poettering][1]). However, I am not sure that this is safe; user-namespace support is not a documented part of the FUSE protocol, which is what FUSE_KERNEL_VERSION/FUSE_KERNEL_MINOR_VERSION are meant to capture. While the same patchset - added FUSE_ABORT_ERROR (which is all that the 7.27 version bump is documented as including), - bumped FUSE_KERNEL_MINOR_VERSION from 26 to 27, and - added user-namespace support these 3 things are not inseparable; it is conceivable to me that a backport could include the first 2 of those things and exclude the 3rd; perhaps it would be safer to check the kernel version. Do note that our get_fuse_version() function uses the fsopen() family of syscalls, which were not added until Linux kernel v5.2 (2019-07-07); so if nothing has been backported, then the minimum kernel version for FUSE-in-nspawn is actually v5.2, not v4.18. Pass whether or not to enable FUSE to copy_devnodes(); have copy_devnodes() copy in /dev/fuse if enabled. Pass whether or not to enable FUSE back over fd_outer_socket to run_container() so that it can pass that to append_machine_properties() (via either register_machine() or allocate_scope()); have append_machine_properties() append "DeviceAllow=/dev/fuse rw" if enabled. For testing, simply check that /dev/fuse can be opened for reading and writing, but that actually reading from it fails with EPERM. The test assumes that if FUSE is supported (/dev/fuse exists), then the testsuite is running on a kernel with FUSE >= 7.27; I am unsure how to go about writing a test that validates that the version check disables FUSE on old kernels. [1]: https://github.com/systemd/systemd/issues/17607#issuecomment-745418835 Closes #17607
This commit is contained in:
parent
f1bf6054ce
commit
dc3223919f
@ -15,6 +15,7 @@
|
||||
|
||||
static int append_machine_properties(
|
||||
sd_bus_message *m,
|
||||
bool enable_fuse,
|
||||
CustomMount *mounts,
|
||||
unsigned n_mounts,
|
||||
int kill_signal,
|
||||
@ -40,6 +41,12 @@ static int append_machine_properties(
|
||||
"char-pts", "rw");
|
||||
if (r < 0)
|
||||
return bus_log_create_error(r);
|
||||
if (enable_fuse) {
|
||||
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
|
||||
"/dev/fuse", "rw");
|
||||
if (r < 0)
|
||||
return bus_log_create_error(r);
|
||||
}
|
||||
|
||||
for (j = 0; j < n_mounts; j++) {
|
||||
CustomMount *cm = mounts + j;
|
||||
@ -200,6 +207,7 @@ int register_machine(
|
||||
|
||||
r = append_machine_properties(
|
||||
m,
|
||||
FLAGS_SET(flags, REGISTER_MACHINE_ENABLE_FUSE),
|
||||
mounts,
|
||||
n_mounts,
|
||||
kill_signal,
|
||||
@ -320,6 +328,7 @@ int allocate_scope(
|
||||
|
||||
r = append_machine_properties(
|
||||
m,
|
||||
FLAGS_SET(flags, ALLOCATE_SCOPE_ENABLE_FUSE),
|
||||
mounts,
|
||||
n_mounts,
|
||||
kill_signal,
|
||||
|
@ -9,7 +9,8 @@
|
||||
#include "nspawn-settings.h"
|
||||
|
||||
typedef enum RegisterMachineFlags {
|
||||
REGISTER_MACHINE_KEEP_UNIT = 1 << 0,
|
||||
REGISTER_MACHINE_KEEP_UNIT = 1 << 0,
|
||||
REGISTER_MACHINE_ENABLE_FUSE = 1 << 1,
|
||||
} RegisterMachineFlags;
|
||||
|
||||
int register_machine(
|
||||
@ -31,6 +32,7 @@ int unregister_machine(sd_bus *bus, const char *machine_name);
|
||||
|
||||
typedef enum AllocateScopeFlags {
|
||||
ALLOCATE_SCOPE_ALLOW_PIDFD = 1 << 0,
|
||||
ALLOCATE_SCOPE_ENABLE_FUSE = 1 << 1,
|
||||
} AllocateScopeFlags;
|
||||
|
||||
int allocate_scope(
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <errno.h>
|
||||
#include <getopt.h>
|
||||
#include <linux/fuse.h>
|
||||
#include <linux/loop.h>
|
||||
#if HAVE_SELINUX
|
||||
#include <selinux/selinux.h>
|
||||
@ -2147,7 +2148,85 @@ static int setup_boot_id(void) {
|
||||
return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
|
||||
}
|
||||
|
||||
static int copy_devnodes(const char *dest) {
|
||||
static int get_fuse_version(uint32_t *ret_major, uint32_t *ret_minor) {
|
||||
/* Must be called with mount privileges, either via arg_privileged or by being uid=0 in new
|
||||
* CLONE_NEWUSER/CLONE_NEWNS namespaces. This is true when called from outer_child(). */
|
||||
ssize_t n;
|
||||
_cleanup_close_ int fuse_fd = -EBADF, mnt_fd = -EBADF;
|
||||
_cleanup_free_ char *opts = NULL;
|
||||
union {
|
||||
char unstructured[FUSE_MIN_READ_BUFFER];
|
||||
struct {
|
||||
struct fuse_in_header header;
|
||||
/* Don't use <linux/fuse.h>:`struct fuse_init_in` because a newer fuse.h might give
|
||||
* us a bigger struct than what an older kernel actually gives us, and that would
|
||||
* break our .header.len check. */
|
||||
struct {
|
||||
uint32_t major;
|
||||
uint32_t minor;
|
||||
} body;
|
||||
} structured;
|
||||
} request;
|
||||
|
||||
assert(ret_major);
|
||||
assert(ret_minor);
|
||||
|
||||
/* Get a FUSE handle. */
|
||||
fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR);
|
||||
if (fuse_fd < 0)
|
||||
return log_debug_errno(errno, "Failed to open /dev/fuse: %m");
|
||||
if (asprintf(&opts, "fd=%i,rootmode=40000,user_id=0,group_id=0", fuse_fd) < 0)
|
||||
return log_oom_debug();
|
||||
mnt_fd = make_fsmount(LOG_DEBUG, "nspawn-fuse", "fuse.nspawn", 0, opts, -EBADF);
|
||||
if (mnt_fd < 0)
|
||||
return mnt_fd;
|
||||
|
||||
/* Read a request from the FUSE handle. */
|
||||
n = read(fuse_fd, &request.unstructured, sizeof request);
|
||||
if (n < 0)
|
||||
return log_debug_errno(errno, "Failed to read /dev/fuse: %m");
|
||||
if ((size_t) n < sizeof request.structured.header ||
|
||||
(size_t) n < request.structured.header.len)
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read /dev/fuse: Short read");
|
||||
|
||||
/* Assume that the request is a FUSE_INIT request, and return the version information from it. */
|
||||
if (request.structured.header.opcode != FUSE_INIT)
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial request from /dev/fuse should have opcode=%i (FUSE_INIT), but has opcode=%"PRIu32,
|
||||
FUSE_INIT, request.structured.header.opcode);
|
||||
if (request.structured.header.len < sizeof request.structured)
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial FUSE_INIT request from /dev/fuse is too short");
|
||||
*ret_major = request.structured.body.major;
|
||||
*ret_minor = request.structured.body.minor;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool should_enable_fuse(void) {
|
||||
uint32_t fuse_major, fuse_minor;
|
||||
int r;
|
||||
|
||||
r = get_fuse_version(&fuse_major, &fuse_minor);
|
||||
if (r < 0) {
|
||||
if (ERRNO_IS_NEG_DEVICE_ABSENT(r))
|
||||
log_debug_errno(r, "Disabling FUSE: FUSE appears to be disabled on the host: %m");
|
||||
else if (r == -ENOSYS)
|
||||
log_debug_errno(r, "Disabling FUSE: Kernel does not support the fsopen() family of syscalls: %m");
|
||||
else
|
||||
log_warning_errno(r, "Disabling FUSE: Failed to determine FUSE version: %m");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* FUSE is only userns-safe in FUSE version 7.27 and later.
|
||||
* https://github.com/torvalds/linux/commit/da315f6e03988a7127680bbc26e1028991b899b8 */
|
||||
if (fuse_major < 7 || (fuse_major == 7 && fuse_minor < 27)) {
|
||||
log_debug("Disabling FUSE: FUSE version %" PRIu32 ".%" PRIu32 " is too old to support user namespaces",
|
||||
fuse_major, fuse_minor);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int copy_devnodes(const char *dest, bool enable_fuse) {
|
||||
_cleanup_strv_free_ char **devnodes = NULL;
|
||||
int r = 0;
|
||||
|
||||
@ -2159,6 +2238,7 @@ static int copy_devnodes(const char *dest) {
|
||||
"random",
|
||||
"urandom",
|
||||
"tty",
|
||||
STRV_IFNOTNULL(enable_fuse ? "fuse" : NULL),
|
||||
"net/tun");
|
||||
if (!devnodes)
|
||||
return log_oom();
|
||||
@ -3807,7 +3887,7 @@ static int outer_child(
|
||||
_cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
|
||||
_cleanup_strv_free_ char **os_release_pairs = NULL;
|
||||
_cleanup_close_ int fd = -EBADF, mntns_fd = -EBADF;
|
||||
bool idmap = false;
|
||||
bool idmap = false, enable_fuse;
|
||||
const char *p;
|
||||
pid_t pid;
|
||||
ssize_t l;
|
||||
@ -4090,7 +4170,12 @@ static int outer_child(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = copy_devnodes(directory);
|
||||
enable_fuse = should_enable_fuse();
|
||||
l = send(fd_outer_socket, &enable_fuse, sizeof enable_fuse, 0);
|
||||
if (l < 0)
|
||||
return log_error_errno(errno, "Failed to send whether to enable FUSE: %m");
|
||||
|
||||
r = copy_devnodes(directory, enable_fuse);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@ -5048,6 +5133,7 @@ static int run_container(
|
||||
ssize_t l;
|
||||
sigset_t mask_chld;
|
||||
_cleanup_close_ int child_netns_fd = -EBADF;
|
||||
bool enable_fuse;
|
||||
|
||||
assert_se(sigemptyset(&mask_chld) == 0);
|
||||
assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
|
||||
@ -5234,6 +5320,12 @@ static int run_container(
|
||||
l, l == 0 ? " The child is most likely dead." : "");
|
||||
}
|
||||
|
||||
l = recv(fd_outer_socket_pair[0], &enable_fuse, sizeof enable_fuse, 0);
|
||||
if (l < 0)
|
||||
return log_error_errno(errno, "Failed to read whether to enable FUSE: %m");
|
||||
if (l != sizeof enable_fuse)
|
||||
return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading whether to enable FUSE.");
|
||||
|
||||
/* Wait for the outer child. */
|
||||
r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
|
||||
if (r < 0)
|
||||
@ -5386,6 +5478,7 @@ static int run_container(
|
||||
if (arg_register) {
|
||||
RegisterMachineFlags flags = 0;
|
||||
SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit);
|
||||
SET_FLAG(flags, REGISTER_MACHINE_ENABLE_FUSE, enable_fuse);
|
||||
r = register_machine(
|
||||
bus,
|
||||
arg_machine,
|
||||
@ -5406,6 +5499,7 @@ static int run_container(
|
||||
|
||||
} else if (!arg_keep_unit) {
|
||||
AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD;
|
||||
SET_FLAG(flags, ALLOCATE_SCOPE_ENABLE_FUSE, enable_fuse);
|
||||
r = allocate_scope(
|
||||
bus,
|
||||
arg_machine,
|
||||
|
@ -1112,4 +1112,75 @@ testcase_unpriv() {
|
||||
echo hello | cmp "$tmpdir/stdout.txt" -
|
||||
}
|
||||
|
||||
testcase_fuse() {
|
||||
if [[ "$(cat <>/dev/fuse 2>&1)" != 'cat: -: Operation not permitted' ]]; then
|
||||
echo "FUSE is not supported, skipping the test..."
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Assume that the tests are running on a kernel that is new enough for FUSE
|
||||
# to have user-namespace support; and so we should expect that nspawn
|
||||
# enables FUSE. This test does not validate that the version check
|
||||
# disables FUSE on old kernels.
|
||||
|
||||
local root
|
||||
|
||||
root="$(mktemp -d /var/lib/machines/TEST-13-NSPAWN.fuse.XXX)"
|
||||
create_dummy_container "$root"
|
||||
|
||||
# To avoid adding any complex dependencies to the test, we simply check
|
||||
# that /dev/fuse can be opened for reading and writing (O_RDWR), but that
|
||||
# actually reading from it fails with EPERM. This can be done with a
|
||||
# simple Bash script: run `cat <>/dev/fuse` and if the EPERM error message
|
||||
# comes from "bash" then we know it couldn't be opened, while if it comes
|
||||
# from "cat" then we know that it was opened but not read. If we are able
|
||||
# to read from the file, then this indicates that it's not a real FUSE
|
||||
# device (which requires us to mount a type="fuse" filesystem with the
|
||||
# option string "fd=${num}" for /dev/fuse FD before reading from it will
|
||||
# return anything other than EPERM); if this happens then most likely
|
||||
# nspawn didn't create the file at all and Bash "<>" simply created a new
|
||||
# normal file.
|
||||
#
|
||||
# "cat: -: Operation not permitted" # pass the test; opened but not read
|
||||
# "bash: line 1: /dev/fuse: Operation not permitted" # fail the test; could not open
|
||||
# "" # fail the test; reading worked
|
||||
[[ "$(systemd-nspawn --pipe --directory="$root" \
|
||||
bash -c 'cat <>/dev/fuse' 2>&1)" == 'cat: -: Operation not permitted' ]]
|
||||
|
||||
rm -fr "$root"
|
||||
}
|
||||
|
||||
testcase_unpriv_fuse() {
|
||||
# Same as above, but for unprivileged operation.
|
||||
|
||||
if [[ "$(cat <>/dev/fuse 2>&1)" != 'cat: -: Operation not permitted' ]]; then
|
||||
echo "FUSE is not supported, skipping the test..."
|
||||
return 0
|
||||
fi
|
||||
if ! can_do_rootless_nspawn; then
|
||||
echo "Skipping rootless test..."
|
||||
return 0
|
||||
fi
|
||||
|
||||
local tmpdir name
|
||||
tmpdir="$(mktemp -d /var/tmp/TEST-13-NSPAWN.unpriv-fuse.XXX)"
|
||||
# $name must be such that len("ns-$(id -u testuser)-nspawn-${name}-65535")
|
||||
# <= 31, or nsresourced will reject the request for a namespace.
|
||||
# Therefore; len($name) <= 10 bytes.
|
||||
name="ufuse-${tmpdir##*.}"
|
||||
trap 'rm -fr ${tmpdir@Q} || true; rm -f /run/verity.d/test-13-nspawn-${name@Q} || true' RETURN ERR
|
||||
create_dummy_ddi "$tmpdir" "$name"
|
||||
chown --recursive testuser: "$tmpdir"
|
||||
|
||||
[[ "$(systemd-run \
|
||||
--pipe \
|
||||
--uid=testuser \
|
||||
--property=Delegate=yes \
|
||||
--setenv=SYSTEMD_LOG_LEVEL \
|
||||
--setenv=SYSTEMD_LOG_TARGET \
|
||||
-- \
|
||||
systemd-nspawn --pipe --private-network --register=no --keep-unit --image="$tmpdir/$name.raw" \
|
||||
bash -c 'cat <>/dev/fuse' 2>&1)" == *'cat: -: Operation not permitted' ]]
|
||||
}
|
||||
|
||||
run_testcases
|
||||
|
@ -30,12 +30,16 @@ CoredumpReceive=yes
|
||||
TasksMax=16384
|
||||
{{SERVICE_WATCHDOG}}
|
||||
|
||||
{# Enforce a strict device policy, similar to the one nspawn configures when it
|
||||
# allocates its own scope unit. Make sure to keep these policies in sync if you
|
||||
# change them! #}
|
||||
{# Enforce a strict device policy, similar to the one nspawn configures (in
|
||||
# nspawn-register.c:append_machine_properties()) when it allocates its own
|
||||
# scope unit. Make sure to keep these policies in sync if you change them! #}
|
||||
DevicePolicy=closed
|
||||
DeviceAllow=/dev/net/tun rwm
|
||||
DeviceAllow=char-pts rw
|
||||
{# /dev/fuse gets 'm' here even though it doesn't in nspawn-register.c, since
|
||||
# efedb6b0f3 (nspawn: refuse to bind mount device node from host when
|
||||
# --private-users= is specified, 2024-09-05) #}
|
||||
DeviceAllow=/dev/fuse rwm
|
||||
|
||||
# nspawn itself needs access to /dev/loop-control and /dev/loop, to implement
|
||||
# the --image= option. Add these here, too.
|
||||
|
Loading…
Reference in New Issue
Block a user