1
0
mirror of https://github.com/systemd/systemd.git synced 2025-03-23 10:50:16 +03:00

nspawn: unconditionally enable FUSE and use FOREACH_ARRAY() (#36407)

- FUSE is unconditionally enabled in the container, as our kernel base
line (v5.4) supports userns-safe FUSE, which is supported since v4.18.
- Create /dev/net/tun only when it is accessible.
- Replaces several loops with FOREACH_ARRAY().
This commit is contained in:
Yu Watanabe 2025-02-19 02:09:58 +09:00 committed by GitHub
commit 134bb7bfdf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 74 additions and 211 deletions

View File

@ -181,13 +181,11 @@ BindUserContext* bind_user_context_free(BindUserContext *c) {
if (!c)
return NULL;
assert(c->n_data == 0 || c->data);
for (size_t i = 0; i < c->n_data; i++) {
user_record_unref(c->data[i].host_user);
group_record_unref(c->data[i].host_group);
user_record_unref(c->data[i].payload_user);
group_record_unref(c->data[i].payload_group);
FOREACH_ARRAY(d, c->data, c->n_data) {
user_record_unref(d->host_user);
group_record_unref(d->host_group);
user_record_unref(d->payload_user);
group_record_unref(d->payload_group);
}
return mfree(c);
@ -400,10 +398,9 @@ int bind_user_setup(
if (r < 0)
return log_error_errno(r, "Failed to create /run/host/userdb: %m");
for (size_t i = 0; i < c->n_data; i++) {
FOREACH_ARRAY(d, c->data, c->n_data) {
_cleanup_(group_record_unrefp) GroupRecord *stripped_group = NULL, *shadow_group = NULL;
_cleanup_(user_record_unrefp) UserRecord *stripped_user = NULL, *shadow_user = NULL;
const BindUserData *d = c->data + i;
/* First, write shadow (i.e. privileged) data for group record */
r = group_record_clone(d->payload_group, shadow_flags, &shadow_group);

View File

@ -48,9 +48,7 @@ CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
}
void custom_mount_free_all(CustomMount *l, size_t n) {
for (size_t i = 0; i < n; i++) {
CustomMount *m = l + i;
FOREACH_ARRAY(m, l, n) {
free(m->source);
free(m->destination);
free(m->options);
@ -1024,9 +1022,7 @@ int mount_custom(
assert(dest);
for (size_t i = 0; i < n; i++) {
CustomMount *m = mounts + i;
FOREACH_ARRAY(m, mounts, n) {
if (FLAGS_SET(mount_settings, MOUNT_IN_USERNS) != m->in_userns)
continue;
@ -1070,8 +1066,8 @@ int mount_custom(
}
bool has_custom_root_mount(const CustomMount *mounts, size_t n) {
for (size_t i = 0; i < n; i++)
if (path_equal(mounts[i].destination, "/"))
FOREACH_ARRAY(m, mounts, n)
if (path_equal(m->destination, "/"))
return true;
return false;

View File

@ -919,21 +919,17 @@ struct device_data {
static int oci_cgroup_device_access(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
struct device_data *d = ASSERT_PTR(userdata);
bool r = false, w = false, m = false;
const char *s;
size_t i;
assert_se(s = sd_json_variant_string(v));
for (i = 0; s[i]; i++)
if (s[i] == 'r')
for (const char *s = ASSERT_PTR(sd_json_variant_string(v)); *s; s++)
if (*s == 'r')
r = true;
else if (s[i] == 'w')
else if (*s == 'w')
w = true;
else if (s[i] == 'm')
else if (*s == 'm')
m = true;
else
return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
"Unknown device access character '%c'.", s[i]);
"Unknown device access character '%c'.", *s);
d->r = r;
d->w = w;
@ -945,7 +941,7 @@ static int oci_cgroup_device_access(const char *name, sd_json_variant *v, sd_jso
static int oci_cgroup_devices(const char *name, sd_json_variant *v, sd_json_dispatch_flags_t flags, void *userdata) {
_cleanup_free_ struct device_data *list = NULL;
Settings *s = ASSERT_PTR(userdata);
size_t n_list = 0, i;
size_t n_list = 0;
bool noop = false;
sd_json_variant *e;
int r;
@ -1036,43 +1032,43 @@ static int oci_cgroup_devices(const char *name, sd_json_variant *v, sd_json_disp
if (r < 0)
return bus_log_create_error(r);
for (i = 0; i < n_list; i++) {
FOREACH_ARRAY(d, list, n_list) {
_cleanup_free_ char *pattern = NULL;
char access[4];
size_t n = 0;
if (list[i].minor == UINT_MAX) {
if (d->minor == UINT_MAX) {
const char *t;
if (list[i].type == S_IFBLK)
if (d->type == S_IFBLK)
t = "block";
else {
assert(list[i].type == S_IFCHR);
assert(d->type == S_IFCHR);
t = "char";
}
if (list[i].major == UINT_MAX) {
if (d->major == UINT_MAX) {
pattern = strjoin(t, "-*");
if (!pattern)
return log_oom();
} else {
if (asprintf(&pattern, "%s-%u", t, list[i].major) < 0)
if (asprintf(&pattern, "%s-%u", t, d->major) < 0)
return log_oom();
}
} else {
assert(list[i].major != UINT_MAX); /* If a minor is specified, then a major also needs to be specified */
assert(d->major != UINT_MAX); /* If a minor is specified, then a major also needs to be specified */
r = device_path_make_major_minor(list[i].type, makedev(list[i].major, list[i].minor), &pattern);
r = device_path_make_major_minor(d->type, makedev(d->major, d->minor), &pattern);
if (r < 0)
return log_oom();
}
if (list[i].r)
if (d->r)
access[n++] = 'r';
if (list[i].w)
if (d->w)
access[n++] = 'w';
if (list[i].m)
if (d->m)
access[n++] = 'm';
access[n] = 0;

View File

@ -15,13 +15,11 @@
static int append_machine_properties(
sd_bus_message *m,
bool enable_fuse,
CustomMount *mounts,
unsigned n_mounts,
int kill_signal,
bool coredump_receive) {
unsigned j;
int r;
assert(m);
@ -32,25 +30,19 @@ static int append_machine_properties(
/* If you make changes here, also make sure to update systemd-nspawn@.service, to keep the device
* policies in sync regardless if we are run with or without the --keep-unit switch. */
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 2,
/* Allow the container to access and create the API device nodes, so that
* PrivateDevices= in the container can work fine */
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 3,
/* Allow the container to access and create the API device node, so that
* PrivateDevices= in the container can work fine. */
"/dev/net/tun", "rwm",
/* Allow the container access to ptys. However, do not permit the container
/* Allow the container to access ptys. However, do not permit the container
* to ever create these device nodes. */
"char-pts", "rw");
"char-pts", "rw",
/* Allow the container to access and create the FUSE API device node. */
"/dev/fuse", "rwm");
if (r < 0)
return bus_log_create_error(r);
if (enable_fuse) {
r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
"/dev/fuse", "rwm");
if (r < 0)
return bus_log_create_error(r);
}
for (j = 0; j < n_mounts; j++) {
CustomMount *cm = mounts + j;
FOREACH_ARRAY(cm, mounts, n_mounts) {
if (cm->type != CUSTOM_MOUNT_BIND)
continue;
@ -207,7 +199,6 @@ int register_machine(
r = append_machine_properties(
m,
FLAGS_SET(flags, REGISTER_MACHINE_ENABLE_FUSE),
mounts,
n_mounts,
kill_signal,
@ -328,7 +319,6 @@ int allocate_scope(
r = append_machine_properties(
m,
FLAGS_SET(flags, ALLOCATE_SCOPE_ENABLE_FUSE),
mounts,
n_mounts,
kill_signal,

View File

@ -10,7 +10,6 @@
typedef enum RegisterMachineFlags {
REGISTER_MACHINE_KEEP_UNIT = 1 << 0,
REGISTER_MACHINE_ENABLE_FUSE = 1 << 1,
} RegisterMachineFlags;
int register_machine(
@ -32,7 +31,6 @@ int unregister_machine(sd_bus *bus, const char *machine_name);
typedef enum AllocateScopeFlags {
ALLOCATE_SCOPE_ALLOW_PIDFD = 1 << 0,
ALLOCATE_SCOPE_ENABLE_FUSE = 1 << 1,
} AllocateScopeFlags;
int allocate_scope(

View File

@ -471,11 +471,7 @@ static int help(void) {
}
static int custom_mount_check_all(void) {
size_t i;
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
FOREACH_ARRAY(m, arg_custom_mounts, arg_n_custom_mounts)
if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
@ -485,7 +481,6 @@ static int custom_mount_check_all(void) {
return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
"--private-users with automatic UID shift may not be combined with custom root mounts.");
}
}
return 0;
}
@ -2175,85 +2170,6 @@ static int setup_boot_id(void) {
return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
}
static int get_fuse_version(uint32_t *ret_major, uint32_t *ret_minor) {
/* Must be called with mount privileges, either via arg_privileged or by being uid=0 in new
* CLONE_NEWUSER/CLONE_NEWNS namespaces. This is true when called from outer_child(). */
ssize_t n;
_cleanup_close_ int fuse_fd = -EBADF, mnt_fd = -EBADF;
_cleanup_free_ char *opts = NULL;
union {
char unstructured[FUSE_MIN_READ_BUFFER];
struct {
struct fuse_in_header header;
/* Don't use <linux/fuse.h>:`struct fuse_init_in` because a newer fuse.h might give
* us a bigger struct than what an older kernel actually gives us, and that would
* break our .header.len check. */
struct {
uint32_t major;
uint32_t minor;
} body;
} structured;
} request;
assert(ret_major);
assert(ret_minor);
/* Get a FUSE handle. */
fuse_fd = open("/dev/fuse", O_CLOEXEC|O_RDWR);
if (fuse_fd < 0)
return log_debug_errno(errno, "Failed to open /dev/fuse: %m");
if (asprintf(&opts, "fd=%i,rootmode=40000,user_id=0,group_id=0", fuse_fd) < 0)
return log_oom_debug();
mnt_fd = make_fsmount(LOG_DEBUG, "nspawn-fuse", "fuse.nspawn", 0, opts, -EBADF);
if (mnt_fd < 0)
return mnt_fd;
/* Read a request from the FUSE handle. */
n = read(fuse_fd, &request.unstructured, sizeof request);
if (n < 0)
return log_debug_errno(errno, "Failed to read /dev/fuse: %m");
if ((size_t) n < sizeof request.structured.header ||
(size_t) n < request.structured.header.len)
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read /dev/fuse: Short read");
/* Assume that the request is a FUSE_INIT request, and return the version information from it. */
if (request.structured.header.opcode != FUSE_INIT)
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial request from /dev/fuse should have opcode=%i (FUSE_INIT), but has opcode=%"PRIu32,
FUSE_INIT, request.structured.header.opcode);
if (request.structured.header.len < sizeof request.structured)
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Initial FUSE_INIT request from /dev/fuse is too short");
*ret_major = request.structured.body.major;
*ret_minor = request.structured.body.minor;
return 0;
}
static bool should_enable_fuse(void) {
uint32_t fuse_major, fuse_minor;
int r;
r = get_fuse_version(&fuse_major, &fuse_minor);
if (r < 0) {
if (ERRNO_IS_NEG_DEVICE_ABSENT(r))
log_debug_errno(r, "Disabling FUSE: FUSE appears to be disabled on the host: %m");
else if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
log_debug_errno(r, "Disabling FUSE: Kernel does not support the fsopen() family of syscalls: %m");
else
log_full_errno(ERRNO_IS_NEG_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
"Disabling FUSE: Failed to determine FUSE version: %m");
return false;
}
/* FUSE is only userns-safe in FUSE version 7.27 and later.
* https://github.com/torvalds/linux/commit/da315f6e03988a7127680bbc26e1028991b899b8 */
if (fuse_major < 7 || (fuse_major == 7 && fuse_minor < 27)) {
log_debug("Disabling FUSE: FUSE version %" PRIu32 ".%" PRIu32 " is too old to support user namespaces",
fuse_major, fuse_minor);
return false;
}
return true;
}
static int bind_mount_devnode(const char *from, const char *to) {
int r;
@ -2273,7 +2189,7 @@ static int bind_mount_devnode(const char *from, const char *to) {
return 0;
}
static int copy_devnode_one(const char *dest, const char *node, bool ignore_mknod_failure) {
static int copy_devnode_one(const char *dest, const char *node, bool check) {
int r;
assert(dest);
@ -2285,6 +2201,19 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno
if (!from)
return log_oom();
if (check) {
/* If 'check' is true, create /dev/fuse only when it is accessible. The check is necessary,
* as some custom service units that invoke systemd-nspawn may enable DevicePolicy= without
* DeviceAllow= for the device node. */
_cleanup_close_ int fd = open(from, O_CLOEXEC|O_RDWR);
if (fd < 0) {
log_debug_errno(errno,
"Failed to open %s, skipping creation of the device node in the container, ignoring: %m",
from);
return 0;
}
}
_cleanup_free_ char *to = path_join(dest, from);
if (!to)
return log_oom();
@ -2319,10 +2248,6 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno
/* If arg_uid_shift != 0, then we cannot fall back to use bind mount. */
if (!(arg_userns_mode == USER_NAMESPACE_NO ||
(arg_userns_mode == USER_NAMESPACE_FIXED && arg_uid_shift == 0))) {
if (ignore_mknod_failure) {
log_debug_errno(r, "Failed to mknod(%s), ignoring: %m", to);
return 0;
}
if (arg_userns_mode != USER_NAMESPACE_MANAGED || !ERRNO_IS_NEG_PRIVILEGE(r))
return log_error_errno(r, "Failed to mknod(%s): %m", to);
@ -2331,14 +2256,9 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno
}
/* Some systems abusively restrict mknod but allow bind mounts. */
if (bind_mount_devnode(from, to) < 0) {
if (bind_mount_devnode(from, to) < 0)
/* use the original error code. */
if (ignore_mknod_failure) {
log_debug_errno(r, "Both mknod() and bind mount %s failed, ignoring: %m", to);
return 0;
}
return log_error_errno(r, "Both mknod() and bind mount %s failed: %m", to);
}
} else {
/* mknod() succeeds, chown() it if necessary. */
r = userns_lchown(to, 0, 0);
@ -2372,30 +2292,25 @@ static int copy_devnode_one(const char *dest, const char *node, bool ignore_mkno
return 0;
}
static int copy_devnodes(const char *dest, bool enable_fuse) {
static int copy_devnodes(const char *dest) {
int r = 0;
assert(dest);
/* Required basic device nodes. */
FOREACH_STRING(node, "null", "zero", "full", "random", "urandom", "tty") {
r = copy_devnode_one(dest, node, /* ignore_mknod_failure = */ false);
r = copy_devnode_one(dest, node, /* check = */ false);
if (r < 0)
return r;
}
if (enable_fuse) {
r = copy_devnode_one(dest, "fuse", /* ignore_mknod_failure = */ false);
/* Optional device nodes. */
FOREACH_STRING(node, "fuse", "net/tun") {
r = copy_devnode_one(dest, node, /* check = */ true);
if (r < 0)
return r;
}
/* We unconditionally try to create /dev/net/tun, but let's ignore failure if --private-network is
* unspecified. The failure can be triggered when e.g. DevicePolicy= is set, but DeviceAllow= does
* not contains the device node, and --private-users=pick is specified. */
r = copy_devnode_one(dest, "net/tun", /* ignore_mknod_failure = */ !arg_private_network);
if (r < 0)
return r;
return 0;
}
@ -3975,7 +3890,7 @@ static int outer_child(
_cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
_cleanup_strv_free_ char **os_release_pairs = NULL;
bool idmap = false, enable_fuse;
bool idmap = false;
const char *p;
pid_t pid;
ssize_t l;
@ -4154,12 +4069,12 @@ static int outer_child(
/* Send the user maps we determined to the parent, so that it installs it in our user
* namespace UID map table */
for (size_t i = 0; i < bind_user_context->n_data; i++) {
FOREACH_ARRAY(d, bind_user_context->data, bind_user_context->n_data) {
uid_t map[] = {
bind_user_context->data[i].payload_user->uid,
bind_user_context->data[i].host_user->uid,
(uid_t) bind_user_context->data[i].payload_group->gid,
(uid_t) bind_user_context->data[i].host_group->gid,
d->payload_user->uid,
d->host_user->uid,
(uid_t) d->payload_group->gid,
(uid_t) d->host_group->gid,
};
l = send(fd_outer_socket, map, sizeof(map), MSG_NOSIGNAL);
@ -4319,12 +4234,7 @@ static int outer_child(
if (r < 0)
return r;
enable_fuse = should_enable_fuse();
l = send(fd_outer_socket, &enable_fuse, sizeof enable_fuse, 0);
if (l < 0)
return log_error_errno(errno, "Failed to send whether to enable FUSE: %m");
r = copy_devnodes(directory, enable_fuse);
r = copy_devnodes(directory);
if (r < 0)
return r;
@ -5280,7 +5190,6 @@ static int run_container(
ssize_t l;
sigset_t mask_chld;
_cleanup_close_ int child_netns_fd = -EBADF;
bool enable_fuse;
assert_se(sigemptyset(&mask_chld) == 0);
assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
@ -5464,12 +5373,6 @@ static int run_container(
l, l == 0 ? " The child is most likely dead." : "");
}
l = recv(fd_outer_socket_pair[0], &enable_fuse, sizeof enable_fuse, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read whether to enable FUSE: %m");
if (l != sizeof enable_fuse)
return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading whether to enable FUSE.");
/* Wait for the outer child. */
r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
if (r < 0)
@ -5624,7 +5527,6 @@ static int run_container(
if (arg_register) {
RegisterMachineFlags flags = 0;
SET_FLAG(flags, REGISTER_MACHINE_KEEP_UNIT, arg_keep_unit);
SET_FLAG(flags, REGISTER_MACHINE_ENABLE_FUSE, enable_fuse);
r = register_machine(
bus,
arg_machine,
@ -5645,7 +5547,6 @@ static int run_container(
} else if (!arg_keep_unit) {
AllocateScopeFlags flags = ALLOCATE_SCOPE_ALLOW_PIDFD;
SET_FLAG(flags, ALLOCATE_SCOPE_ENABLE_FUSE, enable_fuse);
r = allocate_scope(
bus,
arg_machine,

View File

@ -1231,31 +1231,16 @@ testcase_unpriv_fuse() {
}
test_tun() {
local expect=${1?}
local exists=${2?}
local command command_exists command_not_exists
shift 2
command_exists='[[ -c /dev/net/tun ]]; [[ "$(stat /dev/net/tun --format=%u)" == 0 ]]; [[ "$(stat /dev/net/tun --format=%g)" == 0 ]]'
command_not_exists='[[ ! -e /dev/net/tun ]]'
if [[ "$exists" == 0 ]]; then
command="$command_not_exists"
else
command="$command_exists"
fi
systemd-nspawn "$@" bash -xec "$command_exists"
systemd-nspawn "$@" bash -xec '[[ -c /dev/net/tun ]]; [[ "$(stat /dev/net/tun --format=%u)" == 0 ]]; [[ "$(stat /dev/net/tun --format=%g)" == 0 ]]'
# check if the owner of the host device is unchanged, see issue #34243.
[[ "$(stat /dev/net/tun --format=%u)" == 0 ]]
[[ "$(stat /dev/net/tun --format=%g)" == 0 ]]
# Without DeviceAllow= for /dev/net/tun, see issue #35116.
assert_rc \
"$expect" \
systemd-run --wait -p Environment=SYSTEMD_LOG_LEVEL=debug -p DevicePolicy=closed -p DeviceAllow="char-pts rw" \
systemd-nspawn "$@" bash -xec "$command"
systemd-run \
--wait -p Environment=SYSTEMD_LOG_LEVEL=debug -p DevicePolicy=closed -p DeviceAllow="char-pts rw" \
systemd-nspawn "$@" bash -xec '[[ ! -e /dev/net/tun ]]'
[[ "$(stat /dev/net/tun --format=%u)" == 0 ]]
[[ "$(stat /dev/net/tun --format=%g)" == 0 ]]
@ -1272,12 +1257,12 @@ testcase_dev_net_tun() {
root="$(mktemp -d /var/lib/machines/TEST-13-NSPAWN.tun.XXX)"
create_dummy_container "$root"
test_tun 0 1 --ephemeral --directory="$root" --private-users=no
test_tun 0 1 --ephemeral --directory="$root" --private-users=yes
test_tun 0 0 --ephemeral --directory="$root" --private-users=pick
test_tun 0 1 --ephemeral --directory="$root" --private-users=no --private-network
test_tun 0 1 --ephemeral --directory="$root" --private-users=yes --private-network
test_tun 1 0 --ephemeral --directory="$root" --private-users=pick --private-network
test_tun --ephemeral --directory="$root" --private-users=no
test_tun --ephemeral --directory="$root" --private-users=yes
test_tun --ephemeral --directory="$root" --private-users=pick
test_tun --ephemeral --directory="$root" --private-users=no --private-network
test_tun --ephemeral --directory="$root" --private-users=yes --private-network
test_tun --ephemeral --directory="$root" --private-users=pick --private-network
rm -fr "$root"
}