mirror of
https://github.com/systemd/systemd.git
synced 2024-10-27 10:25:37 +03:00
Merge pull request #16543 from poettering/nspawn-run-host
nspawn: /run/host/ tweaks
This commit is contained in:
commit
5b14956385
@ -172,6 +172,13 @@ manager, please consider supporting the following interfaces.
|
||||
unit they created for their container. That's private property of systemd,
|
||||
and no other code should modify it.
|
||||
|
||||
6. systemd running inside the container can report when boot-up is complete
|
||||
using the usual `sd_notify()` protocol that is also used when a service
|
||||
wants to tell the service manager about readiness. A container manager can
|
||||
set the `$NOTIFY_SOCKET` environment variable to a suitable socket path to
|
||||
make use of this functionality. (Also see information about
|
||||
`/run/host/notify` below.)
|
||||
|
||||
## Networking
|
||||
|
||||
1. Inside of a container, if a `veth` link is named `host0`, `systemd-networkd`
|
||||
@ -189,6 +196,62 @@ manager, please consider supporting the following interfaces.
|
||||
devices, for example hashed out of the container names. That way it is more
|
||||
likely that DHCP and IPv4LL will acquire stable addresses.
|
||||
|
||||
## The `/run/host/` Hierarchy
|
||||
|
||||
Container managers may place certain resources the manager wants to provide to
|
||||
the container payload below the `/run/host/` hierarchy. This hierarchy should
|
||||
be mostly immutable (possibly some subdirs might be writable, but the top-level
|
||||
hierarchy — and probably most subdirs should be read-only to the
|
||||
container). Note that this hierarchy is used by various container managers, and
|
||||
care should be taken to avoid naming conflicts. `systemd` (and in particular
|
||||
`systemd-nspawn`) use the hierarchy for the following resources:
|
||||
|
||||
1. The `/run/host/incoming/` directory mount point is configured for `MS_SLAVE`
|
||||
mount propagation with the host, and is used as intermediary location for
|
||||
mounts to establish in the container, for the implementation of `machinectl
|
||||
bind`. Container payload should usually not directly interact with this
|
||||
directory: it's used by code outside the container to insert mounts inside
|
||||
it only, and is mostly an internal vehicle to achieve this. Other container
|
||||
managers that want to implement similar functionality might consider using
|
||||
the same directory.
|
||||
|
||||
2. The `/run/host/inaccessible/` directory may be set up by the container
|
||||
manager to include six file nodes: `reg`, `dir`, `fifo`, `sock`, `chr`,
|
||||
`blk`. These nodes correspond with the six types of file nodes Linux knows
|
||||
(with the exceptions of symlinks). Each node should be of the specific type
|
||||
and have an all zero access mode, i.e. be inaccessible. The two device node
|
||||
types should have major and minor of zero (which are unallocated devices on
|
||||
Linux). These nodes are used as mount source for implementing the
|
||||
`InaccessiblePath=` setting of unit files, i.e. file nodes to mask this way
|
||||
are overmounted with these "inaccessible" inodes, guaranteeing that the file
|
||||
node type does not change this way but the nodes still become
|
||||
inaccessible. Note that systemd when run as PID 1 in the container payload
|
||||
will create these nodes on its own if not passed in by the container
|
||||
manager. However, in that case it likely lacks the privileges to create the
|
||||
character and block devices nodes (there all fallbacks for this case).
|
||||
|
||||
3. The `/run/host/notify` path is a good choice to place the `sd_notify()`
|
||||
socket in, that may be used for the container's PID 1 to report to the
|
||||
container manager when boot-up is complete. The path used for this doesn't
|
||||
matter much as it is communicated via the `$NOTIFY_SOCKET` environment
|
||||
variable, following the usual protocol for this, however it's suitable, and
|
||||
recommended place for this socket in case ready notification is desired.
|
||||
|
||||
4. The `/run/host/os-release` file contains the `/etc/os-release` file of the
|
||||
host, i.e. may be used by the container payload to gather limited
|
||||
information about the host environment, on top of what `uname -a` reports.
|
||||
|
||||
5. The `/run/host/container-manager` file may be used to pass the same
|
||||
information as the `$container` environment variable (see above), i.e. a
|
||||
short string identifying the container manager implementation. This file
|
||||
should be newline terminated. Passing this information via this file has the
|
||||
benefit that payload code can easily access it, even when running
|
||||
unprivileged without access to the container PID1's environment block.
|
||||
|
||||
6. The `/run/host/container-uuid` file may be used to pass the same information
|
||||
as the `$container_uuid` environment variable (see above). This file should
|
||||
be newline terminated.
|
||||
|
||||
## What You Shouldn't Do
|
||||
|
||||
1. Do not drop `CAP_MKNOD` from the container. `PrivateDevices=` is a commonly
|
||||
|
@ -491,6 +491,16 @@ int detect_container(void) {
|
||||
}
|
||||
}
|
||||
|
||||
/* The container manager might have placed this in the /run/host hierarchy for us, which is best
|
||||
* because we can be consumed just like that, without special privileges. */
|
||||
r = read_one_line_file("/run/host/container-manager", &m);
|
||||
if (r > 0) {
|
||||
e = m;
|
||||
goto translate_name;
|
||||
}
|
||||
if (!IN_SET(r, -ENOENT, 0))
|
||||
return log_debug_errno(r, "Failed to read /run/systemd/container: %m");
|
||||
|
||||
if (getpid_cached() == 1) {
|
||||
/* If we are PID 1 we can just check our own environment variable, and that's authoritative.
|
||||
* We distinguish three cases:
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "dbus-manager.h"
|
||||
#include "dbus.h"
|
||||
#include "def.h"
|
||||
#include "dev-setup.h"
|
||||
#include "efi-random.h"
|
||||
#include "efivars.h"
|
||||
#include "emergency-action.h"
|
||||
@ -53,6 +54,7 @@
|
||||
#include "loopback-setup.h"
|
||||
#include "machine-id-setup.h"
|
||||
#include "manager.h"
|
||||
#include "mkdir.h"
|
||||
#include "mount-setup.h"
|
||||
#include "os-util.h"
|
||||
#include "pager.h"
|
||||
@ -2073,6 +2075,20 @@ static int initialize_runtime(
|
||||
if (r < 0)
|
||||
log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
|
||||
}
|
||||
} else {
|
||||
_cleanup_free_ char *p = NULL;
|
||||
|
||||
/* Create the runtime directory and place the inaccessible device nodes there, if we run in
|
||||
* user mode. In system mode mount_setup() already did that. */
|
||||
|
||||
r = xdg_user_runtime_dir(&p, "/systemd");
|
||||
if (r < 0) {
|
||||
*ret_error_message = "$XDG_RUNTIME_DIR is not set";
|
||||
return log_emergency_errno(r, "Failed to determine $XDG_RUNTIME_DIR path: %m");
|
||||
}
|
||||
|
||||
(void) mkdir_p(p, 0755);
|
||||
(void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
|
||||
}
|
||||
|
||||
if (arg_timer_slack_nsec != NSEC_INFINITY)
|
||||
|
@ -538,8 +538,17 @@ int mount_setup(bool loaded_policy, bool leave_propagation) {
|
||||
(void) mkdir_label("/run/systemd/system", 0755);
|
||||
|
||||
/* Also create /run/systemd/inaccessible nodes, so that we always have something to mount
|
||||
* inaccessible nodes from. */
|
||||
(void) make_inaccessible_nodes(NULL, UID_INVALID, GID_INVALID);
|
||||
* inaccessible nodes from. If we run in a container the host might have created these for us already
|
||||
* in /run/host/inaccessible/. Use those if we can, since tht way we likely get access to block/char
|
||||
* device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a
|
||||
* userns outside the container and thus nicely read-only and not remountable. */
|
||||
if (access("/run/host/inaccessible/", F_OK) < 0) {
|
||||
if (errno != ENOENT)
|
||||
log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m");
|
||||
|
||||
(void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID);
|
||||
} else
|
||||
(void) symlink("../host/inaccessible", "/run/systemd/inaccessible");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -88,7 +88,8 @@ static int user_mkdir_runtime_path(
|
||||
goto fail;
|
||||
}
|
||||
|
||||
log_debug_errno(errno, "Failed to mount per-user tmpfs directory %s.\n"
|
||||
log_debug_errno(errno,
|
||||
"Failed to mount per-user tmpfs directory %s.\n"
|
||||
"Assuming containerized execution, ignoring: %m", runtime_path);
|
||||
|
||||
r = chmod_and_chown(runtime_path, 0700, uid, gid);
|
||||
@ -103,8 +104,6 @@ static int user_mkdir_runtime_path(
|
||||
log_warning_errno(r, "Failed to fix label of \"%s\", ignoring: %m", runtime_path);
|
||||
}
|
||||
|
||||
/* Set up inaccessible nodes now so they're available if we decide to use them with user namespaces. */
|
||||
(void) make_inaccessible_nodes(runtime_path, uid, gid);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
|
@ -978,9 +978,8 @@ int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bu
|
||||
goto finish;
|
||||
}
|
||||
if (r == 0) {
|
||||
const char *mount_inside;
|
||||
const char *mount_inside, *q;
|
||||
int mntfd;
|
||||
const char *q;
|
||||
|
||||
errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
|
||||
|
||||
@ -1001,12 +1000,11 @@ int bus_machine_method_bind_mount(sd_bus_message *message, void *userdata, sd_bu
|
||||
(void) mkdir_p(dest, 0755);
|
||||
else {
|
||||
(void) mkdir_parents(dest, 0755);
|
||||
safe_close(open(dest, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOCTTY, 0600));
|
||||
(void) mknod(dest, S_IFREG|0600, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Fifth, move the mount to the right place inside */
|
||||
mount_inside = strjoina("/run/systemd/nspawn/incoming/", basename(mount_outside));
|
||||
mount_inside = strjoina("/run/host/incoming/", basename(mount_outside));
|
||||
if (mount(mount_inside, dest, NULL, MS_MOVE, NULL) < 0) {
|
||||
r = log_error_errno(errno, "Failed to mount: %m");
|
||||
goto child_fail;
|
||||
|
@ -101,10 +101,8 @@
|
||||
#include "user-util.h"
|
||||
#include "util.h"
|
||||
|
||||
/* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path
|
||||
* nspawn_notify_socket_path is relative to the container
|
||||
* the init process in the container pid can send messages to nspawn following the sd_notify(3) protocol */
|
||||
#define NSPAWN_NOTIFY_SOCKET_PATH "/run/systemd/nspawn/notify"
|
||||
/* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
|
||||
#define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
|
||||
|
||||
#define EXIT_FORCE_RESTART 133
|
||||
|
||||
@ -2517,19 +2515,15 @@ static int setup_propagate(const char *root) {
|
||||
p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
|
||||
(void) mkdir_p(p, 0600);
|
||||
|
||||
r = userns_mkdir(root, "/run/systemd", 0755, 0, 0);
|
||||
r = userns_mkdir(root, "/run/host", 0755, 0, 0);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create /run/systemd: %m");
|
||||
return log_error_errno(r, "Failed to create /run/host: %m");
|
||||
|
||||
r = userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0);
|
||||
r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create /run/systemd/nspawn: %m");
|
||||
return log_error_errno(r, "Failed to create /run/host/incoming: %m");
|
||||
|
||||
r = userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create /run/systemd/nspawn/incoming: %m");
|
||||
|
||||
q = prefix_roota(root, "/run/systemd/nspawn/incoming");
|
||||
q = prefix_roota(root, "/run/host/incoming");
|
||||
r = mount_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -2538,8 +2532,7 @@ static int setup_propagate(const char *root) {
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* machined will MS_MOVE into that directory, and that's only
|
||||
* supported for non-shared mounts. */
|
||||
/* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
|
||||
return mount_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
|
||||
}
|
||||
|
||||
@ -3278,7 +3271,7 @@ static int inner_child(
|
||||
return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
|
||||
}
|
||||
|
||||
static int setup_sd_notify_child(void) {
|
||||
static int setup_notify_child(void) {
|
||||
_cleanup_close_ int fd = -1;
|
||||
union sockaddr_union sa = {
|
||||
.un.sun_family = AF_UNIX,
|
||||
@ -3530,7 +3523,7 @@ static int outer_child(
|
||||
|
||||
(void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
|
||||
|
||||
p = prefix_roota(directory, "/run");
|
||||
p = prefix_roota(directory, "/run/host");
|
||||
(void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
|
||||
|
||||
r = setup_pts(directory);
|
||||
@ -3571,6 +3564,14 @@ static int outer_child(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* The same stuff as the $container env var, but nicely readable for the entire payload */
|
||||
p = prefix_roota(directory, "/run/host/container-manager");
|
||||
(void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
|
||||
|
||||
/* The same stuff as the $container_uuid env var */
|
||||
p = prefix_roota(directory, "/run/host/container-uuid");
|
||||
(void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
|
||||
|
||||
if (!arg_use_cgns) {
|
||||
r = mount_cgroups(
|
||||
directory,
|
||||
@ -3588,7 +3589,7 @@ static int outer_child(
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to move root directory: %m");
|
||||
|
||||
fd = setup_sd_notify_child();
|
||||
fd = setup_notify_child();
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
@ -3801,7 +3802,7 @@ static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t r
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int setup_sd_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
|
||||
static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
|
||||
int r;
|
||||
|
||||
r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
|
||||
@ -4632,7 +4633,7 @@ static int run_container(
|
||||
return log_error_errno(r, "Failed to attach bus to event loop: %m");
|
||||
}
|
||||
|
||||
r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), ¬ify_event_source);
|
||||
r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), ¬ify_event_source);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
|
@ -57,7 +57,7 @@ int dev_setup(const char *prefix, uid_t uid, gid_t gid) {
|
||||
}
|
||||
|
||||
int make_inaccessible_nodes(
|
||||
const char *runtime_dir,
|
||||
const char *parent_dir,
|
||||
uid_t uid,
|
||||
gid_t gid) {
|
||||
|
||||
@ -65,28 +65,26 @@ int make_inaccessible_nodes(
|
||||
const char *name;
|
||||
mode_t mode;
|
||||
} table[] = {
|
||||
{ "/systemd", S_IFDIR | 0755 },
|
||||
{ "/systemd/inaccessible", S_IFDIR | 0000 },
|
||||
{ "/systemd/inaccessible/reg", S_IFREG | 0000 },
|
||||
{ "/systemd/inaccessible/dir", S_IFDIR | 0000 },
|
||||
{ "/systemd/inaccessible/fifo", S_IFIFO | 0000 },
|
||||
{ "/systemd/inaccessible/sock", S_IFSOCK | 0000 },
|
||||
{ "inaccessible", S_IFDIR | 0755 },
|
||||
{ "inaccessible/reg", S_IFREG | 0000 },
|
||||
{ "inaccessible/dir", S_IFDIR | 0000 },
|
||||
{ "inaccessible/fifo", S_IFIFO | 0000 },
|
||||
{ "inaccessible/sock", S_IFSOCK | 0000 },
|
||||
|
||||
/* The following two are likely to fail if we lack the privs for it (for example in an userns
|
||||
* environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0
|
||||
* device nodes to be created). But that's entirely fine. Consumers of these files should carry
|
||||
* fallback to use a different node then, for example <root>/inaccessible/sock, which is close
|
||||
* enough in behaviour and semantics for most uses. */
|
||||
{ "/systemd/inaccessible/chr", S_IFCHR | 0000 },
|
||||
{ "/systemd/inaccessible/blk", S_IFBLK | 0000 },
|
||||
{ "inaccessible/chr", S_IFCHR | 0000 },
|
||||
{ "inaccessible/blk", S_IFBLK | 0000 },
|
||||
};
|
||||
|
||||
_cleanup_umask_ mode_t u;
|
||||
size_t i;
|
||||
int r;
|
||||
|
||||
if (!runtime_dir)
|
||||
runtime_dir = "/run";
|
||||
if (!parent_dir)
|
||||
parent_dir = "/run/systemd";
|
||||
|
||||
u = umask(0000);
|
||||
|
||||
@ -95,10 +93,10 @@ int make_inaccessible_nodes(
|
||||
* to lock down these nodes as much as we can, but otherwise try to match them as closely as possible with the
|
||||
* underlying file, i.e. in the best case we offer the same node type as the underlying node. */
|
||||
|
||||
for (i = 0; i < ELEMENTSOF(table); i++) {
|
||||
for (size_t i = 0; i < ELEMENTSOF(table); i++) {
|
||||
_cleanup_free_ char *path = NULL;
|
||||
|
||||
path = path_join(runtime_dir, table[i].name);
|
||||
path = path_join(parent_dir, table[i].name);
|
||||
if (!path)
|
||||
return log_oom();
|
||||
|
||||
@ -107,8 +105,7 @@ int make_inaccessible_nodes(
|
||||
else
|
||||
r = mknod_label(path, table[i].mode, makedev(0, 0));
|
||||
if (r < 0) {
|
||||
if (r != -EEXIST)
|
||||
log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
|
||||
log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -5,4 +5,4 @@
|
||||
|
||||
int dev_setup(const char *prefix, uid_t uid, gid_t gid);
|
||||
|
||||
int make_inaccessible_nodes(const char *root, uid_t uid, gid_t gid);
|
||||
int make_inaccessible_nodes(const char *parent_dir, uid_t uid, gid_t gid);
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "capability-util.h"
|
||||
#include "dev-setup.h"
|
||||
#include "fs-util.h"
|
||||
#include "mkdir.h"
|
||||
#include "path-util.h"
|
||||
#include "rm-rf.h"
|
||||
#include "tmpfile-util.h"
|
||||
@ -17,8 +18,8 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
assert_se(mkdtemp_malloc("/tmp/test-dev-setupXXXXXX", &p) >= 0);
|
||||
|
||||
f = prefix_roota(p, "/run");
|
||||
assert_se(mkdir(f, 0755) >= 0);
|
||||
f = prefix_roota(p, "/run/systemd");
|
||||
assert_se(mkdir_p(f, 0755) >= 0);
|
||||
|
||||
assert_se(make_inaccessible_nodes(f, 1, 1) >= 0);
|
||||
|
||||
|
@ -60,7 +60,7 @@ function check_norbind {
|
||||
|
||||
function check_notification_socket {
|
||||
# https://github.com/systemd/systemd/issues/4944
|
||||
local _cmd='echo a | $(busybox which nc) -U -u -w 1 /run/systemd/nspawn/notify'
|
||||
local _cmd='echo a | $(busybox which nc) -U -u -w 1 /run/host/notify'
|
||||
# /testsuite-13.nc-container is prepared by test.sh
|
||||
systemd-nspawn $SUSE_OPTS--register=no -D /testsuite-13.nc-container /bin/sh -x -c "$_cmd"
|
||||
systemd-nspawn $SUSE_OPTS--register=no -D /testsuite-13.nc-container -U /bin/sh -x -c "$_cmd"
|
||||
|
Loading…
Reference in New Issue
Block a user