mirror of
https://github.com/systemd/systemd.git
synced 2024-12-22 17:35:35 +03:00
Merge pull request #25918 from bluca/smbios_sd_notify
Support AF_VSOCK in sd_notify and pick up notify_socket from creds
This commit is contained in:
commit
7122aee5ab
12
TODO
12
TODO
@ -552,10 +552,6 @@ Features:
|
||||
* sd-boot should look for information what to boot in SMBIOS, too, so that VM
|
||||
managers can tell sd-boot what to boot into and suchlike
|
||||
|
||||
* PID 1 should look for an SMBIOS variable that encodes an AF_VSOCK address it
|
||||
should send sd_notify() ready notifications to. That way a VMM can boot up a
|
||||
system, and generically know when it finished booting.
|
||||
|
||||
* add "systemd-sysext identify" verb, that you can point on any file in /usr/
|
||||
and that determines from which overlayfs layer it originates, which image, and with
|
||||
what it was signed.
|
||||
@ -777,13 +773,7 @@ Features:
|
||||
don't query this unnecessarily in entirely uninitialized
|
||||
containers. (i.e. containers with empty /etc).
|
||||
|
||||
* beef up sd_notify() to support AV_VSOCK in $NOTIFY_SOCKET, so that VM
|
||||
managers can get ready notifications from VMs, just like container managers
|
||||
from their payload. Also pick up address from qemu/fw_cfg if set there.
|
||||
(which has benefits, given SecureBoot and kernel cmdline are not necessarily
|
||||
friends.)
|
||||
|
||||
* mirroring this: maybe support binding to AV_VSOCK in Type=notify services,
|
||||
* sd_notify/vsock: maybe support binding to AF_VSOCK in Type=notify services,
|
||||
then passing $NOTIFY_SOCKET and $NOTIFY_GUESTCID with PID1's cid (typically
|
||||
fixed to "2", i.e. the official host cid) and the expected guest cid, for the
|
||||
two sides of the channel. The latter env var could then be used in an
|
||||
|
@ -330,6 +330,18 @@ systemd-run -p LoadCredential=mycred -P --wait systemd-creds cat mycred
|
||||
|
||||
Various services shipped with `systemd` consume credentials for tweaking behaviour:
|
||||
|
||||
* [`systemd(1)`](https://www.freedesktop.org/software/systemd/man/systemd.html)
|
||||
(I.E.: PID1, the system manager) will look for the credential `vmm.notify_socket`
|
||||
and will use it to send a `READY=1` datagram when the system has finished
|
||||
booting. This is useful for hypervisors/VMMs or other processes on the host
|
||||
to receive a notification via VSOCK when a virtual machine has finished booting.
|
||||
Note that in case the hypervisor does not support `SOCK_DGRAM` over `AF_VSOCK`,
|
||||
`SOCK_SEQPACKET` will be tried instead. The credential payload should be in the
|
||||
form: `vsock:<CID>:<PORT>`, where `<CID>` is optional and if omitted will
|
||||
default to talking to the hypervisor (`0`). Also note that this requires
|
||||
support for VHOST to be built-in both the guest and the host kernels, and the
|
||||
kernel modules to be loaded.
|
||||
|
||||
* [`systemd-sysusers(8)`](https://www.freedesktop.org/software/systemd/man/systemd-sysusers.html)
|
||||
will look for the credentials `passwd.hashed-password.<username>`,
|
||||
`passwd.plaintext-password.<username>` and `passwd.shell.<username>` to
|
||||
@ -382,7 +394,8 @@ qemu-system-x86_64 \
|
||||
```
|
||||
|
||||
This boots the specified disk image via qemu, provisioning public key SSH access
|
||||
for the root user from the caller's key:
|
||||
for the root user from the caller's key, and sends a notification when booting
|
||||
has finished to a process on the host:
|
||||
|
||||
```
|
||||
qemu-system-x86_64 \
|
||||
@ -396,8 +409,18 @@ qemu-system-x86_64 \
|
||||
-drive if=none,id=hd,file=test.raw,format=raw \
|
||||
-device virtio-scsi-pci,id=scsi \
|
||||
-device scsi-hd,drive=hd,bootindex=1 \
|
||||
-device vhost-vsock-pci,id=vhost-vsock-pci0,guest-cid=42 \
|
||||
-smbios type=11,value=io.systemd.credential:vmm.notify_socket=vsock:2:1234 \
|
||||
-smbios type=11,value=io.systemd.credential.binary:tmpfiles.extra=$(echo "f~ /root/.ssh/authorized_keys 700 root root - $(ssh-add -L | base64 -w 0)" | base64 -w 0)
|
||||
```
|
||||
|
||||
A process on the host can listen for the notification, for example:
|
||||
|
||||
```
|
||||
$ socat - VSOCK-LISTEN:1234,socktype=5
|
||||
READY=1
|
||||
```
|
||||
|
||||
## Relevant Paths
|
||||
|
||||
From *service* perspective the runtime path to find loaded credentials in is
|
||||
|
@ -368,13 +368,26 @@
|
||||
<xi:include href="libsystemd-pkgconfig.xml" xpointer="pkgconfig-text"/>
|
||||
|
||||
<para>These functions send a single datagram with the
|
||||
state string as payload to the <constant>AF_UNIX</constant> socket
|
||||
referenced in the <varname>$NOTIFY_SOCKET</varname> environment
|
||||
variable. If the first character of
|
||||
<varname>$NOTIFY_SOCKET</varname> is <literal>@</literal>, the
|
||||
string is understood as Linux abstract namespace socket. The
|
||||
datagram is accompanied by the process credentials of the sending
|
||||
service, using SCM_CREDENTIALS.</para>
|
||||
state string as payload to the socket referenced in the
|
||||
<varname>$NOTIFY_SOCKET</varname> environment variable. If the
|
||||
first character of <varname>$NOTIFY_SOCKET</varname> is
|
||||
<literal>/</literal> or <literal>@</literal>, the string is understood
|
||||
as an <constant>AF_UNIX</constant> or Linux abstract namespace socket
|
||||
(respectively), and in both cases the datagram is accompanied by the
|
||||
process credentials of the sending service, using SCM_CREDENTIALS. If
|
||||
the string starts with <literal>vsock:</literal> then the string is
|
||||
understood as an <constant>AF_VSOCK</constant> address, which is useful
|
||||
for hypervisors/VMMs or other processes on the host to receive a
|
||||
notification when a virtual machine has finished booting. Note that in
|
||||
case the hypervisor does not support <constant>SOCK_DGRAM</constant>
|
||||
over <constant>AF_VSOCK</constant>, <constant>SOCK_SEQPACKET</constant>
|
||||
will be used instead. The address should be in the form:
|
||||
<literal>vsock:CID:PORT</literal>. Note that unlike other uses of vsock,
|
||||
the CID is mandatory and cannot be <literal>VMADDR_CID_ANY</literal>.
|
||||
Note that PID1 will send the VSOCK packets from a privileged port
|
||||
(i.e.: lower than 1024), as an attempt to address concerns that unprivileged
|
||||
processes in the guest might try to send malicious notifications to the
|
||||
host, driving it to make destructive decisions based on them.</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
|
@ -199,6 +199,24 @@
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>vmm.notify_socket</varname></term>
|
||||
<listitem>
|
||||
<para>This credential is parsed looking for an <constant>AF_VSOCK</constant> or
|
||||
<constant>AF_UNIX</constant> address where to send a <constant>READY=1</constant>
|
||||
notification datagram when the system has finished booting. See:
|
||||
<citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>
|
||||
This is useful for hypervisors/VMMs or other processes on the host
|
||||
to receive a notification via VSOCK when a virtual machine has finished booting.
|
||||
Note that in case the hypervisor does not support <constant>SOCK_DGRAM</constant>
|
||||
over <constant>AF_VSOCK</constant>, <constant>SOCK_SEQPACKET</constant> will be
|
||||
tried instead. The credential payload for <constant>AF_VSOCK</constant> should be
|
||||
in the form: <literal>vsock:CID:PORT</literal>, where <literal>CID</literal> is
|
||||
optional and if omitted will default to talking to the hypervisor
|
||||
(<constant>0</constant>).</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
|
@ -1472,3 +1472,71 @@ int connect_unix_path(int fd, int dir_fd, const char *path) {
|
||||
|
||||
return RET_NERRNO(connect(fd, &sa.sa, salen));
|
||||
}
|
||||
|
||||
int socket_address_parse_unix(SocketAddress *ret_address, const char *s) {
|
||||
struct sockaddr_un un;
|
||||
int r;
|
||||
|
||||
assert(ret_address);
|
||||
assert(s);
|
||||
|
||||
if (!IN_SET(*s, '/', '@'))
|
||||
return -EPROTO;
|
||||
|
||||
r = sockaddr_un_set_path(&un, s);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
*ret_address = (SocketAddress) {
|
||||
.sockaddr.un = un,
|
||||
.size = r,
|
||||
};
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int socket_address_parse_vsock(SocketAddress *ret_address, const char *s) {
|
||||
/* AF_VSOCK socket in vsock:cid:port notation */
|
||||
_cleanup_free_ char *n = NULL;
|
||||
char *e, *cid_start;
|
||||
unsigned port, cid;
|
||||
int r;
|
||||
|
||||
assert(ret_address);
|
||||
assert(s);
|
||||
|
||||
cid_start = startswith(s, "vsock:");
|
||||
if (!cid_start)
|
||||
return -EPROTO;
|
||||
|
||||
e = strchr(cid_start, ':');
|
||||
if (!e)
|
||||
return -EINVAL;
|
||||
|
||||
r = safe_atou(e+1, &port);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
n = strndup(cid_start, e - cid_start);
|
||||
if (!n)
|
||||
return -ENOMEM;
|
||||
|
||||
if (isempty(n))
|
||||
cid = VMADDR_CID_ANY;
|
||||
else {
|
||||
r = safe_atou(n, &cid);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
*ret_address = (SocketAddress) {
|
||||
.sockaddr.vm = {
|
||||
.svm_cid = cid,
|
||||
.svm_family = AF_VSOCK,
|
||||
.svm_port = port,
|
||||
},
|
||||
.size = sizeof(struct sockaddr_vm),
|
||||
};
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -336,3 +336,9 @@ int socket_get_mtu(int fd, int af, size_t *ret);
|
||||
#define UCRED_INVALID { .pid = 0, .uid = UID_INVALID, .gid = GID_INVALID }
|
||||
|
||||
int connect_unix_path(int fd, int dir_fd, const char *path);
|
||||
|
||||
/* Parses AF_UNIX and AF_VSOCK addresses. AF_INET[6] require some netlink calls, so it cannot be in
|
||||
* src/basic/ and is done from 'socket_local_address from src/shared/. Return -EPROTO in case of
|
||||
* protocol mismatch. */
|
||||
int socket_address_parse_unix(SocketAddress *ret_address, const char *s);
|
||||
int socket_address_parse_vsock(SocketAddress *ret_address, const char *s);
|
||||
|
@ -713,5 +713,18 @@ int import_credentials(void) {
|
||||
r = q;
|
||||
}
|
||||
|
||||
if (r >= 0) {
|
||||
_cleanup_free_ char *address = NULL;
|
||||
|
||||
r = read_credential("vmm.notify_socket", (void **)&address, /* ret_size= */ NULL);
|
||||
if (r < 0 && !IN_SET(r, -ENOENT, -ENXIO))
|
||||
log_warning_errno(r, "Failed to read 'vmm.notify_socket' credential, ignoring: %m");
|
||||
else if (r >= 0 && !isempty(address)) {
|
||||
r = setenv("NOTIFY_SOCKET", address, /* replace= */ 1);
|
||||
if (r < 0)
|
||||
log_warning_errno(errno, "Failed to set $NOTIFY_SOCKET environment variable, ignoring: %m");
|
||||
}
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
@ -433,6 +433,23 @@ _public_ int sd_is_mq(int fd, const char *path) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int vsock_bind_privileged_port(int fd) {
|
||||
union sockaddr_union sa = {
|
||||
.vm.svm_family = AF_VSOCK,
|
||||
.vm.svm_cid = VMADDR_CID_ANY,
|
||||
.vm.svm_port = 1023,
|
||||
};
|
||||
int r;
|
||||
|
||||
assert(fd >= 0);
|
||||
|
||||
do
|
||||
r = RET_NERRNO(bind(fd, &sa.sa, sizeof(sa.vm)));
|
||||
while (r == -EADDRINUSE && --sa.vm.svm_port > 0);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
_public_ int sd_pid_notify_with_fds(
|
||||
pid_t pid,
|
||||
int unset_environment,
|
||||
@ -440,12 +457,12 @@ _public_ int sd_pid_notify_with_fds(
|
||||
const int *fds,
|
||||
unsigned n_fds) {
|
||||
|
||||
union sockaddr_union sockaddr;
|
||||
SocketAddress address;
|
||||
struct iovec iovec;
|
||||
struct msghdr msghdr = {
|
||||
.msg_iov = &iovec,
|
||||
.msg_iovlen = 1,
|
||||
.msg_name = &sockaddr,
|
||||
.msg_name = &address.sockaddr,
|
||||
};
|
||||
_cleanup_close_ int fd = -EBADF;
|
||||
struct cmsghdr *cmsg = NULL;
|
||||
@ -467,17 +484,53 @@ _public_ int sd_pid_notify_with_fds(
|
||||
if (!e)
|
||||
return 0;
|
||||
|
||||
r = sockaddr_un_set_path(&sockaddr.un, e);
|
||||
/* Allow AF_UNIX and AF_VSOCK, reject the rest. */
|
||||
r = socket_address_parse_unix(&address, e);
|
||||
if (r == -EPROTO)
|
||||
r = socket_address_parse_vsock(&address, e);
|
||||
if (r < 0)
|
||||
goto finish;
|
||||
msghdr.msg_namelen = r;
|
||||
msghdr.msg_namelen = address.size;
|
||||
|
||||
fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0);
|
||||
/* If we didn't get an address (which is a normal pattern when specifying VSOCK tuples) error out,
|
||||
* we always require a specific CID. */
|
||||
if (address.sockaddr.vm.svm_family == AF_VSOCK && address.sockaddr.vm.svm_cid == VMADDR_CID_ANY) {
|
||||
r = -EINVAL;
|
||||
goto finish;
|
||||
}
|
||||
|
||||
/* At the time of writing QEMU does not yet support AF_VSOCK + SOCK_DGRAM and returns
|
||||
* ENODEV. Fallback to SOCK_SEQPACKET in that case. */
|
||||
fd = socket(address.sockaddr.sa.sa_family, SOCK_DGRAM|SOCK_CLOEXEC, 0);
|
||||
if (fd < 0) {
|
||||
if (!(ERRNO_IS_NOT_SUPPORTED(errno) || errno == ENODEV) || address.sockaddr.sa.sa_family != AF_VSOCK) {
|
||||
r = -errno;
|
||||
goto finish;
|
||||
}
|
||||
|
||||
fd = socket(address.sockaddr.sa.sa_family, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
|
||||
if (fd < 0) {
|
||||
r = -errno;
|
||||
goto finish;
|
||||
}
|
||||
|
||||
r = vsock_bind_privileged_port(fd);
|
||||
if (r < 0 && !ERRNO_IS_PRIVILEGE(r))
|
||||
goto finish;
|
||||
|
||||
if (connect(fd, &address.sockaddr.sa, address.size) < 0) {
|
||||
r = -errno;
|
||||
goto finish;
|
||||
}
|
||||
|
||||
msghdr.msg_name = NULL;
|
||||
msghdr.msg_namelen = 0;
|
||||
} else if (address.sockaddr.sa.sa_family == AF_VSOCK) {
|
||||
r = vsock_bind_privileged_port(fd);
|
||||
if (r < 0 && !ERRNO_IS_PRIVILEGE(r))
|
||||
goto finish;
|
||||
}
|
||||
|
||||
(void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
|
||||
|
||||
iovec = IOVEC_MAKE_STRING(state);
|
||||
|
@ -17,63 +17,18 @@
|
||||
#include "string-util.h"
|
||||
|
||||
int socket_address_parse(SocketAddress *a, const char *s) {
|
||||
_cleanup_free_ char *n = NULL;
|
||||
char *e;
|
||||
uint16_t port;
|
||||
int r;
|
||||
|
||||
assert(a);
|
||||
assert(s);
|
||||
|
||||
if (IN_SET(*s, '/', '@')) {
|
||||
/* AF_UNIX socket */
|
||||
struct sockaddr_un un;
|
||||
|
||||
r = sockaddr_un_set_path(&un, s);
|
||||
if (r < 0)
|
||||
r = socket_address_parse_unix(a, s);
|
||||
if (r == -EPROTO)
|
||||
r = socket_address_parse_vsock(a, s);
|
||||
if (r != -EPROTO)
|
||||
return r;
|
||||
|
||||
*a = (SocketAddress) {
|
||||
.sockaddr.un = un,
|
||||
.size = r,
|
||||
};
|
||||
|
||||
} else if (startswith(s, "vsock:")) {
|
||||
/* AF_VSOCK socket in vsock:cid:port notation */
|
||||
const char *cid_start = s + STRLEN("vsock:");
|
||||
unsigned port, cid;
|
||||
|
||||
e = strchr(cid_start, ':');
|
||||
if (!e)
|
||||
return -EINVAL;
|
||||
|
||||
r = safe_atou(e+1, &port);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
n = strndup(cid_start, e - cid_start);
|
||||
if (!n)
|
||||
return -ENOMEM;
|
||||
|
||||
if (isempty(n))
|
||||
cid = VMADDR_CID_ANY;
|
||||
else {
|
||||
r = safe_atou(n, &cid);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
*a = (SocketAddress) {
|
||||
.sockaddr.vm = {
|
||||
.svm_cid = cid,
|
||||
.svm_family = AF_VSOCK,
|
||||
.svm_port = port,
|
||||
},
|
||||
.size = sizeof(struct sockaddr_vm),
|
||||
};
|
||||
|
||||
} else {
|
||||
uint16_t port;
|
||||
|
||||
r = parse_ip_port(s, &port);
|
||||
if (r == -ERANGE)
|
||||
return r; /* Valid port syntax, but the numerical value is wrong for a port. */
|
||||
@ -131,7 +86,6 @@ int socket_address_parse(SocketAddress *a, const char *s) {
|
||||
else
|
||||
assert_not_reached();
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user