1
0
mirror of https://github.com/systemd/systemd.git synced 2025-03-24 14:50:17 +03:00

Add PrivatePIDs= (continued) (#34940)

This commit is contained in:
Luca Boccassi 2024-11-05 18:42:28 +00:00 committed by GitHub
commit 7af37f3a90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 748 additions and 40 deletions

5
NEWS
View File

@ -254,6 +254,11 @@ CHANGES WITH 257 in spe:
the "nobody" user to the dynamic user, rather than via recursive
chown()ing.
* A new service property PrivatePIDs= has been added that runs executed
processes as PID 1 - the init process - within their own PID namespace.
PrivatePIDs= also mounts /proc/ so only processes within the new PID
namespace are visible.
systemd-udevd:
* udev rules now set 'uaccess' for /dev/udmabuf, giving locally

View File

@ -3263,6 +3263,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateIPC = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivatePIDs = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectHome = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectSystem = '...';
@ -4584,6 +4586,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
@ -4870,6 +4874,11 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname>
is a string type.</para>
<para><varname>PrivatePIDs</varname> implements the destination parameter of the
unit file setting <varname>PrivatePIDs=</varname> listed in
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
Note <varname>PrivatePIDs</varname> is a string type to allow adding more values in the future.</para>
</refsect2>
</refsect1>
@ -5439,6 +5448,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateIPC = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivatePIDs = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectHome = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectSystem = '...';
@ -6744,6 +6755,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
@ -7442,6 +7455,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateIPC = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivatePIDs = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectHome = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectSystem = '...';
@ -8585,6 +8600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
@ -9412,6 +9429,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b PrivateIPC = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivatePIDs = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectHome = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s ProtectSystem = '...';
@ -10527,6 +10546,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
@ -12281,8 +12302,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ExtraFileDescriptorNames</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>BindLogSockets</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivateUsersEx</varname> were added in version 257.</para>
<varname>ProtectControlGroupsEx</varname>,
<varname>PrivateUsersEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>
@ -12323,8 +12345,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Mount Unit Objects</title>
@ -12362,8 +12385,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Swap Unit Objects</title>
@ -12401,8 +12425,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ImportCredentialEx</varname>,
<varname>BindLogSockets</varname>,
<varname>PrivateUsersEx</varname>,
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
</refsect2>
<refsect2>
<title>Slice Unit Objects</title>

View File

@ -1976,6 +1976,30 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>PrivatePIDs=</varname></term>
<listitem><para>Takes a boolean argument. Defaults to false. If enabled, sets up a new PID namespace
for the executed processes. Each executed process is now PID 1 - the init process - in the new namespace.
<filename>/proc/</filename> is mounted such that only processes in the PID namespace are visible.
If <varname>PrivatePIDs=</varname> is set, <varname>MountAPIVFS=yes</varname> is implied.</para>
<para><varname>PrivatePIDs=</varname> is only supported for service units. This setting is not supported
with <varname>Type=forking</varname> since the kernel will kill all processes in the PID namespace if
the init process terminates.</para>
<para>This setting will be ignored if the kernel does not support PID namespaces.</para>
<para>Note unprivileged user services (i.e. a service run by the per-user instance of the service manager)
will fail with <varname>PrivatePIDs=yes</varname> if <filename>/proc/</filename> is masked
(i.e. <filename>/proc/kmsg</filename> is over-mounted with <constant>tmpfs</constant> like
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> does).
This is due to a kernel restriction not allowing unprivileged user namespaces to mount a less restrictive
instance of <filename>/proc/</filename>.</para>
<xi:include href="version-info.xml" xpointer="v257"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>PrivateUsers=</varname></term>

View File

@ -1521,11 +1521,12 @@ int safe_fork_full(
}
}
if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS)) != 0)
if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS|FORK_NEW_PIDNS)) != 0)
pid = raw_clone(SIGCHLD|
(FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
(FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0) |
(FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0));
(FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0) |
(FLAGS_SET(flags, FORK_NEW_PIDNS) ? CLONE_NEWPID : 0));
else
pid = fork();
if (pid < 0)

View File

@ -166,7 +166,7 @@ int must_be_root(void);
pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata);
/* 💣 Note that FORK_NEW_USERNS, FORK_NEW_MOUNTNS, or FORK_NEW_NETNS should not be called in threaded
/* 💣 Note that FORK_NEW_USERNS, FORK_NEW_MOUNTNS, FORK_NEW_NETNS or FORK_NEW_PIDNS should not be called in threaded
* programs, because they cause us to use raw_clone() which does not synchronize the glibc malloc() locks,
* and thus will cause deadlocks if the parent uses threads and the child does memory allocations. Hence: if
* the parent is threaded these flags may not be used. These flags cannot be used if the parent uses threads
@ -181,18 +181,19 @@ typedef enum ForkFlags {
FORK_REOPEN_LOG = 1 << 6, /* Reopen log connection */
FORK_LOG = 1 << 7, /* Log above LOG_DEBUG log level about failures */
FORK_WAIT = 1 << 8, /* Wait until child exited */
FORK_NEW_MOUNTNS = 1 << 9, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
FORK_MOUNTNS_SLAVE = 1 << 10, /* Make child's mount namespace MS_SLAVE */
FORK_PRIVATE_TMP = 1 << 11, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
FORK_RLIMIT_NOFILE_SAFE = 1 << 12, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
FORK_STDOUT_TO_STDERR = 1 << 13, /* Make stdout a copy of stderr */
FORK_FLUSH_STDIO = 1 << 14, /* fflush() stdout (and stderr) before forking */
FORK_NEW_USERNS = 1 << 15, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
FORK_CLOEXEC_OFF = 1 << 16, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
FORK_KEEP_NOTIFY_SOCKET = 1 << 17, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
FORK_DETACH = 1 << 18, /* Double fork if needed to ensure PID1/subreaper is parent */
FORK_NEW_NETNS = 1 << 19, /* Run child in its own network namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
FORK_PACK_FDS = 1 << 20, /* Rearrange the passed FDs to be FD 3,4,5,etc. Updates the array in place (combine with FORK_CLOSE_ALL_FDS!) */
FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */
FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */
FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */
FORK_CLOEXEC_OFF = 1 << 14, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
FORK_KEEP_NOTIFY_SOCKET = 1 << 15, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
FORK_DETACH = 1 << 16, /* Double fork if needed to ensure PID1/subreaper is parent */
FORK_PACK_FDS = 1 << 17, /* Rearrange the passed FDs to be FD 3,4,5,etc. Updates the array in place (combine with FORK_CLOSE_ALL_FDS!) */
FORK_NEW_MOUNTNS = 1 << 18, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
FORK_NEW_USERNS = 1 << 19, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
FORK_NEW_NETNS = 1 << 20, /* Run child in its own network namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
FORK_NEW_PIDNS = 1 << 21, /* Run child in its own PID namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
} ForkFlags;
int safe_fork_full(

View File

@ -63,6 +63,7 @@ static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_pids, "s", PrivatePIDs, private_pids_to_string);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
@ -1194,6 +1195,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_tristate, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivatePIDs", "s", property_get_private_pids, offsetof(ExecContext, private_pids), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
@ -1970,6 +1972,27 @@ int bus_exec_context_set_transient_property(
return 1;
}
if (streq(name, "PrivatePIDs")) {
const char *s;
PrivatePIDs t;
r = sd_bus_message_read(message, "s", &s);
if (r < 0)
return r;
t = private_pids_from_string(s);
if (t < 0)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s);
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->private_pids = t;
(void) unit_write_settingf(u, flags, name, "%s=%s",
name, private_pids_to_string(c->private_pids));
}
return 1;
}
if (streq(name, "PrivateDevices"))
return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);

View File

@ -2175,14 +2175,14 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT) {
r = -errno;
r = log_debug_errno(errno, "Failed to open %s: %m", a);
goto child_fail;
}
/* If the file is missing the kernel is too old, let's continue anyway. */
} else {
if (write(fd, "deny\n", 5) < 0) {
r = -errno;
r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a);
goto child_fail;
}
@ -2193,11 +2193,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
a = procfs_file_alloca(ppid, "gid_map");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
r = -errno;
r = log_debug_errno(errno, "Failed to open %s: %m", a);
goto child_fail;
}
if (write(fd, gid_map, strlen(gid_map)) < 0) {
r = -errno;
r = log_debug_errno(errno, "Failed to write GID map to %s: %m", a);
goto child_fail;
}
fd = safe_close(fd);
@ -2206,11 +2206,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
a = procfs_file_alloca(ppid, "uid_map");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
r = -errno;
r = log_debug_errno(errno, "Failed to open %s: %m", a);
goto child_fail;
}
if (write(fd, uid_map, strlen(uid_map)) < 0) {
r = -errno;
r = log_debug_errno(errno, "Failed to write UID map to %s: %m", a);
goto child_fail;
}
@ -2224,7 +2224,7 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
errno_pipe[1] = safe_close(errno_pipe[1]);
if (unshare(CLONE_NEWUSER) < 0)
return -errno;
return log_debug_errno(errno, "Failed to unshare user namespace: %m");
/* Let the child know that the namespace is ready now */
if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
@ -2251,6 +2251,130 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
return 1;
}
static int can_mount_proc(const ExecContext *c, ExecParameters *p) {
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
_cleanup_(sigkill_waitp) pid_t pid = 0;
ssize_t n;
int r;
assert(c);
assert(p);
/* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs
* like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction
* where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */
/* Create a communication channel so that the child can tell the parent a proper error code in case it
* failed. */
if (pipe2(errno_pipe, O_CLOEXEC) < 0)
return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m");
/* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE
* with FORK_MOUNTNS_SLAVE. */
r = safe_fork("(sd-proc-check)",
FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid);
if (r < 0)
return log_exec_debug_errno(c, p, r, "Failed to fork child process (sd-proc-check): %m");
if (r == 0) {
errno_pipe[0] = safe_close(errno_pipe[0]);
/* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount
* namespace will be cleaned up once the process exits. */
r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
if (r < 0) {
(void) write(errno_pipe[1], &r, sizeof(r));
_exit(EXIT_FAILURE);
}
_exit(EXIT_SUCCESS);
}
errno_pipe[1] = safe_close(errno_pipe[1]);
/* Try to read an error code from the child */
n = read(errno_pipe[0], &r, sizeof(r));
if (n < 0)
return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with child process (sd-proc-check): %m");
if (n == sizeof(r)) { /* an error code was sent to us */
/* This is the expected case where proc cannot be mounted due to permissions. */
if (ERRNO_IS_NEG_PRIVILEGE(r))
return 0;
if (r < 0)
return r;
return -EIO;
}
if (n != 0) /* on success we should have read 0 bytes */
return -EIO;
r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */);
if (r < 0)
return log_exec_debug_errno(c, p, r, "Failed to wait for (sd-proc-check) child process to terminate: %m");
if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r);
return 1;
}
static int setup_private_pids(const ExecContext *c, ExecParameters *p) {
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
ssize_t n;
int r, q;
assert(c);
assert(p);
assert(p->pidref_transport_fd >= 0);
/* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so
* we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child
* pidref to the manager and exits while the child process continues with the rest of exec_invoke()
* and finally executes the actual payload. */
/* Create a communication channel so that the parent can tell the child a proper error code in case it
* failed to send child pidref to the manager. */
if (pipe2(errno_pipe, O_CLOEXEC) < 0)
return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with parent process: %m");
r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS, &pidref);
if (r < 0)
return log_exec_debug_errno(c, p, r, "Failed to fork child into new pid namespace: %m");
if (r > 0) {
errno_pipe[0] = safe_close(errno_pipe[0]);
/* In the parent process, we send the child pidref to the manager and exit.
* If PIDFD is not supported, only the child PID is sent. The server then
* uses the child PID to set the new exec main process. */
q = send_one_fd_iov(
p->pidref_transport_fd,
pidref.fd,
&IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
/*iovlen=*/ 1,
/*flags=*/ 0);
/* Send error code to child process. */
(void) write(errno_pipe[1], &q, sizeof(q));
/* Exit here so we only go through the destructors in exec_invoke only once - in the child - as
* some destructors have external effects. The main codepaths continue in the child process. */
_exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
}
errno_pipe[1] = safe_close(errno_pipe[1]);
p->pidref_transport_fd = safe_close(p->pidref_transport_fd);
/* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always
* receive an errno even on success. */
n = read(errno_pipe[0], &r, sizeof(r));
if (n < 0)
return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with parent process: %m");
if (n != sizeof(r))
return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process");
if (r < 0)
return log_exec_debug_errno(c, p, r, "Failed to send child pidref to manager: %m");
/* NOTE! This function returns in the child process only. */
return r;
}
static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
_cleanup_free_ char *src_abs = NULL;
int r;
@ -3301,6 +3425,7 @@ static int apply_mount_namespace(
.private_dev = needs_sandboxing && context->private_devices,
.private_network = needs_sandboxing && exec_needs_network_namespace(context),
.private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
.private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO,
.private_tmp = needs_sandboxing ? context->private_tmp : false,
.mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
@ -3573,7 +3698,7 @@ static int close_remaining_fds(
const int *fds, size_t n_fds) {
size_t n_dont_close = 0;
int dont_close[n_fds + 16];
int dont_close[n_fds + 17];
assert(params);
@ -3612,6 +3737,9 @@ static int close_remaining_fds(
if (params->handoff_timestamp_fd >= 0)
dont_close[n_dont_close++] = params->handoff_timestamp_fd;
if (params->pidref_transport_fd >= 0)
dont_close[n_dont_close++] = params->pidref_transport_fd;
assert(n_dont_close <= ELEMENTSOF(dont_close));
return close_all_fds(dont_close, n_dont_close);
@ -3934,6 +4062,7 @@ static bool exec_context_need_unprivileged_private_users(
!strv_isempty(context->extension_directories) ||
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO ||
exec_needs_pid_namespace(context) ||
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
@ -4139,6 +4268,7 @@ int exec_invoke(
needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
bool keep_seccomp_privileges = false;
bool has_cap_sys_admin = false;
#if HAVE_SELINUX
_cleanup_free_ char *mac_selinux_context_net = NULL;
bool use_selinux = false;
@ -4790,6 +4920,9 @@ int exec_invoke(
uint64_t capability_ambient_set = context->capability_ambient_set;
/* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
if (needs_sandboxing) {
/* MAC enablement checks need to be done before a new mount ns is created, as they rely on
* /sys being present. The actual MAC context application will happen later, as late as
@ -4924,6 +5057,40 @@ int exec_invoke(
}
}
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
if (needs_sandboxing && exec_needs_pid_namespace(context)) {
if (params->pidref_transport_fd < 0) {
*exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "PidRef socket is not set up: %m");
}
/* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need
* to check if we can mount /proc/.
*
* We need to check prior to entering the user namespace because if we're running unprivileged or in a
* system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
* once we unshare a mount namespace. */
r = has_cap_sys_admin ? 1 : can_mount_proc(context, params);
if (r < 0) {
*exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to detect if /proc/ can be remounted: %m");
}
if (r == 0) {
*exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EPERM),
"PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing.");
}
r = setup_private_pids(context, params);
if (r < 0) {
*exit_status = EXIT_NAMESPACE;
return log_exec_error_errno(context, params, r, "Failed to set up pid namespace: %m");
}
}
/* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;

View File

@ -1391,6 +1391,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext
if (r < 0)
return r;
r = serialize_fd(f, fds, "exec-parameters-pidref-transport-fd", p->pidref_transport_fd);
if (r < 0)
return r;
if (c && exec_context_restrict_filesystems_set(c)) {
r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd);
if (r < 0)
@ -1660,6 +1664,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
continue;
close_and_replace(p->handoff_timestamp_fd, fd);
} else if ((val = startswith(l, "exec-parameters-pidref-transport-fd="))) {
int fd;
fd = deserialize_fd(fds, val);
if (fd < 0)
continue;
close_and_replace(p->pidref_transport_fd, fd);
} else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) {
int fd;
@ -1926,6 +1938,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (r < 0)
return r;
r = serialize_item(f, "exec-context-private-pids", private_pids_to_string(c->private_pids));
if (r < 0)
return r;
r = serialize_bool_elide(f, "exec-context-remove-ipc", c->remove_ipc);
if (r < 0)
return r;
@ -2813,6 +2829,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
if (r < 0)
return r;
c->private_ipc = r;
} else if ((val = startswith(l, "exec-context-private-pids="))) {
c->private_pids = private_pids_from_string(val);
if (c->private_pids < 0)
return -EINVAL;
} else if ((val = startswith(l, "exec-context-remove-ipc="))) {
r = parse_boolean(val);
if (r < 0)

View File

@ -254,6 +254,12 @@ bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParame
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
}
bool exec_needs_pid_namespace(const ExecContext *context) {
assert(context);
return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID);
}
bool exec_needs_mount_namespace(
const ExecContext *context,
const ExecParameters *params,
@ -306,7 +312,8 @@ bool exec_needs_mount_namespace(
exec_needs_cgroup_mount(context, params) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
exec_needs_ipc_namespace(context))
exec_needs_ipc_namespace(context) ||
exec_needs_pid_namespace(context))
return true;
if (context->root_directory) {
@ -1026,6 +1033,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sProtectControlGroups: %s\n"
"%sPrivateNetwork: %s\n"
"%sPrivateUsers: %s\n"
"%sPrivatePIDs: %s\n"
"%sProtectHome: %s\n"
"%sProtectSystem: %s\n"
"%sMountAPIVFS: %s\n"
@ -1052,6 +1060,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, protect_control_groups_to_string(c->protect_control_groups),
prefix, yes_no(c->private_network),
prefix, private_users_to_string(c->private_users),
prefix, private_pids_to_string(c->private_pids),
prefix, protect_home_to_string(c->protect_home),
prefix, protect_system_to_string(c->protect_system),
prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),

View File

@ -335,6 +335,7 @@ struct ExecContext {
ProtectControlGroups protect_control_groups;
ProtectSystem protect_system;
ProtectHome protect_home;
PrivatePIDs private_pids;
bool protect_hostname;
bool dynamic_user;
@ -465,6 +466,7 @@ struct ExecParameters {
char **files_env;
int user_lookup_fd;
int handoff_timestamp_fd;
int pidref_transport_fd;
int bpf_restrict_fs_map_fd;
@ -486,6 +488,7 @@ struct ExecParameters {
.bpf_restrict_fs_map_fd = -EBADF, \
.user_lookup_fd = -EBADF, \
.handoff_timestamp_fd = -EBADF, \
.pidref_transport_fd = -EBADF, \
}
#include "unit.h"
@ -623,6 +626,7 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
bool exec_needs_network_namespace(const ExecContext *context);
bool exec_needs_ipc_namespace(const ExecContext *context);
bool exec_needs_pid_namespace(const ExecContext *context);
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);

View File

@ -133,6 +133,7 @@
{{type}}.PrivateUsers, config_parse_private_users, 0, offsetof({{type}}, exec_context.private_users)
{{type}}.PrivateMounts, config_parse_tristate, 0, offsetof({{type}}, exec_context.private_mounts)
{{type}}.PrivateIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.private_ipc)
{{type}}.PrivatePIDs, config_parse_private_pids, 0, offsetof({{type}}, exec_context.private_pids)
{{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system)
{{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home)
{{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag)

View File

@ -135,6 +135,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc);
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs);
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_control_groups, protect_control_groups, ProtectControlGroups);
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode);

View File

@ -114,6 +114,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv);
CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems);
CONFIG_PARSER_PROTOTYPE(config_parse_private_tmp);
CONFIG_PARSER_PROTOTYPE(config_parse_private_users);
CONFIG_PARSER_PROTOTYPE(config_parse_private_pids);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_control_groups);
CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota);
CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset);

View File

@ -126,6 +126,7 @@ static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint
static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata);
static int manager_dispatch_run_queue(sd_event_source *source, void *userdata);
static int manager_dispatch_sigchld(sd_event_source *source, void *userdata);
@ -913,6 +914,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
.signal_fd = -EBADF,
.user_lookup_fds = EBADF_PAIR,
.handoff_timestamp_fds = EBADF_PAIR,
.pidref_transport_fds = EBADF_PAIR,
.private_listen_fd = -EBADF,
.dev_autofs_fd = -EBADF,
.cgroup_inotify_fd = -EBADF,
@ -1309,6 +1311,55 @@ static int manager_setup_handoff_timestamp_fd(Manager *m) {
return 0;
}
static int manager_setup_pidref_transport_fd(Manager *m) {
int r;
assert(m);
/* Set up the socket pair used for passing parent and child pidrefs back when the executor unshares
* a PID namespace and forks again when using PrivatePIDs=yes. */
if (m->pidref_transport_fds[0] < 0) {
m->pidref_event_source = sd_event_source_disable_unref(m->pidref_event_source);
safe_close_pair(m->pidref_transport_fds);
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->pidref_transport_fds) < 0)
return log_error_errno(errno, "Failed to allocate pidref socket: %m");
/* Make sure children never have to block */
(void) fd_increase_rxbuf(m->pidref_transport_fds[0], MANAGER_SOCKET_RCVBUF_SIZE);
r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSCRED, true);
if (r < 0)
return log_error_errno(r, "Failed to enable SO_PASSCRED for pidref socket: %m");
r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSPIDFD, true);
if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
log_debug("SO_PASSPIDFD is not supported for pidref socket, ignoring.");
else if (r < 0)
log_warning_errno(r, "Failed to enable SO_PASSPIDFD for pidref socket, ignoring: %m");
/* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */
r = fd_nonblock(m->pidref_transport_fds[0], true);
if (r < 0)
return log_error_errno(r, "Failed to make pidref socket O_NONBLOCK: %m");
}
if (!m->pidref_event_source) {
r = sd_event_add_io(m->event, &m->pidref_event_source, m->pidref_transport_fds[0], EPOLLIN, manager_dispatch_pidref_transport_fd, m);
if (r < 0)
return log_error_errno(r, "Failed to allocate pidref event source: %m");
r = sd_event_source_set_priority(m->pidref_event_source, EVENT_PRIORITY_PIDREF);
if (r < 0)
return log_error_errno(r, "Failed to set priority of pidref event source: %m");
(void) sd_event_source_set_description(m->pidref_event_source, "pidref");
}
return 0;
}
static unsigned manager_dispatch_cleanup_queue(Manager *m) {
Unit *u;
unsigned n = 0;
@ -1724,6 +1775,7 @@ Manager* manager_free(Manager *m) {
sd_event_source_unref(m->run_queue_event_source);
sd_event_source_unref(m->user_lookup_event_source);
sd_event_source_unref(m->handoff_timestamp_event_source);
sd_event_source_unref(m->pidref_event_source);
sd_event_source_unref(m->memory_pressure_event_source);
safe_close(m->signal_fd);
@ -1731,6 +1783,7 @@ Manager* manager_free(Manager *m) {
safe_close(m->cgroups_agent_fd);
safe_close_pair(m->user_lookup_fds);
safe_close_pair(m->handoff_timestamp_fds);
safe_close_pair(m->pidref_transport_fds);
manager_close_ask_password(m);
@ -2077,6 +2130,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
/* This shouldn't fail, except if things are really broken. */
return r;
r = manager_setup_pidref_transport_fd(m);
if (r < 0)
/* This shouldn't fail, except if things are really broken. */
return r;
/* Connect to the bus if we are good for it */
manager_setup_bus(m);
@ -3747,6 +3805,7 @@ int manager_reload(Manager *m) {
(void) manager_setup_cgroups_agent(m);
(void) manager_setup_user_lookup_fd(m);
(void) manager_setup_handoff_timestamp_fd(m);
(void) manager_setup_pidref_transport_fd(m);
/* Third, fire things up! */
manager_coldplug(m);
@ -5002,6 +5061,142 @@ static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd
return 0;
}
static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
Manager *m = ASSERT_PTR(userdata);
_cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL, parent_pidref = PIDREF_NULL;
_cleanup_close_ int child_pidfd = -EBADF, parent_pidfd = -EBADF;
struct ucred *ucred = NULL;
CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int)) * 2) control;
pid_t child_pid;
struct msghdr msghdr = {
.msg_iov = &IOVEC_MAKE(&child_pid, sizeof(child_pid)),
.msg_iovlen = 1,
.msg_control = &control,
.msg_controllen = sizeof(control),
};
struct cmsghdr *cmsg;
ssize_t n;
int r;
assert(source);
/* Server expects:
* - Parent PID in ucreds enabled via SO_PASSCRED
* - Parent PIDFD in SCM_PIDFD message enabled via SO_PASSPIDFD
* - Child PIDFD in SCM_RIGHTS in message body
* - Child PID in message IOV
*
* SO_PASSPIDFD may not be supported by the kernel so we fall back to using parent PID from ucreds
* and accept some raciness. */
n = recvmsg_safe(m->pidref_transport_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC);
if (ERRNO_IS_NEG_TRANSIENT(n))
return 0; /* Spurious wakeup, try again */
if (n == -ECHRNG) {
log_warning_errno(n, "Got message with truncated control data (unexpected fds sent?), ignoring.");
return 0;
}
if (n == -EXFULL) {
log_warning_errno(n, "Got message with truncated payload data, ignoring.");
return 0;
}
if (n < 0)
return log_error_errno(n, "Failed to receive pidref message: %m");
if (n != sizeof(child_pid)) {
log_warning("Got pidref message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(child_pid));
return 0;
}
CMSG_FOREACH(cmsg, &msghdr) {
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
if (cmsg->cmsg_type == SCM_CREDENTIALS && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
assert(!ucred);
ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
} else if (cmsg->cmsg_type == SCM_PIDFD) {
assert(parent_pidfd < 0);
parent_pidfd = *CMSG_TYPED_DATA(cmsg, int);
} else if (cmsg->cmsg_type == SCM_RIGHTS) {
assert(child_pidfd < 0);
child_pidfd = *CMSG_TYPED_DATA(cmsg, int);
}
}
/* Verify and set parent pidref. */
if (!ucred || !pid_is_valid(ucred->pid)) {
log_warning("Received pidref message without valid credentials. Ignoring.");
return 0;
}
/* Need to handle kernels without SO_PASSPIDFD where SCM_PIDFD will not be set. */
if (parent_pidfd >= 0)
r = pidref_set_pidfd_consume(&parent_pidref, TAKE_FD(parent_pidfd));
else
r = pidref_set_pid(&parent_pidref, ucred->pid);
if (r < 0) {
if (r == -ESRCH)
log_debug_errno(r, "PidRef child process died before message is processed. Ignoring.");
else
log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m");
return 0;
}
if (parent_pidref.pid != ucred->pid) {
assert(parent_pidref.fd >= 0);
log_warning("Got SCM_PIDFD for parent process " PID_FMT " but got SCM_CREDENTIALS for parent process " PID_FMT ". Ignoring.",
parent_pidref.pid, ucred->pid);
return 0;
}
/* Verify and set child pidref. */
if (!pid_is_valid(child_pid)) {
log_warning("Received pidref message without valid child PID. Ignoring.");
return 0;
}
/* Need to handle kernels without PIDFD support. */
if (child_pidfd >= 0)
r = pidref_set_pidfd_consume(&child_pidref, TAKE_FD(child_pidfd));
else
r = pidref_set_pid(&child_pidref, child_pid);
if (r < 0) {
if (r == -ESRCH)
log_debug_errno(r, "PidRef child process died before message is processed. Ignoring.");
else
log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m");
return 0;
}
if (child_pidref.pid != child_pid) {
assert(child_pidref.fd >= 0);
log_warning("Got SCM_RIGHTS for child process " PID_FMT " but PID in IOV message is " PID_FMT ". Ignoring.",
child_pidref.pid, child_pid);
return 0;
}
log_debug("Got pidref event with parent PID " PID_FMT " and child PID " PID_FMT ".", parent_pidref.pid, child_pidref.pid);
/* Try finding cgroup of parent process. But if parent process exited and we're not using PIDFD, this could return NULL.
* Then fall back to finding cgroup of the child process. */
Unit *u = manager_get_unit_by_pidref_cgroup(m, &parent_pidref);
if (!u)
u = manager_get_unit_by_pidref_cgroup(m, &child_pidref);
if (!u) {
log_debug("Got pidref for parent process " PID_FMT " and child process " PID_FMT " we are not interested in, ignoring.", parent_pidref.pid, child_pidref.pid);
return 0;
}
if (!UNIT_VTABLE(u)->notify_pidref) {
log_unit_warning(u, "Received pidref event from unexpected unit type '%s'.", unit_type_to_string(u->type));
return 0;
}
UNIT_VTABLE(u)->notify_pidref(u, &parent_pidref, &child_pidref);
return 0;
}
void manager_ref_console(Manager *m) {
assert(m);

View File

@ -289,6 +289,9 @@ struct Manager {
int handoff_timestamp_fds[2];
sd_event_source *handoff_timestamp_event_source;
int pidref_transport_fds[2];
sd_event_source *pidref_event_source;
RuntimeScope runtime_scope;
LookupPaths lookup_paths;
@ -678,12 +681,13 @@ void unit_defaults_done(UnitDefaults *defaults);
enum {
/* most important … */
EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11,
EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */
EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */
EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8,
EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-12,
EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-11,
EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-11,
EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv1 */
EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv2 */
EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-9,
EVENT_PRIORITY_PIDREF = SD_EVENT_PRIORITY_NORMAL-8,
EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7,
EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6,
EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5,

View File

@ -2061,7 +2061,8 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
p->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
p->protect_kernel_tunables ||
p->protect_proc != PROTECT_PROC_DEFAULT ||
p->proc_subset != PROC_SUBSET_ALL;
p->proc_subset != PROC_SUBSET_ALL ||
p->private_pids != PRIVATE_PIDS_NO;
}
/* Walk all mount entries and dropping any unused mounts. This affects all
@ -3366,3 +3367,10 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = {
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF);
static const char* const private_pids_table[_PRIVATE_PIDS_MAX] = {
[PRIVATE_PIDS_NO] = "no",
[PRIVATE_PIDS_YES] = "yes",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES);

View File

@ -78,6 +78,13 @@ typedef enum ProtectControlGroups {
_PROTECT_CONTROL_GROUPS_INVALID = -EINVAL,
} ProtectControlGroups;
typedef enum PrivatePIDs {
PRIVATE_PIDS_NO,
PRIVATE_PIDS_YES,
_PRIVATE_PIDS_MAX,
_PRIVATE_PIDS_INVALID = -EINVAL,
} PrivatePIDs;
struct BindMount {
char *source;
char *destination;
@ -182,6 +189,7 @@ struct NamespaceParameters {
ProtectProc protect_proc;
ProcSubset proc_subset;
PrivateTmp private_tmp;
PrivatePIDs private_pids;
};
int setup_namespace(const NamespaceParameters *p, char **reterr_path);
@ -225,6 +233,9 @@ PrivateUsers private_users_from_string(const char *s) _pure_;
const char* protect_control_groups_to_string(ProtectControlGroups i) _const_;
ProtectControlGroups protect_control_groups_from_string(const char *s) _pure_;
const char* private_pids_to_string(PrivatePIDs i) _const_;
PrivatePIDs private_pids_from_string(const char *s) _pure_;
void bind_mount_free_many(BindMount *b, size_t n);
int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);

View File

@ -710,6 +710,9 @@ static int service_verify(Service *s) {
if (s->type == SERVICE_DBUS && !s->bus_name)
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context))
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing.");
if (s->usb_function_descriptors && !s->usb_function_strings)
log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring.");
@ -4908,6 +4911,35 @@ static void service_handoff_timestamp(
unit_add_to_dbus_queue(u);
}
static void service_notify_pidref(Unit *u, PidRef *parent_pidref, PidRef *child_pidref) {
Service *s = ASSERT_PTR(SERVICE(u));
int r;
assert(pidref_is_set(parent_pidref));
assert(pidref_is_set(child_pidref));
if (pidref_equal(&s->main_pid, parent_pidref)) {
r = service_set_main_pidref(s, TAKE_PIDREF(*child_pidref), /* start_timestamp = */ NULL);
if (r < 0)
return (void) log_unit_warning_errno(u, r, "Failed to set new main pid: %m");
/* Since the child process is PID 1 in a new PID namespace, it must be exclusive to this unit. */
r = unit_watch_pidref(u, &s->main_pid, /* exclusive= */ true);
if (r < 0)
log_unit_warning_errno(u, r, "Failed to watch new main PID " PID_FMT ": %m", s->main_pid.pid);
} else if (pidref_equal(&s->control_pid, parent_pidref)) {
service_unwatch_control_pid(s);
s->control_pid = TAKE_PIDREF(*child_pidref);
r = unit_watch_pidref(u, &s->control_pid, /* exclusive= */ true);
if (r < 0)
log_unit_warning_errno(u, r, "Failed to watch new control PID " PID_FMT ": %m", s->control_pid.pid);
} else
return (void) log_unit_debug(u, "Parent process " PID_FMT " does not match main or control processes, ignoring.", parent_pidref->pid);
unit_add_to_dbus_queue(u);
}
static int service_get_timeout(Unit *u, usec_t *timeout) {
Service *s = ASSERT_PTR(SERVICE(u));
uint64_t t;
@ -5638,6 +5670,7 @@ const UnitVTable service_vtable = {
.notify_cgroup_oom = service_notify_cgroup_oom_event,
.notify_message = service_notify_message,
.notify_handoff_timestamp = service_handoff_timestamp,
.notify_pidref = service_notify_pidref,
.main_pid = service_main_pid,
.control_pid = service_control_pid,

View File

@ -4237,6 +4237,9 @@ static int unit_verify_contexts(const Unit *u) {
exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL))
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing.");
if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref)
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing.");
const KillContext *kc = unit_get_kill_context(u);
if (ec->pam_name && kc && !IN_SET(kc->kill_mode, KILL_CONTROL_GROUP, KILL_MIXED))
@ -5402,6 +5405,8 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
p->user_lookup_fd = u->manager->user_lookup_fds[1];
p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1];
if (UNIT_VTABLE(u)->notify_pidref)
p->pidref_transport_fd = u->manager->pidref_transport_fds[1];
p->cgroup_id = crt ? crt->cgroup_id : 0;
p->invocation_id = u->invocation_id;

View File

@ -640,6 +640,9 @@ typedef struct UnitVTable {
/* Called whenever we learn a handoff timestamp */
void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts);
/* Called whenever we learn about a child process */
void (*notify_pidref)(Unit *u, PidRef *parent_pidref, PidRef *child_pidref);
/* Called whenever a name this Unit registered for comes or goes away. */
void (*bus_name_owner_change)(Unit *u, const char *new_owner);

View File

@ -1061,7 +1061,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
"LogNamespace",
"RootImagePolicy",
"MountImagePolicy",
"ExtensionImagePolicy"))
"ExtensionImagePolicy",
"PrivatePIDs"))
return bus_append_string(m, field, eq);
if (STR_IN_SET(field, "IgnoreSIGPIPE",

View File

@ -6,12 +6,17 @@ TEST_DESCRIPTION="Tests for core PID1 functionality"
# for testing PrivateNetwork=yes
NSPAWN_ARGUMENTS="--capability=CAP_NET_ADMIN"
# for testing PrivatePIDs=yes
TEST_INSTALL_VERITY_MINIMAL=1
# shellcheck source=test/test-functions
. "${TEST_BASE_DIR:?}/test-functions"
test_append_files() {
image_install logger socat
inst_binary mksquashfs
inst_binary unsquashfs
install_verity_minimal
}
do_test "$@"

View File

@ -0,0 +1,161 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
# shellcheck disable=SC2016
set -eux
set -o pipefail
# shellcheck source=test/units/test-control.sh
. "$(dirname "$0")"/test-control.sh
# shellcheck source=test/units/util.sh
. "$(dirname "$0")"/util.sh
HAS_EXISTING_SCSI_MOUNT=no
if findmnt --mountpoint /proc/scsi; then
HAS_EXISTING_SCSI_MOUNT=yes
fi
at_exit() {
set +e
# Unmount any file systems
if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
umount /proc/scsi
fi
umount /tmp/TEST-07-PID1-private-pids-proc
rm -rf /tmp/TEST-07-PID1-private-pids-proc
# Remove any test files
rm -rf /tmp/TEST-07-PID1-private-pids-services
rm -rf /tmp/TEST-07-PID1-private-pids-root
# Stop any test services
systemctl kill --signal=KILL TEST-07-PID1-private-pid.service
# Remove any failed transient units
systemctl reset-failed
}
trap at_exit EXIT
testcase_basic() {
# Verify current process is PID1 in new namespace
assert_eq "$(systemd-run -p PrivatePIDs=yes --wait --pipe readlink /proc/self)" "1"
# Verify we are only processes in new namespace
assert_eq "$(systemd-run -p PrivatePIDs=yes --wait --pipe ps aux --no-heading | wc -l)" "1"
# Verify procfs mount
systemd-run -p PrivatePIDs=yes --wait --pipe \
bash -xec '[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ rw ]];
[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ nosuid ]];
[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ nodev ]];
[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ noexec ]];'
# Verify main PID is correct
systemd-run -p PrivatePIDs=yes --remain-after-exit --unit TEST-07-PID1-private-pid sleep infinity
# Wait for ExecMainPID to be correctly populated as there might be a race between spawning service
# and actual exec child process
sleep 2
pid=$(systemctl show TEST-07-PID1-private-pid.service -p ExecMainPID --value)
kill -9 "$pid"
timeout 10s bash -xec 'while [[ "$(systemctl show -P SubState TEST-07-PID1-private-pid.service)" != "failed" ]]; do sleep .5; done'
assert_eq "$(systemctl show -P Result TEST-07-PID1-private-pid.service)" "signal"
assert_eq "$(systemctl show -P ExecMainStatus TEST-07-PID1-private-pid.service)" "9"
systemctl reset-failed
}
testcase_analyze() {
mkdir -p /tmp/TEST-07-PID1-private-pids-services
# Verify other services are compatible with PrivatePIDs=yes
cat <<EOF >/tmp/TEST-07-PID1-private-pids-services/oneshot-valid.service
[Service]
ExecStart=echo hello
PrivatePIDs=yes
Type=oneshot
EOF
# Verify Type=forking services are not compatible with PrivatePIDs=yes
cat <<EOF >/tmp/TEST-07-PID1-private-pids-services/forking-invalid.service
[Service]
ExecStart=echo hello
PrivatePIDs=yes
Type=forking
EOF
systemd-analyze --recursive-errors=no verify /tmp/TEST-07-PID1-private-pids-services/oneshot-valid.service
(! systemd-analyze --recursive-errors=no verify /tmp/TEST-07-PID1-private-pids-services/forking-invalid.service)
rm -rf /tmp/TEST-07-PID1-private-pids-services
}
testcase_multiple_features() {
unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-private-pids-root /usr/share/minimal_0.raw
systemd-run \
-p PrivatePIDs=yes \
-p RootDirectory=/tmp/TEST-07-PID1-private-pids-root \
-p ProcSubset=pid \
-p BindReadOnlyPaths=/usr/share \
-p NoNewPrivileges=yes \
-p ProtectSystem=strict \
-p User=testuser\
-p Group=testuser \
-p RuntimeDirectory=abc \
-p StateDirectory=qed \
-p InaccessiblePaths=/usr/include \
-p TemporaryFileSystem=/home \
-p PrivateTmp=yes \
-p PrivateDevices=yes \
-p PrivateNetwork=yes \
-p PrivateUsersEx=self \
-p PrivateIPC=yes \
-p ProtectHostname=yes \
-p ProtectClock=yes \
-p ProtectKernelTunables=yes \
-p ProtectKernelModules=yes \
-p ProtectKernelLogs=yes \
-p ProtectControlGroupsEx=private \
-p LockPersonality=yes \
-p Environment=ABC=QED \
--wait \
--pipe \
grep MARKER=1 /etc/os-release
rm -rf /tmp/TEST-07-PID1-private-pids-root
}
testcase_unpriv() {
if [ ! -f /usr/lib/systemd/user/dbus.socket ] && [ ! -f /etc/systemd/user/dbus.socket ]; then
echo "Per-user instances are not supported, skipping unprivileged PrivatePIDs=yes test"
return 0
fi
if [[ "$(sysctl -ne kernel.apparmor_restrict_unprivileged_userns)" -eq 1 ]]; then
echo "Cannot create unprivileged user namespaces, skipping unprivileged PrivatePIDs=yes test"
return 0
fi
# The kernel has a restriction for unprivileged user namespaces where they cannot mount a less restrictive
# instance of /proc/. So if /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs as systemd-nspawn does),
# then mounting a new /proc/ will fail and we will still see the host's /proc/. Thus, to allow tests to run in
# a VM or nspawn, we mount a new proc on a temporary directory with no masking to bypass this kernel restriction.
mkdir -p /tmp/TEST-07-PID1-private-pids-proc
mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc
# Verify running as unprivileged user can unshare PID namespace and mounts /proc properly.
assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"
umount /tmp/TEST-07-PID1-private-pids-proc
rm -rf /tmp/TEST-07-PID1-private-pids-proc
# Now verify the behavior with masking - units should fail as PrivatePIDs=yes has no graceful fallback.
if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
mount -t tmpfs tmpfs /proc/scsi
fi
(! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true)
if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
umount /proc/scsi
fi
}
run_testcases