mirror of
https://github.com/systemd/systemd.git
synced 2025-03-24 14:50:17 +03:00
Add PrivatePIDs= (continued) (#34940)
This commit is contained in:
commit
7af37f3a90
5
NEWS
5
NEWS
@ -254,6 +254,11 @@ CHANGES WITH 257 in spe:
|
||||
the "nobody" user to the dynamic user, rather than via recursive
|
||||
chown()ing.
|
||||
|
||||
* A new service property PrivatePIDs= has been added that runs executed
|
||||
processes as PID 1 - the init process - within their own PID namespace.
|
||||
PrivatePIDs= also mounts /proc/ so only processes within the new PID
|
||||
namespace are visible.
|
||||
|
||||
systemd-udevd:
|
||||
|
||||
* udev rules now set 'uaccess' for /dev/udmabuf, giving locally
|
||||
|
@ -3263,6 +3263,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly b PrivateIPC = ...;
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s PrivatePIDs = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s ProtectHome = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s ProtectSystem = '...';
|
||||
@ -4584,6 +4586,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
|
||||
@ -4870,6 +4874,11 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
||||
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
|
||||
Unlike boolean <varname>ProtectControlGroups</varname>, <varname>ProtectControlGroupsEx</varname>
|
||||
is a string type.</para>
|
||||
|
||||
<para><varname>PrivatePIDs</varname> implements the destination parameter of the
|
||||
unit file setting <varname>PrivatePIDs=</varname> listed in
|
||||
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>.
|
||||
Note <varname>PrivatePIDs</varname> is a string type to allow adding more values in the future.</para>
|
||||
</refsect2>
|
||||
</refsect1>
|
||||
|
||||
@ -5439,6 +5448,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly b PrivateIPC = ...;
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s PrivatePIDs = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s ProtectHome = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s ProtectSystem = '...';
|
||||
@ -6744,6 +6755,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
|
||||
@ -7442,6 +7455,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly b PrivateIPC = ...;
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s PrivatePIDs = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s ProtectHome = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s ProtectSystem = '...';
|
||||
@ -8585,6 +8600,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
|
||||
@ -9412,6 +9429,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly b PrivateIPC = ...;
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s PrivatePIDs = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s ProtectHome = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s ProtectSystem = '...';
|
||||
@ -10527,6 +10546,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="PrivateIPC"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="PrivatePIDs"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHome"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="ProtectSystem"/>
|
||||
@ -12281,8 +12302,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
|
||||
<varname>ExtraFileDescriptorNames</varname>,
|
||||
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
|
||||
<varname>BindLogSockets</varname>,
|
||||
<varname>ProtectControlGroupsEx</varname>, and
|
||||
<varname>PrivateUsersEx</varname> were added in version 257.</para>
|
||||
<varname>ProtectControlGroupsEx</varname>,
|
||||
<varname>PrivateUsersEx</varname>, and
|
||||
<varname>PrivatePIDs</varname> were added in version 257.</para>
|
||||
</refsect2>
|
||||
<refsect2>
|
||||
<title>Socket Unit Objects</title>
|
||||
@ -12323,8 +12345,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
|
||||
<varname>ImportCredentialEx</varname>,
|
||||
<varname>BindLogSockets</varname>,
|
||||
<varname>PrivateUsersEx</varname>,
|
||||
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
|
||||
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
|
||||
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
|
||||
<varname>ProtectControlGroupsEx</varname>, and
|
||||
<varname>PrivatePIDs</varname> were added in version 257.</para>
|
||||
</refsect2>
|
||||
<refsect2>
|
||||
<title>Mount Unit Objects</title>
|
||||
@ -12362,8 +12385,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
|
||||
<varname>ImportCredentialEx</varname>,
|
||||
<varname>BindLogSockets</varname>,
|
||||
<varname>PrivateUsersEx</varname>,
|
||||
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
|
||||
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
|
||||
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
|
||||
<varname>ProtectControlGroupsEx</varname>, and
|
||||
<varname>PrivatePIDs</varname> were added in version 257.</para>
|
||||
</refsect2>
|
||||
<refsect2>
|
||||
<title>Swap Unit Objects</title>
|
||||
@ -12401,8 +12425,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
|
||||
<varname>ImportCredentialEx</varname>,
|
||||
<varname>BindLogSockets</varname>,
|
||||
<varname>PrivateUsersEx</varname>,
|
||||
<varname>ManagedOOMMemoryPressureDurationUSec</varname>, and
|
||||
<varname>ProtectControlGroupsEx</varname> were added in version 257.</para>
|
||||
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
|
||||
<varname>ProtectControlGroupsEx</varname>, and
|
||||
<varname>PrivatePIDs</varname> were added in version 257.</para>
|
||||
</refsect2>
|
||||
<refsect2>
|
||||
<title>Slice Unit Objects</title>
|
||||
|
@ -1976,6 +1976,30 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>PrivatePIDs=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument. Defaults to false. If enabled, sets up a new PID namespace
|
||||
for the executed processes. Each executed process is now PID 1 - the init process - in the new namespace.
|
||||
<filename>/proc/</filename> is mounted such that only processes in the PID namespace are visible.
|
||||
If <varname>PrivatePIDs=</varname> is set, <varname>MountAPIVFS=yes</varname> is implied.</para>
|
||||
|
||||
<para><varname>PrivatePIDs=</varname> is only supported for service units. This setting is not supported
|
||||
with <varname>Type=forking</varname> since the kernel will kill all processes in the PID namespace if
|
||||
the init process terminates.</para>
|
||||
|
||||
<para>This setting will be ignored if the kernel does not support PID namespaces.</para>
|
||||
|
||||
<para>Note unprivileged user services (i.e. a service run by the per-user instance of the service manager)
|
||||
will fail with <varname>PrivatePIDs=yes</varname> if <filename>/proc/</filename> is masked
|
||||
(i.e. <filename>/proc/kmsg</filename> is over-mounted with <constant>tmpfs</constant> like
|
||||
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> does).
|
||||
This is due to a kernel restriction not allowing unprivileged user namespaces to mount a less restrictive
|
||||
instance of <filename>/proc/</filename>.</para>
|
||||
|
||||
<xi:include href="version-info.xml" xpointer="v257"/></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>PrivateUsers=</varname></term>
|
||||
|
||||
|
@ -1521,11 +1521,12 @@ int safe_fork_full(
|
||||
}
|
||||
}
|
||||
|
||||
if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS)) != 0)
|
||||
if ((flags & (FORK_NEW_MOUNTNS|FORK_NEW_USERNS|FORK_NEW_NETNS|FORK_NEW_PIDNS)) != 0)
|
||||
pid = raw_clone(SIGCHLD|
|
||||
(FLAGS_SET(flags, FORK_NEW_MOUNTNS) ? CLONE_NEWNS : 0) |
|
||||
(FLAGS_SET(flags, FORK_NEW_USERNS) ? CLONE_NEWUSER : 0) |
|
||||
(FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0));
|
||||
(FLAGS_SET(flags, FORK_NEW_NETNS) ? CLONE_NEWNET : 0) |
|
||||
(FLAGS_SET(flags, FORK_NEW_PIDNS) ? CLONE_NEWPID : 0));
|
||||
else
|
||||
pid = fork();
|
||||
if (pid < 0)
|
||||
|
@ -166,7 +166,7 @@ int must_be_root(void);
|
||||
|
||||
pid_t clone_with_nested_stack(int (*fn)(void *), int flags, void *userdata);
|
||||
|
||||
/* 💣 Note that FORK_NEW_USERNS, FORK_NEW_MOUNTNS, or FORK_NEW_NETNS should not be called in threaded
|
||||
/* 💣 Note that FORK_NEW_USERNS, FORK_NEW_MOUNTNS, FORK_NEW_NETNS or FORK_NEW_PIDNS should not be called in threaded
|
||||
* programs, because they cause us to use raw_clone() which does not synchronize the glibc malloc() locks,
|
||||
* and thus will cause deadlocks if the parent uses threads and the child does memory allocations. Hence: if
|
||||
* the parent is threaded these flags may not be used. These flags cannot be used if the parent uses threads
|
||||
@ -181,18 +181,19 @@ typedef enum ForkFlags {
|
||||
FORK_REOPEN_LOG = 1 << 6, /* Reopen log connection */
|
||||
FORK_LOG = 1 << 7, /* Log above LOG_DEBUG log level about failures */
|
||||
FORK_WAIT = 1 << 8, /* Wait until child exited */
|
||||
FORK_NEW_MOUNTNS = 1 << 9, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
FORK_MOUNTNS_SLAVE = 1 << 10, /* Make child's mount namespace MS_SLAVE */
|
||||
FORK_PRIVATE_TMP = 1 << 11, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
|
||||
FORK_RLIMIT_NOFILE_SAFE = 1 << 12, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
|
||||
FORK_STDOUT_TO_STDERR = 1 << 13, /* Make stdout a copy of stderr */
|
||||
FORK_FLUSH_STDIO = 1 << 14, /* fflush() stdout (and stderr) before forking */
|
||||
FORK_NEW_USERNS = 1 << 15, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
FORK_CLOEXEC_OFF = 1 << 16, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
|
||||
FORK_KEEP_NOTIFY_SOCKET = 1 << 17, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
|
||||
FORK_DETACH = 1 << 18, /* Double fork if needed to ensure PID1/subreaper is parent */
|
||||
FORK_NEW_NETNS = 1 << 19, /* Run child in its own network namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
FORK_PACK_FDS = 1 << 20, /* Rearrange the passed FDs to be FD 3,4,5,etc. Updates the array in place (combine with FORK_CLOSE_ALL_FDS!) */
|
||||
FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */
|
||||
FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
|
||||
FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
|
||||
FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */
|
||||
FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */
|
||||
FORK_CLOEXEC_OFF = 1 << 14, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
|
||||
FORK_KEEP_NOTIFY_SOCKET = 1 << 15, /* Unless this specified, $NOTIFY_SOCKET will be unset. */
|
||||
FORK_DETACH = 1 << 16, /* Double fork if needed to ensure PID1/subreaper is parent */
|
||||
FORK_PACK_FDS = 1 << 17, /* Rearrange the passed FDs to be FD 3,4,5,etc. Updates the array in place (combine with FORK_CLOSE_ALL_FDS!) */
|
||||
FORK_NEW_MOUNTNS = 1 << 18, /* Run child in its own mount namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
FORK_NEW_USERNS = 1 << 19, /* Run child in its own user namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
FORK_NEW_NETNS = 1 << 20, /* Run child in its own network namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
FORK_NEW_PIDNS = 1 << 21, /* Run child in its own PID namespace 💣 DO NOT USE IN THREADED PROGRAMS! 💣 */
|
||||
} ForkFlags;
|
||||
|
||||
int safe_fork_full(
|
||||
|
@ -63,6 +63,7 @@ static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_empty_string, "s", NULL);
|
||||
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_tmp_ex, "s", PrivateTmp, private_tmp_to_string);
|
||||
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_users_ex, "s", PrivateUsers, private_users_to_string);
|
||||
static BUS_DEFINE_PROPERTY_GET_REF(property_get_protect_control_groups_ex, "s", ProtectControlGroups, protect_control_groups_to_string);
|
||||
static BUS_DEFINE_PROPERTY_GET_REF(property_get_private_pids, "s", PrivatePIDs, private_pids_to_string);
|
||||
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_level, "i", int, LOG_PRI);
|
||||
static BUS_DEFINE_PROPERTY_GET_REF(property_get_syslog_facility, "i", int, LOG_FAC);
|
||||
static BUS_DEFINE_PROPERTY_GET(property_get_cpu_affinity_from_numa, "b", ExecContext, exec_context_get_cpu_affinity_from_numa);
|
||||
@ -1194,6 +1195,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
|
||||
SD_BUS_PROPERTY("PrivateUsersEx", "s", property_get_private_users_ex, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateMounts", "b", bus_property_get_tristate, offsetof(ExecContext, private_mounts), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateIPC", "b", bus_property_get_bool, offsetof(ExecContext, private_ipc), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivatePIDs", "s", property_get_private_pids, offsetof(ExecContext, private_pids), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectHome", "s", property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectSystem", "s", property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
@ -1970,6 +1972,27 @@ int bus_exec_context_set_transient_property(
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (streq(name, "PrivatePIDs")) {
|
||||
const char *s;
|
||||
PrivatePIDs t;
|
||||
|
||||
r = sd_bus_message_read(message, "s", &s);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
t = private_pids_from_string(s);
|
||||
if (t < 0)
|
||||
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid %s setting: %s", name, s);
|
||||
|
||||
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
|
||||
c->private_pids = t;
|
||||
(void) unit_write_settingf(u, flags, name, "%s=%s",
|
||||
name, private_pids_to_string(c->private_pids));
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (streq(name, "PrivateDevices"))
|
||||
return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error);
|
||||
|
||||
|
@ -2175,14 +2175,14 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
fd = open(a, O_WRONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
if (errno != ENOENT) {
|
||||
r = -errno;
|
||||
r = log_debug_errno(errno, "Failed to open %s: %m", a);
|
||||
goto child_fail;
|
||||
}
|
||||
|
||||
/* If the file is missing the kernel is too old, let's continue anyway. */
|
||||
} else {
|
||||
if (write(fd, "deny\n", 5) < 0) {
|
||||
r = -errno;
|
||||
r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a);
|
||||
goto child_fail;
|
||||
}
|
||||
|
||||
@ -2193,11 +2193,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
a = procfs_file_alloca(ppid, "gid_map");
|
||||
fd = open(a, O_WRONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
r = -errno;
|
||||
r = log_debug_errno(errno, "Failed to open %s: %m", a);
|
||||
goto child_fail;
|
||||
}
|
||||
if (write(fd, gid_map, strlen(gid_map)) < 0) {
|
||||
r = -errno;
|
||||
r = log_debug_errno(errno, "Failed to write GID map to %s: %m", a);
|
||||
goto child_fail;
|
||||
}
|
||||
fd = safe_close(fd);
|
||||
@ -2206,11 +2206,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
a = procfs_file_alloca(ppid, "uid_map");
|
||||
fd = open(a, O_WRONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
r = -errno;
|
||||
r = log_debug_errno(errno, "Failed to open %s: %m", a);
|
||||
goto child_fail;
|
||||
}
|
||||
if (write(fd, uid_map, strlen(uid_map)) < 0) {
|
||||
r = -errno;
|
||||
r = log_debug_errno(errno, "Failed to write UID map to %s: %m", a);
|
||||
goto child_fail;
|
||||
}
|
||||
|
||||
@ -2224,7 +2224,7 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
errno_pipe[1] = safe_close(errno_pipe[1]);
|
||||
|
||||
if (unshare(CLONE_NEWUSER) < 0)
|
||||
return -errno;
|
||||
return log_debug_errno(errno, "Failed to unshare user namespace: %m");
|
||||
|
||||
/* Let the child know that the namespace is ready now */
|
||||
if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
|
||||
@ -2251,6 +2251,130 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int can_mount_proc(const ExecContext *c, ExecParameters *p) {
|
||||
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
|
||||
_cleanup_(sigkill_waitp) pid_t pid = 0;
|
||||
ssize_t n;
|
||||
int r;
|
||||
|
||||
assert(c);
|
||||
assert(p);
|
||||
|
||||
/* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs
|
||||
* like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction
|
||||
* where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */
|
||||
|
||||
/* Create a communication channel so that the child can tell the parent a proper error code in case it
|
||||
* failed. */
|
||||
if (pipe2(errno_pipe, O_CLOEXEC) < 0)
|
||||
return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m");
|
||||
|
||||
/* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE
|
||||
* with FORK_MOUNTNS_SLAVE. */
|
||||
r = safe_fork("(sd-proc-check)",
|
||||
FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid);
|
||||
if (r < 0)
|
||||
return log_exec_debug_errno(c, p, r, "Failed to fork child process (sd-proc-check): %m");
|
||||
if (r == 0) {
|
||||
errno_pipe[0] = safe_close(errno_pipe[0]);
|
||||
|
||||
/* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount
|
||||
* namespace will be cleaned up once the process exits. */
|
||||
r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
|
||||
if (r < 0) {
|
||||
(void) write(errno_pipe[1], &r, sizeof(r));
|
||||
_exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
errno_pipe[1] = safe_close(errno_pipe[1]);
|
||||
|
||||
/* Try to read an error code from the child */
|
||||
n = read(errno_pipe[0], &r, sizeof(r));
|
||||
if (n < 0)
|
||||
return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with child process (sd-proc-check): %m");
|
||||
if (n == sizeof(r)) { /* an error code was sent to us */
|
||||
/* This is the expected case where proc cannot be mounted due to permissions. */
|
||||
if (ERRNO_IS_NEG_PRIVILEGE(r))
|
||||
return 0;
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
return -EIO;
|
||||
}
|
||||
if (n != 0) /* on success we should have read 0 bytes */
|
||||
return -EIO;
|
||||
|
||||
r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */);
|
||||
if (r < 0)
|
||||
return log_exec_debug_errno(c, p, r, "Failed to wait for (sd-proc-check) child process to terminate: %m");
|
||||
if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
|
||||
return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int setup_private_pids(const ExecContext *c, ExecParameters *p) {
|
||||
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
|
||||
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
|
||||
ssize_t n;
|
||||
int r, q;
|
||||
|
||||
assert(c);
|
||||
assert(p);
|
||||
assert(p->pidref_transport_fd >= 0);
|
||||
|
||||
/* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so
|
||||
* we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child
|
||||
* pidref to the manager and exits while the child process continues with the rest of exec_invoke()
|
||||
* and finally executes the actual payload. */
|
||||
|
||||
/* Create a communication channel so that the parent can tell the child a proper error code in case it
|
||||
* failed to send child pidref to the manager. */
|
||||
if (pipe2(errno_pipe, O_CLOEXEC) < 0)
|
||||
return log_exec_debug_errno(c, p, errno, "Failed to create pipe for communicating with parent process: %m");
|
||||
|
||||
r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS, &pidref);
|
||||
if (r < 0)
|
||||
return log_exec_debug_errno(c, p, r, "Failed to fork child into new pid namespace: %m");
|
||||
if (r > 0) {
|
||||
errno_pipe[0] = safe_close(errno_pipe[0]);
|
||||
|
||||
/* In the parent process, we send the child pidref to the manager and exit.
|
||||
* If PIDFD is not supported, only the child PID is sent. The server then
|
||||
* uses the child PID to set the new exec main process. */
|
||||
q = send_one_fd_iov(
|
||||
p->pidref_transport_fd,
|
||||
pidref.fd,
|
||||
&IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
|
||||
/*iovlen=*/ 1,
|
||||
/*flags=*/ 0);
|
||||
/* Send error code to child process. */
|
||||
(void) write(errno_pipe[1], &q, sizeof(q));
|
||||
/* Exit here so we only go through the destructors in exec_invoke only once - in the child - as
|
||||
* some destructors have external effects. The main codepaths continue in the child process. */
|
||||
_exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
errno_pipe[1] = safe_close(errno_pipe[1]);
|
||||
p->pidref_transport_fd = safe_close(p->pidref_transport_fd);
|
||||
|
||||
/* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always
|
||||
* receive an errno even on success. */
|
||||
n = read(errno_pipe[0], &r, sizeof(r));
|
||||
if (n < 0)
|
||||
return log_exec_debug_errno(c, p, errno, "Failed to read errno from pipe with parent process: %m");
|
||||
if (n != sizeof(r))
|
||||
return log_exec_debug_errno(c, p, SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process");
|
||||
if (r < 0)
|
||||
return log_exec_debug_errno(c, p, r, "Failed to send child pidref to manager: %m");
|
||||
|
||||
/* NOTE! This function returns in the child process only. */
|
||||
return r;
|
||||
}
|
||||
|
||||
static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
|
||||
_cleanup_free_ char *src_abs = NULL;
|
||||
int r;
|
||||
@ -3301,6 +3425,7 @@ static int apply_mount_namespace(
|
||||
.private_dev = needs_sandboxing && context->private_devices,
|
||||
.private_network = needs_sandboxing && exec_needs_network_namespace(context),
|
||||
.private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
|
||||
.private_pids = needs_sandboxing && exec_needs_pid_namespace(context) ? context->private_pids : PRIVATE_PIDS_NO,
|
||||
.private_tmp = needs_sandboxing ? context->private_tmp : false,
|
||||
|
||||
.mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
|
||||
@ -3573,7 +3698,7 @@ static int close_remaining_fds(
|
||||
const int *fds, size_t n_fds) {
|
||||
|
||||
size_t n_dont_close = 0;
|
||||
int dont_close[n_fds + 16];
|
||||
int dont_close[n_fds + 17];
|
||||
|
||||
assert(params);
|
||||
|
||||
@ -3612,6 +3737,9 @@ static int close_remaining_fds(
|
||||
if (params->handoff_timestamp_fd >= 0)
|
||||
dont_close[n_dont_close++] = params->handoff_timestamp_fd;
|
||||
|
||||
if (params->pidref_transport_fd >= 0)
|
||||
dont_close[n_dont_close++] = params->pidref_transport_fd;
|
||||
|
||||
assert(n_dont_close <= ELEMENTSOF(dont_close));
|
||||
|
||||
return close_all_fds(dont_close, n_dont_close);
|
||||
@ -3934,6 +4062,7 @@ static bool exec_context_need_unprivileged_private_users(
|
||||
!strv_isempty(context->extension_directories) ||
|
||||
context->protect_system != PROTECT_SYSTEM_NO ||
|
||||
context->protect_home != PROTECT_HOME_NO ||
|
||||
exec_needs_pid_namespace(context) ||
|
||||
context->protect_kernel_tunables ||
|
||||
context->protect_kernel_modules ||
|
||||
context->protect_kernel_logs ||
|
||||
@ -4139,6 +4268,7 @@ int exec_invoke(
|
||||
needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
|
||||
needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
|
||||
bool keep_seccomp_privileges = false;
|
||||
bool has_cap_sys_admin = false;
|
||||
#if HAVE_SELINUX
|
||||
_cleanup_free_ char *mac_selinux_context_net = NULL;
|
||||
bool use_selinux = false;
|
||||
@ -4790,6 +4920,9 @@ int exec_invoke(
|
||||
|
||||
uint64_t capability_ambient_set = context->capability_ambient_set;
|
||||
|
||||
/* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
|
||||
has_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
|
||||
|
||||
if (needs_sandboxing) {
|
||||
/* MAC enablement checks need to be done before a new mount ns is created, as they rely on
|
||||
* /sys being present. The actual MAC context application will happen later, as late as
|
||||
@ -4924,6 +5057,40 @@ int exec_invoke(
|
||||
}
|
||||
}
|
||||
|
||||
/* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
|
||||
* Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
|
||||
if (needs_sandboxing && exec_needs_pid_namespace(context)) {
|
||||
if (params->pidref_transport_fd < 0) {
|
||||
*exit_status = EXIT_NAMESPACE;
|
||||
return log_exec_error_errno(context, params, r, "PidRef socket is not set up: %m");
|
||||
}
|
||||
|
||||
/* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need
|
||||
* to check if we can mount /proc/.
|
||||
*
|
||||
* We need to check prior to entering the user namespace because if we're running unprivileged or in a
|
||||
* system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
|
||||
* once we unshare a mount namespace. */
|
||||
r = has_cap_sys_admin ? 1 : can_mount_proc(context, params);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_NAMESPACE;
|
||||
return log_exec_error_errno(context, params, r, "Failed to detect if /proc/ can be remounted: %m");
|
||||
}
|
||||
if (r == 0) {
|
||||
*exit_status = EXIT_NAMESPACE;
|
||||
return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EPERM),
|
||||
"PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing.");
|
||||
}
|
||||
|
||||
r = setup_private_pids(context, params);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_NAMESPACE;
|
||||
return log_exec_error_errno(context, params, r, "Failed to set up pid namespace: %m");
|
||||
}
|
||||
}
|
||||
|
||||
/* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
|
||||
|
||||
if (needs_mount_namespace) {
|
||||
_cleanup_free_ char *error_path = NULL;
|
||||
|
||||
|
@ -1391,6 +1391,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = serialize_fd(f, fds, "exec-parameters-pidref-transport-fd", p->pidref_transport_fd);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
if (c && exec_context_restrict_filesystems_set(c)) {
|
||||
r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd);
|
||||
if (r < 0)
|
||||
@ -1660,6 +1664,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
|
||||
continue;
|
||||
|
||||
close_and_replace(p->handoff_timestamp_fd, fd);
|
||||
} else if ((val = startswith(l, "exec-parameters-pidref-transport-fd="))) {
|
||||
int fd;
|
||||
|
||||
fd = deserialize_fd(fds, val);
|
||||
if (fd < 0)
|
||||
continue;
|
||||
|
||||
close_and_replace(p->pidref_transport_fd, fd);
|
||||
} else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) {
|
||||
int fd;
|
||||
|
||||
@ -1926,6 +1938,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = serialize_item(f, "exec-context-private-pids", private_pids_to_string(c->private_pids));
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = serialize_bool_elide(f, "exec-context-remove-ipc", c->remove_ipc);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -2813,6 +2829,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
|
||||
if (r < 0)
|
||||
return r;
|
||||
c->private_ipc = r;
|
||||
} else if ((val = startswith(l, "exec-context-private-pids="))) {
|
||||
c->private_pids = private_pids_from_string(val);
|
||||
if (c->private_pids < 0)
|
||||
return -EINVAL;
|
||||
} else if ((val = startswith(l, "exec-context-remove-ipc="))) {
|
||||
r = parse_boolean(val);
|
||||
if (r < 0)
|
||||
|
@ -254,6 +254,12 @@ bool exec_is_cgroup_mount_read_only(const ExecContext *context, const ExecParame
|
||||
return IN_SET(exec_get_protect_control_groups(context, params), PROTECT_CONTROL_GROUPS_YES, PROTECT_CONTROL_GROUPS_STRICT);
|
||||
}
|
||||
|
||||
bool exec_needs_pid_namespace(const ExecContext *context) {
|
||||
assert(context);
|
||||
|
||||
return context->private_pids != PRIVATE_PIDS_NO && ns_type_supported(NAMESPACE_PID);
|
||||
}
|
||||
|
||||
bool exec_needs_mount_namespace(
|
||||
const ExecContext *context,
|
||||
const ExecParameters *params,
|
||||
@ -306,7 +312,8 @@ bool exec_needs_mount_namespace(
|
||||
exec_needs_cgroup_mount(context, params) ||
|
||||
context->protect_proc != PROTECT_PROC_DEFAULT ||
|
||||
context->proc_subset != PROC_SUBSET_ALL ||
|
||||
exec_needs_ipc_namespace(context))
|
||||
exec_needs_ipc_namespace(context) ||
|
||||
exec_needs_pid_namespace(context))
|
||||
return true;
|
||||
|
||||
if (context->root_directory) {
|
||||
@ -1026,6 +1033,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
|
||||
"%sProtectControlGroups: %s\n"
|
||||
"%sPrivateNetwork: %s\n"
|
||||
"%sPrivateUsers: %s\n"
|
||||
"%sPrivatePIDs: %s\n"
|
||||
"%sProtectHome: %s\n"
|
||||
"%sProtectSystem: %s\n"
|
||||
"%sMountAPIVFS: %s\n"
|
||||
@ -1052,6 +1060,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
|
||||
prefix, protect_control_groups_to_string(c->protect_control_groups),
|
||||
prefix, yes_no(c->private_network),
|
||||
prefix, private_users_to_string(c->private_users),
|
||||
prefix, private_pids_to_string(c->private_pids),
|
||||
prefix, protect_home_to_string(c->protect_home),
|
||||
prefix, protect_system_to_string(c->protect_system),
|
||||
prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
|
||||
|
@ -335,6 +335,7 @@ struct ExecContext {
|
||||
ProtectControlGroups protect_control_groups;
|
||||
ProtectSystem protect_system;
|
||||
ProtectHome protect_home;
|
||||
PrivatePIDs private_pids;
|
||||
bool protect_hostname;
|
||||
|
||||
bool dynamic_user;
|
||||
@ -465,6 +466,7 @@ struct ExecParameters {
|
||||
char **files_env;
|
||||
int user_lookup_fd;
|
||||
int handoff_timestamp_fd;
|
||||
int pidref_transport_fd;
|
||||
|
||||
int bpf_restrict_fs_map_fd;
|
||||
|
||||
@ -486,6 +488,7 @@ struct ExecParameters {
|
||||
.bpf_restrict_fs_map_fd = -EBADF, \
|
||||
.user_lookup_fd = -EBADF, \
|
||||
.handoff_timestamp_fd = -EBADF, \
|
||||
.pidref_transport_fd = -EBADF, \
|
||||
}
|
||||
|
||||
#include "unit.h"
|
||||
@ -623,6 +626,7 @@ ExecDirectoryType exec_resource_type_from_string(const char *s) _pure_;
|
||||
bool exec_needs_mount_namespace(const ExecContext *context, const ExecParameters *params, const ExecRuntime *runtime);
|
||||
bool exec_needs_network_namespace(const ExecContext *context);
|
||||
bool exec_needs_ipc_namespace(const ExecContext *context);
|
||||
bool exec_needs_pid_namespace(const ExecContext *context);
|
||||
|
||||
ProtectControlGroups exec_get_protect_control_groups(const ExecContext *context, const ExecParameters *params);
|
||||
bool exec_needs_cgroup_namespace(const ExecContext *context, const ExecParameters *params);
|
||||
|
@ -133,6 +133,7 @@
|
||||
{{type}}.PrivateUsers, config_parse_private_users, 0, offsetof({{type}}, exec_context.private_users)
|
||||
{{type}}.PrivateMounts, config_parse_tristate, 0, offsetof({{type}}, exec_context.private_mounts)
|
||||
{{type}}.PrivateIPC, config_parse_bool, 0, offsetof({{type}}, exec_context.private_ipc)
|
||||
{{type}}.PrivatePIDs, config_parse_private_pids, 0, offsetof({{type}}, exec_context.private_pids)
|
||||
{{type}}.ProtectSystem, config_parse_protect_system, 0, offsetof({{type}}, exec_context.protect_system)
|
||||
{{type}}.ProtectHome, config_parse_protect_home, 0, offsetof({{type}}, exec_context.protect_home)
|
||||
{{type}}.MountFlags, config_parse_exec_mount_propagation_flag, 0, offsetof({{type}}, exec_context.mount_propagation_flag)
|
||||
|
@ -135,6 +135,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc);
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset);
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp);
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers);
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs);
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_control_groups, protect_control_groups, ProtectControlGroups);
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode);
|
||||
|
@ -114,6 +114,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_private_tmp);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_private_users);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_private_pids);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_protect_control_groups);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset);
|
||||
|
@ -126,6 +126,7 @@ static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint
|
||||
static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
|
||||
static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
|
||||
static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
|
||||
static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
|
||||
static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata);
|
||||
static int manager_dispatch_run_queue(sd_event_source *source, void *userdata);
|
||||
static int manager_dispatch_sigchld(sd_event_source *source, void *userdata);
|
||||
@ -913,6 +914,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
|
||||
.signal_fd = -EBADF,
|
||||
.user_lookup_fds = EBADF_PAIR,
|
||||
.handoff_timestamp_fds = EBADF_PAIR,
|
||||
.pidref_transport_fds = EBADF_PAIR,
|
||||
.private_listen_fd = -EBADF,
|
||||
.dev_autofs_fd = -EBADF,
|
||||
.cgroup_inotify_fd = -EBADF,
|
||||
@ -1309,6 +1311,55 @@ static int manager_setup_handoff_timestamp_fd(Manager *m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int manager_setup_pidref_transport_fd(Manager *m) {
|
||||
int r;
|
||||
|
||||
assert(m);
|
||||
|
||||
/* Set up the socket pair used for passing parent and child pidrefs back when the executor unshares
|
||||
* a PID namespace and forks again when using PrivatePIDs=yes. */
|
||||
|
||||
if (m->pidref_transport_fds[0] < 0) {
|
||||
m->pidref_event_source = sd_event_source_disable_unref(m->pidref_event_source);
|
||||
safe_close_pair(m->pidref_transport_fds);
|
||||
|
||||
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->pidref_transport_fds) < 0)
|
||||
return log_error_errno(errno, "Failed to allocate pidref socket: %m");
|
||||
|
||||
/* Make sure children never have to block */
|
||||
(void) fd_increase_rxbuf(m->pidref_transport_fds[0], MANAGER_SOCKET_RCVBUF_SIZE);
|
||||
|
||||
r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSCRED, true);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to enable SO_PASSCRED for pidref socket: %m");
|
||||
|
||||
r = setsockopt_int(m->pidref_transport_fds[0], SOL_SOCKET, SO_PASSPIDFD, true);
|
||||
if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
|
||||
log_debug("SO_PASSPIDFD is not supported for pidref socket, ignoring.");
|
||||
else if (r < 0)
|
||||
log_warning_errno(r, "Failed to enable SO_PASSPIDFD for pidref socket, ignoring: %m");
|
||||
|
||||
/* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */
|
||||
r = fd_nonblock(m->pidref_transport_fds[0], true);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to make pidref socket O_NONBLOCK: %m");
|
||||
}
|
||||
|
||||
if (!m->pidref_event_source) {
|
||||
r = sd_event_add_io(m->event, &m->pidref_event_source, m->pidref_transport_fds[0], EPOLLIN, manager_dispatch_pidref_transport_fd, m);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to allocate pidref event source: %m");
|
||||
|
||||
r = sd_event_source_set_priority(m->pidref_event_source, EVENT_PRIORITY_PIDREF);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to set priority of pidref event source: %m");
|
||||
|
||||
(void) sd_event_source_set_description(m->pidref_event_source, "pidref");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned manager_dispatch_cleanup_queue(Manager *m) {
|
||||
Unit *u;
|
||||
unsigned n = 0;
|
||||
@ -1724,6 +1775,7 @@ Manager* manager_free(Manager *m) {
|
||||
sd_event_source_unref(m->run_queue_event_source);
|
||||
sd_event_source_unref(m->user_lookup_event_source);
|
||||
sd_event_source_unref(m->handoff_timestamp_event_source);
|
||||
sd_event_source_unref(m->pidref_event_source);
|
||||
sd_event_source_unref(m->memory_pressure_event_source);
|
||||
|
||||
safe_close(m->signal_fd);
|
||||
@ -1731,6 +1783,7 @@ Manager* manager_free(Manager *m) {
|
||||
safe_close(m->cgroups_agent_fd);
|
||||
safe_close_pair(m->user_lookup_fds);
|
||||
safe_close_pair(m->handoff_timestamp_fds);
|
||||
safe_close_pair(m->pidref_transport_fds);
|
||||
|
||||
manager_close_ask_password(m);
|
||||
|
||||
@ -2077,6 +2130,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
|
||||
/* This shouldn't fail, except if things are really broken. */
|
||||
return r;
|
||||
|
||||
r = manager_setup_pidref_transport_fd(m);
|
||||
if (r < 0)
|
||||
/* This shouldn't fail, except if things are really broken. */
|
||||
return r;
|
||||
|
||||
/* Connect to the bus if we are good for it */
|
||||
manager_setup_bus(m);
|
||||
|
||||
@ -3747,6 +3805,7 @@ int manager_reload(Manager *m) {
|
||||
(void) manager_setup_cgroups_agent(m);
|
||||
(void) manager_setup_user_lookup_fd(m);
|
||||
(void) manager_setup_handoff_timestamp_fd(m);
|
||||
(void) manager_setup_pidref_transport_fd(m);
|
||||
|
||||
/* Third, fire things up! */
|
||||
manager_coldplug(m);
|
||||
@ -5002,6 +5061,142 @@ static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int manager_dispatch_pidref_transport_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
|
||||
Manager *m = ASSERT_PTR(userdata);
|
||||
_cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL, parent_pidref = PIDREF_NULL;
|
||||
_cleanup_close_ int child_pidfd = -EBADF, parent_pidfd = -EBADF;
|
||||
struct ucred *ucred = NULL;
|
||||
CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int)) * 2) control;
|
||||
pid_t child_pid;
|
||||
struct msghdr msghdr = {
|
||||
.msg_iov = &IOVEC_MAKE(&child_pid, sizeof(child_pid)),
|
||||
.msg_iovlen = 1,
|
||||
.msg_control = &control,
|
||||
.msg_controllen = sizeof(control),
|
||||
};
|
||||
struct cmsghdr *cmsg;
|
||||
ssize_t n;
|
||||
int r;
|
||||
|
||||
assert(source);
|
||||
|
||||
/* Server expects:
|
||||
* - Parent PID in ucreds enabled via SO_PASSCRED
|
||||
* - Parent PIDFD in SCM_PIDFD message enabled via SO_PASSPIDFD
|
||||
* - Child PIDFD in SCM_RIGHTS in message body
|
||||
* - Child PID in message IOV
|
||||
*
|
||||
* SO_PASSPIDFD may not be supported by the kernel so we fall back to using parent PID from ucreds
|
||||
* and accept some raciness. */
|
||||
n = recvmsg_safe(m->pidref_transport_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC);
|
||||
if (ERRNO_IS_NEG_TRANSIENT(n))
|
||||
return 0; /* Spurious wakeup, try again */
|
||||
if (n == -ECHRNG) {
|
||||
log_warning_errno(n, "Got message with truncated control data (unexpected fds sent?), ignoring.");
|
||||
return 0;
|
||||
}
|
||||
if (n == -EXFULL) {
|
||||
log_warning_errno(n, "Got message with truncated payload data, ignoring.");
|
||||
return 0;
|
||||
}
|
||||
if (n < 0)
|
||||
return log_error_errno(n, "Failed to receive pidref message: %m");
|
||||
|
||||
if (n != sizeof(child_pid)) {
|
||||
log_warning("Got pidref message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(child_pid));
|
||||
return 0;
|
||||
}
|
||||
|
||||
CMSG_FOREACH(cmsg, &msghdr) {
|
||||
if (cmsg->cmsg_level != SOL_SOCKET)
|
||||
continue;
|
||||
|
||||
if (cmsg->cmsg_type == SCM_CREDENTIALS && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
|
||||
assert(!ucred);
|
||||
ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
|
||||
} else if (cmsg->cmsg_type == SCM_PIDFD) {
|
||||
assert(parent_pidfd < 0);
|
||||
parent_pidfd = *CMSG_TYPED_DATA(cmsg, int);
|
||||
} else if (cmsg->cmsg_type == SCM_RIGHTS) {
|
||||
assert(child_pidfd < 0);
|
||||
child_pidfd = *CMSG_TYPED_DATA(cmsg, int);
|
||||
}
|
||||
}
|
||||
|
||||
/* Verify and set parent pidref. */
|
||||
if (!ucred || !pid_is_valid(ucred->pid)) {
|
||||
log_warning("Received pidref message without valid credentials. Ignoring.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Need to handle kernels without SO_PASSPIDFD where SCM_PIDFD will not be set. */
|
||||
if (parent_pidfd >= 0)
|
||||
r = pidref_set_pidfd_consume(&parent_pidref, TAKE_FD(parent_pidfd));
|
||||
else
|
||||
r = pidref_set_pid(&parent_pidref, ucred->pid);
|
||||
if (r < 0) {
|
||||
if (r == -ESRCH)
|
||||
log_debug_errno(r, "PidRef child process died before message is processed. Ignoring.");
|
||||
else
|
||||
log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (parent_pidref.pid != ucred->pid) {
|
||||
assert(parent_pidref.fd >= 0);
|
||||
log_warning("Got SCM_PIDFD for parent process " PID_FMT " but got SCM_CREDENTIALS for parent process " PID_FMT ". Ignoring.",
|
||||
parent_pidref.pid, ucred->pid);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Verify and set child pidref. */
|
||||
if (!pid_is_valid(child_pid)) {
|
||||
log_warning("Received pidref message without valid child PID. Ignoring.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Need to handle kernels without PIDFD support. */
|
||||
if (child_pidfd >= 0)
|
||||
r = pidref_set_pidfd_consume(&child_pidref, TAKE_FD(child_pidfd));
|
||||
else
|
||||
r = pidref_set_pid(&child_pidref, child_pid);
|
||||
if (r < 0) {
|
||||
if (r == -ESRCH)
|
||||
log_debug_errno(r, "PidRef child process died before message is processed. Ignoring.");
|
||||
else
|
||||
log_warning_errno(r, "Failed to pin pidref child process, ignoring message: %m");
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (child_pidref.pid != child_pid) {
|
||||
assert(child_pidref.fd >= 0);
|
||||
log_warning("Got SCM_RIGHTS for child process " PID_FMT " but PID in IOV message is " PID_FMT ". Ignoring.",
|
||||
child_pidref.pid, child_pid);
|
||||
return 0;
|
||||
}
|
||||
|
||||
log_debug("Got pidref event with parent PID " PID_FMT " and child PID " PID_FMT ".", parent_pidref.pid, child_pidref.pid);
|
||||
|
||||
/* Try finding cgroup of parent process. But if parent process exited and we're not using PIDFD, this could return NULL.
|
||||
* Then fall back to finding cgroup of the child process. */
|
||||
Unit *u = manager_get_unit_by_pidref_cgroup(m, &parent_pidref);
|
||||
if (!u)
|
||||
u = manager_get_unit_by_pidref_cgroup(m, &child_pidref);
|
||||
if (!u) {
|
||||
log_debug("Got pidref for parent process " PID_FMT " and child process " PID_FMT " we are not interested in, ignoring.", parent_pidref.pid, child_pidref.pid);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!UNIT_VTABLE(u)->notify_pidref) {
|
||||
log_unit_warning(u, "Received pidref event from unexpected unit type '%s'.", unit_type_to_string(u->type));
|
||||
return 0;
|
||||
}
|
||||
|
||||
UNIT_VTABLE(u)->notify_pidref(u, &parent_pidref, &child_pidref);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void manager_ref_console(Manager *m) {
|
||||
assert(m);
|
||||
|
||||
|
@ -289,6 +289,9 @@ struct Manager {
|
||||
int handoff_timestamp_fds[2];
|
||||
sd_event_source *handoff_timestamp_event_source;
|
||||
|
||||
int pidref_transport_fds[2];
|
||||
sd_event_source *pidref_event_source;
|
||||
|
||||
RuntimeScope runtime_scope;
|
||||
|
||||
LookupPaths lookup_paths;
|
||||
@ -678,12 +681,13 @@ void unit_defaults_done(UnitDefaults *defaults);
|
||||
|
||||
enum {
|
||||
/* most important … */
|
||||
EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11,
|
||||
EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
|
||||
EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
|
||||
EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */
|
||||
EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */
|
||||
EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8,
|
||||
EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-12,
|
||||
EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-11,
|
||||
EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-11,
|
||||
EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv1 */
|
||||
EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-10, /* cgroupv2 */
|
||||
EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-9,
|
||||
EVENT_PRIORITY_PIDREF = SD_EVENT_PRIORITY_NORMAL-8,
|
||||
EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7,
|
||||
EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6,
|
||||
EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5,
|
||||
|
@ -2061,7 +2061,8 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
|
||||
p->protect_control_groups != PROTECT_CONTROL_GROUPS_NO ||
|
||||
p->protect_kernel_tunables ||
|
||||
p->protect_proc != PROTECT_PROC_DEFAULT ||
|
||||
p->proc_subset != PROC_SUBSET_ALL;
|
||||
p->proc_subset != PROC_SUBSET_ALL ||
|
||||
p->private_pids != PRIVATE_PIDS_NO;
|
||||
}
|
||||
|
||||
/* Walk all mount entries and dropping any unused mounts. This affects all
|
||||
@ -3366,3 +3367,10 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = {
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF);
|
||||
|
||||
static const char* const private_pids_table[_PRIVATE_PIDS_MAX] = {
|
||||
[PRIVATE_PIDS_NO] = "no",
|
||||
[PRIVATE_PIDS_YES] = "yes",
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_pids, PrivatePIDs, PRIVATE_PIDS_YES);
|
||||
|
@ -78,6 +78,13 @@ typedef enum ProtectControlGroups {
|
||||
_PROTECT_CONTROL_GROUPS_INVALID = -EINVAL,
|
||||
} ProtectControlGroups;
|
||||
|
||||
typedef enum PrivatePIDs {
|
||||
PRIVATE_PIDS_NO,
|
||||
PRIVATE_PIDS_YES,
|
||||
_PRIVATE_PIDS_MAX,
|
||||
_PRIVATE_PIDS_INVALID = -EINVAL,
|
||||
} PrivatePIDs;
|
||||
|
||||
struct BindMount {
|
||||
char *source;
|
||||
char *destination;
|
||||
@ -182,6 +189,7 @@ struct NamespaceParameters {
|
||||
ProtectProc protect_proc;
|
||||
ProcSubset proc_subset;
|
||||
PrivateTmp private_tmp;
|
||||
PrivatePIDs private_pids;
|
||||
};
|
||||
|
||||
int setup_namespace(const NamespaceParameters *p, char **reterr_path);
|
||||
@ -225,6 +233,9 @@ PrivateUsers private_users_from_string(const char *s) _pure_;
|
||||
const char* protect_control_groups_to_string(ProtectControlGroups i) _const_;
|
||||
ProtectControlGroups protect_control_groups_from_string(const char *s) _pure_;
|
||||
|
||||
const char* private_pids_to_string(PrivatePIDs i) _const_;
|
||||
PrivatePIDs private_pids_from_string(const char *s) _pure_;
|
||||
|
||||
void bind_mount_free_many(BindMount *b, size_t n);
|
||||
int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
|
||||
|
||||
|
@ -710,6 +710,9 @@ static int service_verify(Service *s) {
|
||||
if (s->type == SERVICE_DBUS && !s->bus_name)
|
||||
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service is of type D-Bus but no D-Bus service name has been specified. Refusing.");
|
||||
|
||||
if (s->type == SERVICE_FORKING && exec_needs_pid_namespace(&s->exec_context))
|
||||
return log_unit_error_errno(UNIT(s), SYNTHETIC_ERRNO(ENOEXEC), "Service of Type=forking does not support PrivatePIDs=yes. Refusing.");
|
||||
|
||||
if (s->usb_function_descriptors && !s->usb_function_strings)
|
||||
log_unit_warning(UNIT(s), "Service has USBFunctionDescriptors= setting, but no USBFunctionStrings=. Ignoring.");
|
||||
|
||||
@ -4908,6 +4911,35 @@ static void service_handoff_timestamp(
|
||||
unit_add_to_dbus_queue(u);
|
||||
}
|
||||
|
||||
static void service_notify_pidref(Unit *u, PidRef *parent_pidref, PidRef *child_pidref) {
|
||||
Service *s = ASSERT_PTR(SERVICE(u));
|
||||
int r;
|
||||
|
||||
assert(pidref_is_set(parent_pidref));
|
||||
assert(pidref_is_set(child_pidref));
|
||||
|
||||
if (pidref_equal(&s->main_pid, parent_pidref)) {
|
||||
r = service_set_main_pidref(s, TAKE_PIDREF(*child_pidref), /* start_timestamp = */ NULL);
|
||||
if (r < 0)
|
||||
return (void) log_unit_warning_errno(u, r, "Failed to set new main pid: %m");
|
||||
|
||||
/* Since the child process is PID 1 in a new PID namespace, it must be exclusive to this unit. */
|
||||
r = unit_watch_pidref(u, &s->main_pid, /* exclusive= */ true);
|
||||
if (r < 0)
|
||||
log_unit_warning_errno(u, r, "Failed to watch new main PID " PID_FMT ": %m", s->main_pid.pid);
|
||||
} else if (pidref_equal(&s->control_pid, parent_pidref)) {
|
||||
service_unwatch_control_pid(s);
|
||||
s->control_pid = TAKE_PIDREF(*child_pidref);
|
||||
|
||||
r = unit_watch_pidref(u, &s->control_pid, /* exclusive= */ true);
|
||||
if (r < 0)
|
||||
log_unit_warning_errno(u, r, "Failed to watch new control PID " PID_FMT ": %m", s->control_pid.pid);
|
||||
} else
|
||||
return (void) log_unit_debug(u, "Parent process " PID_FMT " does not match main or control processes, ignoring.", parent_pidref->pid);
|
||||
|
||||
unit_add_to_dbus_queue(u);
|
||||
}
|
||||
|
||||
static int service_get_timeout(Unit *u, usec_t *timeout) {
|
||||
Service *s = ASSERT_PTR(SERVICE(u));
|
||||
uint64_t t;
|
||||
@ -5638,6 +5670,7 @@ const UnitVTable service_vtable = {
|
||||
.notify_cgroup_oom = service_notify_cgroup_oom_event,
|
||||
.notify_message = service_notify_message,
|
||||
.notify_handoff_timestamp = service_handoff_timestamp,
|
||||
.notify_pidref = service_notify_pidref,
|
||||
|
||||
.main_pid = service_main_pid,
|
||||
.control_pid = service_control_pid,
|
||||
|
@ -4237,6 +4237,9 @@ static int unit_verify_contexts(const Unit *u) {
|
||||
exec_needs_mount_namespace(ec, /* params = */ NULL, /* runtime = */ NULL))
|
||||
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "WorkingDirectory= may not be below /proc/, /sys/ or /dev/ when using mount namespacing. Refusing.");
|
||||
|
||||
if (exec_needs_pid_namespace(ec) && !UNIT_VTABLE(u)->notify_pidref)
|
||||
return log_unit_error_errno(u, SYNTHETIC_ERRNO(ENOEXEC), "PrivatePIDs= setting is only supported for service units. Refusing.");
|
||||
|
||||
const KillContext *kc = unit_get_kill_context(u);
|
||||
|
||||
if (ec->pam_name && kc && !IN_SET(kc->kill_mode, KILL_CONTROL_GROUP, KILL_MIXED))
|
||||
@ -5402,6 +5405,8 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
|
||||
|
||||
p->user_lookup_fd = u->manager->user_lookup_fds[1];
|
||||
p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1];
|
||||
if (UNIT_VTABLE(u)->notify_pidref)
|
||||
p->pidref_transport_fd = u->manager->pidref_transport_fds[1];
|
||||
|
||||
p->cgroup_id = crt ? crt->cgroup_id : 0;
|
||||
p->invocation_id = u->invocation_id;
|
||||
|
@ -640,6 +640,9 @@ typedef struct UnitVTable {
|
||||
/* Called whenever we learn a handoff timestamp */
|
||||
void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts);
|
||||
|
||||
/* Called whenever we learn about a child process */
|
||||
void (*notify_pidref)(Unit *u, PidRef *parent_pidref, PidRef *child_pidref);
|
||||
|
||||
/* Called whenever a name this Unit registered for comes or goes away. */
|
||||
void (*bus_name_owner_change)(Unit *u, const char *new_owner);
|
||||
|
||||
|
@ -1061,7 +1061,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
|
||||
"LogNamespace",
|
||||
"RootImagePolicy",
|
||||
"MountImagePolicy",
|
||||
"ExtensionImagePolicy"))
|
||||
"ExtensionImagePolicy",
|
||||
"PrivatePIDs"))
|
||||
return bus_append_string(m, field, eq);
|
||||
|
||||
if (STR_IN_SET(field, "IgnoreSIGPIPE",
|
||||
|
@ -6,12 +6,17 @@ TEST_DESCRIPTION="Tests for core PID1 functionality"
|
||||
|
||||
# for testing PrivateNetwork=yes
|
||||
NSPAWN_ARGUMENTS="--capability=CAP_NET_ADMIN"
|
||||
# for testing PrivatePIDs=yes
|
||||
TEST_INSTALL_VERITY_MINIMAL=1
|
||||
|
||||
# shellcheck source=test/test-functions
|
||||
. "${TEST_BASE_DIR:?}/test-functions"
|
||||
|
||||
test_append_files() {
|
||||
image_install logger socat
|
||||
inst_binary mksquashfs
|
||||
inst_binary unsquashfs
|
||||
install_verity_minimal
|
||||
}
|
||||
|
||||
do_test "$@"
|
||||
|
161
test/units/TEST-07-PID1.private-pids.sh
Executable file
161
test/units/TEST-07-PID1.private-pids.sh
Executable file
@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: LGPL-2.1-or-later
|
||||
# shellcheck disable=SC2016
|
||||
set -eux
|
||||
set -o pipefail
|
||||
|
||||
# shellcheck source=test/units/test-control.sh
|
||||
. "$(dirname "$0")"/test-control.sh
|
||||
# shellcheck source=test/units/util.sh
|
||||
. "$(dirname "$0")"/util.sh
|
||||
|
||||
HAS_EXISTING_SCSI_MOUNT=no
|
||||
if findmnt --mountpoint /proc/scsi; then
|
||||
HAS_EXISTING_SCSI_MOUNT=yes
|
||||
fi
|
||||
|
||||
at_exit() {
|
||||
set +e
|
||||
|
||||
# Unmount any file systems
|
||||
if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
|
||||
umount /proc/scsi
|
||||
fi
|
||||
umount /tmp/TEST-07-PID1-private-pids-proc
|
||||
rm -rf /tmp/TEST-07-PID1-private-pids-proc
|
||||
# Remove any test files
|
||||
rm -rf /tmp/TEST-07-PID1-private-pids-services
|
||||
rm -rf /tmp/TEST-07-PID1-private-pids-root
|
||||
# Stop any test services
|
||||
systemctl kill --signal=KILL TEST-07-PID1-private-pid.service
|
||||
# Remove any failed transient units
|
||||
systemctl reset-failed
|
||||
}
|
||||
|
||||
trap at_exit EXIT
|
||||
|
||||
testcase_basic() {
|
||||
# Verify current process is PID1 in new namespace
|
||||
assert_eq "$(systemd-run -p PrivatePIDs=yes --wait --pipe readlink /proc/self)" "1"
|
||||
# Verify we are only processes in new namespace
|
||||
assert_eq "$(systemd-run -p PrivatePIDs=yes --wait --pipe ps aux --no-heading | wc -l)" "1"
|
||||
# Verify procfs mount
|
||||
systemd-run -p PrivatePIDs=yes --wait --pipe \
|
||||
bash -xec '[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ rw ]];
|
||||
[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ nosuid ]];
|
||||
[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ nodev ]];
|
||||
[[ "$$(findmnt --mountpoint /proc --noheadings -o VFS-OPTIONS)" =~ noexec ]];'
|
||||
|
||||
# Verify main PID is correct
|
||||
systemd-run -p PrivatePIDs=yes --remain-after-exit --unit TEST-07-PID1-private-pid sleep infinity
|
||||
# Wait for ExecMainPID to be correctly populated as there might be a race between spawning service
|
||||
# and actual exec child process
|
||||
sleep 2
|
||||
pid=$(systemctl show TEST-07-PID1-private-pid.service -p ExecMainPID --value)
|
||||
kill -9 "$pid"
|
||||
timeout 10s bash -xec 'while [[ "$(systemctl show -P SubState TEST-07-PID1-private-pid.service)" != "failed" ]]; do sleep .5; done'
|
||||
assert_eq "$(systemctl show -P Result TEST-07-PID1-private-pid.service)" "signal"
|
||||
assert_eq "$(systemctl show -P ExecMainStatus TEST-07-PID1-private-pid.service)" "9"
|
||||
systemctl reset-failed
|
||||
}
|
||||
|
||||
testcase_analyze() {
|
||||
mkdir -p /tmp/TEST-07-PID1-private-pids-services
|
||||
|
||||
# Verify other services are compatible with PrivatePIDs=yes
|
||||
cat <<EOF >/tmp/TEST-07-PID1-private-pids-services/oneshot-valid.service
|
||||
[Service]
|
||||
ExecStart=echo hello
|
||||
PrivatePIDs=yes
|
||||
Type=oneshot
|
||||
EOF
|
||||
|
||||
# Verify Type=forking services are not compatible with PrivatePIDs=yes
|
||||
cat <<EOF >/tmp/TEST-07-PID1-private-pids-services/forking-invalid.service
|
||||
[Service]
|
||||
ExecStart=echo hello
|
||||
PrivatePIDs=yes
|
||||
Type=forking
|
||||
EOF
|
||||
|
||||
systemd-analyze --recursive-errors=no verify /tmp/TEST-07-PID1-private-pids-services/oneshot-valid.service
|
||||
(! systemd-analyze --recursive-errors=no verify /tmp/TEST-07-PID1-private-pids-services/forking-invalid.service)
|
||||
|
||||
|
||||
rm -rf /tmp/TEST-07-PID1-private-pids-services
|
||||
}
|
||||
|
||||
testcase_multiple_features() {
|
||||
unsquashfs -no-xattrs -d /tmp/TEST-07-PID1-private-pids-root /usr/share/minimal_0.raw
|
||||
|
||||
systemd-run \
|
||||
-p PrivatePIDs=yes \
|
||||
-p RootDirectory=/tmp/TEST-07-PID1-private-pids-root \
|
||||
-p ProcSubset=pid \
|
||||
-p BindReadOnlyPaths=/usr/share \
|
||||
-p NoNewPrivileges=yes \
|
||||
-p ProtectSystem=strict \
|
||||
-p User=testuser\
|
||||
-p Group=testuser \
|
||||
-p RuntimeDirectory=abc \
|
||||
-p StateDirectory=qed \
|
||||
-p InaccessiblePaths=/usr/include \
|
||||
-p TemporaryFileSystem=/home \
|
||||
-p PrivateTmp=yes \
|
||||
-p PrivateDevices=yes \
|
||||
-p PrivateNetwork=yes \
|
||||
-p PrivateUsersEx=self \
|
||||
-p PrivateIPC=yes \
|
||||
-p ProtectHostname=yes \
|
||||
-p ProtectClock=yes \
|
||||
-p ProtectKernelTunables=yes \
|
||||
-p ProtectKernelModules=yes \
|
||||
-p ProtectKernelLogs=yes \
|
||||
-p ProtectControlGroupsEx=private \
|
||||
-p LockPersonality=yes \
|
||||
-p Environment=ABC=QED \
|
||||
--wait \
|
||||
--pipe \
|
||||
grep MARKER=1 /etc/os-release
|
||||
|
||||
rm -rf /tmp/TEST-07-PID1-private-pids-root
|
||||
}
|
||||
|
||||
testcase_unpriv() {
|
||||
if [ ! -f /usr/lib/systemd/user/dbus.socket ] && [ ! -f /etc/systemd/user/dbus.socket ]; then
|
||||
echo "Per-user instances are not supported, skipping unprivileged PrivatePIDs=yes test"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ "$(sysctl -ne kernel.apparmor_restrict_unprivileged_userns)" -eq 1 ]]; then
|
||||
echo "Cannot create unprivileged user namespaces, skipping unprivileged PrivatePIDs=yes test"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# The kernel has a restriction for unprivileged user namespaces where they cannot mount a less restrictive
|
||||
# instance of /proc/. So if /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs as systemd-nspawn does),
|
||||
# then mounting a new /proc/ will fail and we will still see the host's /proc/. Thus, to allow tests to run in
|
||||
# a VM or nspawn, we mount a new proc on a temporary directory with no masking to bypass this kernel restriction.
|
||||
mkdir -p /tmp/TEST-07-PID1-private-pids-proc
|
||||
mount -t proc proc /tmp/TEST-07-PID1-private-pids-proc
|
||||
|
||||
# Verify running as unprivileged user can unshare PID namespace and mounts /proc properly.
|
||||
assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes readlink /proc/self)" "1"
|
||||
assert_eq "$(runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes ps aux --no-heading | wc -l)" "1"
|
||||
|
||||
umount /tmp/TEST-07-PID1-private-pids-proc
|
||||
rm -rf /tmp/TEST-07-PID1-private-pids-proc
|
||||
|
||||
# Now verify the behavior with masking - units should fail as PrivatePIDs=yes has no graceful fallback.
|
||||
if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
|
||||
mount -t tmpfs tmpfs /proc/scsi
|
||||
fi
|
||||
|
||||
(! runas testuser systemd-run --wait --user --pipe -p PrivatePIDs=yes true)
|
||||
|
||||
if [[ "$HAS_EXISTING_SCSI_MOUNT" == "no" ]]; then
|
||||
umount /proc/scsi
|
||||
fi
|
||||
}
|
||||
|
||||
run_testcases
|
Loading…
x
Reference in New Issue
Block a user