mirror of
https://github.com/systemd/systemd-stable.git
synced 2024-10-27 01:55:32 +03:00
Merge pull request #3884 from poettering/private-users
This commit is contained in:
commit
d87a2ef782
10
TODO
10
TODO
@ -56,11 +56,10 @@ Features:
|
||||
|
||||
* ProtectKeyRing= to take keyring calls away
|
||||
|
||||
* PrivateUsers= which maps the all user ids except root and the one specified
|
||||
in User= to nobody
|
||||
|
||||
* ProtectControlGroups= which mounts all of /sys/fs/cgroup read-only
|
||||
|
||||
* RemoveKeyRing= to remove all keyring entries of the specified user
|
||||
|
||||
* Add DataDirectory=, CacheDirectory= and LogDirectory= to match
|
||||
RuntimeDirectory=, and create it as necessary when starting a service, owned by the right user.
|
||||
|
||||
@ -80,6 +79,11 @@ Features:
|
||||
* expose the "privileged" flag of ExecCommand on the bus, and open it up to
|
||||
transient units
|
||||
|
||||
* in nss-systemd, if we run inside of RootDirectory= with PrivateUsers= set,
|
||||
find a way to map the User=/Group= of the service to the right name. This way
|
||||
a user/group for a service only has to exist on the host for the right
|
||||
mapping to work.
|
||||
|
||||
* allow attaching additional journald log fields to cgroups
|
||||
|
||||
* rework fopen_temporary() to make use of open_tmpfile_linkable() (problem: the
|
||||
|
@ -107,36 +107,29 @@
|
||||
<varlistentry>
|
||||
<term><varname>WorkingDirectory=</varname></term>
|
||||
|
||||
<listitem><para>Takes a directory path relative to the service's root
|
||||
directory specified by <varname>RootDirectory=</varname>, or the
|
||||
special value <literal>~</literal>. Sets the working directory
|
||||
for executed processes. If set to <literal>~</literal>, the
|
||||
home directory of the user specified in
|
||||
<varname>User=</varname> is used. If not set, defaults to the
|
||||
root directory when systemd is running as a system instance
|
||||
and the respective user's home directory if run as user. If
|
||||
the setting is prefixed with the <literal>-</literal>
|
||||
character, a missing working directory is not considered
|
||||
fatal. If <varname>RootDirectory=</varname> is not set, then
|
||||
<varname>WorkingDirectory=</varname> is relative to the root of
|
||||
the system running the service manager.
|
||||
Note that setting this parameter might result in
|
||||
additional dependencies to be added to the unit (see
|
||||
above).</para></listitem>
|
||||
<listitem><para>Takes a directory path relative to the service's root directory specified by
|
||||
<varname>RootDirectory=</varname>, or the special value <literal>~</literal>. Sets the working directory for
|
||||
executed processes. If set to <literal>~</literal>, the home directory of the user specified in
|
||||
<varname>User=</varname> is used. If not set, defaults to the root directory when systemd is running as a
|
||||
system instance and the respective user's home directory if run as user. If the setting is prefixed with the
|
||||
<literal>-</literal> character, a missing working directory is not considered fatal. If
|
||||
<varname>RootDirectory=</varname> is not set, then <varname>WorkingDirectory=</varname> is relative to the root
|
||||
of the system running the service manager. Note that setting this parameter might result in additional
|
||||
dependencies to be added to the unit (see above).</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>RootDirectory=</varname></term>
|
||||
|
||||
<listitem><para>Takes a directory path relative to the host's root directory
|
||||
(i.e. the root of the system running the service manager). Sets the
|
||||
root directory for executed processes, with the <citerefentry
|
||||
project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>2</manvolnum></citerefentry>
|
||||
system call. If this is used, it must be ensured that the
|
||||
process binary and all its auxiliary files are available in
|
||||
the <function>chroot()</function> jail. Note that setting this
|
||||
parameter might result in additional dependencies to be added
|
||||
to the unit (see above).</para></listitem>
|
||||
<listitem><para>Takes a directory path relative to the host's root directory (i.e. the root of the system
|
||||
running the service manager). Sets the root directory for executed processes, with the <citerefentry
|
||||
project='man-pages'><refentrytitle>chroot</refentrytitle><manvolnum>2</manvolnum></citerefentry> system
|
||||
call. If this is used, it must be ensured that the process binary and all its auxiliary files are available in
|
||||
the <function>chroot()</function> jail. Note that setting this parameter might result in additional
|
||||
dependencies to be added to the unit (see above).</para>
|
||||
|
||||
<para>The <varname>PrivateUsers=</varname> setting is particularly useful in conjunction with
|
||||
<varname>RootDirectory=</varname>. For details, see below.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
@ -998,6 +991,28 @@
|
||||
accessible).</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>PrivateUsers=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument. If true, sets up a new user namespace for the executed processes and
|
||||
configures a minimal user and group mapping, that maps the <literal>root</literal> user and group as well as
|
||||
the unit's own user and group to themselves and everything else to the <literal>nobody</literal> user and
|
||||
group. This is useful to securely detach the user and group databases used by the unit from the rest of the
|
||||
system, and thus to create an effective sandbox environment. All files, directories, processes, IPC objects and
|
||||
other resources owned by users/groups not equalling <literal>root</literal> or the unit's own will stay visible
|
||||
from within the unit but appear owned by the <literal>nobody</literal> user and group. If this mode is enabled,
|
||||
all unit processes are run without privileges in the host user namespace (regardless if the unit's own
|
||||
user/group is <literal>root</literal> or not). Specifically this means that the process will have zero process
|
||||
capabilities on the host's user namespace, but full capabilities within the service's user namespace. Settings
|
||||
such as <varname>CapabilityBoundingSet=</varname> will affect only the latter, and there's no way to acquire
|
||||
additional capabilities in the host's user namespace. Defaults to off.</para>
|
||||
|
||||
<para>This setting is particularly useful in conjunction with <varname>RootDirectory=</varname>, as the need to
|
||||
synchronize the user and group databases in the root directory and on the host is reduced, as the only users
|
||||
and groups who need to be matched are <literal>root</literal>, <literal>nobody</literal> and the unit's own
|
||||
user and group.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>ProtectSystem=</varname></term>
|
||||
|
||||
|
@ -705,8 +705,9 @@ const sd_bus_vtable bus_exec_vtable[] = {
|
||||
SD_BUS_PROPERTY("InaccessiblePaths", "as", NULL, offsetof(ExecContext, inaccessible_paths), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_flags), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectHome", "s", bus_property_get_protect_home, offsetof(ExecContext, protect_home), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("ProtectSystem", "s", bus_property_get_protect_system, offsetof(ExecContext, protect_system), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("SameProcessGroup", "b", bus_property_get_bool, offsetof(ExecContext, same_pgrp), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
@ -1068,7 +1069,7 @@ int bus_exec_context_set_transient_property(
|
||||
|
||||
} else if (STR_IN_SET(name,
|
||||
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
|
||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork",
|
||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
|
||||
"NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
|
||||
"RestrictRealtime", "DynamicUser")) {
|
||||
int b;
|
||||
@ -1090,6 +1091,8 @@ int bus_exec_context_set_transient_property(
|
||||
c->private_devices = b;
|
||||
else if (streq(name, "PrivateNetwork"))
|
||||
c->private_network = b;
|
||||
else if (streq(name, "PrivateUsers"))
|
||||
c->private_users = b;
|
||||
else if (streq(name, "NoNewPrivileges"))
|
||||
c->no_new_privileges = b;
|
||||
else if (streq(name, "SyslogLevelPrefix"))
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <signal.h>
|
||||
#include <string.h>
|
||||
#include <sys/capability.h>
|
||||
#include <sys/eventfd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/personality.h>
|
||||
#include <sys/prctl.h>
|
||||
@ -1552,6 +1553,159 @@ static bool exec_needs_mount_namespace(
|
||||
return false;
|
||||
}
|
||||
|
||||
static int setup_private_users(uid_t uid, gid_t gid) {
|
||||
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
|
||||
_cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
|
||||
_cleanup_close_ int unshare_ready_fd = -1;
|
||||
_cleanup_(sigkill_waitp) pid_t pid = 0;
|
||||
uint64_t c = 1;
|
||||
siginfo_t si;
|
||||
ssize_t n;
|
||||
int r;
|
||||
|
||||
/* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
|
||||
* nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
|
||||
* we however lack after opening the user namespace. To work around this we fork() a temporary child process,
|
||||
* which waits for the parent to create the new user namespace while staying in the original namespace. The
|
||||
* child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
|
||||
* continues execution normally. */
|
||||
|
||||
if (uid != 0 && uid_is_valid(uid))
|
||||
asprintf(&uid_map,
|
||||
"0 0 1\n" /* Map root → root */
|
||||
UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
|
||||
uid, uid); /* The case where the above is the same */
|
||||
else
|
||||
uid_map = strdup("0 0 1\n");
|
||||
if (!uid_map)
|
||||
return -ENOMEM;
|
||||
|
||||
if (gid != 0 && gid_is_valid(gid))
|
||||
asprintf(&gid_map,
|
||||
"0 0 1\n" /* Map root → root */
|
||||
GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
|
||||
gid, gid);
|
||||
else
|
||||
gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
|
||||
if (!gid_map)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Create a communication channel so that the parent can tell the child when it finished creating the user
|
||||
* namespace. */
|
||||
unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
|
||||
if (unshare_ready_fd < 0)
|
||||
return -errno;
|
||||
|
||||
/* Create a communication channel so that the child can tell the parent a proper error code in case it
|
||||
* failed. */
|
||||
if (pipe2(errno_pipe, O_CLOEXEC) < 0)
|
||||
return -errno;
|
||||
|
||||
pid = fork();
|
||||
if (pid < 0)
|
||||
return -errno;
|
||||
|
||||
if (pid == 0) {
|
||||
_cleanup_close_ int fd = -1;
|
||||
const char *a;
|
||||
pid_t ppid;
|
||||
|
||||
/* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
|
||||
* here, after the parent opened its own user namespace. */
|
||||
|
||||
ppid = getppid();
|
||||
errno_pipe[0] = safe_close(errno_pipe[0]);
|
||||
|
||||
/* Wait until the parent unshared the user namespace */
|
||||
if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
|
||||
r = -errno;
|
||||
goto child_fail;
|
||||
}
|
||||
|
||||
/* Disable the setgroups() system call in the child user namespace, for good. */
|
||||
a = procfs_file_alloca(ppid, "setgroups");
|
||||
fd = open(a, O_WRONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
if (errno != ENOENT) {
|
||||
r = -errno;
|
||||
goto child_fail;
|
||||
}
|
||||
|
||||
/* If the file is missing the kernel is too old, let's continue anyway. */
|
||||
} else {
|
||||
if (write(fd, "deny\n", 5) < 0) {
|
||||
r = -errno;
|
||||
goto child_fail;
|
||||
}
|
||||
|
||||
fd = safe_close(fd);
|
||||
}
|
||||
|
||||
/* First write the GID map */
|
||||
a = procfs_file_alloca(ppid, "gid_map");
|
||||
fd = open(a, O_WRONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
r = -errno;
|
||||
goto child_fail;
|
||||
}
|
||||
if (write(fd, gid_map, strlen(gid_map)) < 0) {
|
||||
r = -errno;
|
||||
goto child_fail;
|
||||
}
|
||||
fd = safe_close(fd);
|
||||
|
||||
/* The write the UID map */
|
||||
a = procfs_file_alloca(ppid, "uid_map");
|
||||
fd = open(a, O_WRONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
r = -errno;
|
||||
goto child_fail;
|
||||
}
|
||||
if (write(fd, uid_map, strlen(uid_map)) < 0) {
|
||||
r = -errno;
|
||||
goto child_fail;
|
||||
}
|
||||
|
||||
_exit(EXIT_SUCCESS);
|
||||
|
||||
child_fail:
|
||||
(void) write(errno_pipe[1], &r, sizeof(r));
|
||||
_exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
errno_pipe[1] = safe_close(errno_pipe[1]);
|
||||
|
||||
if (unshare(CLONE_NEWUSER) < 0)
|
||||
return -errno;
|
||||
|
||||
/* Let the child know that the namespace is ready now */
|
||||
if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
|
||||
return -errno;
|
||||
|
||||
/* Try to read an error code from the child */
|
||||
n = read(errno_pipe[0], &r, sizeof(r));
|
||||
if (n < 0)
|
||||
return -errno;
|
||||
if (n == sizeof(r)) { /* an error code was sent to us */
|
||||
if (r < 0)
|
||||
return r;
|
||||
return -EIO;
|
||||
}
|
||||
if (n != 0) /* on success we should have read 0 bytes */
|
||||
return -EIO;
|
||||
|
||||
r = wait_for_terminate(pid, &si);
|
||||
if (r < 0)
|
||||
return r;
|
||||
pid = 0;
|
||||
|
||||
/* If something strange happened with the child, let's consider this fatal, too */
|
||||
if (si.si_code != CLD_EXITED || si.si_status != 0)
|
||||
return -EIO;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
|
||||
assert(array);
|
||||
assert(n);
|
||||
@ -2079,6 +2233,14 @@ static int exec_child(
|
||||
}
|
||||
#endif
|
||||
|
||||
if ((params->flags & EXEC_APPLY_PERMISSIONS) && context->private_users) {
|
||||
r = setup_private_users(uid, gid);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_USER;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
/* We repeat the fd closing here, to make sure that
|
||||
* nothing is leaked from the PAM modules. Note that
|
||||
* we are more aggressive this time since socket_fd
|
||||
@ -2640,8 +2802,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
||||
"%sRootDirectory: %s\n"
|
||||
"%sNonBlocking: %s\n"
|
||||
"%sPrivateTmp: %s\n"
|
||||
"%sPrivateNetwork: %s\n"
|
||||
"%sPrivateDevices: %s\n"
|
||||
"%sPrivateNetwork: %s\n"
|
||||
"%sPrivateUsers: %s\n"
|
||||
"%sProtectHome: %s\n"
|
||||
"%sProtectSystem: %s\n"
|
||||
"%sIgnoreSIGPIPE: %s\n"
|
||||
@ -2652,8 +2815,9 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
|
||||
prefix, c->root_directory ? c->root_directory : "/",
|
||||
prefix, yes_no(c->non_blocking),
|
||||
prefix, yes_no(c->private_tmp),
|
||||
prefix, yes_no(c->private_network),
|
||||
prefix, yes_no(c->private_devices),
|
||||
prefix, yes_no(c->private_network),
|
||||
prefix, yes_no(c->private_users),
|
||||
prefix, protect_home_to_string(c->protect_home),
|
||||
prefix, protect_system_to_string(c->protect_system),
|
||||
prefix, yes_no(c->ignore_sigpipe),
|
||||
|
@ -171,6 +171,7 @@ struct ExecContext {
|
||||
bool private_tmp;
|
||||
bool private_network;
|
||||
bool private_devices;
|
||||
bool private_users;
|
||||
ProtectSystem protect_system;
|
||||
ProtectHome protect_home;
|
||||
|
||||
|
@ -88,8 +88,9 @@ $1.ReadWritePaths, config_parse_namespace_path_strv, 0,
|
||||
$1.ReadOnlyPaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.read_only_paths)
|
||||
$1.InaccessiblePaths, config_parse_namespace_path_strv, 0, offsetof($1, exec_context.inaccessible_paths)
|
||||
$1.PrivateTmp, config_parse_bool, 0, offsetof($1, exec_context.private_tmp)
|
||||
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
|
||||
$1.PrivateDevices, config_parse_bool, 0, offsetof($1, exec_context.private_devices)
|
||||
$1.PrivateNetwork, config_parse_bool, 0, offsetof($1, exec_context.private_network)
|
||||
$1.PrivateUsers, config_parse_bool, 0, offsetof($1, exec_context.private_users)
|
||||
$1.ProtectSystem, config_parse_protect_system, 0, offsetof($1, exec_context)
|
||||
$1.ProtectHome, config_parse_protect_home, 0, offsetof($1, exec_context)
|
||||
$1.MountFlags, config_parse_exec_mount_flags, 0, offsetof($1, exec_context)
|
||||
|
@ -202,7 +202,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
|
||||
"CPUAccounting", "MemoryAccounting", "IOAccounting", "BlockIOAccounting", "TasksAccounting",
|
||||
"SendSIGHUP", "SendSIGKILL", "WakeSystem", "DefaultDependencies",
|
||||
"IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
|
||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "NoNewPrivileges",
|
||||
"PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
|
||||
"SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
|
||||
"RestrictRealtime", "DynamicUser")) {
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user