2019-05-19 15:08:55 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2005-04-17 02:20:36 +04:00
/*
* The " user cache " .
*
* ( C ) Copyright 1991 - 2000 Linus Torvalds
*
* We have a per - user structure to keep track of how many
* processes , files etc the user has claimed , in order to be
* able to have per - user limits for system resources .
*/
# include <linux/init.h>
# include <linux/sched.h>
# include <linux/slab.h>
# include <linux/bitops.h>
# include <linux/key.h>
2017-02-08 20:51:30 +03:00
# include <linux/sched/user.h>
2006-01-25 17:23:07 +03:00
# include <linux/interrupt.h>
2011-05-23 22:51:41 +04:00
# include <linux/export.h>
2007-07-16 10:40:59 +04:00
# include <linux/user_namespace.h>
binfmt_misc: enable sandboxed mounts
Enable unprivileged sandboxes to create their own binfmt_misc mounts.
This is based on Laurent's work in [1] but has been significantly
reworked to fix various issues we identified in earlier versions.
While binfmt_misc can currently only be mounted in the initial user
namespace, binary types registered in this binfmt_misc instance are
available to all sandboxes (Either by having them installed in the
sandbox or by registering the binary type with the F flag causing the
interpreter to be opened right away). So binfmt_misc binary types are
already delegated to sandboxes implicitly.
However, while a sandbox has access to all registered binary types in
binfmt_misc a sandbox cannot currently register its own binary types
in binfmt_misc. This has prevented various use-cases some of which were
already outlined in [1] but we have a range of issues associated with
this (cf. [3]-[5] below which are just a small sample).
Extend binfmt_misc to be mountable in non-initial user namespaces.
Similar to other filesystem such as nfsd, mqueue, and sunrpc we use
keyed superblock management. The key determines whether we need to
create a new superblock or can reuse an already existing one. We use the
user namespace of the mount as key. This means a new binfmt_misc
superblock is created once per user namespace creation. Subsequent
mounts of binfmt_misc in the same user namespace will mount the same
binfmt_misc instance. We explicitly do not create a new binfmt_misc
superblock on every binfmt_misc mount as the semantics for
load_misc_binary() line up with the keying model. This also allows us to
retrieve the relevant binfmt_misc instance based on the caller's user
namespace which can be done in a simple (bounded to 32 levels) loop.
Similar to the current binfmt_misc semantics allowing access to the
binary types in the initial binfmt_misc instance we do allow sandboxes
access to their parent's binfmt_misc mounts if they do not have created
a separate binfmt_misc instance.
Overall, this will unblock the use-cases mentioned below and in general
will also allow to support and harden execution of another
architecture's binaries in tight sandboxes. For instance, using the
unshare binary it possible to start a chroot of another architecture and
configure the binfmt_misc interpreter without being root to run the
binaries in this chroot and without requiring the host to modify its
binary type handlers.
Henning had already posted a few experiments in the cover letter at [1].
But here's an additional example where an unprivileged container
registers qemu-user-static binary handlers for various binary types in
its separate binfmt_misc mount and is then seamlessly able to start
containers with a different architecture without affecting the host:
root [lxc monitor] /var/snap/lxd/common/lxd/containers f1
1000000 \_ /sbin/init
1000000 \_ /lib/systemd/systemd-journald
1000000 \_ /lib/systemd/systemd-udevd
1000100 \_ /lib/systemd/systemd-networkd
1000101 \_ /lib/systemd/systemd-resolved
1000000 \_ /usr/sbin/cron -f
1000103 \_ /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-activation --syslog-only
1000000 \_ /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1000104 \_ /usr/sbin/rsyslogd -n -iNONE
1000000 \_ /lib/systemd/systemd-logind
1000000 \_ /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1000107 \_ dnsmasq --conf-file=/dev/null -u lxc-dnsmasq --strict-order --bind-interfaces --pid-file=/run/lxc/dnsmasq.pid --liste
1000000 \_ [lxc monitor] /var/lib/lxc f1-s390x
1100000 \_ /usr/bin/qemu-s390x-static /sbin/init
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-journald
1100000 \_ /usr/bin/qemu-s390x-static /usr/sbin/cron -f
1100103 \_ /usr/bin/qemu-s390x-static /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-ac
1100000 \_ /usr/bin/qemu-s390x-static /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1100104 \_ /usr/bin/qemu-s390x-static /usr/sbin/rsyslogd -n -iNONE
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-logind
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/0 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/1 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/2 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/3 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-udevd
[1]: https://lore.kernel.org/all/20191216091220.465626-1-laurent@vivier.eu
[2]: https://discuss.linuxcontainers.org/t/binfmt-misc-permission-denied
[3]: https://discuss.linuxcontainers.org/t/lxd-binfmt-support-for-qemu-static-interpreters
[4]: https://discuss.linuxcontainers.org/t/3-1-0-binfmt-support-service-in-unprivileged-guest-requires-write-access-on-hosts-proc-sys-fs-binfmt-misc
[5]: https://discuss.linuxcontainers.org/t/qemu-user-static-not-working-4-11
Link: https://lore.kernel.org/r/20191216091220.465626-2-laurent@vivier.eu (origin)
Link: https://lore.kernel.org/r/20211028103114.2849140-2-brauner@kernel.org (v1)
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Henning Schild <henning.schild@siemens.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
/* v2 */
- Serge Hallyn <serge@hallyn.com>:
- Use GFP_KERNEL_ACCOUNT for userspace triggered allocations when a
new binary type handler is registered.
- Christian Brauner <christian.brauner@ubuntu.com>:
- Switch authorship to me. I refused to do that earlier even though
Laurent said I should do so because I think it's genuinely bad form.
But by now I have changed so many things that it'd be unfair to
blame Laurent for any potential bugs in here.
- Add more comments that explain what's going on.
- Rename functions while changing them to better reflect what they are
doing to make the code easier to understand.
- In the first version when a specific binary type handler was removed
either through a write to the entry's file or all binary type
handlers were removed by a write to the binfmt_misc mount's status
file all cleanup work happened during inode eviction.
That includes removal of the relevant entries from entry list. While
that works fine I disliked that model after thinking about it for a
bit. Because it means that there was a window were someone has
already removed a or all binary handlers but they could still be
safely reached from load_misc_binary() when it has managed to take
the read_lock() on the entries list while inode eviction was already
happening. Again, that perfectly benign but it's cleaner to remove
the binary handler from the list immediately meaning that ones the
write to then entry's file or the binfmt_misc status file returns
the binary type cannot be executed anymore. That gives stronger
guarantees to the user.
2021-10-28 13:31:14 +03:00
# include <linux/binfmts.h>
2013-04-12 04:50:06 +04:00
# include <linux/proc_ns.h>
2005-04-17 02:20:36 +04:00
binfmt_misc: enable sandboxed mounts
Enable unprivileged sandboxes to create their own binfmt_misc mounts.
This is based on Laurent's work in [1] but has been significantly
reworked to fix various issues we identified in earlier versions.
While binfmt_misc can currently only be mounted in the initial user
namespace, binary types registered in this binfmt_misc instance are
available to all sandboxes (Either by having them installed in the
sandbox or by registering the binary type with the F flag causing the
interpreter to be opened right away). So binfmt_misc binary types are
already delegated to sandboxes implicitly.
However, while a sandbox has access to all registered binary types in
binfmt_misc a sandbox cannot currently register its own binary types
in binfmt_misc. This has prevented various use-cases some of which were
already outlined in [1] but we have a range of issues associated with
this (cf. [3]-[5] below which are just a small sample).
Extend binfmt_misc to be mountable in non-initial user namespaces.
Similar to other filesystem such as nfsd, mqueue, and sunrpc we use
keyed superblock management. The key determines whether we need to
create a new superblock or can reuse an already existing one. We use the
user namespace of the mount as key. This means a new binfmt_misc
superblock is created once per user namespace creation. Subsequent
mounts of binfmt_misc in the same user namespace will mount the same
binfmt_misc instance. We explicitly do not create a new binfmt_misc
superblock on every binfmt_misc mount as the semantics for
load_misc_binary() line up with the keying model. This also allows us to
retrieve the relevant binfmt_misc instance based on the caller's user
namespace which can be done in a simple (bounded to 32 levels) loop.
Similar to the current binfmt_misc semantics allowing access to the
binary types in the initial binfmt_misc instance we do allow sandboxes
access to their parent's binfmt_misc mounts if they do not have created
a separate binfmt_misc instance.
Overall, this will unblock the use-cases mentioned below and in general
will also allow to support and harden execution of another
architecture's binaries in tight sandboxes. For instance, using the
unshare binary it possible to start a chroot of another architecture and
configure the binfmt_misc interpreter without being root to run the
binaries in this chroot and without requiring the host to modify its
binary type handlers.
Henning had already posted a few experiments in the cover letter at [1].
But here's an additional example where an unprivileged container
registers qemu-user-static binary handlers for various binary types in
its separate binfmt_misc mount and is then seamlessly able to start
containers with a different architecture without affecting the host:
root [lxc monitor] /var/snap/lxd/common/lxd/containers f1
1000000 \_ /sbin/init
1000000 \_ /lib/systemd/systemd-journald
1000000 \_ /lib/systemd/systemd-udevd
1000100 \_ /lib/systemd/systemd-networkd
1000101 \_ /lib/systemd/systemd-resolved
1000000 \_ /usr/sbin/cron -f
1000103 \_ /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-activation --syslog-only
1000000 \_ /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1000104 \_ /usr/sbin/rsyslogd -n -iNONE
1000000 \_ /lib/systemd/systemd-logind
1000000 \_ /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1000107 \_ dnsmasq --conf-file=/dev/null -u lxc-dnsmasq --strict-order --bind-interfaces --pid-file=/run/lxc/dnsmasq.pid --liste
1000000 \_ [lxc monitor] /var/lib/lxc f1-s390x
1100000 \_ /usr/bin/qemu-s390x-static /sbin/init
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-journald
1100000 \_ /usr/bin/qemu-s390x-static /usr/sbin/cron -f
1100103 \_ /usr/bin/qemu-s390x-static /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-ac
1100000 \_ /usr/bin/qemu-s390x-static /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1100104 \_ /usr/bin/qemu-s390x-static /usr/sbin/rsyslogd -n -iNONE
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-logind
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/0 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/1 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/2 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/3 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-udevd
[1]: https://lore.kernel.org/all/20191216091220.465626-1-laurent@vivier.eu
[2]: https://discuss.linuxcontainers.org/t/binfmt-misc-permission-denied
[3]: https://discuss.linuxcontainers.org/t/lxd-binfmt-support-for-qemu-static-interpreters
[4]: https://discuss.linuxcontainers.org/t/3-1-0-binfmt-support-service-in-unprivileged-guest-requires-write-access-on-hosts-proc-sys-fs-binfmt-misc
[5]: https://discuss.linuxcontainers.org/t/qemu-user-static-not-working-4-11
Link: https://lore.kernel.org/r/20191216091220.465626-2-laurent@vivier.eu (origin)
Link: https://lore.kernel.org/r/20211028103114.2849140-2-brauner@kernel.org (v1)
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Henning Schild <henning.schild@siemens.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
/* v2 */
- Serge Hallyn <serge@hallyn.com>:
- Use GFP_KERNEL_ACCOUNT for userspace triggered allocations when a
new binary type handler is registered.
- Christian Brauner <christian.brauner@ubuntu.com>:
- Switch authorship to me. I refused to do that earlier even though
Laurent said I should do so because I think it's genuinely bad form.
But by now I have changed so many things that it'd be unfair to
blame Laurent for any potential bugs in here.
- Add more comments that explain what's going on.
- Rename functions while changing them to better reflect what they are
doing to make the code easier to understand.
- In the first version when a specific binary type handler was removed
either through a write to the entry's file or all binary type
handlers were removed by a write to the binfmt_misc mount's status
file all cleanup work happened during inode eviction.
That includes removal of the relevant entries from entry list. While
that works fine I disliked that model after thinking about it for a
bit. Because it means that there was a window were someone has
already removed a or all binary handlers but they could still be
safely reached from load_misc_binary() when it has managed to take
the read_lock() on the entries list while inode eviction was already
happening. Again, that perfectly benign but it's cleaner to remove
the binary handler from the list immediately meaning that ones the
write to then entry's file or the binfmt_misc status file returns
the binary type cannot be executed anymore. That gives stronger
guarantees to the user.
2021-10-28 13:31:14 +03:00
# if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc init_binfmt_misc = {
. entries = LIST_HEAD_INIT ( init_binfmt_misc . entries ) ,
. enabled = true ,
. entries_lock = __RW_LOCK_UNLOCKED ( init_binfmt_misc . entries_lock ) ,
} ;
EXPORT_SYMBOL_GPL ( init_binfmt_misc ) ;
# endif
userns: add a user_namespace as creator/owner of uts_namespace
The expected course of development for user namespaces targeted
capabilities is laid out at https://wiki.ubuntu.com/UserNamespace.
Goals:
- Make it safe for an unprivileged user to unshare namespaces. They
will be privileged with respect to the new namespace, but this should
only include resources which the unprivileged user already owns.
- Provide separate limits and accounting for userids in different
namespaces.
Status:
Currently (as of 2.6.38) you can clone with the CLONE_NEWUSER flag to
get a new user namespace if you have the CAP_SYS_ADMIN, CAP_SETUID, and
CAP_SETGID capabilities. What this gets you is a whole new set of
userids, meaning that user 500 will have a different 'struct user' in
your namespace than in other namespaces. So any accounting information
stored in struct user will be unique to your namespace.
However, throughout the kernel there are checks which
- simply check for a capability. Since root in a child namespace
has all capabilities, this means that a child namespace is not
constrained.
- simply compare uid1 == uid2. Since these are the integer uids,
uid 500 in namespace 1 will be said to be equal to uid 500 in
namespace 2.
As a result, the lxc implementation at lxc.sf.net does not use user
namespaces. This is actually helpful because it leaves us free to
develop user namespaces in such a way that, for some time, user
namespaces may be unuseful.
Bugs aside, this patchset is supposed to not at all affect systems which
are not actively using user namespaces, and only restrict what tasks in
child user namespace can do. They begin to limit privilege to a user
namespace, so that root in a container cannot kill or ptrace tasks in the
parent user namespace, and can only get world access rights to files.
Since all files currently belong to the initila user namespace, that means
that child user namespaces can only get world access rights to *all*
files. While this temporarily makes user namespaces bad for system
containers, it starts to get useful for some sandboxing.
I've run the 'runltplite.sh' with and without this patchset and found no
difference.
This patch:
copy_process() handles CLONE_NEWUSER before the rest of the namespaces.
So in the case of clone(CLONE_NEWUSER|CLONE_NEWUTS) the new uts namespace
will have the new user namespace as its owner. That is what we want,
since we want root in that new userns to be able to have privilege over
it.
Changelog:
Feb 15: don't set uts_ns->user_ns if we didn't create
a new uts_ns.
Feb 23: Move extern init_user_ns declaration from
init/version.c to utsname.h.
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-24 02:43:16 +03:00
/*
* userns count is 1 for root user , 1 for init_uts_ns ,
* and 1 for . . . ?
*/
2008-02-08 15:18:23 +03:00
struct user_namespace init_user_ns = {
2011-11-17 12:11:58 +04:00
. uid_map = {
. nr_extents = 1 ,
2017-10-25 01:04:40 +03:00
{
. extent [ 0 ] = {
. first = 0 ,
. lower_first = 0 ,
. count = 4294967295U ,
} ,
2011-11-17 12:11:58 +04:00
} ,
} ,
. gid_map = {
. nr_extents = 1 ,
2017-10-25 01:04:40 +03:00
{
. extent [ 0 ] = {
. first = 0 ,
. lower_first = 0 ,
. count = 4294967295U ,
} ,
2011-11-17 12:11:58 +04:00
} ,
} ,
2012-08-30 12:24:05 +04:00
. projid_map = {
. nr_extents = 1 ,
2017-10-25 01:04:40 +03:00
{
. extent [ 0 ] = {
. first = 0 ,
. lower_first = 0 ,
. count = 4294967295U ,
} ,
2012-08-30 12:24:05 +04:00
} ,
} ,
2020-08-03 13:16:37 +03:00
. ns . count = REFCOUNT_INIT ( 3 ) ,
2011-11-17 13:32:59 +04:00
. owner = GLOBAL_ROOT_UID ,
. group = GLOBAL_ROOT_GID ,
2014-11-01 05:56:04 +03:00
. ns . inum = PROC_USER_INIT_INO ,
2014-11-01 09:32:53 +03:00
# ifdef CONFIG_USER_NS
. ns . ops = & userns_operations ,
# endif
2014-12-02 21:27:26 +03:00
. flags = USERNS_INIT_FLAGS ,
2019-06-26 23:02:32 +03:00
# ifdef CONFIG_KEYS
. keyring_name_list = LIST_HEAD_INIT ( init_user_ns . keyring_name_list ) ,
2019-06-26 23:02:32 +03:00
. keyring_sem = __RWSEM_INITIALIZER ( init_user_ns . keyring_sem ) ,
2013-09-24 13:35:19 +04:00
# endif
binfmt_misc: enable sandboxed mounts
Enable unprivileged sandboxes to create their own binfmt_misc mounts.
This is based on Laurent's work in [1] but has been significantly
reworked to fix various issues we identified in earlier versions.
While binfmt_misc can currently only be mounted in the initial user
namespace, binary types registered in this binfmt_misc instance are
available to all sandboxes (Either by having them installed in the
sandbox or by registering the binary type with the F flag causing the
interpreter to be opened right away). So binfmt_misc binary types are
already delegated to sandboxes implicitly.
However, while a sandbox has access to all registered binary types in
binfmt_misc a sandbox cannot currently register its own binary types
in binfmt_misc. This has prevented various use-cases some of which were
already outlined in [1] but we have a range of issues associated with
this (cf. [3]-[5] below which are just a small sample).
Extend binfmt_misc to be mountable in non-initial user namespaces.
Similar to other filesystem such as nfsd, mqueue, and sunrpc we use
keyed superblock management. The key determines whether we need to
create a new superblock or can reuse an already existing one. We use the
user namespace of the mount as key. This means a new binfmt_misc
superblock is created once per user namespace creation. Subsequent
mounts of binfmt_misc in the same user namespace will mount the same
binfmt_misc instance. We explicitly do not create a new binfmt_misc
superblock on every binfmt_misc mount as the semantics for
load_misc_binary() line up with the keying model. This also allows us to
retrieve the relevant binfmt_misc instance based on the caller's user
namespace which can be done in a simple (bounded to 32 levels) loop.
Similar to the current binfmt_misc semantics allowing access to the
binary types in the initial binfmt_misc instance we do allow sandboxes
access to their parent's binfmt_misc mounts if they do not have created
a separate binfmt_misc instance.
Overall, this will unblock the use-cases mentioned below and in general
will also allow to support and harden execution of another
architecture's binaries in tight sandboxes. For instance, using the
unshare binary it possible to start a chroot of another architecture and
configure the binfmt_misc interpreter without being root to run the
binaries in this chroot and without requiring the host to modify its
binary type handlers.
Henning had already posted a few experiments in the cover letter at [1].
But here's an additional example where an unprivileged container
registers qemu-user-static binary handlers for various binary types in
its separate binfmt_misc mount and is then seamlessly able to start
containers with a different architecture without affecting the host:
root [lxc monitor] /var/snap/lxd/common/lxd/containers f1
1000000 \_ /sbin/init
1000000 \_ /lib/systemd/systemd-journald
1000000 \_ /lib/systemd/systemd-udevd
1000100 \_ /lib/systemd/systemd-networkd
1000101 \_ /lib/systemd/systemd-resolved
1000000 \_ /usr/sbin/cron -f
1000103 \_ /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-activation --syslog-only
1000000 \_ /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1000104 \_ /usr/sbin/rsyslogd -n -iNONE
1000000 \_ /lib/systemd/systemd-logind
1000000 \_ /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1000107 \_ dnsmasq --conf-file=/dev/null -u lxc-dnsmasq --strict-order --bind-interfaces --pid-file=/run/lxc/dnsmasq.pid --liste
1000000 \_ [lxc monitor] /var/lib/lxc f1-s390x
1100000 \_ /usr/bin/qemu-s390x-static /sbin/init
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-journald
1100000 \_ /usr/bin/qemu-s390x-static /usr/sbin/cron -f
1100103 \_ /usr/bin/qemu-s390x-static /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-ac
1100000 \_ /usr/bin/qemu-s390x-static /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1100104 \_ /usr/bin/qemu-s390x-static /usr/sbin/rsyslogd -n -iNONE
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-logind
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/0 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/1 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/2 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/3 115200,38400,9600 vt220
1100000 \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-udevd
[1]: https://lore.kernel.org/all/20191216091220.465626-1-laurent@vivier.eu
[2]: https://discuss.linuxcontainers.org/t/binfmt-misc-permission-denied
[3]: https://discuss.linuxcontainers.org/t/lxd-binfmt-support-for-qemu-static-interpreters
[4]: https://discuss.linuxcontainers.org/t/3-1-0-binfmt-support-service-in-unprivileged-guest-requires-write-access-on-hosts-proc-sys-fs-binfmt-misc
[5]: https://discuss.linuxcontainers.org/t/qemu-user-static-not-working-4-11
Link: https://lore.kernel.org/r/20191216091220.465626-2-laurent@vivier.eu (origin)
Link: https://lore.kernel.org/r/20211028103114.2849140-2-brauner@kernel.org (v1)
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Henning Schild <henning.schild@siemens.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
/* v2 */
- Serge Hallyn <serge@hallyn.com>:
- Use GFP_KERNEL_ACCOUNT for userspace triggered allocations when a
new binary type handler is registered.
- Christian Brauner <christian.brauner@ubuntu.com>:
- Switch authorship to me. I refused to do that earlier even though
Laurent said I should do so because I think it's genuinely bad form.
But by now I have changed so many things that it'd be unfair to
blame Laurent for any potential bugs in here.
- Add more comments that explain what's going on.
- Rename functions while changing them to better reflect what they are
doing to make the code easier to understand.
- In the first version when a specific binary type handler was removed
either through a write to the entry's file or all binary type
handlers were removed by a write to the binfmt_misc mount's status
file all cleanup work happened during inode eviction.
That includes removal of the relevant entries from entry list. While
that works fine I disliked that model after thinking about it for a
bit. Because it means that there was a window were someone has
already removed a or all binary handlers but they could still be
safely reached from load_misc_binary() when it has managed to take
the read_lock() on the entries list while inode eviction was already
happening. Again, that perfectly benign but it's cleaner to remove
the binary handler from the list immediately meaning that ones the
write to then entry's file or the binfmt_misc status file returns
the binary type cannot be executed anymore. That gives stronger
guarantees to the user.
2021-10-28 13:31:14 +03:00
# if IS_ENABLED(CONFIG_BINFMT_MISC)
. binfmt_misc = & init_binfmt_misc ,
# endif
2008-02-08 15:18:23 +03:00
} ;
EXPORT_SYMBOL_GPL ( init_user_ns ) ;
2005-04-17 02:20:36 +04:00
/*
* UID task count cache , to get fast user lookup in " alloc_uid "
* when changing user ID ' s ( ie setuid ( ) and friends ) .
*/
2024-05-05 11:03:42 +03:00
# define UIDHASH_BITS (IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7)
2011-11-17 11:20:58 +04:00
# define UIDHASH_SZ (1 << UIDHASH_BITS)
2005-04-17 02:20:36 +04:00
# define UIDHASH_MASK (UIDHASH_SZ - 1)
# define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
2011-11-17 11:20:58 +04:00
# define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
2005-04-17 02:20:36 +04:00
2006-12-07 07:33:20 +03:00
static struct kmem_cache * uid_cachep ;
2020-06-05 02:49:58 +03:00
static struct hlist_head uidhash_table [ UIDHASH_SZ ] ;
2006-01-25 17:23:07 +03:00
/*
* The uidhash_lock is mostly taken from process context , but it is
* occasionally also taken from softirq / tasklet context , when
* task - structs get RCU - freed . Hence all locking must be softirq - safe .
2006-02-01 03:34:26 +03:00
* But free_uid ( ) is also called with local interrupts disabled , and running
* local_bh_enable ( ) with local interrupts disabled is an error - we ' ll run
* softirq callbacks , and they can unconditionally enable interrupts , and
* the caller of free_uid ( ) didn ' t expect that . .
2006-01-25 17:23:07 +03:00
*/
2005-04-17 02:20:36 +04:00
static DEFINE_SPINLOCK ( uidhash_lock ) ;
2011-11-17 13:32:59 +04:00
/* root_user.__count is 1, for init task cred */
2005-04-17 02:20:36 +04:00
struct user_struct root_user = {
2018-08-22 07:55:38 +03:00
. __count = REFCOUNT_INIT ( 1 ) ,
2011-11-17 11:20:58 +04:00
. uid = GLOBAL_ROOT_UID ,
2018-02-22 20:15:06 +03:00
. ratelimit = RATELIMIT_STATE_INIT ( root_user . ratelimit , 0 , 0 ) ,
2005-04-17 02:20:36 +04:00
} ;
2007-10-15 19:00:14 +04:00
/*
* These routines must be called with the uidhash spinlock held !
*/
2007-10-17 10:30:09 +04:00
static void uid_hash_insert ( struct user_struct * up , struct hlist_head * hashent )
2007-10-15 19:00:14 +04:00
{
hlist_add_head ( & up - > uidhash_node , hashent ) ;
}
2007-10-17 10:30:09 +04:00
static void uid_hash_remove ( struct user_struct * up )
2007-10-15 19:00:14 +04:00
{
hlist_del_init ( & up - > uidhash_node ) ;
}
2011-11-17 11:20:58 +04:00
static struct user_struct * uid_hash_find ( kuid_t uid , struct hlist_head * hashent )
sched: delayed cleanup of user_struct
During bootup performance tracing we see repeated occurrences of
/sys/kernel/uid/* events for the same uid, leading to a,
in this case, rather pointless userspace processing for the
same uid over and over.
This is usually caused by tools which change their uid to "nobody",
to run without privileges to read data supplied by untrusted users.
This change delays the execution of the (already existing) scheduled
work, to cleanup the uid after one second, so the allocated and announced
uid can possibly be re-used by another process.
This is the current behavior, where almost every invocation of a
binary, which changes the uid, creates two events:
$ read START < /sys/kernel/uevent_seqnum; \
for i in `seq 100`; do su --shell=/bin/true bin; done; \
read END < /sys/kernel/uevent_seqnum; \
echo $(($END - $START))
178
With the delayed cleanup, we get only two events, and userspace finishes
a bit faster too:
$ read START < /sys/kernel/uevent_seqnum; \
for i in `seq 100`; do su --shell=/bin/true bin; done; \
read END < /sys/kernel/uevent_seqnum; \
echo $(($END - $START))
1
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2009-03-24 17:43:30 +03:00
{
struct user_struct * user ;
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 05:06:00 +04:00
hlist_for_each_entry ( user , hashent , uidhash_node ) {
2011-11-17 11:20:58 +04:00
if ( uid_eq ( user - > uid , uid ) ) {
2018-08-22 07:55:38 +03:00
refcount_inc ( & user - > __count ) ;
sched: delayed cleanup of user_struct
During bootup performance tracing we see repeated occurrences of
/sys/kernel/uid/* events for the same uid, leading to a,
in this case, rather pointless userspace processing for the
same uid over and over.
This is usually caused by tools which change their uid to "nobody",
to run without privileges to read data supplied by untrusted users.
This change delays the execution of the (already existing) scheduled
work, to cleanup the uid after one second, so the allocated and announced
uid can possibly be re-used by another process.
This is the current behavior, where almost every invocation of a
binary, which changes the uid, creates two events:
$ read START < /sys/kernel/uevent_seqnum; \
for i in `seq 100`; do su --shell=/bin/true bin; done; \
read END < /sys/kernel/uevent_seqnum; \
echo $(($END - $START))
178
With the delayed cleanup, we get only two events, and userspace finishes
a bit faster too:
$ read START < /sys/kernel/uevent_seqnum; \
for i in `seq 100`; do su --shell=/bin/true bin; done; \
read END < /sys/kernel/uevent_seqnum; \
echo $(($END - $START))
1
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2009-03-24 17:43:30 +03:00
return user ;
}
}
return NULL ;
}
2021-09-08 06:00:00 +03:00
static int user_epoll_alloc ( struct user_struct * up )
{
# ifdef CONFIG_EPOLL
return percpu_counter_init ( & up - > epoll_watches , 0 , GFP_KERNEL ) ;
# else
return 0 ;
# endif
}
static void user_epoll_free ( struct user_struct * up )
{
# ifdef CONFIG_EPOLL
percpu_counter_destroy ( & up - > epoll_watches ) ;
# endif
}
2007-10-15 19:00:14 +04:00
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state ( as stored in flags ) is restored and uidhash_lock released
* upon function exit .
*/
2008-10-16 01:38:45 +04:00
static void free_user ( struct user_struct * up , unsigned long flags )
2010-10-27 01:22:43 +04:00
__releases ( & uidhash_lock )
2007-10-15 19:00:14 +04:00
{
uid_hash_remove ( up ) ;
spin_unlock_irqrestore ( & uidhash_lock , flags ) ;
2021-09-08 06:00:00 +03:00
user_epoll_free ( up ) ;
2007-10-15 19:00:14 +04:00
kmem_cache_free ( uid_cachep , up ) ;
}
2005-04-17 02:20:36 +04:00
/*
* Locate the user_struct for the passed UID . If found , take a ref on it . The
* caller must undo that ref with free_uid ( ) .
*
* If the user_struct could not be found , return NULL .
*/
2011-11-17 11:20:58 +04:00
struct user_struct * find_user ( kuid_t uid )
2005-04-17 02:20:36 +04:00
{
struct user_struct * ret ;
2006-02-01 03:34:26 +03:00
unsigned long flags ;
2005-04-17 02:20:36 +04:00
2006-02-01 03:34:26 +03:00
spin_lock_irqsave ( & uidhash_lock , flags ) ;
2011-11-17 11:20:58 +04:00
ret = uid_hash_find ( uid , uidhashentry ( uid ) ) ;
2006-02-01 03:34:26 +03:00
spin_unlock_irqrestore ( & uidhash_lock , flags ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
void free_uid ( struct user_struct * up )
{
2006-02-01 03:34:26 +03:00
unsigned long flags ;
2006-03-24 14:15:47 +03:00
if ( ! up )
return ;
2018-08-22 07:55:42 +03:00
if ( refcount_dec_and_lock_irqsave ( & up - > __count , & uidhash_lock , & flags ) )
2007-10-15 19:00:14 +04:00
free_user ( up , flags ) ;
2005-04-17 02:20:36 +04:00
}
2022-11-29 23:29:30 +03:00
EXPORT_SYMBOL_GPL ( free_uid ) ;
2005-04-17 02:20:36 +04:00
2011-11-17 11:20:58 +04:00
struct user_struct * alloc_uid ( kuid_t uid )
2005-04-17 02:20:36 +04:00
{
2011-11-17 11:20:58 +04:00
struct hlist_head * hashent = uidhashentry ( uid ) ;
2008-01-25 23:08:26 +03:00
struct user_struct * up , * new ;
2005-04-17 02:20:36 +04:00
2006-02-01 03:34:26 +03:00
spin_lock_irq ( & uidhash_lock ) ;
2005-04-17 02:20:36 +04:00
up = uid_hash_find ( uid , hashent ) ;
2006-02-01 03:34:26 +03:00
spin_unlock_irq ( & uidhash_lock ) ;
2005-04-17 02:20:36 +04:00
if ( ! up ) {
2008-04-30 11:54:54 +04:00
new = kmem_cache_zalloc ( uid_cachep , GFP_KERNEL ) ;
2008-01-25 23:08:26 +03:00
if ( ! new )
2019-05-15 01:42:37 +03:00
return NULL ;
2007-11-26 23:21:49 +03:00
2005-04-17 02:20:36 +04:00
new - > uid = uid ;
2018-08-22 07:55:38 +03:00
refcount_set ( & new - > __count , 1 ) ;
2021-09-08 06:00:00 +03:00
if ( user_epoll_alloc ( new ) ) {
kmem_cache_free ( uid_cachep , new ) ;
return NULL ;
}
2018-02-22 20:15:06 +03:00
ratelimit_state_init ( & new - > ratelimit , HZ , 100 ) ;
ratelimit_set_flags ( & new - > ratelimit , RATELIMIT_MSG_ON_RELEASE ) ;
2005-04-17 02:20:36 +04:00
/*
* Before adding this , check whether we raced
* on adding the same user already . .
*/
2006-02-01 03:34:26 +03:00
spin_lock_irq ( & uidhash_lock ) ;
2005-04-17 02:20:36 +04:00
up = uid_hash_find ( uid , hashent ) ;
if ( up ) {
2021-09-08 06:00:00 +03:00
user_epoll_free ( new ) ;
2005-04-17 02:20:36 +04:00
kmem_cache_free ( uid_cachep , new ) ;
} else {
uid_hash_insert ( new , hashent ) ;
up = new ;
}
2006-02-01 03:34:26 +03:00
spin_unlock_irq ( & uidhash_lock ) ;
2005-04-17 02:20:36 +04:00
}
2007-10-15 19:00:14 +04:00
2005-04-17 02:20:36 +04:00
return up ;
}
static int __init uid_cache_init ( void )
{
int n ;
uid_cachep = kmem_cache_create ( " uid_cache " , sizeof ( struct user_struct ) ,
2007-07-20 05:11:58 +04:00
0 , SLAB_HWCACHE_ALIGN | SLAB_PANIC , NULL ) ;
2005-04-17 02:20:36 +04:00
for ( n = 0 ; n < UIDHASH_SZ ; + + n )
2011-11-17 11:20:58 +04:00
INIT_HLIST_HEAD ( uidhash_table + n ) ;
2005-04-17 02:20:36 +04:00
2021-09-08 06:00:00 +03:00
if ( user_epoll_alloc ( & root_user ) )
panic ( " root_user epoll percpu counter alloc failed " ) ;
2005-04-17 02:20:36 +04:00
/* Insert the root user immediately (init already runs as root) */
2006-02-01 03:34:26 +03:00
spin_lock_irq ( & uidhash_lock ) ;
2011-11-17 11:20:58 +04:00
uid_hash_insert ( & root_user , uidhashentry ( GLOBAL_ROOT_UID ) ) ;
2006-02-01 03:34:26 +03:00
spin_unlock_irq ( & uidhash_lock ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2014-04-04 01:48:35 +04:00
subsys_initcall ( uid_cache_init ) ;