2019-05-19 15:08:55 +03:00
// SPDX-License-Identifier: GPL-2.0-only
2017-09-09 02:17:00 +03:00
/*
* umh - the kernel usermode helper
*/
# include <linux/module.h>
# include <linux/sched.h>
# include <linux/sched/task.h>
# include <linux/binfmts.h>
# include <linux/syscalls.h>
# include <linux/unistd.h>
# include <linux/kmod.h>
# include <linux/slab.h>
# include <linux/completion.h>
# include <linux/cred.h>
# include <linux/file.h>
# include <linux/fdtable.h>
2020-10-05 20:56:22 +03:00
# include <linux/fs_struct.h>
2017-09-09 02:17:00 +03:00
# include <linux/workqueue.h>
# include <linux/security.h>
# include <linux/mount.h>
# include <linux/kernel.h>
# include <linux/init.h>
# include <linux/resource.h>
# include <linux/notifier.h>
# include <linux/suspend.h>
# include <linux/rwsem.h>
# include <linux/ptrace.h>
# include <linux/async.h>
# include <linux/uaccess.h>
init/initramfs.c: do unpacking asynchronously
Patch series "background initramfs unpacking, and CONFIG_MODPROBE_PATH", v3.
These two patches are independent, but better-together.
The second is a rather trivial patch that simply allows the developer to
change "/sbin/modprobe" to something else - e.g. the empty string, so
that all request_module() during early boot return -ENOENT early, without
even spawning a usermode helper, needlessly synchronizing with the
initramfs unpacking.
The first patch delegates decompressing the initramfs to a worker thread,
allowing do_initcalls() in main.c to proceed to the device_ and late_
initcalls without waiting for that decompression (and populating of
rootfs) to finish. Obviously, some of those later calls may rely on the
initramfs being available, so I've added synchronization points in the
firmware loader and usermodehelper paths - there might be other places
that would need this, but so far no one has been able to think of any
places I have missed.
There's not much to win if most of the functionality needed during boot is
only available as modules. But systems with a custom-made .config and
initramfs can boot faster, partly due to utilizing more than one cpu
earlier, partly by avoiding known-futile modprobe calls (which would still
trigger synchronization with the initramfs unpacking, thus eliminating
most of the first benefit).
This patch (of 2):
Most of the boot process doesn't actually need anything from the
initramfs, until of course PID1 is to be executed. So instead of doing
the decompressing and populating of the initramfs synchronously in
populate_rootfs() itself, push that off to a worker thread.
This is primarily motivated by an embedded ppc target, where unpacking
even the rather modest sized initramfs takes 0.6 seconds, which is long
enough that the external watchdog becomes unhappy that it doesn't get
attention soon enough. By doing the initramfs decompression in a worker
thread, we get to do the device_initcalls and hence start petting the
watchdog much sooner.
Normal desktops might benefit as well. On my mostly stock Ubuntu kernel,
my initramfs is a 26M xz-compressed blob, decompressing to around 126M.
That takes almost two seconds:
[ 0.201454] Trying to unpack rootfs image as initramfs...
[ 1.976633] Freeing initrd memory: 29416K
Before this patch, these lines occur consecutively in dmesg. With this
patch, the timestamps on these two lines is roughly the same as above, but
with 172 lines inbetween - so more than one cpu has been kept busy doing
work that would otherwise only happen after the populate_rootfs()
finished.
Should one of the initcalls done after rootfs_initcall time (i.e., device_
and late_ initcalls) need something from the initramfs (say, a kernel
module or a firmware blob), it will simply wait for the initramfs
unpacking to be done before proceeding, which should in theory make this
completely safe.
But if some driver pokes around in the filesystem directly and not via one
of the official kernel interfaces (i.e. request_firmware*(),
call_usermodehelper*) that theory may not hold - also, I certainly might
have missed a spot when sprinkling wait_for_initramfs(). So there is an
escape hatch in the form of an initramfs_async= command line parameter.
Link: https://lkml.kernel.org/r/20210313212528.2956377-1-linux@rasmusvillemoes.dk
Link: https://lkml.kernel.org/r/20210313212528.2956377-2-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-07 04:05:42 +03:00
# include <linux/initrd.h>
2022-08-22 14:18:18 +03:00
# include <linux/freezer.h>
2017-09-09 02:17:00 +03:00
# include <trace/events/module.h>
static kernel_cap_t usermodehelper_bset = CAP_FULL_SET ;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET ;
static DEFINE_SPINLOCK ( umh_sysctl_lock ) ;
static DECLARE_RWSEM ( umhelper_sem ) ;
static void call_usermodehelper_freeinfo ( struct subprocess_info * info )
{
if ( info - > cleanup )
( * info - > cleanup ) ( info ) ;
kfree ( info ) ;
}
static void umh_complete ( struct subprocess_info * sub_info )
{
struct completion * comp = xchg ( & sub_info - > complete , NULL ) ;
/*
* See call_usermodehelper_exec ( ) . If xchg ( ) returns NULL
* we own sub_info , the UMH_KILLABLE caller has gone away
* or the caller used UMH_NO_WAIT .
*/
if ( comp )
complete ( comp ) ;
else
call_usermodehelper_freeinfo ( sub_info ) ;
}
/*
* This is the task which runs the usermode application
*/
static int call_usermodehelper_exec_async ( void * data )
{
struct subprocess_info * sub_info = data ;
struct cred * new ;
int retval ;
spin_lock_irq ( & current - > sighand - > siglock ) ;
flush_signal_handlers ( current , 1 ) ;
spin_unlock_irq ( & current - > sighand - > siglock ) ;
2020-10-05 20:56:22 +03:00
/*
* Initial kernel threads share ther FS with init , in order to
* get the init root directory . But we ' ve now created a new
* thread that is going to execve a user process and has its own
* ' struct fs_struct ' . Reset umask to the default .
*/
current - > fs - > umask = 0022 ;
2017-09-09 02:17:00 +03:00
/*
* Our parent ( unbound workqueue ) runs with elevated scheduling
* priority . Avoid propagating that into the userspace child .
*/
set_user_nice ( current , 0 ) ;
retval = - ENOMEM ;
new = prepare_kernel_cred ( current ) ;
if ( ! new )
goto out ;
spin_lock ( & umh_sysctl_lock ) ;
new - > cap_bset = cap_intersect ( usermodehelper_bset , new - > cap_bset ) ;
new - > cap_inheritable = cap_intersect ( usermodehelper_inheritable ,
new - > cap_inheritable ) ;
spin_unlock ( & umh_sysctl_lock ) ;
if ( sub_info - > init ) {
retval = sub_info - > init ( sub_info , new ) ;
if ( retval ) {
abort_creds ( new ) ;
goto out ;
}
}
commit_creds ( new ) ;
init/initramfs.c: do unpacking asynchronously
Patch series "background initramfs unpacking, and CONFIG_MODPROBE_PATH", v3.
These two patches are independent, but better-together.
The second is a rather trivial patch that simply allows the developer to
change "/sbin/modprobe" to something else - e.g. the empty string, so
that all request_module() during early boot return -ENOENT early, without
even spawning a usermode helper, needlessly synchronizing with the
initramfs unpacking.
The first patch delegates decompressing the initramfs to a worker thread,
allowing do_initcalls() in main.c to proceed to the device_ and late_
initcalls without waiting for that decompression (and populating of
rootfs) to finish. Obviously, some of those later calls may rely on the
initramfs being available, so I've added synchronization points in the
firmware loader and usermodehelper paths - there might be other places
that would need this, but so far no one has been able to think of any
places I have missed.
There's not much to win if most of the functionality needed during boot is
only available as modules. But systems with a custom-made .config and
initramfs can boot faster, partly due to utilizing more than one cpu
earlier, partly by avoiding known-futile modprobe calls (which would still
trigger synchronization with the initramfs unpacking, thus eliminating
most of the first benefit).
This patch (of 2):
Most of the boot process doesn't actually need anything from the
initramfs, until of course PID1 is to be executed. So instead of doing
the decompressing and populating of the initramfs synchronously in
populate_rootfs() itself, push that off to a worker thread.
This is primarily motivated by an embedded ppc target, where unpacking
even the rather modest sized initramfs takes 0.6 seconds, which is long
enough that the external watchdog becomes unhappy that it doesn't get
attention soon enough. By doing the initramfs decompression in a worker
thread, we get to do the device_initcalls and hence start petting the
watchdog much sooner.
Normal desktops might benefit as well. On my mostly stock Ubuntu kernel,
my initramfs is a 26M xz-compressed blob, decompressing to around 126M.
That takes almost two seconds:
[ 0.201454] Trying to unpack rootfs image as initramfs...
[ 1.976633] Freeing initrd memory: 29416K
Before this patch, these lines occur consecutively in dmesg. With this
patch, the timestamps on these two lines is roughly the same as above, but
with 172 lines inbetween - so more than one cpu has been kept busy doing
work that would otherwise only happen after the populate_rootfs()
finished.
Should one of the initcalls done after rootfs_initcall time (i.e., device_
and late_ initcalls) need something from the initramfs (say, a kernel
module or a firmware blob), it will simply wait for the initramfs
unpacking to be done before proceeding, which should in theory make this
completely safe.
But if some driver pokes around in the filesystem directly and not via one
of the official kernel interfaces (i.e. request_firmware*(),
call_usermodehelper*) that theory may not hold - also, I certainly might
have missed a spot when sprinkling wait_for_initramfs(). So there is an
escape hatch in the form of an initramfs_async= command line parameter.
Link: https://lkml.kernel.org/r/20210313212528.2956377-1-linux@rasmusvillemoes.dk
Link: https://lkml.kernel.org/r/20210313212528.2956377-2-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-07 04:05:42 +03:00
wait_for_initramfs ( ) ;
2020-07-13 20:06:48 +03:00
retval = kernel_execve ( sub_info - > path ,
( const char * const * ) sub_info - > argv ,
( const char * const * ) sub_info - > envp ) ;
2017-09-09 02:17:00 +03:00
out :
sub_info - > retval = retval ;
/*
* call_usermodehelper_exec_sync ( ) will call umh_complete
* if UHM_WAIT_PROC .
*/
if ( ! ( sub_info - > wait & UMH_WAIT_PROC ) )
umh_complete ( sub_info ) ;
if ( ! retval )
return 0 ;
do_exit ( 0 ) ;
}
/* Handles UMH_WAIT_PROC. */
static void call_usermodehelper_exec_sync ( struct subprocess_info * sub_info )
{
pid_t pid ;
2020-08-12 04:34:10 +03:00
/* If SIGCLD is ignored do_wait won't populate the status. */
2017-09-09 02:17:00 +03:00
kernel_sigaction ( SIGCHLD , SIG_DFL ) ;
2022-04-11 19:40:14 +03:00
pid = user_mode_thread ( call_usermodehelper_exec_async , sub_info , SIGCHLD ) ;
2020-08-12 04:34:10 +03:00
if ( pid < 0 )
2017-09-09 02:17:00 +03:00
sub_info - > retval = pid ;
2020-08-12 04:34:10 +03:00
else
kernel_wait ( pid , & sub_info - > retval ) ;
2017-09-09 02:17:00 +03:00
/* Restore default kernel sig handler */
kernel_sigaction ( SIGCHLD , SIG_IGN ) ;
umh_complete ( sub_info ) ;
}
/*
* We need to create the usermodehelper kernel thread from a task that is affine
* to an optimized set of CPUs ( or nohz housekeeping ones ) such that they
* inherit a widest affinity irrespective of call_usermodehelper ( ) callers with
* possibly reduced affinity ( eg : per - cpu workqueues ) . We don ' t want
* usermodehelper targets to contend a busy CPU .
*
* Unbound workqueues provide such wide affinity and allow to block on
* UMH_WAIT_PROC requests without blocking pending request ( up to some limit ) .
*
* Besides , workqueues provide the privilege level that caller might not have
* to perform the usermodehelper request .
*
*/
static void call_usermodehelper_exec_work ( struct work_struct * work )
{
struct subprocess_info * sub_info =
container_of ( work , struct subprocess_info , work ) ;
if ( sub_info - > wait & UMH_WAIT_PROC ) {
call_usermodehelper_exec_sync ( sub_info ) ;
} else {
pid_t pid ;
/*
* Use CLONE_PARENT to reparent it to kthreadd ; we do not
* want to pollute current - > children , and we need a parent
* that always ignores SIGCHLD to ensure auto - reaping .
*/
2022-04-11 19:40:14 +03:00
pid = user_mode_thread ( call_usermodehelper_exec_async , sub_info ,
CLONE_PARENT | SIGCHLD ) ;
2017-09-09 02:17:00 +03:00
if ( pid < 0 ) {
sub_info - > retval = pid ;
umh_complete ( sub_info ) ;
}
}
}
/*
* If set , call_usermodehelper_exec ( ) will exit immediately returning - EBUSY
* ( used for preventing user land processes from being created after the user
* land has been frozen during a system - wide hibernation or suspend operation ) .
* Should always be manipulated under umhelper_sem acquired for write .
*/
static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED ;
/* Number of helpers running */
static atomic_t running_helpers = ATOMIC_INIT ( 0 ) ;
/*
* Wait queue head used by usermodehelper_disable ( ) to wait for all running
* helpers to finish .
*/
static DECLARE_WAIT_QUEUE_HEAD ( running_helpers_waitq ) ;
/*
* Used by usermodehelper_read_lock_wait ( ) to wait for usermodehelper_disabled
* to become ' false ' .
*/
static DECLARE_WAIT_QUEUE_HEAD ( usermodehelper_disabled_waitq ) ;
/*
* Time to wait for running_helpers to become zero before the setting of
* usermodehelper_disabled in usermodehelper_disable ( ) fails
*/
# define RUNNING_HELPERS_TIMEOUT (5 * HZ)
int usermodehelper_read_trylock ( void )
{
DEFINE_WAIT ( wait ) ;
int ret = 0 ;
down_read ( & umhelper_sem ) ;
for ( ; ; ) {
prepare_to_wait ( & usermodehelper_disabled_waitq , & wait ,
TASK_INTERRUPTIBLE ) ;
if ( ! usermodehelper_disabled )
break ;
if ( usermodehelper_disabled = = UMH_DISABLED )
ret = - EAGAIN ;
up_read ( & umhelper_sem ) ;
if ( ret )
break ;
schedule ( ) ;
try_to_freeze ( ) ;
down_read ( & umhelper_sem ) ;
}
finish_wait ( & usermodehelper_disabled_waitq , & wait ) ;
return ret ;
}
EXPORT_SYMBOL_GPL ( usermodehelper_read_trylock ) ;
long usermodehelper_read_lock_wait ( long timeout )
{
DEFINE_WAIT ( wait ) ;
if ( timeout < 0 )
return - EINVAL ;
down_read ( & umhelper_sem ) ;
for ( ; ; ) {
prepare_to_wait ( & usermodehelper_disabled_waitq , & wait ,
TASK_UNINTERRUPTIBLE ) ;
if ( ! usermodehelper_disabled )
break ;
up_read ( & umhelper_sem ) ;
timeout = schedule_timeout ( timeout ) ;
if ( ! timeout )
break ;
down_read ( & umhelper_sem ) ;
}
finish_wait ( & usermodehelper_disabled_waitq , & wait ) ;
return timeout ;
}
EXPORT_SYMBOL_GPL ( usermodehelper_read_lock_wait ) ;
void usermodehelper_read_unlock ( void )
{
up_read ( & umhelper_sem ) ;
}
EXPORT_SYMBOL_GPL ( usermodehelper_read_unlock ) ;
/**
* __usermodehelper_set_disable_depth - Modify usermodehelper_disabled .
* @ depth : New value to assign to usermodehelper_disabled .
*
* Change the value of usermodehelper_disabled ( under umhelper_sem locked for
* writing ) and wakeup tasks waiting for it to change .
*/
void __usermodehelper_set_disable_depth ( enum umh_disable_depth depth )
{
down_write ( & umhelper_sem ) ;
usermodehelper_disabled = depth ;
wake_up ( & usermodehelper_disabled_waitq ) ;
up_write ( & umhelper_sem ) ;
}
/**
* __usermodehelper_disable - Prevent new helpers from being started .
* @ depth : New value to assign to usermodehelper_disabled .
*
* Set usermodehelper_disabled to @ depth and wait for running helpers to exit .
*/
int __usermodehelper_disable ( enum umh_disable_depth depth )
{
long retval ;
if ( ! depth )
return - EINVAL ;
down_write ( & umhelper_sem ) ;
usermodehelper_disabled = depth ;
up_write ( & umhelper_sem ) ;
/*
* From now on call_usermodehelper_exec ( ) won ' t start any new
* helpers , so it is sufficient if running_helpers turns out to
* be zero at one point ( it may be increased later , but that
* doesn ' t matter ) .
*/
retval = wait_event_timeout ( running_helpers_waitq ,
atomic_read ( & running_helpers ) = = 0 ,
RUNNING_HELPERS_TIMEOUT ) ;
if ( retval )
return 0 ;
__usermodehelper_set_disable_depth ( UMH_ENABLED ) ;
return - EAGAIN ;
}
static void helper_lock ( void )
{
atomic_inc ( & running_helpers ) ;
smp_mb__after_atomic ( ) ;
}
static void helper_unlock ( void )
{
if ( atomic_dec_and_test ( & running_helpers ) )
wake_up ( & running_helpers_waitq ) ;
}
/**
* call_usermodehelper_setup - prepare to call a usermode helper
* @ path : path to usermode executable
* @ argv : arg vector for process
* @ envp : environment for process
* @ gfp_mask : gfp mask for memory allocation
* @ init : an init function
2021-05-07 04:06:27 +03:00
* @ cleanup : a cleanup function
2017-09-09 02:17:00 +03:00
* @ data : arbitrary context sensitive data
*
* Returns either % NULL on allocation failure , or a subprocess_info
* structure . This should be passed to call_usermodehelper_exec to
* exec the process and free the structure .
*
* The init function is used to customize the helper process prior to
* exec . A non - zero return code causes the process to error out , exit ,
* and return the failure to the calling process
*
2021-05-07 04:06:27 +03:00
* The cleanup function is just before the subprocess_info is about to
2017-09-09 02:17:00 +03:00
* be freed . This can be used for freeing the argv and envp . The
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called .
*/
struct subprocess_info * call_usermodehelper_setup ( const char * path , char * * argv ,
char * * envp , gfp_t gfp_mask ,
int ( * init ) ( struct subprocess_info * info , struct cred * new ) ,
void ( * cleanup ) ( struct subprocess_info * info ) ,
void * data )
{
struct subprocess_info * sub_info ;
sub_info = kzalloc ( sizeof ( struct subprocess_info ) , gfp_mask ) ;
if ( ! sub_info )
goto out ;
INIT_WORK ( & sub_info - > work , call_usermodehelper_exec_work ) ;
# ifdef CONFIG_STATIC_USERMODEHELPER
sub_info - > path = CONFIG_STATIC_USERMODEHELPER_PATH ;
# else
sub_info - > path = path ;
# endif
sub_info - > argv = argv ;
sub_info - > envp = envp ;
sub_info - > cleanup = cleanup ;
sub_info - > init = init ;
sub_info - > data = data ;
out :
return sub_info ;
}
EXPORT_SYMBOL ( call_usermodehelper_setup ) ;
/**
* call_usermodehelper_exec - start a usermode application
2021-05-07 04:06:27 +03:00
* @ sub_info : information about the subprocess
2017-09-09 02:17:00 +03:00
* @ wait : wait for the application to finish and return status .
* when UMH_NO_WAIT don ' t wait at all , but you get no useful error back
* when the program couldn ' t be exec ' ed . This makes it safe to call
* from interrupt context .
*
* Runs a user - space application . The application is started
* asynchronously if wait is not set , and runs as a child of system workqueues .
* ( ie . it runs with full root capabilities and optimized affinity ) .
2020-04-16 19:28:59 +03:00
*
* Note : successful return value does not guarantee the helper was called at
* all . You can ' t rely on sub_info - > { init , cleanup } being called even for
* UMH_WAIT_ * wait modes as STATIC_USERMODEHELPER_PATH = " " turns all helpers
* into a successful no - op .
2017-09-09 02:17:00 +03:00
*/
int call_usermodehelper_exec ( struct subprocess_info * sub_info , int wait )
{
freezer,sched: Rewrite core freezer logic
Rewrite the core freezer to behave better wrt thawing and be simpler
in general.
By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is
ensured frozen tasks stay frozen until thawed and don't randomly wake
up early, as is currently possible.
As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up
two PF_flags (yay!).
Specifically; the current scheme works a little like:
freezer_do_not_count();
schedule();
freezer_count();
And either the task is blocked, or it lands in try_to_freezer()
through freezer_count(). Now, when it is blocked, the freezer
considers it frozen and continues.
However, on thawing, once pm_freezing is cleared, freezer_count()
stops working, and any random/spurious wakeup will let a task run
before its time.
That is, thawing tries to thaw things in explicit order; kernel
threads and workqueues before doing bringing SMP back before userspace
etc.. However due to the above mentioned races it is entirely possible
for userspace tasks to thaw (by accident) before SMP is back.
This can be a fatal problem in asymmetric ISA architectures (eg ARMv9)
where the userspace task requires a special CPU to run.
As said; replace this with a special task state TASK_FROZEN and add
the following state transitions:
TASK_FREEZABLE -> TASK_FROZEN
__TASK_STOPPED -> TASK_FROZEN
__TASK_TRACED -> TASK_FROZEN
The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL
(IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state
is already required to deal with spurious wakeups and the freezer
causes one such when thawing the task (since the original state is
lost).
The special __TASK_{STOPPED,TRACED} states *can* be restored since
their canonical state is in ->jobctl.
With this, frozen tasks need an explicit TASK_FROZEN wakeup and are
free of undue (early / spurious) wakeups.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org
2022-08-22 14:18:22 +03:00
unsigned int state = TASK_UNINTERRUPTIBLE ;
2017-09-09 02:17:00 +03:00
DECLARE_COMPLETION_ONSTACK ( done ) ;
int retval = 0 ;
if ( ! sub_info - > path ) {
call_usermodehelper_freeinfo ( sub_info ) ;
return - EINVAL ;
}
helper_lock ( ) ;
if ( usermodehelper_disabled ) {
retval = - EBUSY ;
goto out ;
}
/*
* If there is no binary for us to call , then just return and get out of
* here . This allows us to set STATIC_USERMODEHELPER_PATH to " " and
* disable all call_usermodehelper ( ) calls .
*/
if ( strlen ( sub_info - > path ) = = 0 )
goto out ;
/*
* Set the completion pointer only if there is a waiter .
* This makes it possible to use umh_complete to free
* the data structure in case of UMH_NO_WAIT .
*/
sub_info - > complete = ( wait = = UMH_NO_WAIT ) ? NULL : & done ;
sub_info - > wait = wait ;
queue_work ( system_unbound_wq , & sub_info - > work ) ;
if ( wait = = UMH_NO_WAIT ) /* task has freed sub_info */
goto unlock ;
2022-08-22 14:18:18 +03:00
if ( wait & UMH_FREEZABLE )
freezer,sched: Rewrite core freezer logic
Rewrite the core freezer to behave better wrt thawing and be simpler
in general.
By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is
ensured frozen tasks stay frozen until thawed and don't randomly wake
up early, as is currently possible.
As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up
two PF_flags (yay!).
Specifically; the current scheme works a little like:
freezer_do_not_count();
schedule();
freezer_count();
And either the task is blocked, or it lands in try_to_freezer()
through freezer_count(). Now, when it is blocked, the freezer
considers it frozen and continues.
However, on thawing, once pm_freezing is cleared, freezer_count()
stops working, and any random/spurious wakeup will let a task run
before its time.
That is, thawing tries to thaw things in explicit order; kernel
threads and workqueues before doing bringing SMP back before userspace
etc.. However due to the above mentioned races it is entirely possible
for userspace tasks to thaw (by accident) before SMP is back.
This can be a fatal problem in asymmetric ISA architectures (eg ARMv9)
where the userspace task requires a special CPU to run.
As said; replace this with a special task state TASK_FROZEN and add
the following state transitions:
TASK_FREEZABLE -> TASK_FROZEN
__TASK_STOPPED -> TASK_FROZEN
__TASK_TRACED -> TASK_FROZEN
The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL
(IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state
is already required to deal with spurious wakeups and the freezer
causes one such when thawing the task (since the original state is
lost).
The special __TASK_{STOPPED,TRACED} states *can* be restored since
their canonical state is in ->jobctl.
With this, frozen tasks need an explicit TASK_FROZEN wakeup and are
free of undue (early / spurious) wakeups.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org
2022-08-22 14:18:22 +03:00
state | = TASK_FREEZABLE ;
2022-08-22 14:18:18 +03:00
freezer,sched: Rewrite core freezer logic
Rewrite the core freezer to behave better wrt thawing and be simpler
in general.
By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is
ensured frozen tasks stay frozen until thawed and don't randomly wake
up early, as is currently possible.
As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up
two PF_flags (yay!).
Specifically; the current scheme works a little like:
freezer_do_not_count();
schedule();
freezer_count();
And either the task is blocked, or it lands in try_to_freezer()
through freezer_count(). Now, when it is blocked, the freezer
considers it frozen and continues.
However, on thawing, once pm_freezing is cleared, freezer_count()
stops working, and any random/spurious wakeup will let a task run
before its time.
That is, thawing tries to thaw things in explicit order; kernel
threads and workqueues before doing bringing SMP back before userspace
etc.. However due to the above mentioned races it is entirely possible
for userspace tasks to thaw (by accident) before SMP is back.
This can be a fatal problem in asymmetric ISA architectures (eg ARMv9)
where the userspace task requires a special CPU to run.
As said; replace this with a special task state TASK_FROZEN and add
the following state transitions:
TASK_FREEZABLE -> TASK_FROZEN
__TASK_STOPPED -> TASK_FROZEN
__TASK_TRACED -> TASK_FROZEN
The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL
(IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state
is already required to deal with spurious wakeups and the freezer
causes one such when thawing the task (since the original state is
lost).
The special __TASK_{STOPPED,TRACED} states *can* be restored since
their canonical state is in ->jobctl.
With this, frozen tasks need an explicit TASK_FROZEN wakeup and are
free of undue (early / spurious) wakeups.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org
2022-08-22 14:18:22 +03:00
if ( wait & UMH_KILLABLE ) {
2023-02-03 17:31:11 +03:00
retval = wait_for_completion_state ( & done , state | TASK_KILLABLE ) ;
if ( ! retval )
goto wait_done ;
2017-09-09 02:17:00 +03:00
/* umh_complete() will see NULL and free sub_info */
if ( xchg ( & sub_info - > complete , NULL ) )
goto unlock ;
2023-02-03 17:31:11 +03:00
/*
* fallthrough ; in case of - ERESTARTSYS now do uninterruptible
* wait_for_completion_state ( ) . Since umh_complete ( ) shall call
* complete ( ) in a moment if xchg ( ) above returned NULL , this
* uninterruptible wait_for_completion_state ( ) will not block
* SIGKILL ' ed processes for long .
*/
2017-09-09 02:17:00 +03:00
}
2023-02-03 17:31:11 +03:00
wait_for_completion_state ( & done , state ) ;
2017-09-09 02:17:00 +03:00
wait_done :
retval = sub_info - > retval ;
out :
call_usermodehelper_freeinfo ( sub_info ) ;
unlock :
helper_unlock ( ) ;
return retval ;
}
EXPORT_SYMBOL ( call_usermodehelper_exec ) ;
/**
* call_usermodehelper ( ) - prepare and start a usermode application
* @ path : path to usermode executable
* @ argv : arg vector for process
* @ envp : environment for process
* @ wait : wait for the application to finish and return status .
* when UMH_NO_WAIT don ' t wait at all , but you get no useful error back
* when the program couldn ' t be exec ' ed . This makes it safe to call
* from interrupt context .
*
* This function is the equivalent to use call_usermodehelper_setup ( ) and
* call_usermodehelper_exec ( ) .
*/
int call_usermodehelper ( const char * path , char * * argv , char * * envp , int wait )
{
struct subprocess_info * info ;
gfp_t gfp_mask = ( wait = = UMH_NO_WAIT ) ? GFP_ATOMIC : GFP_KERNEL ;
info = call_usermodehelper_setup ( path , argv , envp , gfp_mask ,
NULL , NULL , NULL ) ;
if ( info = = NULL )
return - ENOMEM ;
return call_usermodehelper_exec ( info , wait ) ;
}
EXPORT_SYMBOL ( call_usermodehelper ) ;
static int proc_cap_handler ( struct ctl_table * table , int write ,
2020-04-24 09:43:38 +03:00
void * buffer , size_t * lenp , loff_t * ppos )
2017-09-09 02:17:00 +03:00
{
struct ctl_table t ;
2023-02-28 22:39:09 +03:00
unsigned long cap_array [ 2 ] ;
kernel_cap_t new_cap , * cap ;
int err ;
2017-09-09 02:17:00 +03:00
if ( write & & ( ! capable ( CAP_SETPCAP ) | |
! capable ( CAP_SYS_MODULE ) ) )
return - EPERM ;
/*
* convert from the global kernel_cap_t to the ulong array to print to
* userspace if this is a read .
2023-03-03 02:49:44 +03:00
*
* Legacy format : capabilities are exposed as two 32 - bit values
2017-09-09 02:17:00 +03:00
*/
2023-03-03 02:49:44 +03:00
cap = table - > data ;
2017-09-09 02:17:00 +03:00
spin_lock ( & umh_sysctl_lock ) ;
2023-02-28 22:39:09 +03:00
cap_array [ 0 ] = ( u32 ) cap - > val ;
cap_array [ 1 ] = cap - > val > > 32 ;
2017-09-09 02:17:00 +03:00
spin_unlock ( & umh_sysctl_lock ) ;
t = * table ;
t . data = & cap_array ;
/*
* actually read or write and array of ulongs from userspace . Remember
* these are least significant 32 bits first
*/
err = proc_doulongvec_minmax ( & t , write , buffer , lenp , ppos ) ;
if ( err < 0 )
return err ;
2023-02-28 22:39:09 +03:00
new_cap . val = ( u32 ) cap_array [ 0 ] ;
new_cap . val + = ( u64 ) cap_array [ 1 ] < < 32 ;
2017-09-09 02:17:00 +03:00
/*
* Drop everything not in the new_cap ( but don ' t add things )
*/
if ( write ) {
2017-11-18 02:27:32 +03:00
spin_lock ( & umh_sysctl_lock ) ;
2023-02-28 22:39:09 +03:00
* cap = cap_intersect ( * cap , new_cap ) ;
2017-11-18 02:27:32 +03:00
spin_unlock ( & umh_sysctl_lock ) ;
2017-09-09 02:17:00 +03:00
}
return 0 ;
}
struct ctl_table usermodehelper_table [ ] = {
{
. procname = " bset " ,
2023-03-03 02:49:44 +03:00
. data = & usermodehelper_bset ,
2023-02-28 22:39:09 +03:00
. maxlen = 2 * sizeof ( unsigned long ) ,
2017-09-09 02:17:00 +03:00
. mode = 0600 ,
. proc_handler = proc_cap_handler ,
} ,
{
. procname = " inheritable " ,
2023-03-03 02:49:44 +03:00
. data = & usermodehelper_inheritable ,
2023-02-28 22:39:09 +03:00
. maxlen = 2 * sizeof ( unsigned long ) ,
2017-09-09 02:17:00 +03:00
. mode = 0600 ,
. proc_handler = proc_cap_handler ,
} ,
{ }
} ;