c32b3cbe0d
Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend") has left a race window when OOM killer manages to note_oom_kill after freeze_processes checks the counter. The race window is quite small and really unlikely and partial solution deemed sufficient at the time of submission. Tejun wasn't happy about this partial solution though and insisted on a full solution. That requires the full OOM and freezer's task freezing exclusion, though. This is done by this patch which introduces oom_sem RW lock and turns oom_killer_disable() into a full OOM barrier. oom_killer_disabled check is moved from the allocation path to the OOM level and we take oom_sem for reading for both the check and the whole OOM invocation. oom_killer_disable() takes oom_sem for writing so it waits for all currently running OOM killer invocations. Then it disable all the further OOMs by setting oom_killer_disabled and checks for any oom victims. Victims are counted via mark_tsk_oom_victim resp. unmark_oom_victim. The last victim wakes up all waiters enqueued by oom_killer_disable(). Therefore this function acts as the full OOM barrier. The page fault path is covered now as well although it was assumed to be safe before. As per Tejun, "We used to have freezing points deep in file system code which may be reacheable from page fault." so it would be better and more robust to not rely on freezing points here. Same applies to the memcg OOM killer. out_of_memory tells the caller whether the OOM was allowed to trigger and the callers are supposed to handle the situation. The page allocation path simply fails the allocation same as before. The page fault path will retry the fault (more on that later) and Sysrq OOM trigger will simply complain to the log. Normally there wouldn't be any unfrozen user tasks after try_to_freeze_tasks so the function will not block. But if there was an OOM killer racing with try_to_freeze_tasks and the OOM victim didn't finish yet then we have to wait for it. This should complete in a finite time, though, because - the victim cannot loop in the page fault handler (it would die on the way out from the exception) - it cannot loop in the page allocator because all the further allocation would fail and __GFP_NOFAIL allocations are not acceptable at this stage - it shouldn't be blocked on any locks held by frozen tasks (try_to_freeze expects lockless context) and kernel threads and work queues are not frozen yet Signed-off-by: Michal Hocko <mhocko@suse.cz> Suggested-by: Tejun Heo <tj@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
238 lines
5.3 KiB
C
238 lines
5.3 KiB
C
/*
|
|
* drivers/power/process.c - Functions for starting/stopping processes on
|
|
* suspend transitions.
|
|
*
|
|
* Originally from swsusp.
|
|
*/
|
|
|
|
|
|
#undef DEBUG
|
|
|
|
#include <linux/interrupt.h>
|
|
#include <linux/oom.h>
|
|
#include <linux/suspend.h>
|
|
#include <linux/module.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/freezer.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/kmod.h>
|
|
#include <trace/events/power.h>
|
|
|
|
/*
|
|
* Timeout for stopping processes
|
|
*/
|
|
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
|
|
|
|
static int try_to_freeze_tasks(bool user_only)
|
|
{
|
|
struct task_struct *g, *p;
|
|
unsigned long end_time;
|
|
unsigned int todo;
|
|
bool wq_busy = false;
|
|
struct timeval start, end;
|
|
u64 elapsed_msecs64;
|
|
unsigned int elapsed_msecs;
|
|
bool wakeup = false;
|
|
int sleep_usecs = USEC_PER_MSEC;
|
|
|
|
do_gettimeofday(&start);
|
|
|
|
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
|
|
|
|
if (!user_only)
|
|
freeze_workqueues_begin();
|
|
|
|
while (true) {
|
|
todo = 0;
|
|
read_lock(&tasklist_lock);
|
|
for_each_process_thread(g, p) {
|
|
if (p == current || !freeze_task(p))
|
|
continue;
|
|
|
|
if (!freezer_should_skip(p))
|
|
todo++;
|
|
}
|
|
read_unlock(&tasklist_lock);
|
|
|
|
if (!user_only) {
|
|
wq_busy = freeze_workqueues_busy();
|
|
todo += wq_busy;
|
|
}
|
|
|
|
if (!todo || time_after(jiffies, end_time))
|
|
break;
|
|
|
|
if (pm_wakeup_pending()) {
|
|
wakeup = true;
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* We need to retry, but first give the freezing tasks some
|
|
* time to enter the refrigerator. Start with an initial
|
|
* 1 ms sleep followed by exponential backoff until 8 ms.
|
|
*/
|
|
usleep_range(sleep_usecs / 2, sleep_usecs);
|
|
if (sleep_usecs < 8 * USEC_PER_MSEC)
|
|
sleep_usecs *= 2;
|
|
}
|
|
|
|
do_gettimeofday(&end);
|
|
elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
|
|
do_div(elapsed_msecs64, NSEC_PER_MSEC);
|
|
elapsed_msecs = elapsed_msecs64;
|
|
|
|
if (todo) {
|
|
pr_cont("\n");
|
|
pr_err("Freezing of tasks %s after %d.%03d seconds "
|
|
"(%d tasks refusing to freeze, wq_busy=%d):\n",
|
|
wakeup ? "aborted" : "failed",
|
|
elapsed_msecs / 1000, elapsed_msecs % 1000,
|
|
todo - wq_busy, wq_busy);
|
|
|
|
if (!wakeup) {
|
|
read_lock(&tasklist_lock);
|
|
for_each_process_thread(g, p) {
|
|
if (p != current && !freezer_should_skip(p)
|
|
&& freezing(p) && !frozen(p))
|
|
sched_show_task(p);
|
|
}
|
|
read_unlock(&tasklist_lock);
|
|
}
|
|
} else {
|
|
pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
|
|
elapsed_msecs % 1000);
|
|
}
|
|
|
|
return todo ? -EBUSY : 0;
|
|
}
|
|
|
|
/**
|
|
* freeze_processes - Signal user space processes to enter the refrigerator.
|
|
* The current thread will not be frozen. The same process that calls
|
|
* freeze_processes must later call thaw_processes.
|
|
*
|
|
* On success, returns 0. On failure, -errno and system is fully thawed.
|
|
*/
|
|
int freeze_processes(void)
|
|
{
|
|
int error;
|
|
|
|
error = __usermodehelper_disable(UMH_FREEZING);
|
|
if (error)
|
|
return error;
|
|
|
|
/* Make sure this task doesn't get frozen */
|
|
current->flags |= PF_SUSPEND_TASK;
|
|
|
|
if (!pm_freezing)
|
|
atomic_inc(&system_freezing_cnt);
|
|
|
|
pm_wakeup_clear();
|
|
pr_info("Freezing user space processes ... ");
|
|
pm_freezing = true;
|
|
error = try_to_freeze_tasks(true);
|
|
if (!error) {
|
|
__usermodehelper_set_disable_depth(UMH_DISABLED);
|
|
pr_cont("done.");
|
|
}
|
|
pr_cont("\n");
|
|
BUG_ON(in_atomic());
|
|
|
|
/*
|
|
* Now that the whole userspace is frozen we need to disbale
|
|
* the OOM killer to disallow any further interference with
|
|
* killable tasks.
|
|
*/
|
|
if (!error && !oom_killer_disable())
|
|
error = -EBUSY;
|
|
|
|
if (error)
|
|
thaw_processes();
|
|
return error;
|
|
}
|
|
|
|
/**
|
|
* freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
|
|
*
|
|
* On success, returns 0. On failure, -errno and only the kernel threads are
|
|
* thawed, so as to give a chance to the caller to do additional cleanups
|
|
* (if any) before thawing the userspace tasks. So, it is the responsibility
|
|
* of the caller to thaw the userspace tasks, when the time is right.
|
|
*/
|
|
int freeze_kernel_threads(void)
|
|
{
|
|
int error;
|
|
|
|
pr_info("Freezing remaining freezable tasks ... ");
|
|
|
|
pm_nosig_freezing = true;
|
|
error = try_to_freeze_tasks(false);
|
|
if (!error)
|
|
pr_cont("done.");
|
|
|
|
pr_cont("\n");
|
|
BUG_ON(in_atomic());
|
|
|
|
if (error)
|
|
thaw_kernel_threads();
|
|
return error;
|
|
}
|
|
|
|
void thaw_processes(void)
|
|
{
|
|
struct task_struct *g, *p;
|
|
struct task_struct *curr = current;
|
|
|
|
trace_suspend_resume(TPS("thaw_processes"), 0, true);
|
|
if (pm_freezing)
|
|
atomic_dec(&system_freezing_cnt);
|
|
pm_freezing = false;
|
|
pm_nosig_freezing = false;
|
|
|
|
oom_killer_enable();
|
|
|
|
pr_info("Restarting tasks ... ");
|
|
|
|
__usermodehelper_set_disable_depth(UMH_FREEZING);
|
|
thaw_workqueues();
|
|
|
|
read_lock(&tasklist_lock);
|
|
for_each_process_thread(g, p) {
|
|
/* No other threads should have PF_SUSPEND_TASK set */
|
|
WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
|
|
__thaw_task(p);
|
|
}
|
|
read_unlock(&tasklist_lock);
|
|
|
|
WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
|
|
curr->flags &= ~PF_SUSPEND_TASK;
|
|
|
|
usermodehelper_enable();
|
|
|
|
schedule();
|
|
pr_cont("done.\n");
|
|
trace_suspend_resume(TPS("thaw_processes"), 0, false);
|
|
}
|
|
|
|
void thaw_kernel_threads(void)
|
|
{
|
|
struct task_struct *g, *p;
|
|
|
|
pm_nosig_freezing = false;
|
|
pr_info("Restarting kernel threads ... ");
|
|
|
|
thaw_workqueues();
|
|
|
|
read_lock(&tasklist_lock);
|
|
for_each_process_thread(g, p) {
|
|
if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
|
|
__thaw_task(p);
|
|
}
|
|
read_unlock(&tasklist_lock);
|
|
|
|
schedule();
|
|
pr_cont("done.\n");
|
|
}
|