67197a4f28
Currently __set_oom_adj loops through all processes in the system to keep oom_score_adj and oom_score_adj_min in sync between processes sharing their mm. This is done for any task with more that one mm_users, which includes processes with multiple threads (sharing mm and signals). However for such processes the loop is unnecessary because their signal structure is shared as well. Android updates oom_score_adj whenever a tasks changes its role (background/foreground/...) or binds to/unbinds from a service, making it more/less important. Such operation can happen frequently. We noticed that updates to oom_score_adj became more expensive and after further investigation found out that the patch mentioned in "Fixes" introduced a regression. Using Pixel 4 with a typical Android workload, write time to oom_score_adj increased from ~3.57us to ~362us. Moreover this regression linearly depends on the number of multi-threaded processes running on the system. Mark the mm with a new MMF_MULTIPROCESS flag bit when task is created with (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK). Change __set_oom_adj to use MMF_MULTIPROCESS instead of mm_users to decide whether oom_score_adj update should be synchronized between multiple processes. To prevent races between clone() and __set_oom_adj(), when oom_score_adj of the process being cloned might be modified from userspace, we use oom_adj_mutex. Its scope is changed to global. The combination of (CLONE_VM && !CLONE_THREAD) is rarely used except for the case of vfork(). To prevent performance regressions of vfork(), we skip taking oom_adj_mutex and setting MMF_MULTIPROCESS when CLONE_VFORK is specified. Clearing the MMF_MULTIPROCESS flag (when the last process sharing the mm exits) is left out of this patch to keep it simple and because it is believed that this threading model is rare. Should there ever be a need for optimizing that case as well, it can be done by hooking into the exit path, likely following the mm_update_next_owner pattern. With the combination of (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK) being quite rare, the regression is gone after the change is applied. [surenb@google.com: v3] Link: https://lkml.kernel.org/r/20200902012558.2335613-1-surenb@google.com Fixes: 44a70adec910 ("mm, oom_adj: make sure processes sharing mm have same view of oom_score_adj") Reported-by: Tim Murray <timmurray@google.com> Suggested-by: Michal Hocko <mhocko@kernel.org> Signed-off-by: Suren Baghdasaryan <surenb@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: Christian Brauner <christian.brauner@ubuntu.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Eugene Syromiatnikov <esyr@redhat.com> Cc: Christian Kellner <christian@kellner.me> Cc: Adrian Reber <areber@redhat.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Aleksa Sarai <cyphar@cyphar.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Alexey Gladkov <gladkov.alexey@gmail.com> Cc: Michel Lespinasse <walken@google.com> Cc: Daniel Jordan <daniel.m.jordan@oracle.com> Cc: Andrei Vagin <avagin@gmail.com> Cc: Bernd Edlinger <bernd.edlinger@hotmail.de> Cc: John Johansen <john.johansen@canonical.com> Cc: Yafang Shao <laoar.shao@gmail.com> Link: https://lkml.kernel.org/r/20200824153036.3201505-1-surenb@google.com Debugged-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
131 lines
3.3 KiB
C
131 lines
3.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __INCLUDE_LINUX_OOM_H
|
|
#define __INCLUDE_LINUX_OOM_H
|
|
|
|
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/types.h>
|
|
#include <linux/nodemask.h>
|
|
#include <uapi/linux/oom.h>
|
|
#include <linux/sched/coredump.h> /* MMF_* */
|
|
#include <linux/mm.h> /* VM_FAULT* */
|
|
|
|
struct zonelist;
|
|
struct notifier_block;
|
|
struct mem_cgroup;
|
|
struct task_struct;
|
|
|
|
enum oom_constraint {
|
|
CONSTRAINT_NONE,
|
|
CONSTRAINT_CPUSET,
|
|
CONSTRAINT_MEMORY_POLICY,
|
|
CONSTRAINT_MEMCG,
|
|
};
|
|
|
|
/*
|
|
* Details of the page allocation that triggered the oom killer that are used to
|
|
* determine what should be killed.
|
|
*/
|
|
struct oom_control {
|
|
/* Used to determine cpuset */
|
|
struct zonelist *zonelist;
|
|
|
|
/* Used to determine mempolicy */
|
|
nodemask_t *nodemask;
|
|
|
|
/* Memory cgroup in which oom is invoked, or NULL for global oom */
|
|
struct mem_cgroup *memcg;
|
|
|
|
/* Used to determine cpuset and node locality requirement */
|
|
const gfp_t gfp_mask;
|
|
|
|
/*
|
|
* order == -1 means the oom kill is required by sysrq, otherwise only
|
|
* for display purposes.
|
|
*/
|
|
const int order;
|
|
|
|
/* Used by oom implementation, do not set */
|
|
unsigned long totalpages;
|
|
struct task_struct *chosen;
|
|
long chosen_points;
|
|
|
|
/* Used to print the constraint info. */
|
|
enum oom_constraint constraint;
|
|
};
|
|
|
|
extern struct mutex oom_lock;
|
|
extern struct mutex oom_adj_mutex;
|
|
|
|
static inline void set_current_oom_origin(void)
|
|
{
|
|
current->signal->oom_flag_origin = true;
|
|
}
|
|
|
|
static inline void clear_current_oom_origin(void)
|
|
{
|
|
current->signal->oom_flag_origin = false;
|
|
}
|
|
|
|
static inline bool oom_task_origin(const struct task_struct *p)
|
|
{
|
|
return p->signal->oom_flag_origin;
|
|
}
|
|
|
|
static inline bool tsk_is_oom_victim(struct task_struct * tsk)
|
|
{
|
|
return tsk->signal->oom_mm;
|
|
}
|
|
|
|
/*
|
|
* Use this helper if tsk->mm != mm and the victim mm needs a special
|
|
* handling. This is guaranteed to stay true after once set.
|
|
*/
|
|
static inline bool mm_is_oom_victim(struct mm_struct *mm)
|
|
{
|
|
return test_bit(MMF_OOM_VICTIM, &mm->flags);
|
|
}
|
|
|
|
/*
|
|
* Checks whether a page fault on the given mm is still reliable.
|
|
* This is no longer true if the oom reaper started to reap the
|
|
* address space which is reflected by MMF_UNSTABLE flag set in
|
|
* the mm. At that moment any !shared mapping would lose the content
|
|
* and could cause a memory corruption (zero pages instead of the
|
|
* original content).
|
|
*
|
|
* User should call this before establishing a page table entry for
|
|
* a !shared mapping and under the proper page table lock.
|
|
*
|
|
* Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
|
|
*/
|
|
static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
|
|
{
|
|
if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
|
|
return VM_FAULT_SIGBUS;
|
|
return 0;
|
|
}
|
|
|
|
bool __oom_reap_task_mm(struct mm_struct *mm);
|
|
|
|
long oom_badness(struct task_struct *p,
|
|
unsigned long totalpages);
|
|
|
|
extern bool out_of_memory(struct oom_control *oc);
|
|
|
|
extern void exit_oom_victim(void);
|
|
|
|
extern int register_oom_notifier(struct notifier_block *nb);
|
|
extern int unregister_oom_notifier(struct notifier_block *nb);
|
|
|
|
extern bool oom_killer_disable(signed long timeout);
|
|
extern void oom_killer_enable(void);
|
|
|
|
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
|
|
|
|
/* sysctls */
|
|
extern int sysctl_oom_dump_tasks;
|
|
extern int sysctl_oom_kill_allocating_task;
|
|
extern int sysctl_panic_on_oom;
|
|
#endif /* _INCLUDE_LINUX_OOM_H */
|