mm: memcg: move cgroup v1 oom handling code into memcontrol-v1.c
Cgroup v1 supports a complicated OOM handling in userspace mechanism, which is not supported by cgroup v2. Let's move the corresponding code into memcontrol-v1.c. Aside from mechanical code movement this patch introduces two new functions: memcg1_oom_prepare() and memcg1_oom_finish(). Those are implementing cgroup v1-specific parts of the common memcg OOM handling path. Link: https://lkml.kernel.org/r/20240625005906.106920-9-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Muchun Song <muchun.song@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
cc7b8504f6
commit
292fc2e020
@ -110,7 +110,13 @@ struct mem_cgroup_event {
|
||||
struct work_struct remove;
|
||||
};
|
||||
|
||||
extern spinlock_t memcg_oom_lock;
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
static struct lockdep_map memcg_oom_lock_dep_map = {
|
||||
.name = "memcg_oom_lock",
|
||||
};
|
||||
#endif
|
||||
|
||||
DEFINE_SPINLOCK(memcg_oom_lock);
|
||||
|
||||
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
|
||||
struct mem_cgroup_tree_per_node *mctz,
|
||||
@ -1469,7 +1475,7 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
|
||||
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
|
||||
@ -1959,6 +1965,225 @@ void memcg1_css_offline(struct mem_cgroup *memcg)
|
||||
spin_unlock_irq(&memcg->event_list_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check OOM-Killer is already running under our hierarchy.
|
||||
* If someone is running, return false.
|
||||
*/
|
||||
static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter, *failed = NULL;
|
||||
|
||||
spin_lock(&memcg_oom_lock);
|
||||
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
if (iter->oom_lock) {
|
||||
/*
|
||||
* this subtree of our hierarchy is already locked
|
||||
* so we cannot give a lock.
|
||||
*/
|
||||
failed = iter;
|
||||
mem_cgroup_iter_break(memcg, iter);
|
||||
break;
|
||||
} else
|
||||
iter->oom_lock = true;
|
||||
}
|
||||
|
||||
if (failed) {
|
||||
/*
|
||||
* OK, we failed to lock the whole subtree so we have
|
||||
* to clean up what we set up to the failing subtree
|
||||
*/
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
if (iter == failed) {
|
||||
mem_cgroup_iter_break(memcg, iter);
|
||||
break;
|
||||
}
|
||||
iter->oom_lock = false;
|
||||
}
|
||||
} else
|
||||
mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
|
||||
|
||||
spin_unlock(&memcg_oom_lock);
|
||||
|
||||
return !failed;
|
||||
}
|
||||
|
||||
static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
|
||||
spin_lock(&memcg_oom_lock);
|
||||
mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
|
||||
for_each_mem_cgroup_tree(iter, memcg)
|
||||
iter->oom_lock = false;
|
||||
spin_unlock(&memcg_oom_lock);
|
||||
}
|
||||
|
||||
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
|
||||
spin_lock(&memcg_oom_lock);
|
||||
for_each_mem_cgroup_tree(iter, memcg)
|
||||
iter->under_oom++;
|
||||
spin_unlock(&memcg_oom_lock);
|
||||
}
|
||||
|
||||
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
|
||||
/*
|
||||
* Be careful about under_oom underflows because a child memcg
|
||||
* could have been added after mem_cgroup_mark_under_oom.
|
||||
*/
|
||||
spin_lock(&memcg_oom_lock);
|
||||
for_each_mem_cgroup_tree(iter, memcg)
|
||||
if (iter->under_oom > 0)
|
||||
iter->under_oom--;
|
||||
spin_unlock(&memcg_oom_lock);
|
||||
}
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
|
||||
|
||||
struct oom_wait_info {
|
||||
struct mem_cgroup *memcg;
|
||||
wait_queue_entry_t wait;
|
||||
};
|
||||
|
||||
static int memcg_oom_wake_function(wait_queue_entry_t *wait,
|
||||
unsigned mode, int sync, void *arg)
|
||||
{
|
||||
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
|
||||
struct mem_cgroup *oom_wait_memcg;
|
||||
struct oom_wait_info *oom_wait_info;
|
||||
|
||||
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
|
||||
oom_wait_memcg = oom_wait_info->memcg;
|
||||
|
||||
if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
|
||||
!mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
|
||||
return 0;
|
||||
return autoremove_wake_function(wait, mode, sync, arg);
|
||||
}
|
||||
|
||||
void memcg_oom_recover(struct mem_cgroup *memcg)
|
||||
{
|
||||
/*
|
||||
* For the following lockless ->under_oom test, the only required
|
||||
* guarantee is that it must see the state asserted by an OOM when
|
||||
* this function is called as a result of userland actions
|
||||
* triggered by the notification of the OOM. This is trivially
|
||||
* achieved by invoking mem_cgroup_mark_under_oom() before
|
||||
* triggering notification.
|
||||
*/
|
||||
if (memcg && memcg->under_oom)
|
||||
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_oom_synchronize - complete memcg OOM handling
|
||||
* @handle: actually kill/wait or just clean up the OOM state
|
||||
*
|
||||
* This has to be called at the end of a page fault if the memcg OOM
|
||||
* handler was enabled.
|
||||
*
|
||||
* Memcg supports userspace OOM handling where failed allocations must
|
||||
* sleep on a waitqueue until the userspace task resolves the
|
||||
* situation. Sleeping directly in the charge context with all kinds
|
||||
* of locks held is not a good idea, instead we remember an OOM state
|
||||
* in the task and mem_cgroup_oom_synchronize() has to be called at
|
||||
* the end of the page fault to complete the OOM handling.
|
||||
*
|
||||
* Returns %true if an ongoing memcg OOM situation was detected and
|
||||
* completed, %false otherwise.
|
||||
*/
|
||||
bool mem_cgroup_oom_synchronize(bool handle)
|
||||
{
|
||||
struct mem_cgroup *memcg = current->memcg_in_oom;
|
||||
struct oom_wait_info owait;
|
||||
bool locked;
|
||||
|
||||
/* OOM is global, do not handle */
|
||||
if (!memcg)
|
||||
return false;
|
||||
|
||||
if (!handle)
|
||||
goto cleanup;
|
||||
|
||||
owait.memcg = memcg;
|
||||
owait.wait.flags = 0;
|
||||
owait.wait.func = memcg_oom_wake_function;
|
||||
owait.wait.private = current;
|
||||
INIT_LIST_HEAD(&owait.wait.entry);
|
||||
|
||||
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
||||
mem_cgroup_mark_under_oom(memcg);
|
||||
|
||||
locked = mem_cgroup_oom_trylock(memcg);
|
||||
|
||||
if (locked)
|
||||
mem_cgroup_oom_notify(memcg);
|
||||
|
||||
schedule();
|
||||
mem_cgroup_unmark_under_oom(memcg);
|
||||
finish_wait(&memcg_oom_waitq, &owait.wait);
|
||||
|
||||
if (locked)
|
||||
mem_cgroup_oom_unlock(memcg);
|
||||
cleanup:
|
||||
current->memcg_in_oom = NULL;
|
||||
css_put(&memcg->css);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
|
||||
{
|
||||
/*
|
||||
* We are in the middle of the charge context here, so we
|
||||
* don't want to block when potentially sitting on a callstack
|
||||
* that holds all kinds of filesystem and mm locks.
|
||||
*
|
||||
* cgroup1 allows disabling the OOM killer and waiting for outside
|
||||
* handling until the charge can succeed; remember the context and put
|
||||
* the task to sleep at the end of the page fault when all locks are
|
||||
* released.
|
||||
*
|
||||
* On the other hand, in-kernel OOM killer allows for an async victim
|
||||
* memory reclaim (oom_reaper) and that means that we are not solely
|
||||
* relying on the oom victim to make a forward progress and we can
|
||||
* invoke the oom killer here.
|
||||
*
|
||||
* Please note that mem_cgroup_out_of_memory might fail to find a
|
||||
* victim and then we have to bail out from the charge path.
|
||||
*/
|
||||
if (READ_ONCE(memcg->oom_kill_disable)) {
|
||||
if (current->in_user_fault) {
|
||||
css_get(&memcg->css);
|
||||
current->memcg_in_oom = memcg;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
mem_cgroup_mark_under_oom(memcg);
|
||||
|
||||
*locked = mem_cgroup_oom_trylock(memcg);
|
||||
|
||||
if (*locked)
|
||||
mem_cgroup_oom_notify(memcg);
|
||||
|
||||
mem_cgroup_unmark_under_oom(memcg);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
|
||||
{
|
||||
if (locked)
|
||||
mem_cgroup_oom_unlock(memcg);
|
||||
}
|
||||
|
||||
static int __init memcg1_init(void)
|
||||
{
|
||||
int node;
|
||||
|
@ -87,9 +87,10 @@ enum res_type {
|
||||
bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
||||
enum mem_cgroup_events_target target);
|
||||
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
|
||||
void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
|
||||
ssize_t memcg_write_event_control(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off);
|
||||
|
||||
bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked);
|
||||
void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);
|
||||
|
||||
#endif /* __MM_MEMCONTROL_V1_H */
|
||||
|
216
mm/memcontrol.c
216
mm/memcontrol.c
@ -1615,130 +1615,6 @@ unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
static struct lockdep_map memcg_oom_lock_dep_map = {
|
||||
.name = "memcg_oom_lock",
|
||||
};
|
||||
#endif
|
||||
|
||||
DEFINE_SPINLOCK(memcg_oom_lock);
|
||||
|
||||
/*
|
||||
* Check OOM-Killer is already running under our hierarchy.
|
||||
* If someone is running, return false.
|
||||
*/
|
||||
static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter, *failed = NULL;
|
||||
|
||||
spin_lock(&memcg_oom_lock);
|
||||
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
if (iter->oom_lock) {
|
||||
/*
|
||||
* this subtree of our hierarchy is already locked
|
||||
* so we cannot give a lock.
|
||||
*/
|
||||
failed = iter;
|
||||
mem_cgroup_iter_break(memcg, iter);
|
||||
break;
|
||||
} else
|
||||
iter->oom_lock = true;
|
||||
}
|
||||
|
||||
if (failed) {
|
||||
/*
|
||||
* OK, we failed to lock the whole subtree so we have
|
||||
* to clean up what we set up to the failing subtree
|
||||
*/
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
if (iter == failed) {
|
||||
mem_cgroup_iter_break(memcg, iter);
|
||||
break;
|
||||
}
|
||||
iter->oom_lock = false;
|
||||
}
|
||||
} else
|
||||
mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
|
||||
|
||||
spin_unlock(&memcg_oom_lock);
|
||||
|
||||
return !failed;
|
||||
}
|
||||
|
||||
static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
|
||||
spin_lock(&memcg_oom_lock);
|
||||
mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
|
||||
for_each_mem_cgroup_tree(iter, memcg)
|
||||
iter->oom_lock = false;
|
||||
spin_unlock(&memcg_oom_lock);
|
||||
}
|
||||
|
||||
static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
|
||||
spin_lock(&memcg_oom_lock);
|
||||
for_each_mem_cgroup_tree(iter, memcg)
|
||||
iter->under_oom++;
|
||||
spin_unlock(&memcg_oom_lock);
|
||||
}
|
||||
|
||||
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup *iter;
|
||||
|
||||
/*
|
||||
* Be careful about under_oom underflows because a child memcg
|
||||
* could have been added after mem_cgroup_mark_under_oom.
|
||||
*/
|
||||
spin_lock(&memcg_oom_lock);
|
||||
for_each_mem_cgroup_tree(iter, memcg)
|
||||
if (iter->under_oom > 0)
|
||||
iter->under_oom--;
|
||||
spin_unlock(&memcg_oom_lock);
|
||||
}
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
|
||||
|
||||
struct oom_wait_info {
|
||||
struct mem_cgroup *memcg;
|
||||
wait_queue_entry_t wait;
|
||||
};
|
||||
|
||||
static int memcg_oom_wake_function(wait_queue_entry_t *wait,
|
||||
unsigned mode, int sync, void *arg)
|
||||
{
|
||||
struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
|
||||
struct mem_cgroup *oom_wait_memcg;
|
||||
struct oom_wait_info *oom_wait_info;
|
||||
|
||||
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
|
||||
oom_wait_memcg = oom_wait_info->memcg;
|
||||
|
||||
if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
|
||||
!mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
|
||||
return 0;
|
||||
return autoremove_wake_function(wait, mode, sync, arg);
|
||||
}
|
||||
|
||||
void memcg_oom_recover(struct mem_cgroup *memcg)
|
||||
{
|
||||
/*
|
||||
* For the following lockless ->under_oom test, the only required
|
||||
* guarantee is that it must see the state asserted by an OOM when
|
||||
* this function is called as a result of userland actions
|
||||
* triggered by the notification of the OOM. This is trivially
|
||||
* achieved by invoking mem_cgroup_mark_under_oom() before
|
||||
* triggering notification.
|
||||
*/
|
||||
if (memcg && memcg->under_oom)
|
||||
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if successfully killed one or more processes. Though in some
|
||||
* corner cases it can return true even without killing any process.
|
||||
@ -1752,104 +1628,16 @@ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
|
||||
|
||||
memcg_memory_event(memcg, MEMCG_OOM);
|
||||
|
||||
/*
|
||||
* We are in the middle of the charge context here, so we
|
||||
* don't want to block when potentially sitting on a callstack
|
||||
* that holds all kinds of filesystem and mm locks.
|
||||
*
|
||||
* cgroup1 allows disabling the OOM killer and waiting for outside
|
||||
* handling until the charge can succeed; remember the context and put
|
||||
* the task to sleep at the end of the page fault when all locks are
|
||||
* released.
|
||||
*
|
||||
* On the other hand, in-kernel OOM killer allows for an async victim
|
||||
* memory reclaim (oom_reaper) and that means that we are not solely
|
||||
* relying on the oom victim to make a forward progress and we can
|
||||
* invoke the oom killer here.
|
||||
*
|
||||
* Please note that mem_cgroup_out_of_memory might fail to find a
|
||||
* victim and then we have to bail out from the charge path.
|
||||
*/
|
||||
if (READ_ONCE(memcg->oom_kill_disable)) {
|
||||
if (current->in_user_fault) {
|
||||
css_get(&memcg->css);
|
||||
current->memcg_in_oom = memcg;
|
||||
}
|
||||
if (!memcg1_oom_prepare(memcg, &locked))
|
||||
return false;
|
||||
}
|
||||
|
||||
mem_cgroup_mark_under_oom(memcg);
|
||||
|
||||
locked = mem_cgroup_oom_trylock(memcg);
|
||||
|
||||
if (locked)
|
||||
mem_cgroup_oom_notify(memcg);
|
||||
|
||||
mem_cgroup_unmark_under_oom(memcg);
|
||||
ret = mem_cgroup_out_of_memory(memcg, mask, order);
|
||||
|
||||
if (locked)
|
||||
mem_cgroup_oom_unlock(memcg);
|
||||
memcg1_oom_finish(memcg, locked);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_oom_synchronize - complete memcg OOM handling
|
||||
* @handle: actually kill/wait or just clean up the OOM state
|
||||
*
|
||||
* This has to be called at the end of a page fault if the memcg OOM
|
||||
* handler was enabled.
|
||||
*
|
||||
* Memcg supports userspace OOM handling where failed allocations must
|
||||
* sleep on a waitqueue until the userspace task resolves the
|
||||
* situation. Sleeping directly in the charge context with all kinds
|
||||
* of locks held is not a good idea, instead we remember an OOM state
|
||||
* in the task and mem_cgroup_oom_synchronize() has to be called at
|
||||
* the end of the page fault to complete the OOM handling.
|
||||
*
|
||||
* Returns %true if an ongoing memcg OOM situation was detected and
|
||||
* completed, %false otherwise.
|
||||
*/
|
||||
bool mem_cgroup_oom_synchronize(bool handle)
|
||||
{
|
||||
struct mem_cgroup *memcg = current->memcg_in_oom;
|
||||
struct oom_wait_info owait;
|
||||
bool locked;
|
||||
|
||||
/* OOM is global, do not handle */
|
||||
if (!memcg)
|
||||
return false;
|
||||
|
||||
if (!handle)
|
||||
goto cleanup;
|
||||
|
||||
owait.memcg = memcg;
|
||||
owait.wait.flags = 0;
|
||||
owait.wait.func = memcg_oom_wake_function;
|
||||
owait.wait.private = current;
|
||||
INIT_LIST_HEAD(&owait.wait.entry);
|
||||
|
||||
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
||||
mem_cgroup_mark_under_oom(memcg);
|
||||
|
||||
locked = mem_cgroup_oom_trylock(memcg);
|
||||
|
||||
if (locked)
|
||||
mem_cgroup_oom_notify(memcg);
|
||||
|
||||
schedule();
|
||||
mem_cgroup_unmark_under_oom(memcg);
|
||||
finish_wait(&memcg_oom_waitq, &owait.wait);
|
||||
|
||||
if (locked)
|
||||
mem_cgroup_oom_unlock(memcg);
|
||||
cleanup:
|
||||
current->memcg_in_oom = NULL;
|
||||
css_put(&memcg->css);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
|
||||
* @victim: task to be killed by the OOM killer
|
||||
|
Loading…
x
Reference in New Issue
Block a user