mm, slab, slub: stop taking memory hotplug lock
Since commit03afc0e25f
("slab: get_online_mems for kmem_cache_{create,destroy,shrink}") we are taking memory hotplug lock for SLAB and SLUB when creating, destroying or shrinking a cache. It is quite a heavy lock and it's best to avoid it if possible, as we had several issues with lockdep complaining about ordering in the past, see e.g.e4f8e513c3
("mm/slub: fix a deadlock in show_slab_objects()"). The problem scenario in03afc0e25f
(solved by the memory hotplug lock) can be summarized as follows: while there's slab_mutex synchronizing new kmem cache creation and SLUB's MEM_GOING_ONLINE callback slab_mem_going_online_callback(), we may miss creation of kmem_cache_node for the hotplugged node in the new kmem cache, because the hotplug callback doesn't yet see the new cache, and cache creation in init_kmem_cache_nodes() only inits kmem_cache_node for nodes in the N_NORMAL_MEMORY nodemask, which however may not yet include the new node, as that happens only later after the MEM_GOING_ONLINE callback. Instead of using get/put_online_mems(), the problem can be solved by SLUB maintaining its own nodemask of nodes for which it has allocated the per-node kmem_cache_node structures. This nodemask would generally mirror the N_NORMAL_MEMORY nodemask, but would be updated only in under SLUB's control in its memory hotplug callbacks under the slab_mutex. This patch adds such nodemask and its handling. Commit03afc0e25f
mentiones "issues like [the one above]", but there don't appear to be further issues. All the paths (shared for SLAB and SLUB) taking the memory hotplug locks are also taking the slab_mutex, except kmem_cache_shrink() where03afc0e25f
replaced slab_mutex with get/put_online_mems(). We however cannot simply restore slab_mutex in kmem_cache_shrink(), as SLUB can enters the function from a write to sysfs 'shrink' file, thus holding kernfs lock, and in kmem_cache_create() the kernfs lock is nested within slab_mutex. But on closer inspection we don't actually need to protect kmem_cache_shrink() from hotplug callbacks: While SLUB's __kmem_cache_shrink() does for_each_kmem_cache_node(), missing a new node added in parallel hotplug is not fatal, and parallel hotremove does not free kmem_cache_node's anymore after the previous patch, so use-after free cannot happen. The per-node shrinking itself is protected by n->list_lock. Same is true for SLAB, and SLOB is no-op. SLAB also doesn't need the memory hotplug locking, which it only gained by03afc0e25f
through the shared paths in slab_common.c. Its memory hotplug callbacks are also protected by slab_mutex against races with these paths. The problem of SLUB relying on N_NORMAL_MEMORY doesn't apply to SLAB, as its setup_kmem_cache_nodes relies on N_ONLINE, and the new node is already set there during the MEM_GOING_ONLINE callback, so no special care is needed for SLAB. As such, this patch removes all get/put_online_mems() usage by the slab subsystem. Link: https://lkml.kernel.org/r/20210113131634.3671-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Pekka Enberg <penberg@kernel.org> Cc: Qian Cai <cai@redhat.com> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
666716fd26
commit
7e1fa93def
@ -310,7 +310,6 @@ kmem_cache_create_usercopy(const char *name,
|
||||
int err;
|
||||
|
||||
get_online_cpus();
|
||||
get_online_mems();
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
|
||||
@ -360,7 +359,6 @@ kmem_cache_create_usercopy(const char *name,
|
||||
out_unlock:
|
||||
mutex_unlock(&slab_mutex);
|
||||
|
||||
put_online_mems();
|
||||
put_online_cpus();
|
||||
|
||||
if (err) {
|
||||
@ -487,7 +485,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
|
||||
return;
|
||||
|
||||
get_online_cpus();
|
||||
get_online_mems();
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
|
||||
@ -504,7 +501,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
|
||||
out_unlock:
|
||||
mutex_unlock(&slab_mutex);
|
||||
|
||||
put_online_mems();
|
||||
put_online_cpus();
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_destroy);
|
||||
@ -523,10 +519,10 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
|
||||
int ret;
|
||||
|
||||
get_online_cpus();
|
||||
get_online_mems();
|
||||
|
||||
kasan_cache_shrink(cachep);
|
||||
ret = __kmem_cache_shrink(cachep);
|
||||
put_online_mems();
|
||||
|
||||
put_online_cpus();
|
||||
return ret;
|
||||
}
|
||||
|
28
mm/slub.c
28
mm/slub.c
@ -235,6 +235,14 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
|
||||
* Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
|
||||
* differ during memory hotplug/hotremove operations.
|
||||
* Protected by slab_mutex.
|
||||
*/
|
||||
static nodemask_t slab_nodes;
|
||||
|
||||
/********************************************************************
|
||||
* Core slab cache functions
|
||||
*******************************************************************/
|
||||
@ -2678,7 +2686,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
|
||||
* ignore the node constraint
|
||||
*/
|
||||
if (unlikely(node != NUMA_NO_NODE &&
|
||||
!node_state(node, N_NORMAL_MEMORY)))
|
||||
!node_isset(node, slab_nodes)))
|
||||
node = NUMA_NO_NODE;
|
||||
goto new_slab;
|
||||
}
|
||||
@ -2689,7 +2697,7 @@ redo:
|
||||
* same as above but node_match() being false already
|
||||
* implies node != NUMA_NO_NODE
|
||||
*/
|
||||
if (!node_state(node, N_NORMAL_MEMORY)) {
|
||||
if (!node_isset(node, slab_nodes)) {
|
||||
node = NUMA_NO_NODE;
|
||||
goto redo;
|
||||
} else {
|
||||
@ -3592,7 +3600,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
|
||||
{
|
||||
int node;
|
||||
|
||||
for_each_node_state(node, N_NORMAL_MEMORY) {
|
||||
for_each_node_mask(node, slab_nodes) {
|
||||
struct kmem_cache_node *n;
|
||||
|
||||
if (slab_state == DOWN) {
|
||||
@ -4286,6 +4294,7 @@ static void slab_mem_offline_callback(void *arg)
|
||||
return;
|
||||
|
||||
mutex_lock(&slab_mutex);
|
||||
node_clear(offline_node, slab_nodes);
|
||||
/*
|
||||
* We no longer free kmem_cache_node structures here, as it would be
|
||||
* racy with all get_node() users, and infeasible to protect them with
|
||||
@ -4335,6 +4344,11 @@ static int slab_mem_going_online_callback(void *arg)
|
||||
init_kmem_cache_node(n);
|
||||
s->node[nid] = n;
|
||||
}
|
||||
/*
|
||||
* Any cache created after this point will also have kmem_cache_node
|
||||
* initialized for the new node.
|
||||
*/
|
||||
node_set(nid, slab_nodes);
|
||||
out:
|
||||
mutex_unlock(&slab_mutex);
|
||||
return ret;
|
||||
@ -4415,6 +4429,7 @@ void __init kmem_cache_init(void)
|
||||
{
|
||||
static __initdata struct kmem_cache boot_kmem_cache,
|
||||
boot_kmem_cache_node;
|
||||
int node;
|
||||
|
||||
if (debug_guardpage_minorder())
|
||||
slub_max_order = 0;
|
||||
@ -4422,6 +4437,13 @@ void __init kmem_cache_init(void)
|
||||
kmem_cache_node = &boot_kmem_cache_node;
|
||||
kmem_cache = &boot_kmem_cache;
|
||||
|
||||
/*
|
||||
* Initialize the nodemask for which we will allocate per node
|
||||
* structures. Here we don't need taking slab_mutex yet.
|
||||
*/
|
||||
for_each_node_state(node, N_NORMAL_MEMORY)
|
||||
node_set(node, slab_nodes);
|
||||
|
||||
create_boot_cache(kmem_cache_node, "kmem_cache_node",
|
||||
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user