mm/demotion: build demotion targets based on explicit memory tiers
This patch switch the demotion target building logic to use memory tiers instead of NUMA distance. All N_MEMORY NUMA nodes will be placed in the default memory tier and additional memory tiers will be added by drivers like dax kmem. This patch builds the demotion target for a NUMA node by looking at all memory tiers below the tier to which the NUMA node belongs. The closest node in the immediately following memory tier is used as a demotion target. Since we are now only building demotion target for N_MEMORY NUMA nodes the CPU hotplug calls are removed in this patch. Link: https://lkml.kernel.org/r/20220818131042.113280-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Acked-by: Wei Xu <weixugc@google.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Bharata B Rao <bharata@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hesham Almatary <hesham.almatary@huawei.com> Cc: Jagdish Gediya <jvgediya.oss@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Yang Shi <shy828301@gmail.com> Cc: SeongJae Park <sj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
7b88bda376
commit
6c542ab757
@ -37,6 +37,14 @@ struct memory_dev_type *alloc_memory_type(int adistance);
|
||||
void destroy_memory_type(struct memory_dev_type *memtype);
|
||||
void init_node_memory_type(int node, struct memory_dev_type *default_type);
|
||||
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
|
||||
#ifdef CONFIG_MIGRATION
|
||||
int next_demotion_node(int node);
|
||||
#else
|
||||
static inline int next_demotion_node(int node)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
@ -63,5 +71,10 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
static inline int next_demotion_node(int node)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
#endif /* CONFIG_NUMA */
|
||||
#endif /* _LINUX_MEMORY_TIERS_H */
|
||||
|
@ -100,19 +100,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
|
||||
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
||||
#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA)
|
||||
extern void set_migration_target_nodes(void);
|
||||
extern void migrate_on_reclaim_init(void);
|
||||
extern int next_demotion_node(int node);
|
||||
#else
|
||||
static inline void set_migration_target_nodes(void) {}
|
||||
static inline void migrate_on_reclaim_init(void) {}
|
||||
static inline int next_demotion_node(int node)
|
||||
{
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
bool PageMovable(struct page *page);
|
||||
void __SetPageMovable(struct page *page, const struct movable_operations *ops);
|
||||
|
@ -6,6 +6,8 @@
|
||||
#include <linux/memory.h>
|
||||
#include <linux/memory-tiers.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
struct memory_tier {
|
||||
/* hierarchy of memory tiers */
|
||||
struct list_head list;
|
||||
@ -19,6 +21,10 @@ struct memory_tier {
|
||||
int adistance_start;
|
||||
};
|
||||
|
||||
struct demotion_nodes {
|
||||
nodemask_t preferred;
|
||||
};
|
||||
|
||||
struct node_memory_type_map {
|
||||
struct memory_dev_type *memtype;
|
||||
int map_count;
|
||||
@ -28,6 +34,66 @@ static DEFINE_MUTEX(memory_tier_lock);
|
||||
static LIST_HEAD(memory_tiers);
|
||||
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
|
||||
static struct memory_dev_type *default_dram_type;
|
||||
#ifdef CONFIG_MIGRATION
|
||||
/*
|
||||
* node_demotion[] examples:
|
||||
*
|
||||
* Example 1:
|
||||
*
|
||||
* Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
|
||||
*
|
||||
* node distances:
|
||||
* node 0 1 2 3
|
||||
* 0 10 20 30 40
|
||||
* 1 20 10 40 30
|
||||
* 2 30 40 10 40
|
||||
* 3 40 30 40 10
|
||||
*
|
||||
* memory_tiers0 = 0-1
|
||||
* memory_tiers1 = 2-3
|
||||
*
|
||||
* node_demotion[0].preferred = 2
|
||||
* node_demotion[1].preferred = 3
|
||||
* node_demotion[2].preferred = <empty>
|
||||
* node_demotion[3].preferred = <empty>
|
||||
*
|
||||
* Example 2:
|
||||
*
|
||||
* Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
|
||||
*
|
||||
* node distances:
|
||||
* node 0 1 2
|
||||
* 0 10 20 30
|
||||
* 1 20 10 30
|
||||
* 2 30 30 10
|
||||
*
|
||||
* memory_tiers0 = 0-2
|
||||
*
|
||||
* node_demotion[0].preferred = <empty>
|
||||
* node_demotion[1].preferred = <empty>
|
||||
* node_demotion[2].preferred = <empty>
|
||||
*
|
||||
* Example 3:
|
||||
*
|
||||
* Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
|
||||
*
|
||||
* node distances:
|
||||
* node 0 1 2
|
||||
* 0 10 20 30
|
||||
* 1 20 10 40
|
||||
* 2 30 40 10
|
||||
*
|
||||
* memory_tiers0 = 1
|
||||
* memory_tiers1 = 0
|
||||
* memory_tiers2 = 2
|
||||
*
|
||||
* node_demotion[0].preferred = 2
|
||||
* node_demotion[1].preferred = 0
|
||||
* node_demotion[2].preferred = <empty>
|
||||
*
|
||||
*/
|
||||
static struct demotion_nodes *node_demotion __read_mostly;
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
||||
static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
|
||||
{
|
||||
@ -73,6 +139,154 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
|
||||
return new_memtier;
|
||||
}
|
||||
|
||||
static struct memory_tier *__node_get_memory_tier(int node)
|
||||
{
|
||||
struct memory_dev_type *memtype;
|
||||
|
||||
memtype = node_memory_types[node];
|
||||
if (memtype && node_isset(node, memtype->nodes))
|
||||
return memtype->memtier;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MIGRATION
|
||||
/**
|
||||
* next_demotion_node() - Get the next node in the demotion path
|
||||
* @node: The starting node to lookup the next node
|
||||
*
|
||||
* Return: node id for next memory node in the demotion path hierarchy
|
||||
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
|
||||
* @node online or guarantee that it *continues* to be the next demotion
|
||||
* target.
|
||||
*/
|
||||
int next_demotion_node(int node)
|
||||
{
|
||||
struct demotion_nodes *nd;
|
||||
int target;
|
||||
|
||||
if (!node_demotion)
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
nd = &node_demotion[node];
|
||||
|
||||
/*
|
||||
* node_demotion[] is updated without excluding this
|
||||
* function from running.
|
||||
*
|
||||
* Make sure to use RCU over entire code blocks if
|
||||
* node_demotion[] reads need to be consistent.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
/*
|
||||
* If there are multiple target nodes, just select one
|
||||
* target node randomly.
|
||||
*
|
||||
* In addition, we can also use round-robin to select
|
||||
* target node, but we should introduce another variable
|
||||
* for node_demotion[] to record last selected target node,
|
||||
* that may cause cache ping-pong due to the changing of
|
||||
* last target node. Or introducing per-cpu data to avoid
|
||||
* caching issue, which seems more complicated. So selecting
|
||||
* target node randomly seems better until now.
|
||||
*/
|
||||
target = node_random(&nd->preferred);
|
||||
rcu_read_unlock();
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
static void disable_all_demotion_targets(void)
|
||||
{
|
||||
int node;
|
||||
|
||||
for_each_node_state(node, N_MEMORY)
|
||||
node_demotion[node].preferred = NODE_MASK_NONE;
|
||||
/*
|
||||
* Ensure that the "disable" is visible across the system.
|
||||
* Readers will see either a combination of before+disable
|
||||
* state or disable+after. They will never see before and
|
||||
* after state together.
|
||||
*/
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
|
||||
{
|
||||
nodemask_t nodes = NODE_MASK_NONE;
|
||||
struct memory_dev_type *memtype;
|
||||
|
||||
list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
|
||||
nodes_or(nodes, nodes, memtype->nodes);
|
||||
|
||||
return nodes;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find an automatic demotion target for all memory
|
||||
* nodes. Failing here is OK. It might just indicate
|
||||
* being at the end of a chain.
|
||||
*/
|
||||
static void establish_demotion_targets(void)
|
||||
{
|
||||
struct memory_tier *memtier;
|
||||
struct demotion_nodes *nd;
|
||||
int target = NUMA_NO_NODE, node;
|
||||
int distance, best_distance;
|
||||
nodemask_t tier_nodes;
|
||||
|
||||
lockdep_assert_held_once(&memory_tier_lock);
|
||||
|
||||
if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
|
||||
return;
|
||||
|
||||
disable_all_demotion_targets();
|
||||
|
||||
for_each_node_state(node, N_MEMORY) {
|
||||
best_distance = -1;
|
||||
nd = &node_demotion[node];
|
||||
|
||||
memtier = __node_get_memory_tier(node);
|
||||
if (!memtier || list_is_last(&memtier->list, &memory_tiers))
|
||||
continue;
|
||||
/*
|
||||
* Get the lower memtier to find the demotion node list.
|
||||
*/
|
||||
memtier = list_next_entry(memtier, list);
|
||||
tier_nodes = get_memtier_nodemask(memtier);
|
||||
/*
|
||||
* find_next_best_node, use 'used' nodemask as a skip list.
|
||||
* Add all memory nodes except the selected memory tier
|
||||
* nodelist to skip list so that we find the best node from the
|
||||
* memtier nodelist.
|
||||
*/
|
||||
nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
|
||||
|
||||
/*
|
||||
* Find all the nodes in the memory tier node list of same best distance.
|
||||
* add them to the preferred mask. We randomly select between nodes
|
||||
* in the preferred mask when allocating pages during demotion.
|
||||
*/
|
||||
do {
|
||||
target = find_next_best_node(node, &tier_nodes);
|
||||
if (target == NUMA_NO_NODE)
|
||||
break;
|
||||
|
||||
distance = node_distance(node, target);
|
||||
if (distance == best_distance || best_distance == -1) {
|
||||
best_distance = distance;
|
||||
node_set(target, nd->preferred);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
static inline void disable_all_demotion_targets(void) {}
|
||||
static inline void establish_demotion_targets(void) {}
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
||||
static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
|
||||
{
|
||||
if (!node_memory_types[node].memtype)
|
||||
@ -109,16 +323,6 @@ static struct memory_tier *set_node_memory_tier(int node)
|
||||
return memtier;
|
||||
}
|
||||
|
||||
static struct memory_tier *__node_get_memory_tier(int node)
|
||||
{
|
||||
struct memory_dev_type *memtype;
|
||||
|
||||
memtype = node_memory_types[node];
|
||||
if (memtype && node_isset(node, memtype->nodes))
|
||||
return memtype->memtier;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void destroy_memory_tier(struct memory_tier *memtier)
|
||||
{
|
||||
list_del(&memtier->list);
|
||||
@ -207,6 +411,7 @@ EXPORT_SYMBOL_GPL(clear_node_memory_type);
|
||||
static int __meminit memtier_hotplug_callback(struct notifier_block *self,
|
||||
unsigned long action, void *_arg)
|
||||
{
|
||||
struct memory_tier *memtier;
|
||||
struct memory_notify *arg = _arg;
|
||||
|
||||
/*
|
||||
@ -219,12 +424,15 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
|
||||
switch (action) {
|
||||
case MEM_OFFLINE:
|
||||
mutex_lock(&memory_tier_lock);
|
||||
clear_node_memory_tier(arg->status_change_nid);
|
||||
if (clear_node_memory_tier(arg->status_change_nid))
|
||||
establish_demotion_targets();
|
||||
mutex_unlock(&memory_tier_lock);
|
||||
break;
|
||||
case MEM_ONLINE:
|
||||
mutex_lock(&memory_tier_lock);
|
||||
set_node_memory_tier(arg->status_change_nid);
|
||||
memtier = set_node_memory_tier(arg->status_change_nid);
|
||||
if (!IS_ERR(memtier))
|
||||
establish_demotion_targets();
|
||||
mutex_unlock(&memory_tier_lock);
|
||||
break;
|
||||
}
|
||||
@ -237,6 +445,11 @@ static int __init memory_tier_init(void)
|
||||
int node;
|
||||
struct memory_tier *memtier;
|
||||
|
||||
#ifdef CONFIG_MIGRATION
|
||||
node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
|
||||
GFP_KERNEL);
|
||||
WARN_ON(!node_demotion);
|
||||
#endif
|
||||
mutex_lock(&memory_tier_lock);
|
||||
/*
|
||||
* For now we can have 4 faster memory tiers with smaller adistance
|
||||
@ -259,6 +472,7 @@ static int __init memory_tier_init(void)
|
||||
*/
|
||||
break;
|
||||
}
|
||||
establish_demotion_targets();
|
||||
mutex_unlock(&memory_tier_lock);
|
||||
|
||||
hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
|
||||
|
394
mm/migrate.c
394
mm/migrate.c
@ -2198,398 +2198,4 @@ out:
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
/*
|
||||
* node_demotion[] example:
|
||||
*
|
||||
* Consider a system with two sockets. Each socket has
|
||||
* three classes of memory attached: fast, medium and slow.
|
||||
* Each memory class is placed in its own NUMA node. The
|
||||
* CPUs are placed in the node with the "fast" memory. The
|
||||
* 6 NUMA nodes (0-5) might be split among the sockets like
|
||||
* this:
|
||||
*
|
||||
* Socket A: 0, 1, 2
|
||||
* Socket B: 3, 4, 5
|
||||
*
|
||||
* When Node 0 fills up, its memory should be migrated to
|
||||
* Node 1. When Node 1 fills up, it should be migrated to
|
||||
* Node 2. The migration path start on the nodes with the
|
||||
* processors (since allocations default to this node) and
|
||||
* fast memory, progress through medium and end with the
|
||||
* slow memory:
|
||||
*
|
||||
* 0 -> 1 -> 2 -> stop
|
||||
* 3 -> 4 -> 5 -> stop
|
||||
*
|
||||
* This is represented in the node_demotion[] like this:
|
||||
*
|
||||
* { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
|
||||
* { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
|
||||
* { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
|
||||
* { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
|
||||
* { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
|
||||
* { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
|
||||
*
|
||||
* Moreover some systems may have multiple slow memory nodes.
|
||||
* Suppose a system has one socket with 3 memory nodes, node 0
|
||||
* is fast memory type, and node 1/2 both are slow memory
|
||||
* type, and the distance between fast memory node and slow
|
||||
* memory node is same. So the migration path should be:
|
||||
*
|
||||
* 0 -> 1/2 -> stop
|
||||
*
|
||||
* This is represented in the node_demotion[] like this:
|
||||
* { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
|
||||
* { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
|
||||
* { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
|
||||
*/
|
||||
|
||||
/*
|
||||
* Writes to this array occur without locking. Cycles are
|
||||
* not allowed: Node X demotes to Y which demotes to X...
|
||||
*
|
||||
* If multiple reads are performed, a single rcu_read_lock()
|
||||
* must be held over all reads to ensure that no cycles are
|
||||
* observed.
|
||||
*/
|
||||
#define DEFAULT_DEMOTION_TARGET_NODES 15
|
||||
|
||||
#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
|
||||
#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
|
||||
#else
|
||||
#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
|
||||
#endif
|
||||
|
||||
struct demotion_nodes {
|
||||
unsigned short nr;
|
||||
short nodes[DEMOTION_TARGET_NODES];
|
||||
};
|
||||
|
||||
static struct demotion_nodes *node_demotion __read_mostly;
|
||||
|
||||
/**
|
||||
* next_demotion_node() - Get the next node in the demotion path
|
||||
* @node: The starting node to lookup the next node
|
||||
*
|
||||
* Return: node id for next memory node in the demotion path hierarchy
|
||||
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
|
||||
* @node online or guarantee that it *continues* to be the next demotion
|
||||
* target.
|
||||
*/
|
||||
int next_demotion_node(int node)
|
||||
{
|
||||
struct demotion_nodes *nd;
|
||||
unsigned short target_nr, index;
|
||||
int target;
|
||||
|
||||
if (!node_demotion)
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
nd = &node_demotion[node];
|
||||
|
||||
/*
|
||||
* node_demotion[] is updated without excluding this
|
||||
* function from running. RCU doesn't provide any
|
||||
* compiler barriers, so the READ_ONCE() is required
|
||||
* to avoid compiler reordering or read merging.
|
||||
*
|
||||
* Make sure to use RCU over entire code blocks if
|
||||
* node_demotion[] reads need to be consistent.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
target_nr = READ_ONCE(nd->nr);
|
||||
|
||||
switch (target_nr) {
|
||||
case 0:
|
||||
target = NUMA_NO_NODE;
|
||||
goto out;
|
||||
case 1:
|
||||
index = 0;
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* If there are multiple target nodes, just select one
|
||||
* target node randomly.
|
||||
*
|
||||
* In addition, we can also use round-robin to select
|
||||
* target node, but we should introduce another variable
|
||||
* for node_demotion[] to record last selected target node,
|
||||
* that may cause cache ping-pong due to the changing of
|
||||
* last target node. Or introducing per-cpu data to avoid
|
||||
* caching issue, which seems more complicated. So selecting
|
||||
* target node randomly seems better until now.
|
||||
*/
|
||||
index = get_random_int() % target_nr;
|
||||
break;
|
||||
}
|
||||
|
||||
target = READ_ONCE(nd->nodes[index]);
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return target;
|
||||
}
|
||||
|
||||
/* Disable reclaim-based migration. */
|
||||
static void __disable_all_migrate_targets(void)
|
||||
{
|
||||
int node, i;
|
||||
|
||||
if (!node_demotion)
|
||||
return;
|
||||
|
||||
for_each_online_node(node) {
|
||||
node_demotion[node].nr = 0;
|
||||
for (i = 0; i < DEMOTION_TARGET_NODES; i++)
|
||||
node_demotion[node].nodes[i] = NUMA_NO_NODE;
|
||||
}
|
||||
}
|
||||
|
||||
static void disable_all_migrate_targets(void)
|
||||
{
|
||||
__disable_all_migrate_targets();
|
||||
|
||||
/*
|
||||
* Ensure that the "disable" is visible across the system.
|
||||
* Readers will see either a combination of before+disable
|
||||
* state or disable+after. They will never see before and
|
||||
* after state together.
|
||||
*
|
||||
* The before+after state together might have cycles and
|
||||
* could cause readers to do things like loop until this
|
||||
* function finishes. This ensures they can only see a
|
||||
* single "bad" read and would, for instance, only loop
|
||||
* once.
|
||||
*/
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
/*
|
||||
* Find an automatic demotion target for 'node'.
|
||||
* Failing here is OK. It might just indicate
|
||||
* being at the end of a chain.
|
||||
*/
|
||||
static int establish_migrate_target(int node, nodemask_t *used,
|
||||
int best_distance)
|
||||
{
|
||||
int migration_target, index, val;
|
||||
struct demotion_nodes *nd;
|
||||
|
||||
if (!node_demotion)
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
nd = &node_demotion[node];
|
||||
|
||||
migration_target = find_next_best_node(node, used);
|
||||
if (migration_target == NUMA_NO_NODE)
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
/*
|
||||
* If the node has been set a migration target node before,
|
||||
* which means it's the best distance between them. Still
|
||||
* check if this node can be demoted to other target nodes
|
||||
* if they have a same best distance.
|
||||
*/
|
||||
if (best_distance != -1) {
|
||||
val = node_distance(node, migration_target);
|
||||
if (val > best_distance)
|
||||
goto out_clear;
|
||||
}
|
||||
|
||||
index = nd->nr;
|
||||
if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
|
||||
"Exceeds maximum demotion target nodes\n"))
|
||||
goto out_clear;
|
||||
|
||||
nd->nodes[index] = migration_target;
|
||||
nd->nr++;
|
||||
|
||||
return migration_target;
|
||||
out_clear:
|
||||
node_clear(migration_target, *used);
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
/*
|
||||
* When memory fills up on a node, memory contents can be
|
||||
* automatically migrated to another node instead of
|
||||
* discarded at reclaim.
|
||||
*
|
||||
* Establish a "migration path" which will start at nodes
|
||||
* with CPUs and will follow the priorities used to build the
|
||||
* page allocator zonelists.
|
||||
*
|
||||
* The difference here is that cycles must be avoided. If
|
||||
* node0 migrates to node1, then neither node1, nor anything
|
||||
* node1 migrates to can migrate to node0. Also one node can
|
||||
* be migrated to multiple nodes if the target nodes all have
|
||||
* a same best-distance against the source node.
|
||||
*
|
||||
* This function can run simultaneously with readers of
|
||||
* node_demotion[]. However, it can not run simultaneously
|
||||
* with itself. Exclusion is provided by memory hotplug events
|
||||
* being single-threaded.
|
||||
*/
|
||||
static void __set_migration_target_nodes(void)
|
||||
{
|
||||
nodemask_t next_pass;
|
||||
nodemask_t this_pass;
|
||||
nodemask_t used_targets = NODE_MASK_NONE;
|
||||
int node, best_distance;
|
||||
|
||||
/*
|
||||
* Avoid any oddities like cycles that could occur
|
||||
* from changes in the topology. This will leave
|
||||
* a momentary gap when migration is disabled.
|
||||
*/
|
||||
disable_all_migrate_targets();
|
||||
|
||||
/*
|
||||
* Allocations go close to CPUs, first. Assume that
|
||||
* the migration path starts at the nodes with CPUs.
|
||||
*/
|
||||
next_pass = node_states[N_CPU];
|
||||
again:
|
||||
this_pass = next_pass;
|
||||
next_pass = NODE_MASK_NONE;
|
||||
/*
|
||||
* To avoid cycles in the migration "graph", ensure
|
||||
* that migration sources are not future targets by
|
||||
* setting them in 'used_targets'. Do this only
|
||||
* once per pass so that multiple source nodes can
|
||||
* share a target node.
|
||||
*
|
||||
* 'used_targets' will become unavailable in future
|
||||
* passes. This limits some opportunities for
|
||||
* multiple source nodes to share a destination.
|
||||
*/
|
||||
nodes_or(used_targets, used_targets, this_pass);
|
||||
|
||||
for_each_node_mask(node, this_pass) {
|
||||
best_distance = -1;
|
||||
|
||||
/*
|
||||
* Try to set up the migration path for the node, and the target
|
||||
* migration nodes can be multiple, so doing a loop to find all
|
||||
* the target nodes if they all have a best node distance.
|
||||
*/
|
||||
do {
|
||||
int target_node =
|
||||
establish_migrate_target(node, &used_targets,
|
||||
best_distance);
|
||||
|
||||
if (target_node == NUMA_NO_NODE)
|
||||
break;
|
||||
|
||||
if (best_distance == -1)
|
||||
best_distance = node_distance(node, target_node);
|
||||
|
||||
/*
|
||||
* Visit targets from this pass in the next pass.
|
||||
* Eventually, every node will have been part of
|
||||
* a pass, and will become set in 'used_targets'.
|
||||
*/
|
||||
node_set(target_node, next_pass);
|
||||
} while (1);
|
||||
}
|
||||
/*
|
||||
* 'next_pass' contains nodes which became migration
|
||||
* targets in this pass. Make additional passes until
|
||||
* no more migrations targets are available.
|
||||
*/
|
||||
if (!nodes_empty(next_pass))
|
||||
goto again;
|
||||
}
|
||||
|
||||
/*
|
||||
* For callers that do not hold get_online_mems() already.
|
||||
*/
|
||||
void set_migration_target_nodes(void)
|
||||
{
|
||||
get_online_mems();
|
||||
__set_migration_target_nodes();
|
||||
put_online_mems();
|
||||
}
|
||||
|
||||
/*
|
||||
* This leaves migrate-on-reclaim transiently disabled between
|
||||
* the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
|
||||
* whether reclaim-based migration is enabled or not, which
|
||||
* ensures that the user can turn reclaim-based migration at
|
||||
* any time without needing to recalculate migration targets.
|
||||
*
|
||||
* These callbacks already hold get_online_mems(). That is why
|
||||
* __set_migration_target_nodes() can be used as opposed to
|
||||
* set_migration_target_nodes().
|
||||
*/
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
|
||||
unsigned long action, void *_arg)
|
||||
{
|
||||
struct memory_notify *arg = _arg;
|
||||
|
||||
/*
|
||||
* Only update the node migration order when a node is
|
||||
* changing status, like online->offline. This avoids
|
||||
* the overhead of synchronize_rcu() in most cases.
|
||||
*/
|
||||
if (arg->status_change_nid < 0)
|
||||
return notifier_from_errno(0);
|
||||
|
||||
switch (action) {
|
||||
case MEM_GOING_OFFLINE:
|
||||
/*
|
||||
* Make sure there are not transient states where
|
||||
* an offline node is a migration target. This
|
||||
* will leave migration disabled until the offline
|
||||
* completes and the MEM_OFFLINE case below runs.
|
||||
*/
|
||||
disable_all_migrate_targets();
|
||||
break;
|
||||
case MEM_OFFLINE:
|
||||
case MEM_ONLINE:
|
||||
/*
|
||||
* Recalculate the target nodes once the node
|
||||
* reaches its final state (online or offline).
|
||||
*/
|
||||
__set_migration_target_nodes();
|
||||
break;
|
||||
case MEM_CANCEL_OFFLINE:
|
||||
/*
|
||||
* MEM_GOING_OFFLINE disabled all the migration
|
||||
* targets. Reenable them.
|
||||
*/
|
||||
__set_migration_target_nodes();
|
||||
break;
|
||||
case MEM_GOING_ONLINE:
|
||||
case MEM_CANCEL_ONLINE:
|
||||
break;
|
||||
}
|
||||
|
||||
return notifier_from_errno(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
void __init migrate_on_reclaim_init(void)
|
||||
{
|
||||
node_demotion = kcalloc(nr_node_ids,
|
||||
sizeof(struct demotion_nodes),
|
||||
GFP_KERNEL);
|
||||
WARN_ON(!node_demotion);
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
|
||||
#endif
|
||||
/*
|
||||
* At this point, all numa nodes with memory/CPus have their state
|
||||
* properly set, so we can build the demotion order now.
|
||||
* Let us hold the cpu_hotplug lock just, as we could possibily have
|
||||
* CPU hotplug events during boot.
|
||||
*/
|
||||
cpus_read_lock();
|
||||
set_migration_target_nodes();
|
||||
cpus_read_unlock();
|
||||
}
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
|
||||
|
@ -28,7 +28,6 @@
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/page_ext.h>
|
||||
#include <linux/page_owner.h>
|
||||
#include <linux/migrate.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
@ -2068,7 +2067,6 @@ static int vmstat_cpu_online(unsigned int cpu)
|
||||
|
||||
if (!node_state(cpu_to_node(cpu), N_CPU)) {
|
||||
node_set_state(cpu_to_node(cpu), N_CPU);
|
||||
set_migration_target_nodes();
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -2093,7 +2091,6 @@ static int vmstat_cpu_dead(unsigned int cpu)
|
||||
return 0;
|
||||
|
||||
node_clear_state(node, N_CPU);
|
||||
set_migration_target_nodes();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -2126,7 +2123,6 @@ void __init init_mm_internals(void)
|
||||
|
||||
start_shepherd_timer();
|
||||
#endif
|
||||
migrate_on_reclaim_init();
|
||||
#ifdef CONFIG_PROC_FS
|
||||
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
|
||||
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
|
||||
|
Loading…
Reference in New Issue
Block a user