162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci#include <linux/slab.h>
362306a36Sopenharmony_ci#include <linux/lockdep.h>
462306a36Sopenharmony_ci#include <linux/sysfs.h>
562306a36Sopenharmony_ci#include <linux/kobject.h>
662306a36Sopenharmony_ci#include <linux/memory.h>
762306a36Sopenharmony_ci#include <linux/memory-tiers.h>
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include "internal.h"
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_cistruct memory_tier {
1262306a36Sopenharmony_ci	/* hierarchy of memory tiers */
1362306a36Sopenharmony_ci	struct list_head list;
1462306a36Sopenharmony_ci	/* list of all memory types part of this tier */
1562306a36Sopenharmony_ci	struct list_head memory_types;
1662306a36Sopenharmony_ci	/*
1762306a36Sopenharmony_ci	 * start value of abstract distance. memory tier maps
1862306a36Sopenharmony_ci	 * an abstract distance  range,
1962306a36Sopenharmony_ci	 * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
2062306a36Sopenharmony_ci	 */
2162306a36Sopenharmony_ci	int adistance_start;
2262306a36Sopenharmony_ci	struct device dev;
2362306a36Sopenharmony_ci	/* All the nodes that are part of all the lower memory tiers. */
2462306a36Sopenharmony_ci	nodemask_t lower_tier_mask;
2562306a36Sopenharmony_ci};
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_cistruct demotion_nodes {
2862306a36Sopenharmony_ci	nodemask_t preferred;
2962306a36Sopenharmony_ci};
3062306a36Sopenharmony_ci
3162306a36Sopenharmony_cistruct node_memory_type_map {
3262306a36Sopenharmony_ci	struct memory_dev_type *memtype;
3362306a36Sopenharmony_ci	int map_count;
3462306a36Sopenharmony_ci};
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_cistatic DEFINE_MUTEX(memory_tier_lock);
3762306a36Sopenharmony_cistatic LIST_HEAD(memory_tiers);
3862306a36Sopenharmony_cistatic struct node_memory_type_map node_memory_types[MAX_NUMNODES];
3962306a36Sopenharmony_cistatic struct memory_dev_type *default_dram_type;
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_cistatic struct bus_type memory_tier_subsys = {
4262306a36Sopenharmony_ci	.name = "memory_tiering",
4362306a36Sopenharmony_ci	.dev_name = "memory_tier",
4462306a36Sopenharmony_ci};
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION
4762306a36Sopenharmony_cistatic int top_tier_adistance;
4862306a36Sopenharmony_ci/*
4962306a36Sopenharmony_ci * node_demotion[] examples:
5062306a36Sopenharmony_ci *
5162306a36Sopenharmony_ci * Example 1:
5262306a36Sopenharmony_ci *
5362306a36Sopenharmony_ci * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
5462306a36Sopenharmony_ci *
5562306a36Sopenharmony_ci * node distances:
5662306a36Sopenharmony_ci * node   0    1    2    3
5762306a36Sopenharmony_ci *    0  10   20   30   40
5862306a36Sopenharmony_ci *    1  20   10   40   30
5962306a36Sopenharmony_ci *    2  30   40   10   40
6062306a36Sopenharmony_ci *    3  40   30   40   10
6162306a36Sopenharmony_ci *
6262306a36Sopenharmony_ci * memory_tiers0 = 0-1
6362306a36Sopenharmony_ci * memory_tiers1 = 2-3
6462306a36Sopenharmony_ci *
6562306a36Sopenharmony_ci * node_demotion[0].preferred = 2
6662306a36Sopenharmony_ci * node_demotion[1].preferred = 3
6762306a36Sopenharmony_ci * node_demotion[2].preferred = <empty>
6862306a36Sopenharmony_ci * node_demotion[3].preferred = <empty>
6962306a36Sopenharmony_ci *
7062306a36Sopenharmony_ci * Example 2:
7162306a36Sopenharmony_ci *
7262306a36Sopenharmony_ci * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
7362306a36Sopenharmony_ci *
7462306a36Sopenharmony_ci * node distances:
7562306a36Sopenharmony_ci * node   0    1    2
7662306a36Sopenharmony_ci *    0  10   20   30
7762306a36Sopenharmony_ci *    1  20   10   30
7862306a36Sopenharmony_ci *    2  30   30   10
7962306a36Sopenharmony_ci *
8062306a36Sopenharmony_ci * memory_tiers0 = 0-2
8162306a36Sopenharmony_ci *
8262306a36Sopenharmony_ci * node_demotion[0].preferred = <empty>
8362306a36Sopenharmony_ci * node_demotion[1].preferred = <empty>
8462306a36Sopenharmony_ci * node_demotion[2].preferred = <empty>
8562306a36Sopenharmony_ci *
8662306a36Sopenharmony_ci * Example 3:
8762306a36Sopenharmony_ci *
8862306a36Sopenharmony_ci * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
8962306a36Sopenharmony_ci *
9062306a36Sopenharmony_ci * node distances:
9162306a36Sopenharmony_ci * node   0    1    2
9262306a36Sopenharmony_ci *    0  10   20   30
9362306a36Sopenharmony_ci *    1  20   10   40
9462306a36Sopenharmony_ci *    2  30   40   10
9562306a36Sopenharmony_ci *
9662306a36Sopenharmony_ci * memory_tiers0 = 1
9762306a36Sopenharmony_ci * memory_tiers1 = 0
9862306a36Sopenharmony_ci * memory_tiers2 = 2
9962306a36Sopenharmony_ci *
10062306a36Sopenharmony_ci * node_demotion[0].preferred = 2
10162306a36Sopenharmony_ci * node_demotion[1].preferred = 0
10262306a36Sopenharmony_ci * node_demotion[2].preferred = <empty>
10362306a36Sopenharmony_ci *
10462306a36Sopenharmony_ci */
10562306a36Sopenharmony_cistatic struct demotion_nodes *node_demotion __read_mostly;
10662306a36Sopenharmony_ci#endif /* CONFIG_MIGRATION */
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_cistatic inline struct memory_tier *to_memory_tier(struct device *device)
10962306a36Sopenharmony_ci{
11062306a36Sopenharmony_ci	return container_of(device, struct memory_tier, dev);
11162306a36Sopenharmony_ci}
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_cistatic __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
11462306a36Sopenharmony_ci{
11562306a36Sopenharmony_ci	nodemask_t nodes = NODE_MASK_NONE;
11662306a36Sopenharmony_ci	struct memory_dev_type *memtype;
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
11962306a36Sopenharmony_ci		nodes_or(nodes, nodes, memtype->nodes);
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci	return nodes;
12262306a36Sopenharmony_ci}
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_cistatic void memory_tier_device_release(struct device *dev)
12562306a36Sopenharmony_ci{
12662306a36Sopenharmony_ci	struct memory_tier *tier = to_memory_tier(dev);
12762306a36Sopenharmony_ci	/*
12862306a36Sopenharmony_ci	 * synchronize_rcu in clear_node_memory_tier makes sure
12962306a36Sopenharmony_ci	 * we don't have rcu access to this memory tier.
13062306a36Sopenharmony_ci	 */
13162306a36Sopenharmony_ci	kfree(tier);
13262306a36Sopenharmony_ci}
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_cistatic ssize_t nodelist_show(struct device *dev,
13562306a36Sopenharmony_ci			     struct device_attribute *attr, char *buf)
13662306a36Sopenharmony_ci{
13762306a36Sopenharmony_ci	int ret;
13862306a36Sopenharmony_ci	nodemask_t nmask;
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci	mutex_lock(&memory_tier_lock);
14162306a36Sopenharmony_ci	nmask = get_memtier_nodemask(to_memory_tier(dev));
14262306a36Sopenharmony_ci	ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
14362306a36Sopenharmony_ci	mutex_unlock(&memory_tier_lock);
14462306a36Sopenharmony_ci	return ret;
14562306a36Sopenharmony_ci}
14662306a36Sopenharmony_cistatic DEVICE_ATTR_RO(nodelist);
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_cistatic struct attribute *memtier_dev_attrs[] = {
14962306a36Sopenharmony_ci	&dev_attr_nodelist.attr,
15062306a36Sopenharmony_ci	NULL
15162306a36Sopenharmony_ci};
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_cistatic const struct attribute_group memtier_dev_group = {
15462306a36Sopenharmony_ci	.attrs = memtier_dev_attrs,
15562306a36Sopenharmony_ci};
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_cistatic const struct attribute_group *memtier_dev_groups[] = {
15862306a36Sopenharmony_ci	&memtier_dev_group,
15962306a36Sopenharmony_ci	NULL
16062306a36Sopenharmony_ci};
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_cistatic struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
16362306a36Sopenharmony_ci{
16462306a36Sopenharmony_ci	int ret;
16562306a36Sopenharmony_ci	bool found_slot = false;
16662306a36Sopenharmony_ci	struct memory_tier *memtier, *new_memtier;
16762306a36Sopenharmony_ci	int adistance = memtype->adistance;
16862306a36Sopenharmony_ci	unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	lockdep_assert_held_once(&memory_tier_lock);
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci	adistance = round_down(adistance, memtier_adistance_chunk_size);
17362306a36Sopenharmony_ci	/*
17462306a36Sopenharmony_ci	 * If the memtype is already part of a memory tier,
17562306a36Sopenharmony_ci	 * just return that.
17662306a36Sopenharmony_ci	 */
17762306a36Sopenharmony_ci	if (!list_empty(&memtype->tier_sibiling)) {
17862306a36Sopenharmony_ci		list_for_each_entry(memtier, &memory_tiers, list) {
17962306a36Sopenharmony_ci			if (adistance == memtier->adistance_start)
18062306a36Sopenharmony_ci				return memtier;
18162306a36Sopenharmony_ci		}
18262306a36Sopenharmony_ci		WARN_ON(1);
18362306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
18462306a36Sopenharmony_ci	}
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci	list_for_each_entry(memtier, &memory_tiers, list) {
18762306a36Sopenharmony_ci		if (adistance == memtier->adistance_start) {
18862306a36Sopenharmony_ci			goto link_memtype;
18962306a36Sopenharmony_ci		} else if (adistance < memtier->adistance_start) {
19062306a36Sopenharmony_ci			found_slot = true;
19162306a36Sopenharmony_ci			break;
19262306a36Sopenharmony_ci		}
19362306a36Sopenharmony_ci	}
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci	new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
19662306a36Sopenharmony_ci	if (!new_memtier)
19762306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci	new_memtier->adistance_start = adistance;
20062306a36Sopenharmony_ci	INIT_LIST_HEAD(&new_memtier->list);
20162306a36Sopenharmony_ci	INIT_LIST_HEAD(&new_memtier->memory_types);
20262306a36Sopenharmony_ci	if (found_slot)
20362306a36Sopenharmony_ci		list_add_tail(&new_memtier->list, &memtier->list);
20462306a36Sopenharmony_ci	else
20562306a36Sopenharmony_ci		list_add_tail(&new_memtier->list, &memory_tiers);
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
20862306a36Sopenharmony_ci	new_memtier->dev.bus = &memory_tier_subsys;
20962306a36Sopenharmony_ci	new_memtier->dev.release = memory_tier_device_release;
21062306a36Sopenharmony_ci	new_memtier->dev.groups = memtier_dev_groups;
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	ret = device_register(&new_memtier->dev);
21362306a36Sopenharmony_ci	if (ret) {
21462306a36Sopenharmony_ci		list_del(&new_memtier->list);
21562306a36Sopenharmony_ci		put_device(&new_memtier->dev);
21662306a36Sopenharmony_ci		return ERR_PTR(ret);
21762306a36Sopenharmony_ci	}
21862306a36Sopenharmony_ci	memtier = new_memtier;
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_cilink_memtype:
22162306a36Sopenharmony_ci	list_add(&memtype->tier_sibiling, &memtier->memory_types);
22262306a36Sopenharmony_ci	return memtier;
22362306a36Sopenharmony_ci}
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_cistatic struct memory_tier *__node_get_memory_tier(int node)
22662306a36Sopenharmony_ci{
22762306a36Sopenharmony_ci	pg_data_t *pgdat;
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	pgdat = NODE_DATA(node);
23062306a36Sopenharmony_ci	if (!pgdat)
23162306a36Sopenharmony_ci		return NULL;
23262306a36Sopenharmony_ci	/*
23362306a36Sopenharmony_ci	 * Since we hold memory_tier_lock, we can avoid
23462306a36Sopenharmony_ci	 * RCU read locks when accessing the details. No
23562306a36Sopenharmony_ci	 * parallel updates are possible here.
23662306a36Sopenharmony_ci	 */
23762306a36Sopenharmony_ci	return rcu_dereference_check(pgdat->memtier,
23862306a36Sopenharmony_ci				     lockdep_is_held(&memory_tier_lock));
23962306a36Sopenharmony_ci}
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION
24262306a36Sopenharmony_cibool node_is_toptier(int node)
24362306a36Sopenharmony_ci{
24462306a36Sopenharmony_ci	bool toptier;
24562306a36Sopenharmony_ci	pg_data_t *pgdat;
24662306a36Sopenharmony_ci	struct memory_tier *memtier;
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci	pgdat = NODE_DATA(node);
24962306a36Sopenharmony_ci	if (!pgdat)
25062306a36Sopenharmony_ci		return false;
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci	rcu_read_lock();
25362306a36Sopenharmony_ci	memtier = rcu_dereference(pgdat->memtier);
25462306a36Sopenharmony_ci	if (!memtier) {
25562306a36Sopenharmony_ci		toptier = true;
25662306a36Sopenharmony_ci		goto out;
25762306a36Sopenharmony_ci	}
25862306a36Sopenharmony_ci	if (memtier->adistance_start <= top_tier_adistance)
25962306a36Sopenharmony_ci		toptier = true;
26062306a36Sopenharmony_ci	else
26162306a36Sopenharmony_ci		toptier = false;
26262306a36Sopenharmony_ciout:
26362306a36Sopenharmony_ci	rcu_read_unlock();
26462306a36Sopenharmony_ci	return toptier;
26562306a36Sopenharmony_ci}
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_civoid node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
26862306a36Sopenharmony_ci{
26962306a36Sopenharmony_ci	struct memory_tier *memtier;
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci	/*
27262306a36Sopenharmony_ci	 * pg_data_t.memtier updates includes a synchronize_rcu()
27362306a36Sopenharmony_ci	 * which ensures that we either find NULL or a valid memtier
27462306a36Sopenharmony_ci	 * in NODE_DATA. protect the access via rcu_read_lock();
27562306a36Sopenharmony_ci	 */
27662306a36Sopenharmony_ci	rcu_read_lock();
27762306a36Sopenharmony_ci	memtier = rcu_dereference(pgdat->memtier);
27862306a36Sopenharmony_ci	if (memtier)
27962306a36Sopenharmony_ci		*targets = memtier->lower_tier_mask;
28062306a36Sopenharmony_ci	else
28162306a36Sopenharmony_ci		*targets = NODE_MASK_NONE;
28262306a36Sopenharmony_ci	rcu_read_unlock();
28362306a36Sopenharmony_ci}
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci/**
28662306a36Sopenharmony_ci * next_demotion_node() - Get the next node in the demotion path
28762306a36Sopenharmony_ci * @node: The starting node to lookup the next node
28862306a36Sopenharmony_ci *
28962306a36Sopenharmony_ci * Return: node id for next memory node in the demotion path hierarchy
29062306a36Sopenharmony_ci * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
29162306a36Sopenharmony_ci * @node online or guarantee that it *continues* to be the next demotion
29262306a36Sopenharmony_ci * target.
29362306a36Sopenharmony_ci */
29462306a36Sopenharmony_ciint next_demotion_node(int node)
29562306a36Sopenharmony_ci{
29662306a36Sopenharmony_ci	struct demotion_nodes *nd;
29762306a36Sopenharmony_ci	int target;
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	if (!node_demotion)
30062306a36Sopenharmony_ci		return NUMA_NO_NODE;
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	nd = &node_demotion[node];
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_ci	/*
30562306a36Sopenharmony_ci	 * node_demotion[] is updated without excluding this
30662306a36Sopenharmony_ci	 * function from running.
30762306a36Sopenharmony_ci	 *
30862306a36Sopenharmony_ci	 * Make sure to use RCU over entire code blocks if
30962306a36Sopenharmony_ci	 * node_demotion[] reads need to be consistent.
31062306a36Sopenharmony_ci	 */
31162306a36Sopenharmony_ci	rcu_read_lock();
31262306a36Sopenharmony_ci	/*
31362306a36Sopenharmony_ci	 * If there are multiple target nodes, just select one
31462306a36Sopenharmony_ci	 * target node randomly.
31562306a36Sopenharmony_ci	 *
31662306a36Sopenharmony_ci	 * In addition, we can also use round-robin to select
31762306a36Sopenharmony_ci	 * target node, but we should introduce another variable
31862306a36Sopenharmony_ci	 * for node_demotion[] to record last selected target node,
31962306a36Sopenharmony_ci	 * that may cause cache ping-pong due to the changing of
32062306a36Sopenharmony_ci	 * last target node. Or introducing per-cpu data to avoid
32162306a36Sopenharmony_ci	 * caching issue, which seems more complicated. So selecting
32262306a36Sopenharmony_ci	 * target node randomly seems better until now.
32362306a36Sopenharmony_ci	 */
32462306a36Sopenharmony_ci	target = node_random(&nd->preferred);
32562306a36Sopenharmony_ci	rcu_read_unlock();
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	return target;
32862306a36Sopenharmony_ci}
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_cistatic void disable_all_demotion_targets(void)
33162306a36Sopenharmony_ci{
33262306a36Sopenharmony_ci	struct memory_tier *memtier;
33362306a36Sopenharmony_ci	int node;
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci	for_each_node_state(node, N_MEMORY) {
33662306a36Sopenharmony_ci		node_demotion[node].preferred = NODE_MASK_NONE;
33762306a36Sopenharmony_ci		/*
33862306a36Sopenharmony_ci		 * We are holding memory_tier_lock, it is safe
33962306a36Sopenharmony_ci		 * to access pgda->memtier.
34062306a36Sopenharmony_ci		 */
34162306a36Sopenharmony_ci		memtier = __node_get_memory_tier(node);
34262306a36Sopenharmony_ci		if (memtier)
34362306a36Sopenharmony_ci			memtier->lower_tier_mask = NODE_MASK_NONE;
34462306a36Sopenharmony_ci	}
34562306a36Sopenharmony_ci	/*
34662306a36Sopenharmony_ci	 * Ensure that the "disable" is visible across the system.
34762306a36Sopenharmony_ci	 * Readers will see either a combination of before+disable
34862306a36Sopenharmony_ci	 * state or disable+after.  They will never see before and
34962306a36Sopenharmony_ci	 * after state together.
35062306a36Sopenharmony_ci	 */
35162306a36Sopenharmony_ci	synchronize_rcu();
35262306a36Sopenharmony_ci}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci/*
35562306a36Sopenharmony_ci * Find an automatic demotion target for all memory
35662306a36Sopenharmony_ci * nodes. Failing here is OK.  It might just indicate
35762306a36Sopenharmony_ci * being at the end of a chain.
35862306a36Sopenharmony_ci */
35962306a36Sopenharmony_cistatic void establish_demotion_targets(void)
36062306a36Sopenharmony_ci{
36162306a36Sopenharmony_ci	struct memory_tier *memtier;
36262306a36Sopenharmony_ci	struct demotion_nodes *nd;
36362306a36Sopenharmony_ci	int target = NUMA_NO_NODE, node;
36462306a36Sopenharmony_ci	int distance, best_distance;
36562306a36Sopenharmony_ci	nodemask_t tier_nodes, lower_tier;
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci	lockdep_assert_held_once(&memory_tier_lock);
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci	if (!node_demotion)
37062306a36Sopenharmony_ci		return;
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	disable_all_demotion_targets();
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci	for_each_node_state(node, N_MEMORY) {
37562306a36Sopenharmony_ci		best_distance = -1;
37662306a36Sopenharmony_ci		nd = &node_demotion[node];
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci		memtier = __node_get_memory_tier(node);
37962306a36Sopenharmony_ci		if (!memtier || list_is_last(&memtier->list, &memory_tiers))
38062306a36Sopenharmony_ci			continue;
38162306a36Sopenharmony_ci		/*
38262306a36Sopenharmony_ci		 * Get the lower memtier to find the  demotion node list.
38362306a36Sopenharmony_ci		 */
38462306a36Sopenharmony_ci		memtier = list_next_entry(memtier, list);
38562306a36Sopenharmony_ci		tier_nodes = get_memtier_nodemask(memtier);
38662306a36Sopenharmony_ci		/*
38762306a36Sopenharmony_ci		 * find_next_best_node, use 'used' nodemask as a skip list.
38862306a36Sopenharmony_ci		 * Add all memory nodes except the selected memory tier
38962306a36Sopenharmony_ci		 * nodelist to skip list so that we find the best node from the
39062306a36Sopenharmony_ci		 * memtier nodelist.
39162306a36Sopenharmony_ci		 */
39262306a36Sopenharmony_ci		nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ci		/*
39562306a36Sopenharmony_ci		 * Find all the nodes in the memory tier node list of same best distance.
39662306a36Sopenharmony_ci		 * add them to the preferred mask. We randomly select between nodes
39762306a36Sopenharmony_ci		 * in the preferred mask when allocating pages during demotion.
39862306a36Sopenharmony_ci		 */
39962306a36Sopenharmony_ci		do {
40062306a36Sopenharmony_ci			target = find_next_best_node(node, &tier_nodes);
40162306a36Sopenharmony_ci			if (target == NUMA_NO_NODE)
40262306a36Sopenharmony_ci				break;
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci			distance = node_distance(node, target);
40562306a36Sopenharmony_ci			if (distance == best_distance || best_distance == -1) {
40662306a36Sopenharmony_ci				best_distance = distance;
40762306a36Sopenharmony_ci				node_set(target, nd->preferred);
40862306a36Sopenharmony_ci			} else {
40962306a36Sopenharmony_ci				break;
41062306a36Sopenharmony_ci			}
41162306a36Sopenharmony_ci		} while (1);
41262306a36Sopenharmony_ci	}
41362306a36Sopenharmony_ci	/*
41462306a36Sopenharmony_ci	 * Promotion is allowed from a memory tier to higher
41562306a36Sopenharmony_ci	 * memory tier only if the memory tier doesn't include
41662306a36Sopenharmony_ci	 * compute. We want to skip promotion from a memory tier,
41762306a36Sopenharmony_ci	 * if any node that is part of the memory tier have CPUs.
41862306a36Sopenharmony_ci	 * Once we detect such a memory tier, we consider that tier
41962306a36Sopenharmony_ci	 * as top tiper from which promotion is not allowed.
42062306a36Sopenharmony_ci	 */
42162306a36Sopenharmony_ci	list_for_each_entry_reverse(memtier, &memory_tiers, list) {
42262306a36Sopenharmony_ci		tier_nodes = get_memtier_nodemask(memtier);
42362306a36Sopenharmony_ci		nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
42462306a36Sopenharmony_ci		if (!nodes_empty(tier_nodes)) {
42562306a36Sopenharmony_ci			/*
42662306a36Sopenharmony_ci			 * abstract distance below the max value of this memtier
42762306a36Sopenharmony_ci			 * is considered toptier.
42862306a36Sopenharmony_ci			 */
42962306a36Sopenharmony_ci			top_tier_adistance = memtier->adistance_start +
43062306a36Sopenharmony_ci						MEMTIER_CHUNK_SIZE - 1;
43162306a36Sopenharmony_ci			break;
43262306a36Sopenharmony_ci		}
43362306a36Sopenharmony_ci	}
43462306a36Sopenharmony_ci	/*
43562306a36Sopenharmony_ci	 * Now build the lower_tier mask for each node collecting node mask from
43662306a36Sopenharmony_ci	 * all memory tier below it. This allows us to fallback demotion page
43762306a36Sopenharmony_ci	 * allocation to a set of nodes that is closer the above selected
43862306a36Sopenharmony_ci	 * perferred node.
43962306a36Sopenharmony_ci	 */
44062306a36Sopenharmony_ci	lower_tier = node_states[N_MEMORY];
44162306a36Sopenharmony_ci	list_for_each_entry(memtier, &memory_tiers, list) {
44262306a36Sopenharmony_ci		/*
44362306a36Sopenharmony_ci		 * Keep removing current tier from lower_tier nodes,
44462306a36Sopenharmony_ci		 * This will remove all nodes in current and above
44562306a36Sopenharmony_ci		 * memory tier from the lower_tier mask.
44662306a36Sopenharmony_ci		 */
44762306a36Sopenharmony_ci		tier_nodes = get_memtier_nodemask(memtier);
44862306a36Sopenharmony_ci		nodes_andnot(lower_tier, lower_tier, tier_nodes);
44962306a36Sopenharmony_ci		memtier->lower_tier_mask = lower_tier;
45062306a36Sopenharmony_ci	}
45162306a36Sopenharmony_ci}
45262306a36Sopenharmony_ci
45362306a36Sopenharmony_ci#else
45462306a36Sopenharmony_cistatic inline void establish_demotion_targets(void) {}
45562306a36Sopenharmony_ci#endif /* CONFIG_MIGRATION */
45662306a36Sopenharmony_ci
45762306a36Sopenharmony_cistatic inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
45862306a36Sopenharmony_ci{
45962306a36Sopenharmony_ci	if (!node_memory_types[node].memtype)
46062306a36Sopenharmony_ci		node_memory_types[node].memtype = memtype;
46162306a36Sopenharmony_ci	/*
46262306a36Sopenharmony_ci	 * for each device getting added in the same NUMA node
46362306a36Sopenharmony_ci	 * with this specific memtype, bump the map count. We
46462306a36Sopenharmony_ci	 * Only take memtype device reference once, so that
46562306a36Sopenharmony_ci	 * changing a node memtype can be done by droping the
46662306a36Sopenharmony_ci	 * only reference count taken here.
46762306a36Sopenharmony_ci	 */
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	if (node_memory_types[node].memtype == memtype) {
47062306a36Sopenharmony_ci		if (!node_memory_types[node].map_count++)
47162306a36Sopenharmony_ci			kref_get(&memtype->kref);
47262306a36Sopenharmony_ci	}
47362306a36Sopenharmony_ci}
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_cistatic struct memory_tier *set_node_memory_tier(int node)
47662306a36Sopenharmony_ci{
47762306a36Sopenharmony_ci	struct memory_tier *memtier;
47862306a36Sopenharmony_ci	struct memory_dev_type *memtype;
47962306a36Sopenharmony_ci	pg_data_t *pgdat = NODE_DATA(node);
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci	lockdep_assert_held_once(&memory_tier_lock);
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci	if (!node_state(node, N_MEMORY))
48562306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci	__init_node_memory_type(node, default_dram_type);
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci	memtype = node_memory_types[node].memtype;
49062306a36Sopenharmony_ci	node_set(node, memtype->nodes);
49162306a36Sopenharmony_ci	memtier = find_create_memory_tier(memtype);
49262306a36Sopenharmony_ci	if (!IS_ERR(memtier))
49362306a36Sopenharmony_ci		rcu_assign_pointer(pgdat->memtier, memtier);
49462306a36Sopenharmony_ci	return memtier;
49562306a36Sopenharmony_ci}
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_cistatic void destroy_memory_tier(struct memory_tier *memtier)
49862306a36Sopenharmony_ci{
49962306a36Sopenharmony_ci	list_del(&memtier->list);
50062306a36Sopenharmony_ci	device_unregister(&memtier->dev);
50162306a36Sopenharmony_ci}
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_cistatic bool clear_node_memory_tier(int node)
50462306a36Sopenharmony_ci{
50562306a36Sopenharmony_ci	bool cleared = false;
50662306a36Sopenharmony_ci	pg_data_t *pgdat;
50762306a36Sopenharmony_ci	struct memory_tier *memtier;
50862306a36Sopenharmony_ci
50962306a36Sopenharmony_ci	pgdat = NODE_DATA(node);
51062306a36Sopenharmony_ci	if (!pgdat)
51162306a36Sopenharmony_ci		return false;
51262306a36Sopenharmony_ci
51362306a36Sopenharmony_ci	/*
51462306a36Sopenharmony_ci	 * Make sure that anybody looking at NODE_DATA who finds
51562306a36Sopenharmony_ci	 * a valid memtier finds memory_dev_types with nodes still
51662306a36Sopenharmony_ci	 * linked to the memtier. We achieve this by waiting for
51762306a36Sopenharmony_ci	 * rcu read section to finish using synchronize_rcu.
51862306a36Sopenharmony_ci	 * This also enables us to free the destroyed memory tier
51962306a36Sopenharmony_ci	 * with kfree instead of kfree_rcu
52062306a36Sopenharmony_ci	 */
52162306a36Sopenharmony_ci	memtier = __node_get_memory_tier(node);
52262306a36Sopenharmony_ci	if (memtier) {
52362306a36Sopenharmony_ci		struct memory_dev_type *memtype;
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci		rcu_assign_pointer(pgdat->memtier, NULL);
52662306a36Sopenharmony_ci		synchronize_rcu();
52762306a36Sopenharmony_ci		memtype = node_memory_types[node].memtype;
52862306a36Sopenharmony_ci		node_clear(node, memtype->nodes);
52962306a36Sopenharmony_ci		if (nodes_empty(memtype->nodes)) {
53062306a36Sopenharmony_ci			list_del_init(&memtype->tier_sibiling);
53162306a36Sopenharmony_ci			if (list_empty(&memtier->memory_types))
53262306a36Sopenharmony_ci				destroy_memory_tier(memtier);
53362306a36Sopenharmony_ci		}
53462306a36Sopenharmony_ci		cleared = true;
53562306a36Sopenharmony_ci	}
53662306a36Sopenharmony_ci	return cleared;
53762306a36Sopenharmony_ci}
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_cistatic void release_memtype(struct kref *kref)
54062306a36Sopenharmony_ci{
54162306a36Sopenharmony_ci	struct memory_dev_type *memtype;
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_ci	memtype = container_of(kref, struct memory_dev_type, kref);
54462306a36Sopenharmony_ci	kfree(memtype);
54562306a36Sopenharmony_ci}
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_cistruct memory_dev_type *alloc_memory_type(int adistance)
54862306a36Sopenharmony_ci{
54962306a36Sopenharmony_ci	struct memory_dev_type *memtype;
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci	memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
55262306a36Sopenharmony_ci	if (!memtype)
55362306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci	memtype->adistance = adistance;
55662306a36Sopenharmony_ci	INIT_LIST_HEAD(&memtype->tier_sibiling);
55762306a36Sopenharmony_ci	memtype->nodes  = NODE_MASK_NONE;
55862306a36Sopenharmony_ci	kref_init(&memtype->kref);
55962306a36Sopenharmony_ci	return memtype;
56062306a36Sopenharmony_ci}
56162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(alloc_memory_type);
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_civoid put_memory_type(struct memory_dev_type *memtype)
56462306a36Sopenharmony_ci{
56562306a36Sopenharmony_ci	kref_put(&memtype->kref, release_memtype);
56662306a36Sopenharmony_ci}
56762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(put_memory_type);
56862306a36Sopenharmony_ci
56962306a36Sopenharmony_civoid init_node_memory_type(int node, struct memory_dev_type *memtype)
57062306a36Sopenharmony_ci{
57162306a36Sopenharmony_ci
57262306a36Sopenharmony_ci	mutex_lock(&memory_tier_lock);
57362306a36Sopenharmony_ci	__init_node_memory_type(node, memtype);
57462306a36Sopenharmony_ci	mutex_unlock(&memory_tier_lock);
57562306a36Sopenharmony_ci}
57662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(init_node_memory_type);
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_civoid clear_node_memory_type(int node, struct memory_dev_type *memtype)
57962306a36Sopenharmony_ci{
58062306a36Sopenharmony_ci	mutex_lock(&memory_tier_lock);
58162306a36Sopenharmony_ci	if (node_memory_types[node].memtype == memtype)
58262306a36Sopenharmony_ci		node_memory_types[node].map_count--;
58362306a36Sopenharmony_ci	/*
58462306a36Sopenharmony_ci	 * If we umapped all the attached devices to this node,
58562306a36Sopenharmony_ci	 * clear the node memory type.
58662306a36Sopenharmony_ci	 */
58762306a36Sopenharmony_ci	if (!node_memory_types[node].map_count) {
58862306a36Sopenharmony_ci		node_memory_types[node].memtype = NULL;
58962306a36Sopenharmony_ci		put_memory_type(memtype);
59062306a36Sopenharmony_ci	}
59162306a36Sopenharmony_ci	mutex_unlock(&memory_tier_lock);
59262306a36Sopenharmony_ci}
59362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(clear_node_memory_type);
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_cistatic int __meminit memtier_hotplug_callback(struct notifier_block *self,
59662306a36Sopenharmony_ci					      unsigned long action, void *_arg)
59762306a36Sopenharmony_ci{
59862306a36Sopenharmony_ci	struct memory_tier *memtier;
59962306a36Sopenharmony_ci	struct memory_notify *arg = _arg;
60062306a36Sopenharmony_ci
60162306a36Sopenharmony_ci	/*
60262306a36Sopenharmony_ci	 * Only update the node migration order when a node is
60362306a36Sopenharmony_ci	 * changing status, like online->offline.
60462306a36Sopenharmony_ci	 */
60562306a36Sopenharmony_ci	if (arg->status_change_nid < 0)
60662306a36Sopenharmony_ci		return notifier_from_errno(0);
60762306a36Sopenharmony_ci
60862306a36Sopenharmony_ci	switch (action) {
60962306a36Sopenharmony_ci	case MEM_OFFLINE:
61062306a36Sopenharmony_ci		mutex_lock(&memory_tier_lock);
61162306a36Sopenharmony_ci		if (clear_node_memory_tier(arg->status_change_nid))
61262306a36Sopenharmony_ci			establish_demotion_targets();
61362306a36Sopenharmony_ci		mutex_unlock(&memory_tier_lock);
61462306a36Sopenharmony_ci		break;
61562306a36Sopenharmony_ci	case MEM_ONLINE:
61662306a36Sopenharmony_ci		mutex_lock(&memory_tier_lock);
61762306a36Sopenharmony_ci		memtier = set_node_memory_tier(arg->status_change_nid);
61862306a36Sopenharmony_ci		if (!IS_ERR(memtier))
61962306a36Sopenharmony_ci			establish_demotion_targets();
62062306a36Sopenharmony_ci		mutex_unlock(&memory_tier_lock);
62162306a36Sopenharmony_ci		break;
62262306a36Sopenharmony_ci	}
62362306a36Sopenharmony_ci
62462306a36Sopenharmony_ci	return notifier_from_errno(0);
62562306a36Sopenharmony_ci}
62662306a36Sopenharmony_ci
62762306a36Sopenharmony_cistatic int __init memory_tier_init(void)
62862306a36Sopenharmony_ci{
62962306a36Sopenharmony_ci	int ret, node;
63062306a36Sopenharmony_ci	struct memory_tier *memtier;
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci	ret = subsys_virtual_register(&memory_tier_subsys, NULL);
63362306a36Sopenharmony_ci	if (ret)
63462306a36Sopenharmony_ci		panic("%s() failed to register memory tier subsystem\n", __func__);
63562306a36Sopenharmony_ci
63662306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION
63762306a36Sopenharmony_ci	node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
63862306a36Sopenharmony_ci				GFP_KERNEL);
63962306a36Sopenharmony_ci	WARN_ON(!node_demotion);
64062306a36Sopenharmony_ci#endif
64162306a36Sopenharmony_ci	mutex_lock(&memory_tier_lock);
64262306a36Sopenharmony_ci	/*
64362306a36Sopenharmony_ci	 * For now we can have 4 faster memory tiers with smaller adistance
64462306a36Sopenharmony_ci	 * than default DRAM tier.
64562306a36Sopenharmony_ci	 */
64662306a36Sopenharmony_ci	default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
64762306a36Sopenharmony_ci	if (IS_ERR(default_dram_type))
64862306a36Sopenharmony_ci		panic("%s() failed to allocate default DRAM tier\n", __func__);
64962306a36Sopenharmony_ci
65062306a36Sopenharmony_ci	/*
65162306a36Sopenharmony_ci	 * Look at all the existing N_MEMORY nodes and add them to
65262306a36Sopenharmony_ci	 * default memory tier or to a tier if we already have memory
65362306a36Sopenharmony_ci	 * types assigned.
65462306a36Sopenharmony_ci	 */
65562306a36Sopenharmony_ci	for_each_node_state(node, N_MEMORY) {
65662306a36Sopenharmony_ci		memtier = set_node_memory_tier(node);
65762306a36Sopenharmony_ci		if (IS_ERR(memtier))
65862306a36Sopenharmony_ci			/*
65962306a36Sopenharmony_ci			 * Continue with memtiers we are able to setup
66062306a36Sopenharmony_ci			 */
66162306a36Sopenharmony_ci			break;
66262306a36Sopenharmony_ci	}
66362306a36Sopenharmony_ci	establish_demotion_targets();
66462306a36Sopenharmony_ci	mutex_unlock(&memory_tier_lock);
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI);
66762306a36Sopenharmony_ci	return 0;
66862306a36Sopenharmony_ci}
66962306a36Sopenharmony_cisubsys_initcall(memory_tier_init);
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_cibool numa_demotion_enabled = false;
67262306a36Sopenharmony_ci
67362306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION
67462306a36Sopenharmony_ci#ifdef CONFIG_SYSFS
67562306a36Sopenharmony_cistatic ssize_t demotion_enabled_show(struct kobject *kobj,
67662306a36Sopenharmony_ci				     struct kobj_attribute *attr, char *buf)
67762306a36Sopenharmony_ci{
67862306a36Sopenharmony_ci	return sysfs_emit(buf, "%s\n",
67962306a36Sopenharmony_ci			  numa_demotion_enabled ? "true" : "false");
68062306a36Sopenharmony_ci}
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_cistatic ssize_t demotion_enabled_store(struct kobject *kobj,
68362306a36Sopenharmony_ci				      struct kobj_attribute *attr,
68462306a36Sopenharmony_ci				      const char *buf, size_t count)
68562306a36Sopenharmony_ci{
68662306a36Sopenharmony_ci	ssize_t ret;
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci	ret = kstrtobool(buf, &numa_demotion_enabled);
68962306a36Sopenharmony_ci	if (ret)
69062306a36Sopenharmony_ci		return ret;
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	return count;
69362306a36Sopenharmony_ci}
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_cistatic struct kobj_attribute numa_demotion_enabled_attr =
69662306a36Sopenharmony_ci	__ATTR_RW(demotion_enabled);
69762306a36Sopenharmony_ci
69862306a36Sopenharmony_cistatic struct attribute *numa_attrs[] = {
69962306a36Sopenharmony_ci	&numa_demotion_enabled_attr.attr,
70062306a36Sopenharmony_ci	NULL,
70162306a36Sopenharmony_ci};
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_cistatic const struct attribute_group numa_attr_group = {
70462306a36Sopenharmony_ci	.attrs = numa_attrs,
70562306a36Sopenharmony_ci};
70662306a36Sopenharmony_ci
70762306a36Sopenharmony_cistatic int __init numa_init_sysfs(void)
70862306a36Sopenharmony_ci{
70962306a36Sopenharmony_ci	int err;
71062306a36Sopenharmony_ci	struct kobject *numa_kobj;
71162306a36Sopenharmony_ci
71262306a36Sopenharmony_ci	numa_kobj = kobject_create_and_add("numa", mm_kobj);
71362306a36Sopenharmony_ci	if (!numa_kobj) {
71462306a36Sopenharmony_ci		pr_err("failed to create numa kobject\n");
71562306a36Sopenharmony_ci		return -ENOMEM;
71662306a36Sopenharmony_ci	}
71762306a36Sopenharmony_ci	err = sysfs_create_group(numa_kobj, &numa_attr_group);
71862306a36Sopenharmony_ci	if (err) {
71962306a36Sopenharmony_ci		pr_err("failed to register numa group\n");
72062306a36Sopenharmony_ci		goto delete_obj;
72162306a36Sopenharmony_ci	}
72262306a36Sopenharmony_ci	return 0;
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_cidelete_obj:
72562306a36Sopenharmony_ci	kobject_put(numa_kobj);
72662306a36Sopenharmony_ci	return err;
72762306a36Sopenharmony_ci}
72862306a36Sopenharmony_cisubsys_initcall(numa_init_sysfs);
72962306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */
73062306a36Sopenharmony_ci#endif
731