162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci#include <linux/slab.h> 362306a36Sopenharmony_ci#include <linux/lockdep.h> 462306a36Sopenharmony_ci#include <linux/sysfs.h> 562306a36Sopenharmony_ci#include <linux/kobject.h> 662306a36Sopenharmony_ci#include <linux/memory.h> 762306a36Sopenharmony_ci#include <linux/memory-tiers.h> 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include "internal.h" 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_cistruct memory_tier { 1262306a36Sopenharmony_ci /* hierarchy of memory tiers */ 1362306a36Sopenharmony_ci struct list_head list; 1462306a36Sopenharmony_ci /* list of all memory types part of this tier */ 1562306a36Sopenharmony_ci struct list_head memory_types; 1662306a36Sopenharmony_ci /* 1762306a36Sopenharmony_ci * start value of abstract distance. memory tier maps 1862306a36Sopenharmony_ci * an abstract distance range, 1962306a36Sopenharmony_ci * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE 2062306a36Sopenharmony_ci */ 2162306a36Sopenharmony_ci int adistance_start; 2262306a36Sopenharmony_ci struct device dev; 2362306a36Sopenharmony_ci /* All the nodes that are part of all the lower memory tiers. */ 2462306a36Sopenharmony_ci nodemask_t lower_tier_mask; 2562306a36Sopenharmony_ci}; 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_cistruct demotion_nodes { 2862306a36Sopenharmony_ci nodemask_t preferred; 2962306a36Sopenharmony_ci}; 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_cistruct node_memory_type_map { 3262306a36Sopenharmony_ci struct memory_dev_type *memtype; 3362306a36Sopenharmony_ci int map_count; 3462306a36Sopenharmony_ci}; 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_cistatic DEFINE_MUTEX(memory_tier_lock); 3762306a36Sopenharmony_cistatic LIST_HEAD(memory_tiers); 3862306a36Sopenharmony_cistatic struct node_memory_type_map node_memory_types[MAX_NUMNODES]; 3962306a36Sopenharmony_cistatic struct memory_dev_type *default_dram_type; 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_cistatic struct bus_type memory_tier_subsys = { 4262306a36Sopenharmony_ci .name = "memory_tiering", 4362306a36Sopenharmony_ci .dev_name = "memory_tier", 4462306a36Sopenharmony_ci}; 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION 4762306a36Sopenharmony_cistatic int top_tier_adistance; 4862306a36Sopenharmony_ci/* 4962306a36Sopenharmony_ci * node_demotion[] examples: 5062306a36Sopenharmony_ci * 5162306a36Sopenharmony_ci * Example 1: 5262306a36Sopenharmony_ci * 5362306a36Sopenharmony_ci * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. 5462306a36Sopenharmony_ci * 5562306a36Sopenharmony_ci * node distances: 5662306a36Sopenharmony_ci * node 0 1 2 3 5762306a36Sopenharmony_ci * 0 10 20 30 40 5862306a36Sopenharmony_ci * 1 20 10 40 30 5962306a36Sopenharmony_ci * 2 30 40 10 40 6062306a36Sopenharmony_ci * 3 40 30 40 10 6162306a36Sopenharmony_ci * 6262306a36Sopenharmony_ci * memory_tiers0 = 0-1 6362306a36Sopenharmony_ci * memory_tiers1 = 2-3 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * node_demotion[0].preferred = 2 6662306a36Sopenharmony_ci * node_demotion[1].preferred = 3 6762306a36Sopenharmony_ci * node_demotion[2].preferred = <empty> 6862306a36Sopenharmony_ci * node_demotion[3].preferred = <empty> 6962306a36Sopenharmony_ci * 7062306a36Sopenharmony_ci * Example 2: 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. 7362306a36Sopenharmony_ci * 7462306a36Sopenharmony_ci * node distances: 7562306a36Sopenharmony_ci * node 0 1 2 7662306a36Sopenharmony_ci * 0 10 20 30 7762306a36Sopenharmony_ci * 1 20 10 30 7862306a36Sopenharmony_ci * 2 30 30 10 7962306a36Sopenharmony_ci * 8062306a36Sopenharmony_ci * memory_tiers0 = 0-2 8162306a36Sopenharmony_ci * 8262306a36Sopenharmony_ci * node_demotion[0].preferred = <empty> 8362306a36Sopenharmony_ci * node_demotion[1].preferred = <empty> 8462306a36Sopenharmony_ci * node_demotion[2].preferred = <empty> 8562306a36Sopenharmony_ci * 8662306a36Sopenharmony_ci * Example 3: 8762306a36Sopenharmony_ci * 8862306a36Sopenharmony_ci * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. 8962306a36Sopenharmony_ci * 9062306a36Sopenharmony_ci * node distances: 9162306a36Sopenharmony_ci * node 0 1 2 9262306a36Sopenharmony_ci * 0 10 20 30 9362306a36Sopenharmony_ci * 1 20 10 40 9462306a36Sopenharmony_ci * 2 30 40 10 9562306a36Sopenharmony_ci * 9662306a36Sopenharmony_ci * memory_tiers0 = 1 9762306a36Sopenharmony_ci * memory_tiers1 = 0 9862306a36Sopenharmony_ci * memory_tiers2 = 2 9962306a36Sopenharmony_ci * 10062306a36Sopenharmony_ci * node_demotion[0].preferred = 2 10162306a36Sopenharmony_ci * node_demotion[1].preferred = 0 10262306a36Sopenharmony_ci * node_demotion[2].preferred = <empty> 10362306a36Sopenharmony_ci * 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_cistatic struct demotion_nodes *node_demotion __read_mostly; 10662306a36Sopenharmony_ci#endif /* CONFIG_MIGRATION */ 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_cistatic inline struct memory_tier *to_memory_tier(struct device *device) 10962306a36Sopenharmony_ci{ 11062306a36Sopenharmony_ci return container_of(device, struct memory_tier, dev); 11162306a36Sopenharmony_ci} 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_cistatic __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) 11462306a36Sopenharmony_ci{ 11562306a36Sopenharmony_ci nodemask_t nodes = NODE_MASK_NONE; 11662306a36Sopenharmony_ci struct memory_dev_type *memtype; 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling) 11962306a36Sopenharmony_ci nodes_or(nodes, nodes, memtype->nodes); 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci return nodes; 12262306a36Sopenharmony_ci} 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_cistatic void memory_tier_device_release(struct device *dev) 12562306a36Sopenharmony_ci{ 12662306a36Sopenharmony_ci struct memory_tier *tier = to_memory_tier(dev); 12762306a36Sopenharmony_ci /* 12862306a36Sopenharmony_ci * synchronize_rcu in clear_node_memory_tier makes sure 12962306a36Sopenharmony_ci * we don't have rcu access to this memory tier. 13062306a36Sopenharmony_ci */ 13162306a36Sopenharmony_ci kfree(tier); 13262306a36Sopenharmony_ci} 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_cistatic ssize_t nodelist_show(struct device *dev, 13562306a36Sopenharmony_ci struct device_attribute *attr, char *buf) 13662306a36Sopenharmony_ci{ 13762306a36Sopenharmony_ci int ret; 13862306a36Sopenharmony_ci nodemask_t nmask; 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci mutex_lock(&memory_tier_lock); 14162306a36Sopenharmony_ci nmask = get_memtier_nodemask(to_memory_tier(dev)); 14262306a36Sopenharmony_ci ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); 14362306a36Sopenharmony_ci mutex_unlock(&memory_tier_lock); 14462306a36Sopenharmony_ci return ret; 14562306a36Sopenharmony_ci} 14662306a36Sopenharmony_cistatic DEVICE_ATTR_RO(nodelist); 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_cistatic struct attribute *memtier_dev_attrs[] = { 14962306a36Sopenharmony_ci &dev_attr_nodelist.attr, 15062306a36Sopenharmony_ci NULL 15162306a36Sopenharmony_ci}; 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_cistatic const struct attribute_group memtier_dev_group = { 15462306a36Sopenharmony_ci .attrs = memtier_dev_attrs, 15562306a36Sopenharmony_ci}; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_cistatic const struct attribute_group *memtier_dev_groups[] = { 15862306a36Sopenharmony_ci &memtier_dev_group, 15962306a36Sopenharmony_ci NULL 16062306a36Sopenharmony_ci}; 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_cistatic struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci int ret; 16562306a36Sopenharmony_ci bool found_slot = false; 16662306a36Sopenharmony_ci struct memory_tier *memtier, *new_memtier; 16762306a36Sopenharmony_ci int adistance = memtype->adistance; 16862306a36Sopenharmony_ci unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci lockdep_assert_held_once(&memory_tier_lock); 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_ci adistance = round_down(adistance, memtier_adistance_chunk_size); 17362306a36Sopenharmony_ci /* 17462306a36Sopenharmony_ci * If the memtype is already part of a memory tier, 17562306a36Sopenharmony_ci * just return that. 17662306a36Sopenharmony_ci */ 17762306a36Sopenharmony_ci if (!list_empty(&memtype->tier_sibiling)) { 17862306a36Sopenharmony_ci list_for_each_entry(memtier, &memory_tiers, list) { 17962306a36Sopenharmony_ci if (adistance == memtier->adistance_start) 18062306a36Sopenharmony_ci return memtier; 18162306a36Sopenharmony_ci } 18262306a36Sopenharmony_ci WARN_ON(1); 18362306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 18462306a36Sopenharmony_ci } 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci list_for_each_entry(memtier, &memory_tiers, list) { 18762306a36Sopenharmony_ci if (adistance == memtier->adistance_start) { 18862306a36Sopenharmony_ci goto link_memtype; 18962306a36Sopenharmony_ci } else if (adistance < memtier->adistance_start) { 19062306a36Sopenharmony_ci found_slot = true; 19162306a36Sopenharmony_ci break; 19262306a36Sopenharmony_ci } 19362306a36Sopenharmony_ci } 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); 19662306a36Sopenharmony_ci if (!new_memtier) 19762306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci new_memtier->adistance_start = adistance; 20062306a36Sopenharmony_ci INIT_LIST_HEAD(&new_memtier->list); 20162306a36Sopenharmony_ci INIT_LIST_HEAD(&new_memtier->memory_types); 20262306a36Sopenharmony_ci if (found_slot) 20362306a36Sopenharmony_ci list_add_tail(&new_memtier->list, &memtier->list); 20462306a36Sopenharmony_ci else 20562306a36Sopenharmony_ci list_add_tail(&new_memtier->list, &memory_tiers); 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; 20862306a36Sopenharmony_ci new_memtier->dev.bus = &memory_tier_subsys; 20962306a36Sopenharmony_ci new_memtier->dev.release = memory_tier_device_release; 21062306a36Sopenharmony_ci new_memtier->dev.groups = memtier_dev_groups; 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci ret = device_register(&new_memtier->dev); 21362306a36Sopenharmony_ci if (ret) { 21462306a36Sopenharmony_ci list_del(&new_memtier->list); 21562306a36Sopenharmony_ci put_device(&new_memtier->dev); 21662306a36Sopenharmony_ci return ERR_PTR(ret); 21762306a36Sopenharmony_ci } 21862306a36Sopenharmony_ci memtier = new_memtier; 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_cilink_memtype: 22162306a36Sopenharmony_ci list_add(&memtype->tier_sibiling, &memtier->memory_types); 22262306a36Sopenharmony_ci return memtier; 22362306a36Sopenharmony_ci} 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_cistatic struct memory_tier *__node_get_memory_tier(int node) 22662306a36Sopenharmony_ci{ 22762306a36Sopenharmony_ci pg_data_t *pgdat; 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci pgdat = NODE_DATA(node); 23062306a36Sopenharmony_ci if (!pgdat) 23162306a36Sopenharmony_ci return NULL; 23262306a36Sopenharmony_ci /* 23362306a36Sopenharmony_ci * Since we hold memory_tier_lock, we can avoid 23462306a36Sopenharmony_ci * RCU read locks when accessing the details. No 23562306a36Sopenharmony_ci * parallel updates are possible here. 23662306a36Sopenharmony_ci */ 23762306a36Sopenharmony_ci return rcu_dereference_check(pgdat->memtier, 23862306a36Sopenharmony_ci lockdep_is_held(&memory_tier_lock)); 23962306a36Sopenharmony_ci} 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION 24262306a36Sopenharmony_cibool node_is_toptier(int node) 24362306a36Sopenharmony_ci{ 24462306a36Sopenharmony_ci bool toptier; 24562306a36Sopenharmony_ci pg_data_t *pgdat; 24662306a36Sopenharmony_ci struct memory_tier *memtier; 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci pgdat = NODE_DATA(node); 24962306a36Sopenharmony_ci if (!pgdat) 25062306a36Sopenharmony_ci return false; 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci rcu_read_lock(); 25362306a36Sopenharmony_ci memtier = rcu_dereference(pgdat->memtier); 25462306a36Sopenharmony_ci if (!memtier) { 25562306a36Sopenharmony_ci toptier = true; 25662306a36Sopenharmony_ci goto out; 25762306a36Sopenharmony_ci } 25862306a36Sopenharmony_ci if (memtier->adistance_start <= top_tier_adistance) 25962306a36Sopenharmony_ci toptier = true; 26062306a36Sopenharmony_ci else 26162306a36Sopenharmony_ci toptier = false; 26262306a36Sopenharmony_ciout: 26362306a36Sopenharmony_ci rcu_read_unlock(); 26462306a36Sopenharmony_ci return toptier; 26562306a36Sopenharmony_ci} 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_civoid node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) 26862306a36Sopenharmony_ci{ 26962306a36Sopenharmony_ci struct memory_tier *memtier; 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci /* 27262306a36Sopenharmony_ci * pg_data_t.memtier updates includes a synchronize_rcu() 27362306a36Sopenharmony_ci * which ensures that we either find NULL or a valid memtier 27462306a36Sopenharmony_ci * in NODE_DATA. protect the access via rcu_read_lock(); 27562306a36Sopenharmony_ci */ 27662306a36Sopenharmony_ci rcu_read_lock(); 27762306a36Sopenharmony_ci memtier = rcu_dereference(pgdat->memtier); 27862306a36Sopenharmony_ci if (memtier) 27962306a36Sopenharmony_ci *targets = memtier->lower_tier_mask; 28062306a36Sopenharmony_ci else 28162306a36Sopenharmony_ci *targets = NODE_MASK_NONE; 28262306a36Sopenharmony_ci rcu_read_unlock(); 28362306a36Sopenharmony_ci} 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci/** 28662306a36Sopenharmony_ci * next_demotion_node() - Get the next node in the demotion path 28762306a36Sopenharmony_ci * @node: The starting node to lookup the next node 28862306a36Sopenharmony_ci * 28962306a36Sopenharmony_ci * Return: node id for next memory node in the demotion path hierarchy 29062306a36Sopenharmony_ci * from @node; NUMA_NO_NODE if @node is terminal. This does not keep 29162306a36Sopenharmony_ci * @node online or guarantee that it *continues* to be the next demotion 29262306a36Sopenharmony_ci * target. 29362306a36Sopenharmony_ci */ 29462306a36Sopenharmony_ciint next_demotion_node(int node) 29562306a36Sopenharmony_ci{ 29662306a36Sopenharmony_ci struct demotion_nodes *nd; 29762306a36Sopenharmony_ci int target; 29862306a36Sopenharmony_ci 29962306a36Sopenharmony_ci if (!node_demotion) 30062306a36Sopenharmony_ci return NUMA_NO_NODE; 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci nd = &node_demotion[node]; 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci /* 30562306a36Sopenharmony_ci * node_demotion[] is updated without excluding this 30662306a36Sopenharmony_ci * function from running. 30762306a36Sopenharmony_ci * 30862306a36Sopenharmony_ci * Make sure to use RCU over entire code blocks if 30962306a36Sopenharmony_ci * node_demotion[] reads need to be consistent. 31062306a36Sopenharmony_ci */ 31162306a36Sopenharmony_ci rcu_read_lock(); 31262306a36Sopenharmony_ci /* 31362306a36Sopenharmony_ci * If there are multiple target nodes, just select one 31462306a36Sopenharmony_ci * target node randomly. 31562306a36Sopenharmony_ci * 31662306a36Sopenharmony_ci * In addition, we can also use round-robin to select 31762306a36Sopenharmony_ci * target node, but we should introduce another variable 31862306a36Sopenharmony_ci * for node_demotion[] to record last selected target node, 31962306a36Sopenharmony_ci * that may cause cache ping-pong due to the changing of 32062306a36Sopenharmony_ci * last target node. Or introducing per-cpu data to avoid 32162306a36Sopenharmony_ci * caching issue, which seems more complicated. So selecting 32262306a36Sopenharmony_ci * target node randomly seems better until now. 32362306a36Sopenharmony_ci */ 32462306a36Sopenharmony_ci target = node_random(&nd->preferred); 32562306a36Sopenharmony_ci rcu_read_unlock(); 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci return target; 32862306a36Sopenharmony_ci} 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_cistatic void disable_all_demotion_targets(void) 33162306a36Sopenharmony_ci{ 33262306a36Sopenharmony_ci struct memory_tier *memtier; 33362306a36Sopenharmony_ci int node; 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci for_each_node_state(node, N_MEMORY) { 33662306a36Sopenharmony_ci node_demotion[node].preferred = NODE_MASK_NONE; 33762306a36Sopenharmony_ci /* 33862306a36Sopenharmony_ci * We are holding memory_tier_lock, it is safe 33962306a36Sopenharmony_ci * to access pgda->memtier. 34062306a36Sopenharmony_ci */ 34162306a36Sopenharmony_ci memtier = __node_get_memory_tier(node); 34262306a36Sopenharmony_ci if (memtier) 34362306a36Sopenharmony_ci memtier->lower_tier_mask = NODE_MASK_NONE; 34462306a36Sopenharmony_ci } 34562306a36Sopenharmony_ci /* 34662306a36Sopenharmony_ci * Ensure that the "disable" is visible across the system. 34762306a36Sopenharmony_ci * Readers will see either a combination of before+disable 34862306a36Sopenharmony_ci * state or disable+after. They will never see before and 34962306a36Sopenharmony_ci * after state together. 35062306a36Sopenharmony_ci */ 35162306a36Sopenharmony_ci synchronize_rcu(); 35262306a36Sopenharmony_ci} 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci/* 35562306a36Sopenharmony_ci * Find an automatic demotion target for all memory 35662306a36Sopenharmony_ci * nodes. Failing here is OK. It might just indicate 35762306a36Sopenharmony_ci * being at the end of a chain. 35862306a36Sopenharmony_ci */ 35962306a36Sopenharmony_cistatic void establish_demotion_targets(void) 36062306a36Sopenharmony_ci{ 36162306a36Sopenharmony_ci struct memory_tier *memtier; 36262306a36Sopenharmony_ci struct demotion_nodes *nd; 36362306a36Sopenharmony_ci int target = NUMA_NO_NODE, node; 36462306a36Sopenharmony_ci int distance, best_distance; 36562306a36Sopenharmony_ci nodemask_t tier_nodes, lower_tier; 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci lockdep_assert_held_once(&memory_tier_lock); 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci if (!node_demotion) 37062306a36Sopenharmony_ci return; 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci disable_all_demotion_targets(); 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci for_each_node_state(node, N_MEMORY) { 37562306a36Sopenharmony_ci best_distance = -1; 37662306a36Sopenharmony_ci nd = &node_demotion[node]; 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci memtier = __node_get_memory_tier(node); 37962306a36Sopenharmony_ci if (!memtier || list_is_last(&memtier->list, &memory_tiers)) 38062306a36Sopenharmony_ci continue; 38162306a36Sopenharmony_ci /* 38262306a36Sopenharmony_ci * Get the lower memtier to find the demotion node list. 38362306a36Sopenharmony_ci */ 38462306a36Sopenharmony_ci memtier = list_next_entry(memtier, list); 38562306a36Sopenharmony_ci tier_nodes = get_memtier_nodemask(memtier); 38662306a36Sopenharmony_ci /* 38762306a36Sopenharmony_ci * find_next_best_node, use 'used' nodemask as a skip list. 38862306a36Sopenharmony_ci * Add all memory nodes except the selected memory tier 38962306a36Sopenharmony_ci * nodelist to skip list so that we find the best node from the 39062306a36Sopenharmony_ci * memtier nodelist. 39162306a36Sopenharmony_ci */ 39262306a36Sopenharmony_ci nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ci /* 39562306a36Sopenharmony_ci * Find all the nodes in the memory tier node list of same best distance. 39662306a36Sopenharmony_ci * add them to the preferred mask. We randomly select between nodes 39762306a36Sopenharmony_ci * in the preferred mask when allocating pages during demotion. 39862306a36Sopenharmony_ci */ 39962306a36Sopenharmony_ci do { 40062306a36Sopenharmony_ci target = find_next_best_node(node, &tier_nodes); 40162306a36Sopenharmony_ci if (target == NUMA_NO_NODE) 40262306a36Sopenharmony_ci break; 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_ci distance = node_distance(node, target); 40562306a36Sopenharmony_ci if (distance == best_distance || best_distance == -1) { 40662306a36Sopenharmony_ci best_distance = distance; 40762306a36Sopenharmony_ci node_set(target, nd->preferred); 40862306a36Sopenharmony_ci } else { 40962306a36Sopenharmony_ci break; 41062306a36Sopenharmony_ci } 41162306a36Sopenharmony_ci } while (1); 41262306a36Sopenharmony_ci } 41362306a36Sopenharmony_ci /* 41462306a36Sopenharmony_ci * Promotion is allowed from a memory tier to higher 41562306a36Sopenharmony_ci * memory tier only if the memory tier doesn't include 41662306a36Sopenharmony_ci * compute. We want to skip promotion from a memory tier, 41762306a36Sopenharmony_ci * if any node that is part of the memory tier have CPUs. 41862306a36Sopenharmony_ci * Once we detect such a memory tier, we consider that tier 41962306a36Sopenharmony_ci * as top tiper from which promotion is not allowed. 42062306a36Sopenharmony_ci */ 42162306a36Sopenharmony_ci list_for_each_entry_reverse(memtier, &memory_tiers, list) { 42262306a36Sopenharmony_ci tier_nodes = get_memtier_nodemask(memtier); 42362306a36Sopenharmony_ci nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); 42462306a36Sopenharmony_ci if (!nodes_empty(tier_nodes)) { 42562306a36Sopenharmony_ci /* 42662306a36Sopenharmony_ci * abstract distance below the max value of this memtier 42762306a36Sopenharmony_ci * is considered toptier. 42862306a36Sopenharmony_ci */ 42962306a36Sopenharmony_ci top_tier_adistance = memtier->adistance_start + 43062306a36Sopenharmony_ci MEMTIER_CHUNK_SIZE - 1; 43162306a36Sopenharmony_ci break; 43262306a36Sopenharmony_ci } 43362306a36Sopenharmony_ci } 43462306a36Sopenharmony_ci /* 43562306a36Sopenharmony_ci * Now build the lower_tier mask for each node collecting node mask from 43662306a36Sopenharmony_ci * all memory tier below it. This allows us to fallback demotion page 43762306a36Sopenharmony_ci * allocation to a set of nodes that is closer the above selected 43862306a36Sopenharmony_ci * perferred node. 43962306a36Sopenharmony_ci */ 44062306a36Sopenharmony_ci lower_tier = node_states[N_MEMORY]; 44162306a36Sopenharmony_ci list_for_each_entry(memtier, &memory_tiers, list) { 44262306a36Sopenharmony_ci /* 44362306a36Sopenharmony_ci * Keep removing current tier from lower_tier nodes, 44462306a36Sopenharmony_ci * This will remove all nodes in current and above 44562306a36Sopenharmony_ci * memory tier from the lower_tier mask. 44662306a36Sopenharmony_ci */ 44762306a36Sopenharmony_ci tier_nodes = get_memtier_nodemask(memtier); 44862306a36Sopenharmony_ci nodes_andnot(lower_tier, lower_tier, tier_nodes); 44962306a36Sopenharmony_ci memtier->lower_tier_mask = lower_tier; 45062306a36Sopenharmony_ci } 45162306a36Sopenharmony_ci} 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_ci#else 45462306a36Sopenharmony_cistatic inline void establish_demotion_targets(void) {} 45562306a36Sopenharmony_ci#endif /* CONFIG_MIGRATION */ 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_cistatic inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) 45862306a36Sopenharmony_ci{ 45962306a36Sopenharmony_ci if (!node_memory_types[node].memtype) 46062306a36Sopenharmony_ci node_memory_types[node].memtype = memtype; 46162306a36Sopenharmony_ci /* 46262306a36Sopenharmony_ci * for each device getting added in the same NUMA node 46362306a36Sopenharmony_ci * with this specific memtype, bump the map count. We 46462306a36Sopenharmony_ci * Only take memtype device reference once, so that 46562306a36Sopenharmony_ci * changing a node memtype can be done by droping the 46662306a36Sopenharmony_ci * only reference count taken here. 46762306a36Sopenharmony_ci */ 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci if (node_memory_types[node].memtype == memtype) { 47062306a36Sopenharmony_ci if (!node_memory_types[node].map_count++) 47162306a36Sopenharmony_ci kref_get(&memtype->kref); 47262306a36Sopenharmony_ci } 47362306a36Sopenharmony_ci} 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_cistatic struct memory_tier *set_node_memory_tier(int node) 47662306a36Sopenharmony_ci{ 47762306a36Sopenharmony_ci struct memory_tier *memtier; 47862306a36Sopenharmony_ci struct memory_dev_type *memtype; 47962306a36Sopenharmony_ci pg_data_t *pgdat = NODE_DATA(node); 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci 48262306a36Sopenharmony_ci lockdep_assert_held_once(&memory_tier_lock); 48362306a36Sopenharmony_ci 48462306a36Sopenharmony_ci if (!node_state(node, N_MEMORY)) 48562306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci __init_node_memory_type(node, default_dram_type); 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci memtype = node_memory_types[node].memtype; 49062306a36Sopenharmony_ci node_set(node, memtype->nodes); 49162306a36Sopenharmony_ci memtier = find_create_memory_tier(memtype); 49262306a36Sopenharmony_ci if (!IS_ERR(memtier)) 49362306a36Sopenharmony_ci rcu_assign_pointer(pgdat->memtier, memtier); 49462306a36Sopenharmony_ci return memtier; 49562306a36Sopenharmony_ci} 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_cistatic void destroy_memory_tier(struct memory_tier *memtier) 49862306a36Sopenharmony_ci{ 49962306a36Sopenharmony_ci list_del(&memtier->list); 50062306a36Sopenharmony_ci device_unregister(&memtier->dev); 50162306a36Sopenharmony_ci} 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_cistatic bool clear_node_memory_tier(int node) 50462306a36Sopenharmony_ci{ 50562306a36Sopenharmony_ci bool cleared = false; 50662306a36Sopenharmony_ci pg_data_t *pgdat; 50762306a36Sopenharmony_ci struct memory_tier *memtier; 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci pgdat = NODE_DATA(node); 51062306a36Sopenharmony_ci if (!pgdat) 51162306a36Sopenharmony_ci return false; 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_ci /* 51462306a36Sopenharmony_ci * Make sure that anybody looking at NODE_DATA who finds 51562306a36Sopenharmony_ci * a valid memtier finds memory_dev_types with nodes still 51662306a36Sopenharmony_ci * linked to the memtier. We achieve this by waiting for 51762306a36Sopenharmony_ci * rcu read section to finish using synchronize_rcu. 51862306a36Sopenharmony_ci * This also enables us to free the destroyed memory tier 51962306a36Sopenharmony_ci * with kfree instead of kfree_rcu 52062306a36Sopenharmony_ci */ 52162306a36Sopenharmony_ci memtier = __node_get_memory_tier(node); 52262306a36Sopenharmony_ci if (memtier) { 52362306a36Sopenharmony_ci struct memory_dev_type *memtype; 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci rcu_assign_pointer(pgdat->memtier, NULL); 52662306a36Sopenharmony_ci synchronize_rcu(); 52762306a36Sopenharmony_ci memtype = node_memory_types[node].memtype; 52862306a36Sopenharmony_ci node_clear(node, memtype->nodes); 52962306a36Sopenharmony_ci if (nodes_empty(memtype->nodes)) { 53062306a36Sopenharmony_ci list_del_init(&memtype->tier_sibiling); 53162306a36Sopenharmony_ci if (list_empty(&memtier->memory_types)) 53262306a36Sopenharmony_ci destroy_memory_tier(memtier); 53362306a36Sopenharmony_ci } 53462306a36Sopenharmony_ci cleared = true; 53562306a36Sopenharmony_ci } 53662306a36Sopenharmony_ci return cleared; 53762306a36Sopenharmony_ci} 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_cistatic void release_memtype(struct kref *kref) 54062306a36Sopenharmony_ci{ 54162306a36Sopenharmony_ci struct memory_dev_type *memtype; 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_ci memtype = container_of(kref, struct memory_dev_type, kref); 54462306a36Sopenharmony_ci kfree(memtype); 54562306a36Sopenharmony_ci} 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_cistruct memory_dev_type *alloc_memory_type(int adistance) 54862306a36Sopenharmony_ci{ 54962306a36Sopenharmony_ci struct memory_dev_type *memtype; 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); 55262306a36Sopenharmony_ci if (!memtype) 55362306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_ci memtype->adistance = adistance; 55662306a36Sopenharmony_ci INIT_LIST_HEAD(&memtype->tier_sibiling); 55762306a36Sopenharmony_ci memtype->nodes = NODE_MASK_NONE; 55862306a36Sopenharmony_ci kref_init(&memtype->kref); 55962306a36Sopenharmony_ci return memtype; 56062306a36Sopenharmony_ci} 56162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(alloc_memory_type); 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_civoid put_memory_type(struct memory_dev_type *memtype) 56462306a36Sopenharmony_ci{ 56562306a36Sopenharmony_ci kref_put(&memtype->kref, release_memtype); 56662306a36Sopenharmony_ci} 56762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(put_memory_type); 56862306a36Sopenharmony_ci 56962306a36Sopenharmony_civoid init_node_memory_type(int node, struct memory_dev_type *memtype) 57062306a36Sopenharmony_ci{ 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci mutex_lock(&memory_tier_lock); 57362306a36Sopenharmony_ci __init_node_memory_type(node, memtype); 57462306a36Sopenharmony_ci mutex_unlock(&memory_tier_lock); 57562306a36Sopenharmony_ci} 57662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(init_node_memory_type); 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_civoid clear_node_memory_type(int node, struct memory_dev_type *memtype) 57962306a36Sopenharmony_ci{ 58062306a36Sopenharmony_ci mutex_lock(&memory_tier_lock); 58162306a36Sopenharmony_ci if (node_memory_types[node].memtype == memtype) 58262306a36Sopenharmony_ci node_memory_types[node].map_count--; 58362306a36Sopenharmony_ci /* 58462306a36Sopenharmony_ci * If we umapped all the attached devices to this node, 58562306a36Sopenharmony_ci * clear the node memory type. 58662306a36Sopenharmony_ci */ 58762306a36Sopenharmony_ci if (!node_memory_types[node].map_count) { 58862306a36Sopenharmony_ci node_memory_types[node].memtype = NULL; 58962306a36Sopenharmony_ci put_memory_type(memtype); 59062306a36Sopenharmony_ci } 59162306a36Sopenharmony_ci mutex_unlock(&memory_tier_lock); 59262306a36Sopenharmony_ci} 59362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(clear_node_memory_type); 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_cistatic int __meminit memtier_hotplug_callback(struct notifier_block *self, 59662306a36Sopenharmony_ci unsigned long action, void *_arg) 59762306a36Sopenharmony_ci{ 59862306a36Sopenharmony_ci struct memory_tier *memtier; 59962306a36Sopenharmony_ci struct memory_notify *arg = _arg; 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci /* 60262306a36Sopenharmony_ci * Only update the node migration order when a node is 60362306a36Sopenharmony_ci * changing status, like online->offline. 60462306a36Sopenharmony_ci */ 60562306a36Sopenharmony_ci if (arg->status_change_nid < 0) 60662306a36Sopenharmony_ci return notifier_from_errno(0); 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci switch (action) { 60962306a36Sopenharmony_ci case MEM_OFFLINE: 61062306a36Sopenharmony_ci mutex_lock(&memory_tier_lock); 61162306a36Sopenharmony_ci if (clear_node_memory_tier(arg->status_change_nid)) 61262306a36Sopenharmony_ci establish_demotion_targets(); 61362306a36Sopenharmony_ci mutex_unlock(&memory_tier_lock); 61462306a36Sopenharmony_ci break; 61562306a36Sopenharmony_ci case MEM_ONLINE: 61662306a36Sopenharmony_ci mutex_lock(&memory_tier_lock); 61762306a36Sopenharmony_ci memtier = set_node_memory_tier(arg->status_change_nid); 61862306a36Sopenharmony_ci if (!IS_ERR(memtier)) 61962306a36Sopenharmony_ci establish_demotion_targets(); 62062306a36Sopenharmony_ci mutex_unlock(&memory_tier_lock); 62162306a36Sopenharmony_ci break; 62262306a36Sopenharmony_ci } 62362306a36Sopenharmony_ci 62462306a36Sopenharmony_ci return notifier_from_errno(0); 62562306a36Sopenharmony_ci} 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_cistatic int __init memory_tier_init(void) 62862306a36Sopenharmony_ci{ 62962306a36Sopenharmony_ci int ret, node; 63062306a36Sopenharmony_ci struct memory_tier *memtier; 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci ret = subsys_virtual_register(&memory_tier_subsys, NULL); 63362306a36Sopenharmony_ci if (ret) 63462306a36Sopenharmony_ci panic("%s() failed to register memory tier subsystem\n", __func__); 63562306a36Sopenharmony_ci 63662306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION 63762306a36Sopenharmony_ci node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), 63862306a36Sopenharmony_ci GFP_KERNEL); 63962306a36Sopenharmony_ci WARN_ON(!node_demotion); 64062306a36Sopenharmony_ci#endif 64162306a36Sopenharmony_ci mutex_lock(&memory_tier_lock); 64262306a36Sopenharmony_ci /* 64362306a36Sopenharmony_ci * For now we can have 4 faster memory tiers with smaller adistance 64462306a36Sopenharmony_ci * than default DRAM tier. 64562306a36Sopenharmony_ci */ 64662306a36Sopenharmony_ci default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); 64762306a36Sopenharmony_ci if (IS_ERR(default_dram_type)) 64862306a36Sopenharmony_ci panic("%s() failed to allocate default DRAM tier\n", __func__); 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci /* 65162306a36Sopenharmony_ci * Look at all the existing N_MEMORY nodes and add them to 65262306a36Sopenharmony_ci * default memory tier or to a tier if we already have memory 65362306a36Sopenharmony_ci * types assigned. 65462306a36Sopenharmony_ci */ 65562306a36Sopenharmony_ci for_each_node_state(node, N_MEMORY) { 65662306a36Sopenharmony_ci memtier = set_node_memory_tier(node); 65762306a36Sopenharmony_ci if (IS_ERR(memtier)) 65862306a36Sopenharmony_ci /* 65962306a36Sopenharmony_ci * Continue with memtiers we are able to setup 66062306a36Sopenharmony_ci */ 66162306a36Sopenharmony_ci break; 66262306a36Sopenharmony_ci } 66362306a36Sopenharmony_ci establish_demotion_targets(); 66462306a36Sopenharmony_ci mutex_unlock(&memory_tier_lock); 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); 66762306a36Sopenharmony_ci return 0; 66862306a36Sopenharmony_ci} 66962306a36Sopenharmony_cisubsys_initcall(memory_tier_init); 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_cibool numa_demotion_enabled = false; 67262306a36Sopenharmony_ci 67362306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION 67462306a36Sopenharmony_ci#ifdef CONFIG_SYSFS 67562306a36Sopenharmony_cistatic ssize_t demotion_enabled_show(struct kobject *kobj, 67662306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 67762306a36Sopenharmony_ci{ 67862306a36Sopenharmony_ci return sysfs_emit(buf, "%s\n", 67962306a36Sopenharmony_ci numa_demotion_enabled ? "true" : "false"); 68062306a36Sopenharmony_ci} 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_cistatic ssize_t demotion_enabled_store(struct kobject *kobj, 68362306a36Sopenharmony_ci struct kobj_attribute *attr, 68462306a36Sopenharmony_ci const char *buf, size_t count) 68562306a36Sopenharmony_ci{ 68662306a36Sopenharmony_ci ssize_t ret; 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_ci ret = kstrtobool(buf, &numa_demotion_enabled); 68962306a36Sopenharmony_ci if (ret) 69062306a36Sopenharmony_ci return ret; 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci return count; 69362306a36Sopenharmony_ci} 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_cistatic struct kobj_attribute numa_demotion_enabled_attr = 69662306a36Sopenharmony_ci __ATTR_RW(demotion_enabled); 69762306a36Sopenharmony_ci 69862306a36Sopenharmony_cistatic struct attribute *numa_attrs[] = { 69962306a36Sopenharmony_ci &numa_demotion_enabled_attr.attr, 70062306a36Sopenharmony_ci NULL, 70162306a36Sopenharmony_ci}; 70262306a36Sopenharmony_ci 70362306a36Sopenharmony_cistatic const struct attribute_group numa_attr_group = { 70462306a36Sopenharmony_ci .attrs = numa_attrs, 70562306a36Sopenharmony_ci}; 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_cistatic int __init numa_init_sysfs(void) 70862306a36Sopenharmony_ci{ 70962306a36Sopenharmony_ci int err; 71062306a36Sopenharmony_ci struct kobject *numa_kobj; 71162306a36Sopenharmony_ci 71262306a36Sopenharmony_ci numa_kobj = kobject_create_and_add("numa", mm_kobj); 71362306a36Sopenharmony_ci if (!numa_kobj) { 71462306a36Sopenharmony_ci pr_err("failed to create numa kobject\n"); 71562306a36Sopenharmony_ci return -ENOMEM; 71662306a36Sopenharmony_ci } 71762306a36Sopenharmony_ci err = sysfs_create_group(numa_kobj, &numa_attr_group); 71862306a36Sopenharmony_ci if (err) { 71962306a36Sopenharmony_ci pr_err("failed to register numa group\n"); 72062306a36Sopenharmony_ci goto delete_obj; 72162306a36Sopenharmony_ci } 72262306a36Sopenharmony_ci return 0; 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_cidelete_obj: 72562306a36Sopenharmony_ci kobject_put(numa_kobj); 72662306a36Sopenharmony_ci return err; 72762306a36Sopenharmony_ci} 72862306a36Sopenharmony_cisubsys_initcall(numa_init_sysfs); 72962306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */ 73062306a36Sopenharmony_ci#endif 731