162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Simple NUMA memory policy for the Linux kernel. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright 2003,2004 Andi Kleen, SuSE Labs. 662306a36Sopenharmony_ci * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * NUMA policy allows the user to give hints in which node(s) memory should 962306a36Sopenharmony_ci * be allocated. 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * Support four policies per VMA and per process: 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * The VMA policy has priority over the process policy for a page fault. 1462306a36Sopenharmony_ci * 1562306a36Sopenharmony_ci * interleave Allocate memory interleaved over a set of nodes, 1662306a36Sopenharmony_ci * with normal fallback if it fails. 1762306a36Sopenharmony_ci * For VMA based allocations this interleaves based on the 1862306a36Sopenharmony_ci * offset into the backing object or offset into the mapping 1962306a36Sopenharmony_ci * for anonymous memory. For process policy an process counter 2062306a36Sopenharmony_ci * is used. 2162306a36Sopenharmony_ci * 2262306a36Sopenharmony_ci * bind Only allocate memory on a specific set of nodes, 2362306a36Sopenharmony_ci * no fallback. 2462306a36Sopenharmony_ci * FIXME: memory is allocated starting with the first node 2562306a36Sopenharmony_ci * to the last. It would be better if bind would truly restrict 2662306a36Sopenharmony_ci * the allocation to memory nodes instead 2762306a36Sopenharmony_ci * 2862306a36Sopenharmony_ci * preferred Try a specific node first before normal fallback. 2962306a36Sopenharmony_ci * As a special case NUMA_NO_NODE here means do the allocation 3062306a36Sopenharmony_ci * on the local CPU. This is normally identical to default, 3162306a36Sopenharmony_ci * but useful to set in a VMA when you have a non default 3262306a36Sopenharmony_ci * process policy. 3362306a36Sopenharmony_ci * 3462306a36Sopenharmony_ci * preferred many Try a set of nodes first before normal fallback. This is 3562306a36Sopenharmony_ci * similar to preferred without the special case. 3662306a36Sopenharmony_ci * 3762306a36Sopenharmony_ci * default Allocate on the local node first, or when on a VMA 3862306a36Sopenharmony_ci * use the process policy. This is what Linux always did 3962306a36Sopenharmony_ci * in a NUMA aware kernel and still does by, ahem, default. 4062306a36Sopenharmony_ci * 4162306a36Sopenharmony_ci * The process policy is applied for most non interrupt memory allocations 4262306a36Sopenharmony_ci * in that process' context. Interrupts ignore the policies and always 4362306a36Sopenharmony_ci * try to allocate on the local CPU. The VMA policy is only applied for memory 4462306a36Sopenharmony_ci * allocations for a VMA in the VM. 4562306a36Sopenharmony_ci * 4662306a36Sopenharmony_ci * Currently there are a few corner cases in swapping where the policy 4762306a36Sopenharmony_ci * is not applied, but the majority should be handled. When process policy 4862306a36Sopenharmony_ci * is used it is not remembered over swap outs/swap ins. 4962306a36Sopenharmony_ci * 5062306a36Sopenharmony_ci * Only the highest zone in the zone hierarchy gets policied. Allocations 5162306a36Sopenharmony_ci * requesting a lower zone just use default policy. This implies that 5262306a36Sopenharmony_ci * on systems with highmem kernel lowmem allocation don't get policied. 5362306a36Sopenharmony_ci * Same with GFP_DMA allocations. 5462306a36Sopenharmony_ci * 5562306a36Sopenharmony_ci * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between 5662306a36Sopenharmony_ci * all users and remembered even when nobody has memory mapped. 5762306a36Sopenharmony_ci */ 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci/* Notebook: 6062306a36Sopenharmony_ci fix mmap readahead to honour policy and enable policy for any page cache 6162306a36Sopenharmony_ci object 6262306a36Sopenharmony_ci statistics for bigpages 6362306a36Sopenharmony_ci global policy for page cache? currently it uses process policy. Requires 6462306a36Sopenharmony_ci first item above. 6562306a36Sopenharmony_ci handle mremap for shared memory (currently ignored for the policy) 6662306a36Sopenharmony_ci grows down? 6762306a36Sopenharmony_ci make bind policy root only? It can trigger oom much faster and the 6862306a36Sopenharmony_ci kernel is not always grateful with that. 6962306a36Sopenharmony_ci*/ 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_ci#include <linux/mempolicy.h> 7462306a36Sopenharmony_ci#include <linux/pagewalk.h> 7562306a36Sopenharmony_ci#include <linux/highmem.h> 7662306a36Sopenharmony_ci#include <linux/hugetlb.h> 7762306a36Sopenharmony_ci#include <linux/kernel.h> 7862306a36Sopenharmony_ci#include <linux/sched.h> 7962306a36Sopenharmony_ci#include <linux/sched/mm.h> 8062306a36Sopenharmony_ci#include <linux/sched/numa_balancing.h> 8162306a36Sopenharmony_ci#include <linux/sched/task.h> 8262306a36Sopenharmony_ci#include <linux/nodemask.h> 8362306a36Sopenharmony_ci#include <linux/cpuset.h> 8462306a36Sopenharmony_ci#include <linux/slab.h> 8562306a36Sopenharmony_ci#include <linux/string.h> 8662306a36Sopenharmony_ci#include <linux/export.h> 8762306a36Sopenharmony_ci#include <linux/nsproxy.h> 8862306a36Sopenharmony_ci#include <linux/interrupt.h> 8962306a36Sopenharmony_ci#include <linux/init.h> 9062306a36Sopenharmony_ci#include <linux/compat.h> 9162306a36Sopenharmony_ci#include <linux/ptrace.h> 9262306a36Sopenharmony_ci#include <linux/swap.h> 9362306a36Sopenharmony_ci#include <linux/seq_file.h> 9462306a36Sopenharmony_ci#include <linux/proc_fs.h> 9562306a36Sopenharmony_ci#include <linux/migrate.h> 9662306a36Sopenharmony_ci#include <linux/ksm.h> 9762306a36Sopenharmony_ci#include <linux/rmap.h> 9862306a36Sopenharmony_ci#include <linux/security.h> 9962306a36Sopenharmony_ci#include <linux/syscalls.h> 10062306a36Sopenharmony_ci#include <linux/ctype.h> 10162306a36Sopenharmony_ci#include <linux/mm_inline.h> 10262306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 10362306a36Sopenharmony_ci#include <linux/printk.h> 10462306a36Sopenharmony_ci#include <linux/swapops.h> 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci#include <asm/tlbflush.h> 10762306a36Sopenharmony_ci#include <asm/tlb.h> 10862306a36Sopenharmony_ci#include <linux/uaccess.h> 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci#include "internal.h" 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci/* Internal flags */ 11362306a36Sopenharmony_ci#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 11462306a36Sopenharmony_ci#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_cistatic struct kmem_cache *policy_cache; 11762306a36Sopenharmony_cistatic struct kmem_cache *sn_cache; 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci/* Highest zone. An specific allocation for a zone below that is not 12062306a36Sopenharmony_ci policied. */ 12162306a36Sopenharmony_cienum zone_type policy_zone = 0; 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci/* 12462306a36Sopenharmony_ci * run-time system-wide default policy => local allocation 12562306a36Sopenharmony_ci */ 12662306a36Sopenharmony_cistatic struct mempolicy default_policy = { 12762306a36Sopenharmony_ci .refcnt = ATOMIC_INIT(1), /* never free it */ 12862306a36Sopenharmony_ci .mode = MPOL_LOCAL, 12962306a36Sopenharmony_ci}; 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_cistatic struct mempolicy preferred_node_policy[MAX_NUMNODES]; 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci/** 13462306a36Sopenharmony_ci * numa_nearest_node - Find nearest node by state 13562306a36Sopenharmony_ci * @node: Node id to start the search 13662306a36Sopenharmony_ci * @state: State to filter the search 13762306a36Sopenharmony_ci * 13862306a36Sopenharmony_ci * Lookup the closest node by distance if @nid is not in state. 13962306a36Sopenharmony_ci * 14062306a36Sopenharmony_ci * Return: this @node if it is in state, otherwise the closest node by distance 14162306a36Sopenharmony_ci */ 14262306a36Sopenharmony_ciint numa_nearest_node(int node, unsigned int state) 14362306a36Sopenharmony_ci{ 14462306a36Sopenharmony_ci int min_dist = INT_MAX, dist, n, min_node; 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci if (state >= NR_NODE_STATES) 14762306a36Sopenharmony_ci return -EINVAL; 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci if (node == NUMA_NO_NODE || node_state(node, state)) 15062306a36Sopenharmony_ci return node; 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci min_node = node; 15362306a36Sopenharmony_ci for_each_node_state(n, state) { 15462306a36Sopenharmony_ci dist = node_distance(node, n); 15562306a36Sopenharmony_ci if (dist < min_dist) { 15662306a36Sopenharmony_ci min_dist = dist; 15762306a36Sopenharmony_ci min_node = n; 15862306a36Sopenharmony_ci } 15962306a36Sopenharmony_ci } 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci return min_node; 16262306a36Sopenharmony_ci} 16362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(numa_nearest_node); 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_cistruct mempolicy *get_task_policy(struct task_struct *p) 16662306a36Sopenharmony_ci{ 16762306a36Sopenharmony_ci struct mempolicy *pol = p->mempolicy; 16862306a36Sopenharmony_ci int node; 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci if (pol) 17162306a36Sopenharmony_ci return pol; 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci node = numa_node_id(); 17462306a36Sopenharmony_ci if (node != NUMA_NO_NODE) { 17562306a36Sopenharmony_ci pol = &preferred_node_policy[node]; 17662306a36Sopenharmony_ci /* preferred_node_policy is not initialised early in boot */ 17762306a36Sopenharmony_ci if (pol->mode) 17862306a36Sopenharmony_ci return pol; 17962306a36Sopenharmony_ci } 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci return &default_policy; 18262306a36Sopenharmony_ci} 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_cistatic const struct mempolicy_operations { 18562306a36Sopenharmony_ci int (*create)(struct mempolicy *pol, const nodemask_t *nodes); 18662306a36Sopenharmony_ci void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); 18762306a36Sopenharmony_ci} mpol_ops[MPOL_MAX]; 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_cistatic inline int mpol_store_user_nodemask(const struct mempolicy *pol) 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci return pol->flags & MPOL_MODE_FLAGS; 19262306a36Sopenharmony_ci} 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_cistatic void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, 19562306a36Sopenharmony_ci const nodemask_t *rel) 19662306a36Sopenharmony_ci{ 19762306a36Sopenharmony_ci nodemask_t tmp; 19862306a36Sopenharmony_ci nodes_fold(tmp, *orig, nodes_weight(*rel)); 19962306a36Sopenharmony_ci nodes_onto(*ret, tmp, *rel); 20062306a36Sopenharmony_ci} 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_cistatic int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 20362306a36Sopenharmony_ci{ 20462306a36Sopenharmony_ci if (nodes_empty(*nodes)) 20562306a36Sopenharmony_ci return -EINVAL; 20662306a36Sopenharmony_ci pol->nodes = *nodes; 20762306a36Sopenharmony_ci return 0; 20862306a36Sopenharmony_ci} 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_cistatic int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) 21162306a36Sopenharmony_ci{ 21262306a36Sopenharmony_ci if (nodes_empty(*nodes)) 21362306a36Sopenharmony_ci return -EINVAL; 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci nodes_clear(pol->nodes); 21662306a36Sopenharmony_ci node_set(first_node(*nodes), pol->nodes); 21762306a36Sopenharmony_ci return 0; 21862306a36Sopenharmony_ci} 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci/* 22162306a36Sopenharmony_ci * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if 22262306a36Sopenharmony_ci * any, for the new policy. mpol_new() has already validated the nodes 22362306a36Sopenharmony_ci * parameter with respect to the policy mode and flags. 22462306a36Sopenharmony_ci * 22562306a36Sopenharmony_ci * Must be called holding task's alloc_lock to protect task's mems_allowed 22662306a36Sopenharmony_ci * and mempolicy. May also be called holding the mmap_lock for write. 22762306a36Sopenharmony_ci */ 22862306a36Sopenharmony_cistatic int mpol_set_nodemask(struct mempolicy *pol, 22962306a36Sopenharmony_ci const nodemask_t *nodes, struct nodemask_scratch *nsc) 23062306a36Sopenharmony_ci{ 23162306a36Sopenharmony_ci int ret; 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci /* 23462306a36Sopenharmony_ci * Default (pol==NULL) resp. local memory policies are not a 23562306a36Sopenharmony_ci * subject of any remapping. They also do not need any special 23662306a36Sopenharmony_ci * constructor. 23762306a36Sopenharmony_ci */ 23862306a36Sopenharmony_ci if (!pol || pol->mode == MPOL_LOCAL) 23962306a36Sopenharmony_ci return 0; 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci /* Check N_MEMORY */ 24262306a36Sopenharmony_ci nodes_and(nsc->mask1, 24362306a36Sopenharmony_ci cpuset_current_mems_allowed, node_states[N_MEMORY]); 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci VM_BUG_ON(!nodes); 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci if (pol->flags & MPOL_F_RELATIVE_NODES) 24862306a36Sopenharmony_ci mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); 24962306a36Sopenharmony_ci else 25062306a36Sopenharmony_ci nodes_and(nsc->mask2, *nodes, nsc->mask1); 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci if (mpol_store_user_nodemask(pol)) 25362306a36Sopenharmony_ci pol->w.user_nodemask = *nodes; 25462306a36Sopenharmony_ci else 25562306a36Sopenharmony_ci pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); 25862306a36Sopenharmony_ci return ret; 25962306a36Sopenharmony_ci} 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci/* 26262306a36Sopenharmony_ci * This function just creates a new policy, does some check and simple 26362306a36Sopenharmony_ci * initialization. You must invoke mpol_set_nodemask() to set nodes. 26462306a36Sopenharmony_ci */ 26562306a36Sopenharmony_cistatic struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 26662306a36Sopenharmony_ci nodemask_t *nodes) 26762306a36Sopenharmony_ci{ 26862306a36Sopenharmony_ci struct mempolicy *policy; 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci pr_debug("setting mode %d flags %d nodes[0] %lx\n", 27162306a36Sopenharmony_ci mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci if (mode == MPOL_DEFAULT) { 27462306a36Sopenharmony_ci if (nodes && !nodes_empty(*nodes)) 27562306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 27662306a36Sopenharmony_ci return NULL; 27762306a36Sopenharmony_ci } 27862306a36Sopenharmony_ci VM_BUG_ON(!nodes); 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci /* 28162306a36Sopenharmony_ci * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or 28262306a36Sopenharmony_ci * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). 28362306a36Sopenharmony_ci * All other modes require a valid pointer to a non-empty nodemask. 28462306a36Sopenharmony_ci */ 28562306a36Sopenharmony_ci if (mode == MPOL_PREFERRED) { 28662306a36Sopenharmony_ci if (nodes_empty(*nodes)) { 28762306a36Sopenharmony_ci if (((flags & MPOL_F_STATIC_NODES) || 28862306a36Sopenharmony_ci (flags & MPOL_F_RELATIVE_NODES))) 28962306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci mode = MPOL_LOCAL; 29262306a36Sopenharmony_ci } 29362306a36Sopenharmony_ci } else if (mode == MPOL_LOCAL) { 29462306a36Sopenharmony_ci if (!nodes_empty(*nodes) || 29562306a36Sopenharmony_ci (flags & MPOL_F_STATIC_NODES) || 29662306a36Sopenharmony_ci (flags & MPOL_F_RELATIVE_NODES)) 29762306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 29862306a36Sopenharmony_ci } else if (nodes_empty(*nodes)) 29962306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 30062306a36Sopenharmony_ci policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 30162306a36Sopenharmony_ci if (!policy) 30262306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 30362306a36Sopenharmony_ci atomic_set(&policy->refcnt, 1); 30462306a36Sopenharmony_ci policy->mode = mode; 30562306a36Sopenharmony_ci policy->flags = flags; 30662306a36Sopenharmony_ci policy->home_node = NUMA_NO_NODE; 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci return policy; 30962306a36Sopenharmony_ci} 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci/* Slow path of a mpol destructor. */ 31262306a36Sopenharmony_civoid __mpol_put(struct mempolicy *p) 31362306a36Sopenharmony_ci{ 31462306a36Sopenharmony_ci if (!atomic_dec_and_test(&p->refcnt)) 31562306a36Sopenharmony_ci return; 31662306a36Sopenharmony_ci kmem_cache_free(policy_cache, p); 31762306a36Sopenharmony_ci} 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_cistatic void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) 32062306a36Sopenharmony_ci{ 32162306a36Sopenharmony_ci} 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_cistatic void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 32462306a36Sopenharmony_ci{ 32562306a36Sopenharmony_ci nodemask_t tmp; 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci if (pol->flags & MPOL_F_STATIC_NODES) 32862306a36Sopenharmony_ci nodes_and(tmp, pol->w.user_nodemask, *nodes); 32962306a36Sopenharmony_ci else if (pol->flags & MPOL_F_RELATIVE_NODES) 33062306a36Sopenharmony_ci mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); 33162306a36Sopenharmony_ci else { 33262306a36Sopenharmony_ci nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, 33362306a36Sopenharmony_ci *nodes); 33462306a36Sopenharmony_ci pol->w.cpuset_mems_allowed = *nodes; 33562306a36Sopenharmony_ci } 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci if (nodes_empty(tmp)) 33862306a36Sopenharmony_ci tmp = *nodes; 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci pol->nodes = tmp; 34162306a36Sopenharmony_ci} 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_cistatic void mpol_rebind_preferred(struct mempolicy *pol, 34462306a36Sopenharmony_ci const nodemask_t *nodes) 34562306a36Sopenharmony_ci{ 34662306a36Sopenharmony_ci pol->w.cpuset_mems_allowed = *nodes; 34762306a36Sopenharmony_ci} 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci/* 35062306a36Sopenharmony_ci * mpol_rebind_policy - Migrate a policy to a different set of nodes 35162306a36Sopenharmony_ci * 35262306a36Sopenharmony_ci * Per-vma policies are protected by mmap_lock. Allocations using per-task 35362306a36Sopenharmony_ci * policies are protected by task->mems_allowed_seq to prevent a premature 35462306a36Sopenharmony_ci * OOM/allocation failure due to parallel nodemask modification. 35562306a36Sopenharmony_ci */ 35662306a36Sopenharmony_cistatic void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) 35762306a36Sopenharmony_ci{ 35862306a36Sopenharmony_ci if (!pol || pol->mode == MPOL_LOCAL) 35962306a36Sopenharmony_ci return; 36062306a36Sopenharmony_ci if (!mpol_store_user_nodemask(pol) && 36162306a36Sopenharmony_ci nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 36262306a36Sopenharmony_ci return; 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_ci mpol_ops[pol->mode].rebind(pol, newmask); 36562306a36Sopenharmony_ci} 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci/* 36862306a36Sopenharmony_ci * Wrapper for mpol_rebind_policy() that just requires task 36962306a36Sopenharmony_ci * pointer, and updates task mempolicy. 37062306a36Sopenharmony_ci * 37162306a36Sopenharmony_ci * Called with task's alloc_lock held. 37262306a36Sopenharmony_ci */ 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_civoid mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 37562306a36Sopenharmony_ci{ 37662306a36Sopenharmony_ci mpol_rebind_policy(tsk->mempolicy, new); 37762306a36Sopenharmony_ci} 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci/* 38062306a36Sopenharmony_ci * Rebind each vma in mm to new nodemask. 38162306a36Sopenharmony_ci * 38262306a36Sopenharmony_ci * Call holding a reference to mm. Takes mm->mmap_lock during call. 38362306a36Sopenharmony_ci */ 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_civoid mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 38662306a36Sopenharmony_ci{ 38762306a36Sopenharmony_ci struct vm_area_struct *vma; 38862306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm, 0); 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci mmap_write_lock(mm); 39162306a36Sopenharmony_ci for_each_vma(vmi, vma) { 39262306a36Sopenharmony_ci vma_start_write(vma); 39362306a36Sopenharmony_ci mpol_rebind_policy(vma->vm_policy, new); 39462306a36Sopenharmony_ci } 39562306a36Sopenharmony_ci mmap_write_unlock(mm); 39662306a36Sopenharmony_ci} 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_cistatic const struct mempolicy_operations mpol_ops[MPOL_MAX] = { 39962306a36Sopenharmony_ci [MPOL_DEFAULT] = { 40062306a36Sopenharmony_ci .rebind = mpol_rebind_default, 40162306a36Sopenharmony_ci }, 40262306a36Sopenharmony_ci [MPOL_INTERLEAVE] = { 40362306a36Sopenharmony_ci .create = mpol_new_nodemask, 40462306a36Sopenharmony_ci .rebind = mpol_rebind_nodemask, 40562306a36Sopenharmony_ci }, 40662306a36Sopenharmony_ci [MPOL_PREFERRED] = { 40762306a36Sopenharmony_ci .create = mpol_new_preferred, 40862306a36Sopenharmony_ci .rebind = mpol_rebind_preferred, 40962306a36Sopenharmony_ci }, 41062306a36Sopenharmony_ci [MPOL_BIND] = { 41162306a36Sopenharmony_ci .create = mpol_new_nodemask, 41262306a36Sopenharmony_ci .rebind = mpol_rebind_nodemask, 41362306a36Sopenharmony_ci }, 41462306a36Sopenharmony_ci [MPOL_LOCAL] = { 41562306a36Sopenharmony_ci .rebind = mpol_rebind_default, 41662306a36Sopenharmony_ci }, 41762306a36Sopenharmony_ci [MPOL_PREFERRED_MANY] = { 41862306a36Sopenharmony_ci .create = mpol_new_nodemask, 41962306a36Sopenharmony_ci .rebind = mpol_rebind_preferred, 42062306a36Sopenharmony_ci }, 42162306a36Sopenharmony_ci}; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_cistatic int migrate_folio_add(struct folio *folio, struct list_head *foliolist, 42462306a36Sopenharmony_ci unsigned long flags); 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_cistruct queue_pages { 42762306a36Sopenharmony_ci struct list_head *pagelist; 42862306a36Sopenharmony_ci unsigned long flags; 42962306a36Sopenharmony_ci nodemask_t *nmask; 43062306a36Sopenharmony_ci unsigned long start; 43162306a36Sopenharmony_ci unsigned long end; 43262306a36Sopenharmony_ci struct vm_area_struct *first; 43362306a36Sopenharmony_ci bool has_unmovable; 43462306a36Sopenharmony_ci}; 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci/* 43762306a36Sopenharmony_ci * Check if the folio's nid is in qp->nmask. 43862306a36Sopenharmony_ci * 43962306a36Sopenharmony_ci * If MPOL_MF_INVERT is set in qp->flags, check if the nid is 44062306a36Sopenharmony_ci * in the invert of qp->nmask. 44162306a36Sopenharmony_ci */ 44262306a36Sopenharmony_cistatic inline bool queue_folio_required(struct folio *folio, 44362306a36Sopenharmony_ci struct queue_pages *qp) 44462306a36Sopenharmony_ci{ 44562306a36Sopenharmony_ci int nid = folio_nid(folio); 44662306a36Sopenharmony_ci unsigned long flags = qp->flags; 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); 44962306a36Sopenharmony_ci} 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci/* 45262306a36Sopenharmony_ci * queue_folios_pmd() has three possible return values: 45362306a36Sopenharmony_ci * 0 - folios are placed on the right node or queued successfully, or 45462306a36Sopenharmony_ci * special page is met, i.e. zero page, or unmovable page is found 45562306a36Sopenharmony_ci * but continue walking (indicated by queue_pages.has_unmovable). 45662306a36Sopenharmony_ci * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an 45762306a36Sopenharmony_ci * existing folio was already on a node that does not follow the 45862306a36Sopenharmony_ci * policy. 45962306a36Sopenharmony_ci */ 46062306a36Sopenharmony_cistatic int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, 46162306a36Sopenharmony_ci unsigned long end, struct mm_walk *walk) 46262306a36Sopenharmony_ci __releases(ptl) 46362306a36Sopenharmony_ci{ 46462306a36Sopenharmony_ci int ret = 0; 46562306a36Sopenharmony_ci struct folio *folio; 46662306a36Sopenharmony_ci struct queue_pages *qp = walk->private; 46762306a36Sopenharmony_ci unsigned long flags; 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci if (unlikely(is_pmd_migration_entry(*pmd))) { 47062306a36Sopenharmony_ci ret = -EIO; 47162306a36Sopenharmony_ci goto unlock; 47262306a36Sopenharmony_ci } 47362306a36Sopenharmony_ci folio = pfn_folio(pmd_pfn(*pmd)); 47462306a36Sopenharmony_ci if (is_huge_zero_page(&folio->page)) { 47562306a36Sopenharmony_ci walk->action = ACTION_CONTINUE; 47662306a36Sopenharmony_ci goto unlock; 47762306a36Sopenharmony_ci } 47862306a36Sopenharmony_ci if (!queue_folio_required(folio, qp)) 47962306a36Sopenharmony_ci goto unlock; 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci flags = qp->flags; 48262306a36Sopenharmony_ci /* go to folio migration */ 48362306a36Sopenharmony_ci if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 48462306a36Sopenharmony_ci if (!vma_migratable(walk->vma) || 48562306a36Sopenharmony_ci migrate_folio_add(folio, qp->pagelist, flags)) { 48662306a36Sopenharmony_ci qp->has_unmovable = true; 48762306a36Sopenharmony_ci goto unlock; 48862306a36Sopenharmony_ci } 48962306a36Sopenharmony_ci } else 49062306a36Sopenharmony_ci ret = -EIO; 49162306a36Sopenharmony_ciunlock: 49262306a36Sopenharmony_ci spin_unlock(ptl); 49362306a36Sopenharmony_ci return ret; 49462306a36Sopenharmony_ci} 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci/* 49762306a36Sopenharmony_ci * Scan through pages checking if pages follow certain conditions, 49862306a36Sopenharmony_ci * and move them to the pagelist if they do. 49962306a36Sopenharmony_ci * 50062306a36Sopenharmony_ci * queue_folios_pte_range() has three possible return values: 50162306a36Sopenharmony_ci * 0 - folios are placed on the right node or queued successfully, or 50262306a36Sopenharmony_ci * special page is met, i.e. zero page, or unmovable page is found 50362306a36Sopenharmony_ci * but continue walking (indicated by queue_pages.has_unmovable). 50462306a36Sopenharmony_ci * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already 50562306a36Sopenharmony_ci * on a node that does not follow the policy. 50662306a36Sopenharmony_ci */ 50762306a36Sopenharmony_cistatic int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, 50862306a36Sopenharmony_ci unsigned long end, struct mm_walk *walk) 50962306a36Sopenharmony_ci{ 51062306a36Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 51162306a36Sopenharmony_ci struct folio *folio; 51262306a36Sopenharmony_ci struct queue_pages *qp = walk->private; 51362306a36Sopenharmony_ci unsigned long flags = qp->flags; 51462306a36Sopenharmony_ci pte_t *pte, *mapped_pte; 51562306a36Sopenharmony_ci pte_t ptent; 51662306a36Sopenharmony_ci spinlock_t *ptl; 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci ptl = pmd_trans_huge_lock(pmd, vma); 51962306a36Sopenharmony_ci if (ptl) 52062306a36Sopenharmony_ci return queue_folios_pmd(pmd, ptl, addr, end, walk); 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_ci mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 52362306a36Sopenharmony_ci if (!pte) { 52462306a36Sopenharmony_ci walk->action = ACTION_AGAIN; 52562306a36Sopenharmony_ci return 0; 52662306a36Sopenharmony_ci } 52762306a36Sopenharmony_ci for (; addr != end; pte++, addr += PAGE_SIZE) { 52862306a36Sopenharmony_ci ptent = ptep_get(pte); 52962306a36Sopenharmony_ci if (!pte_present(ptent)) 53062306a36Sopenharmony_ci continue; 53162306a36Sopenharmony_ci folio = vm_normal_folio(vma, addr, ptent); 53262306a36Sopenharmony_ci if (!folio || folio_is_zone_device(folio)) 53362306a36Sopenharmony_ci continue; 53462306a36Sopenharmony_ci /* 53562306a36Sopenharmony_ci * vm_normal_folio() filters out zero pages, but there might 53662306a36Sopenharmony_ci * still be reserved folios to skip, perhaps in a VDSO. 53762306a36Sopenharmony_ci */ 53862306a36Sopenharmony_ci if (folio_test_reserved(folio)) 53962306a36Sopenharmony_ci continue; 54062306a36Sopenharmony_ci if (!queue_folio_required(folio, qp)) 54162306a36Sopenharmony_ci continue; 54262306a36Sopenharmony_ci if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 54362306a36Sopenharmony_ci /* 54462306a36Sopenharmony_ci * MPOL_MF_STRICT must be specified if we get here. 54562306a36Sopenharmony_ci * Continue walking vmas due to MPOL_MF_MOVE* flags. 54662306a36Sopenharmony_ci */ 54762306a36Sopenharmony_ci if (!vma_migratable(vma)) 54862306a36Sopenharmony_ci qp->has_unmovable = true; 54962306a36Sopenharmony_ci 55062306a36Sopenharmony_ci /* 55162306a36Sopenharmony_ci * Do not abort immediately since there may be 55262306a36Sopenharmony_ci * temporary off LRU pages in the range. Still 55362306a36Sopenharmony_ci * need migrate other LRU pages. 55462306a36Sopenharmony_ci */ 55562306a36Sopenharmony_ci if (migrate_folio_add(folio, qp->pagelist, flags)) 55662306a36Sopenharmony_ci qp->has_unmovable = true; 55762306a36Sopenharmony_ci } else 55862306a36Sopenharmony_ci break; 55962306a36Sopenharmony_ci } 56062306a36Sopenharmony_ci pte_unmap_unlock(mapped_pte, ptl); 56162306a36Sopenharmony_ci cond_resched(); 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci return addr != end ? -EIO : 0; 56462306a36Sopenharmony_ci} 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_cistatic int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, 56762306a36Sopenharmony_ci unsigned long addr, unsigned long end, 56862306a36Sopenharmony_ci struct mm_walk *walk) 56962306a36Sopenharmony_ci{ 57062306a36Sopenharmony_ci int ret = 0; 57162306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 57262306a36Sopenharmony_ci struct queue_pages *qp = walk->private; 57362306a36Sopenharmony_ci unsigned long flags = (qp->flags & MPOL_MF_VALID); 57462306a36Sopenharmony_ci struct folio *folio; 57562306a36Sopenharmony_ci spinlock_t *ptl; 57662306a36Sopenharmony_ci pte_t entry; 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 57962306a36Sopenharmony_ci entry = huge_ptep_get(pte); 58062306a36Sopenharmony_ci if (!pte_present(entry)) 58162306a36Sopenharmony_ci goto unlock; 58262306a36Sopenharmony_ci folio = pfn_folio(pte_pfn(entry)); 58362306a36Sopenharmony_ci if (!queue_folio_required(folio, qp)) 58462306a36Sopenharmony_ci goto unlock; 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_ci if (flags == MPOL_MF_STRICT) { 58762306a36Sopenharmony_ci /* 58862306a36Sopenharmony_ci * STRICT alone means only detecting misplaced folio and no 58962306a36Sopenharmony_ci * need to further check other vma. 59062306a36Sopenharmony_ci */ 59162306a36Sopenharmony_ci ret = -EIO; 59262306a36Sopenharmony_ci goto unlock; 59362306a36Sopenharmony_ci } 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci if (!vma_migratable(walk->vma)) { 59662306a36Sopenharmony_ci /* 59762306a36Sopenharmony_ci * Must be STRICT with MOVE*, otherwise .test_walk() have 59862306a36Sopenharmony_ci * stopped walking current vma. 59962306a36Sopenharmony_ci * Detecting misplaced folio but allow migrating folios which 60062306a36Sopenharmony_ci * have been queued. 60162306a36Sopenharmony_ci */ 60262306a36Sopenharmony_ci qp->has_unmovable = true; 60362306a36Sopenharmony_ci goto unlock; 60462306a36Sopenharmony_ci } 60562306a36Sopenharmony_ci 60662306a36Sopenharmony_ci /* 60762306a36Sopenharmony_ci * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it 60862306a36Sopenharmony_ci * is shared it is likely not worth migrating. 60962306a36Sopenharmony_ci * 61062306a36Sopenharmony_ci * To check if the folio is shared, ideally we want to make sure 61162306a36Sopenharmony_ci * every page is mapped to the same process. Doing that is very 61262306a36Sopenharmony_ci * expensive, so check the estimated mapcount of the folio instead. 61362306a36Sopenharmony_ci */ 61462306a36Sopenharmony_ci if (flags & (MPOL_MF_MOVE_ALL) || 61562306a36Sopenharmony_ci (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && 61662306a36Sopenharmony_ci !hugetlb_pmd_shared(pte))) { 61762306a36Sopenharmony_ci if (!isolate_hugetlb(folio, qp->pagelist) && 61862306a36Sopenharmony_ci (flags & MPOL_MF_STRICT)) 61962306a36Sopenharmony_ci /* 62062306a36Sopenharmony_ci * Failed to isolate folio but allow migrating pages 62162306a36Sopenharmony_ci * which have been queued. 62262306a36Sopenharmony_ci */ 62362306a36Sopenharmony_ci qp->has_unmovable = true; 62462306a36Sopenharmony_ci } 62562306a36Sopenharmony_ciunlock: 62662306a36Sopenharmony_ci spin_unlock(ptl); 62762306a36Sopenharmony_ci#else 62862306a36Sopenharmony_ci BUG(); 62962306a36Sopenharmony_ci#endif 63062306a36Sopenharmony_ci return ret; 63162306a36Sopenharmony_ci} 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci#ifdef CONFIG_NUMA_BALANCING 63462306a36Sopenharmony_ci/* 63562306a36Sopenharmony_ci * This is used to mark a range of virtual addresses to be inaccessible. 63662306a36Sopenharmony_ci * These are later cleared by a NUMA hinting fault. Depending on these 63762306a36Sopenharmony_ci * faults, pages may be migrated for better NUMA placement. 63862306a36Sopenharmony_ci * 63962306a36Sopenharmony_ci * This is assuming that NUMA faults are handled using PROT_NONE. If 64062306a36Sopenharmony_ci * an architecture makes a different choice, it will need further 64162306a36Sopenharmony_ci * changes to the core. 64262306a36Sopenharmony_ci */ 64362306a36Sopenharmony_ciunsigned long change_prot_numa(struct vm_area_struct *vma, 64462306a36Sopenharmony_ci unsigned long addr, unsigned long end) 64562306a36Sopenharmony_ci{ 64662306a36Sopenharmony_ci struct mmu_gather tlb; 64762306a36Sopenharmony_ci long nr_updated; 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_ci tlb_gather_mmu(&tlb, vma->vm_mm); 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_ci nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); 65262306a36Sopenharmony_ci if (nr_updated > 0) 65362306a36Sopenharmony_ci count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_ci tlb_finish_mmu(&tlb); 65662306a36Sopenharmony_ci 65762306a36Sopenharmony_ci return nr_updated; 65862306a36Sopenharmony_ci} 65962306a36Sopenharmony_ci#else 66062306a36Sopenharmony_cistatic unsigned long change_prot_numa(struct vm_area_struct *vma, 66162306a36Sopenharmony_ci unsigned long addr, unsigned long end) 66262306a36Sopenharmony_ci{ 66362306a36Sopenharmony_ci return 0; 66462306a36Sopenharmony_ci} 66562306a36Sopenharmony_ci#endif /* CONFIG_NUMA_BALANCING */ 66662306a36Sopenharmony_ci 66762306a36Sopenharmony_cistatic int queue_pages_test_walk(unsigned long start, unsigned long end, 66862306a36Sopenharmony_ci struct mm_walk *walk) 66962306a36Sopenharmony_ci{ 67062306a36Sopenharmony_ci struct vm_area_struct *next, *vma = walk->vma; 67162306a36Sopenharmony_ci struct queue_pages *qp = walk->private; 67262306a36Sopenharmony_ci unsigned long endvma = vma->vm_end; 67362306a36Sopenharmony_ci unsigned long flags = qp->flags; 67462306a36Sopenharmony_ci 67562306a36Sopenharmony_ci /* range check first */ 67662306a36Sopenharmony_ci VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci if (!qp->first) { 67962306a36Sopenharmony_ci qp->first = vma; 68062306a36Sopenharmony_ci if (!(flags & MPOL_MF_DISCONTIG_OK) && 68162306a36Sopenharmony_ci (qp->start < vma->vm_start)) 68262306a36Sopenharmony_ci /* hole at head side of range */ 68362306a36Sopenharmony_ci return -EFAULT; 68462306a36Sopenharmony_ci } 68562306a36Sopenharmony_ci next = find_vma(vma->vm_mm, vma->vm_end); 68662306a36Sopenharmony_ci if (!(flags & MPOL_MF_DISCONTIG_OK) && 68762306a36Sopenharmony_ci ((vma->vm_end < qp->end) && 68862306a36Sopenharmony_ci (!next || vma->vm_end < next->vm_start))) 68962306a36Sopenharmony_ci /* hole at middle or tail of range */ 69062306a36Sopenharmony_ci return -EFAULT; 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci /* 69362306a36Sopenharmony_ci * Need check MPOL_MF_STRICT to return -EIO if possible 69462306a36Sopenharmony_ci * regardless of vma_migratable 69562306a36Sopenharmony_ci */ 69662306a36Sopenharmony_ci if (!vma_migratable(vma) && 69762306a36Sopenharmony_ci !(flags & MPOL_MF_STRICT)) 69862306a36Sopenharmony_ci return 1; 69962306a36Sopenharmony_ci 70062306a36Sopenharmony_ci if (endvma > end) 70162306a36Sopenharmony_ci endvma = end; 70262306a36Sopenharmony_ci 70362306a36Sopenharmony_ci if (flags & MPOL_MF_LAZY) { 70462306a36Sopenharmony_ci /* Similar to task_numa_work, skip inaccessible VMAs */ 70562306a36Sopenharmony_ci if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && 70662306a36Sopenharmony_ci !(vma->vm_flags & VM_MIXEDMAP)) 70762306a36Sopenharmony_ci change_prot_numa(vma, start, endvma); 70862306a36Sopenharmony_ci return 1; 70962306a36Sopenharmony_ci } 71062306a36Sopenharmony_ci 71162306a36Sopenharmony_ci /* queue pages from current vma */ 71262306a36Sopenharmony_ci if (flags & MPOL_MF_VALID) 71362306a36Sopenharmony_ci return 0; 71462306a36Sopenharmony_ci return 1; 71562306a36Sopenharmony_ci} 71662306a36Sopenharmony_ci 71762306a36Sopenharmony_cistatic const struct mm_walk_ops queue_pages_walk_ops = { 71862306a36Sopenharmony_ci .hugetlb_entry = queue_folios_hugetlb, 71962306a36Sopenharmony_ci .pmd_entry = queue_folios_pte_range, 72062306a36Sopenharmony_ci .test_walk = queue_pages_test_walk, 72162306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 72262306a36Sopenharmony_ci}; 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_cistatic const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { 72562306a36Sopenharmony_ci .hugetlb_entry = queue_folios_hugetlb, 72662306a36Sopenharmony_ci .pmd_entry = queue_folios_pte_range, 72762306a36Sopenharmony_ci .test_walk = queue_pages_test_walk, 72862306a36Sopenharmony_ci .walk_lock = PGWALK_WRLOCK, 72962306a36Sopenharmony_ci}; 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_ci/* 73262306a36Sopenharmony_ci * Walk through page tables and collect pages to be migrated. 73362306a36Sopenharmony_ci * 73462306a36Sopenharmony_ci * If pages found in a given range are on a set of nodes (determined by 73562306a36Sopenharmony_ci * @nodes and @flags,) it's isolated and queued to the pagelist which is 73662306a36Sopenharmony_ci * passed via @private. 73762306a36Sopenharmony_ci * 73862306a36Sopenharmony_ci * queue_pages_range() has three possible return values: 73962306a36Sopenharmony_ci * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were 74062306a36Sopenharmony_ci * specified. 74162306a36Sopenharmony_ci * 0 - queue pages successfully or no misplaced page. 74262306a36Sopenharmony_ci * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or 74362306a36Sopenharmony_ci * memory range specified by nodemask and maxnode points outside 74462306a36Sopenharmony_ci * your accessible address space (-EFAULT) 74562306a36Sopenharmony_ci */ 74662306a36Sopenharmony_cistatic int 74762306a36Sopenharmony_ciqueue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 74862306a36Sopenharmony_ci nodemask_t *nodes, unsigned long flags, 74962306a36Sopenharmony_ci struct list_head *pagelist, bool lock_vma) 75062306a36Sopenharmony_ci{ 75162306a36Sopenharmony_ci int err; 75262306a36Sopenharmony_ci struct queue_pages qp = { 75362306a36Sopenharmony_ci .pagelist = pagelist, 75462306a36Sopenharmony_ci .flags = flags, 75562306a36Sopenharmony_ci .nmask = nodes, 75662306a36Sopenharmony_ci .start = start, 75762306a36Sopenharmony_ci .end = end, 75862306a36Sopenharmony_ci .first = NULL, 75962306a36Sopenharmony_ci .has_unmovable = false, 76062306a36Sopenharmony_ci }; 76162306a36Sopenharmony_ci const struct mm_walk_ops *ops = lock_vma ? 76262306a36Sopenharmony_ci &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci err = walk_page_range(mm, start, end, ops, &qp); 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci if (qp.has_unmovable) 76762306a36Sopenharmony_ci err = 1; 76862306a36Sopenharmony_ci if (!qp.first) 76962306a36Sopenharmony_ci /* whole range in hole */ 77062306a36Sopenharmony_ci err = -EFAULT; 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci return err; 77362306a36Sopenharmony_ci} 77462306a36Sopenharmony_ci 77562306a36Sopenharmony_ci/* 77662306a36Sopenharmony_ci * Apply policy to a single VMA 77762306a36Sopenharmony_ci * This must be called with the mmap_lock held for writing. 77862306a36Sopenharmony_ci */ 77962306a36Sopenharmony_cistatic int vma_replace_policy(struct vm_area_struct *vma, 78062306a36Sopenharmony_ci struct mempolicy *pol) 78162306a36Sopenharmony_ci{ 78262306a36Sopenharmony_ci int err; 78362306a36Sopenharmony_ci struct mempolicy *old; 78462306a36Sopenharmony_ci struct mempolicy *new; 78562306a36Sopenharmony_ci 78662306a36Sopenharmony_ci vma_assert_write_locked(vma); 78762306a36Sopenharmony_ci 78862306a36Sopenharmony_ci pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 78962306a36Sopenharmony_ci vma->vm_start, vma->vm_end, vma->vm_pgoff, 79062306a36Sopenharmony_ci vma->vm_ops, vma->vm_file, 79162306a36Sopenharmony_ci vma->vm_ops ? vma->vm_ops->set_policy : NULL); 79262306a36Sopenharmony_ci 79362306a36Sopenharmony_ci new = mpol_dup(pol); 79462306a36Sopenharmony_ci if (IS_ERR(new)) 79562306a36Sopenharmony_ci return PTR_ERR(new); 79662306a36Sopenharmony_ci 79762306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->set_policy) { 79862306a36Sopenharmony_ci err = vma->vm_ops->set_policy(vma, new); 79962306a36Sopenharmony_ci if (err) 80062306a36Sopenharmony_ci goto err_out; 80162306a36Sopenharmony_ci } 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci old = vma->vm_policy; 80462306a36Sopenharmony_ci vma->vm_policy = new; /* protected by mmap_lock */ 80562306a36Sopenharmony_ci mpol_put(old); 80662306a36Sopenharmony_ci 80762306a36Sopenharmony_ci return 0; 80862306a36Sopenharmony_ci err_out: 80962306a36Sopenharmony_ci mpol_put(new); 81062306a36Sopenharmony_ci return err; 81162306a36Sopenharmony_ci} 81262306a36Sopenharmony_ci 81362306a36Sopenharmony_ci/* Split or merge the VMA (if required) and apply the new policy */ 81462306a36Sopenharmony_cistatic int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, 81562306a36Sopenharmony_ci struct vm_area_struct **prev, unsigned long start, 81662306a36Sopenharmony_ci unsigned long end, struct mempolicy *new_pol) 81762306a36Sopenharmony_ci{ 81862306a36Sopenharmony_ci struct vm_area_struct *merged; 81962306a36Sopenharmony_ci unsigned long vmstart, vmend; 82062306a36Sopenharmony_ci pgoff_t pgoff; 82162306a36Sopenharmony_ci int err; 82262306a36Sopenharmony_ci 82362306a36Sopenharmony_ci vmend = min(end, vma->vm_end); 82462306a36Sopenharmony_ci if (start > vma->vm_start) { 82562306a36Sopenharmony_ci *prev = vma; 82662306a36Sopenharmony_ci vmstart = start; 82762306a36Sopenharmony_ci } else { 82862306a36Sopenharmony_ci vmstart = vma->vm_start; 82962306a36Sopenharmony_ci } 83062306a36Sopenharmony_ci 83162306a36Sopenharmony_ci if (mpol_equal(vma_policy(vma), new_pol)) { 83262306a36Sopenharmony_ci *prev = vma; 83362306a36Sopenharmony_ci return 0; 83462306a36Sopenharmony_ci } 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); 83762306a36Sopenharmony_ci merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, 83862306a36Sopenharmony_ci vma->anon_vma, vma->vm_file, pgoff, new_pol, 83962306a36Sopenharmony_ci vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 84062306a36Sopenharmony_ci if (merged) { 84162306a36Sopenharmony_ci *prev = merged; 84262306a36Sopenharmony_ci return vma_replace_policy(merged, new_pol); 84362306a36Sopenharmony_ci } 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci if (vma->vm_start != vmstart) { 84662306a36Sopenharmony_ci err = split_vma(vmi, vma, vmstart, 1); 84762306a36Sopenharmony_ci if (err) 84862306a36Sopenharmony_ci return err; 84962306a36Sopenharmony_ci } 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci if (vma->vm_end != vmend) { 85262306a36Sopenharmony_ci err = split_vma(vmi, vma, vmend, 0); 85362306a36Sopenharmony_ci if (err) 85462306a36Sopenharmony_ci return err; 85562306a36Sopenharmony_ci } 85662306a36Sopenharmony_ci 85762306a36Sopenharmony_ci *prev = vma; 85862306a36Sopenharmony_ci return vma_replace_policy(vma, new_pol); 85962306a36Sopenharmony_ci} 86062306a36Sopenharmony_ci 86162306a36Sopenharmony_ci/* Set the process memory policy */ 86262306a36Sopenharmony_cistatic long do_set_mempolicy(unsigned short mode, unsigned short flags, 86362306a36Sopenharmony_ci nodemask_t *nodes) 86462306a36Sopenharmony_ci{ 86562306a36Sopenharmony_ci struct mempolicy *new, *old; 86662306a36Sopenharmony_ci NODEMASK_SCRATCH(scratch); 86762306a36Sopenharmony_ci int ret; 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci if (!scratch) 87062306a36Sopenharmony_ci return -ENOMEM; 87162306a36Sopenharmony_ci 87262306a36Sopenharmony_ci new = mpol_new(mode, flags, nodes); 87362306a36Sopenharmony_ci if (IS_ERR(new)) { 87462306a36Sopenharmony_ci ret = PTR_ERR(new); 87562306a36Sopenharmony_ci goto out; 87662306a36Sopenharmony_ci } 87762306a36Sopenharmony_ci 87862306a36Sopenharmony_ci task_lock(current); 87962306a36Sopenharmony_ci ret = mpol_set_nodemask(new, nodes, scratch); 88062306a36Sopenharmony_ci if (ret) { 88162306a36Sopenharmony_ci task_unlock(current); 88262306a36Sopenharmony_ci mpol_put(new); 88362306a36Sopenharmony_ci goto out; 88462306a36Sopenharmony_ci } 88562306a36Sopenharmony_ci 88662306a36Sopenharmony_ci old = current->mempolicy; 88762306a36Sopenharmony_ci current->mempolicy = new; 88862306a36Sopenharmony_ci if (new && new->mode == MPOL_INTERLEAVE) 88962306a36Sopenharmony_ci current->il_prev = MAX_NUMNODES-1; 89062306a36Sopenharmony_ci task_unlock(current); 89162306a36Sopenharmony_ci mpol_put(old); 89262306a36Sopenharmony_ci ret = 0; 89362306a36Sopenharmony_ciout: 89462306a36Sopenharmony_ci NODEMASK_SCRATCH_FREE(scratch); 89562306a36Sopenharmony_ci return ret; 89662306a36Sopenharmony_ci} 89762306a36Sopenharmony_ci 89862306a36Sopenharmony_ci/* 89962306a36Sopenharmony_ci * Return nodemask for policy for get_mempolicy() query 90062306a36Sopenharmony_ci * 90162306a36Sopenharmony_ci * Called with task's alloc_lock held 90262306a36Sopenharmony_ci */ 90362306a36Sopenharmony_cistatic void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 90462306a36Sopenharmony_ci{ 90562306a36Sopenharmony_ci nodes_clear(*nodes); 90662306a36Sopenharmony_ci if (p == &default_policy) 90762306a36Sopenharmony_ci return; 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_ci switch (p->mode) { 91062306a36Sopenharmony_ci case MPOL_BIND: 91162306a36Sopenharmony_ci case MPOL_INTERLEAVE: 91262306a36Sopenharmony_ci case MPOL_PREFERRED: 91362306a36Sopenharmony_ci case MPOL_PREFERRED_MANY: 91462306a36Sopenharmony_ci *nodes = p->nodes; 91562306a36Sopenharmony_ci break; 91662306a36Sopenharmony_ci case MPOL_LOCAL: 91762306a36Sopenharmony_ci /* return empty node mask for local allocation */ 91862306a36Sopenharmony_ci break; 91962306a36Sopenharmony_ci default: 92062306a36Sopenharmony_ci BUG(); 92162306a36Sopenharmony_ci } 92262306a36Sopenharmony_ci} 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_cistatic int lookup_node(struct mm_struct *mm, unsigned long addr) 92562306a36Sopenharmony_ci{ 92662306a36Sopenharmony_ci struct page *p = NULL; 92762306a36Sopenharmony_ci int ret; 92862306a36Sopenharmony_ci 92962306a36Sopenharmony_ci ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); 93062306a36Sopenharmony_ci if (ret > 0) { 93162306a36Sopenharmony_ci ret = page_to_nid(p); 93262306a36Sopenharmony_ci put_page(p); 93362306a36Sopenharmony_ci } 93462306a36Sopenharmony_ci return ret; 93562306a36Sopenharmony_ci} 93662306a36Sopenharmony_ci 93762306a36Sopenharmony_ci/* Retrieve NUMA policy */ 93862306a36Sopenharmony_cistatic long do_get_mempolicy(int *policy, nodemask_t *nmask, 93962306a36Sopenharmony_ci unsigned long addr, unsigned long flags) 94062306a36Sopenharmony_ci{ 94162306a36Sopenharmony_ci int err; 94262306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 94362306a36Sopenharmony_ci struct vm_area_struct *vma = NULL; 94462306a36Sopenharmony_ci struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; 94562306a36Sopenharmony_ci 94662306a36Sopenharmony_ci if (flags & 94762306a36Sopenharmony_ci ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 94862306a36Sopenharmony_ci return -EINVAL; 94962306a36Sopenharmony_ci 95062306a36Sopenharmony_ci if (flags & MPOL_F_MEMS_ALLOWED) { 95162306a36Sopenharmony_ci if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 95262306a36Sopenharmony_ci return -EINVAL; 95362306a36Sopenharmony_ci *policy = 0; /* just so it's initialized */ 95462306a36Sopenharmony_ci task_lock(current); 95562306a36Sopenharmony_ci *nmask = cpuset_current_mems_allowed; 95662306a36Sopenharmony_ci task_unlock(current); 95762306a36Sopenharmony_ci return 0; 95862306a36Sopenharmony_ci } 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci if (flags & MPOL_F_ADDR) { 96162306a36Sopenharmony_ci /* 96262306a36Sopenharmony_ci * Do NOT fall back to task policy if the 96362306a36Sopenharmony_ci * vma/shared policy at addr is NULL. We 96462306a36Sopenharmony_ci * want to return MPOL_DEFAULT in this case. 96562306a36Sopenharmony_ci */ 96662306a36Sopenharmony_ci mmap_read_lock(mm); 96762306a36Sopenharmony_ci vma = vma_lookup(mm, addr); 96862306a36Sopenharmony_ci if (!vma) { 96962306a36Sopenharmony_ci mmap_read_unlock(mm); 97062306a36Sopenharmony_ci return -EFAULT; 97162306a36Sopenharmony_ci } 97262306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->get_policy) 97362306a36Sopenharmony_ci pol = vma->vm_ops->get_policy(vma, addr); 97462306a36Sopenharmony_ci else 97562306a36Sopenharmony_ci pol = vma->vm_policy; 97662306a36Sopenharmony_ci } else if (addr) 97762306a36Sopenharmony_ci return -EINVAL; 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci if (!pol) 98062306a36Sopenharmony_ci pol = &default_policy; /* indicates default behavior */ 98162306a36Sopenharmony_ci 98262306a36Sopenharmony_ci if (flags & MPOL_F_NODE) { 98362306a36Sopenharmony_ci if (flags & MPOL_F_ADDR) { 98462306a36Sopenharmony_ci /* 98562306a36Sopenharmony_ci * Take a refcount on the mpol, because we are about to 98662306a36Sopenharmony_ci * drop the mmap_lock, after which only "pol" remains 98762306a36Sopenharmony_ci * valid, "vma" is stale. 98862306a36Sopenharmony_ci */ 98962306a36Sopenharmony_ci pol_refcount = pol; 99062306a36Sopenharmony_ci vma = NULL; 99162306a36Sopenharmony_ci mpol_get(pol); 99262306a36Sopenharmony_ci mmap_read_unlock(mm); 99362306a36Sopenharmony_ci err = lookup_node(mm, addr); 99462306a36Sopenharmony_ci if (err < 0) 99562306a36Sopenharmony_ci goto out; 99662306a36Sopenharmony_ci *policy = err; 99762306a36Sopenharmony_ci } else if (pol == current->mempolicy && 99862306a36Sopenharmony_ci pol->mode == MPOL_INTERLEAVE) { 99962306a36Sopenharmony_ci *policy = next_node_in(current->il_prev, pol->nodes); 100062306a36Sopenharmony_ci } else { 100162306a36Sopenharmony_ci err = -EINVAL; 100262306a36Sopenharmony_ci goto out; 100362306a36Sopenharmony_ci } 100462306a36Sopenharmony_ci } else { 100562306a36Sopenharmony_ci *policy = pol == &default_policy ? MPOL_DEFAULT : 100662306a36Sopenharmony_ci pol->mode; 100762306a36Sopenharmony_ci /* 100862306a36Sopenharmony_ci * Internal mempolicy flags must be masked off before exposing 100962306a36Sopenharmony_ci * the policy to userspace. 101062306a36Sopenharmony_ci */ 101162306a36Sopenharmony_ci *policy |= (pol->flags & MPOL_MODE_FLAGS); 101262306a36Sopenharmony_ci } 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_ci err = 0; 101562306a36Sopenharmony_ci if (nmask) { 101662306a36Sopenharmony_ci if (mpol_store_user_nodemask(pol)) { 101762306a36Sopenharmony_ci *nmask = pol->w.user_nodemask; 101862306a36Sopenharmony_ci } else { 101962306a36Sopenharmony_ci task_lock(current); 102062306a36Sopenharmony_ci get_policy_nodemask(pol, nmask); 102162306a36Sopenharmony_ci task_unlock(current); 102262306a36Sopenharmony_ci } 102362306a36Sopenharmony_ci } 102462306a36Sopenharmony_ci 102562306a36Sopenharmony_ci out: 102662306a36Sopenharmony_ci mpol_cond_put(pol); 102762306a36Sopenharmony_ci if (vma) 102862306a36Sopenharmony_ci mmap_read_unlock(mm); 102962306a36Sopenharmony_ci if (pol_refcount) 103062306a36Sopenharmony_ci mpol_put(pol_refcount); 103162306a36Sopenharmony_ci return err; 103262306a36Sopenharmony_ci} 103362306a36Sopenharmony_ci 103462306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION 103562306a36Sopenharmony_cistatic int migrate_folio_add(struct folio *folio, struct list_head *foliolist, 103662306a36Sopenharmony_ci unsigned long flags) 103762306a36Sopenharmony_ci{ 103862306a36Sopenharmony_ci /* 103962306a36Sopenharmony_ci * We try to migrate only unshared folios. If it is shared it 104062306a36Sopenharmony_ci * is likely not worth migrating. 104162306a36Sopenharmony_ci * 104262306a36Sopenharmony_ci * To check if the folio is shared, ideally we want to make sure 104362306a36Sopenharmony_ci * every page is mapped to the same process. Doing that is very 104462306a36Sopenharmony_ci * expensive, so check the estimated mapcount of the folio instead. 104562306a36Sopenharmony_ci */ 104662306a36Sopenharmony_ci if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { 104762306a36Sopenharmony_ci if (folio_isolate_lru(folio)) { 104862306a36Sopenharmony_ci list_add_tail(&folio->lru, foliolist); 104962306a36Sopenharmony_ci node_stat_mod_folio(folio, 105062306a36Sopenharmony_ci NR_ISOLATED_ANON + folio_is_file_lru(folio), 105162306a36Sopenharmony_ci folio_nr_pages(folio)); 105262306a36Sopenharmony_ci } else if (flags & MPOL_MF_STRICT) { 105362306a36Sopenharmony_ci /* 105462306a36Sopenharmony_ci * Non-movable folio may reach here. And, there may be 105562306a36Sopenharmony_ci * temporary off LRU folios or non-LRU movable folios. 105662306a36Sopenharmony_ci * Treat them as unmovable folios since they can't be 105762306a36Sopenharmony_ci * isolated, so they can't be moved at the moment. It 105862306a36Sopenharmony_ci * should return -EIO for this case too. 105962306a36Sopenharmony_ci */ 106062306a36Sopenharmony_ci return -EIO; 106162306a36Sopenharmony_ci } 106262306a36Sopenharmony_ci } 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_ci return 0; 106562306a36Sopenharmony_ci} 106662306a36Sopenharmony_ci 106762306a36Sopenharmony_ci/* 106862306a36Sopenharmony_ci * Migrate pages from one node to a target node. 106962306a36Sopenharmony_ci * Returns error or the number of pages not migrated. 107062306a36Sopenharmony_ci */ 107162306a36Sopenharmony_cistatic int migrate_to_node(struct mm_struct *mm, int source, int dest, 107262306a36Sopenharmony_ci int flags) 107362306a36Sopenharmony_ci{ 107462306a36Sopenharmony_ci nodemask_t nmask; 107562306a36Sopenharmony_ci struct vm_area_struct *vma; 107662306a36Sopenharmony_ci LIST_HEAD(pagelist); 107762306a36Sopenharmony_ci int err = 0; 107862306a36Sopenharmony_ci struct migration_target_control mtc = { 107962306a36Sopenharmony_ci .nid = dest, 108062306a36Sopenharmony_ci .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 108162306a36Sopenharmony_ci }; 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci nodes_clear(nmask); 108462306a36Sopenharmony_ci node_set(source, nmask); 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_ci /* 108762306a36Sopenharmony_ci * This does not "check" the range but isolates all pages that 108862306a36Sopenharmony_ci * need migration. Between passing in the full user address 108962306a36Sopenharmony_ci * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. 109062306a36Sopenharmony_ci */ 109162306a36Sopenharmony_ci vma = find_vma(mm, 0); 109262306a36Sopenharmony_ci VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 109362306a36Sopenharmony_ci queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, 109462306a36Sopenharmony_ci flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); 109562306a36Sopenharmony_ci 109662306a36Sopenharmony_ci if (!list_empty(&pagelist)) { 109762306a36Sopenharmony_ci err = migrate_pages(&pagelist, alloc_migration_target, NULL, 109862306a36Sopenharmony_ci (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 109962306a36Sopenharmony_ci if (err) 110062306a36Sopenharmony_ci putback_movable_pages(&pagelist); 110162306a36Sopenharmony_ci } 110262306a36Sopenharmony_ci 110362306a36Sopenharmony_ci return err; 110462306a36Sopenharmony_ci} 110562306a36Sopenharmony_ci 110662306a36Sopenharmony_ci/* 110762306a36Sopenharmony_ci * Move pages between the two nodesets so as to preserve the physical 110862306a36Sopenharmony_ci * layout as much as possible. 110962306a36Sopenharmony_ci * 111062306a36Sopenharmony_ci * Returns the number of page that could not be moved. 111162306a36Sopenharmony_ci */ 111262306a36Sopenharmony_ciint do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 111362306a36Sopenharmony_ci const nodemask_t *to, int flags) 111462306a36Sopenharmony_ci{ 111562306a36Sopenharmony_ci int busy = 0; 111662306a36Sopenharmony_ci int err = 0; 111762306a36Sopenharmony_ci nodemask_t tmp; 111862306a36Sopenharmony_ci 111962306a36Sopenharmony_ci lru_cache_disable(); 112062306a36Sopenharmony_ci 112162306a36Sopenharmony_ci mmap_read_lock(mm); 112262306a36Sopenharmony_ci 112362306a36Sopenharmony_ci /* 112462306a36Sopenharmony_ci * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 112562306a36Sopenharmony_ci * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 112662306a36Sopenharmony_ci * bit in 'tmp', and return that <source, dest> pair for migration. 112762306a36Sopenharmony_ci * The pair of nodemasks 'to' and 'from' define the map. 112862306a36Sopenharmony_ci * 112962306a36Sopenharmony_ci * If no pair of bits is found that way, fallback to picking some 113062306a36Sopenharmony_ci * pair of 'source' and 'dest' bits that are not the same. If the 113162306a36Sopenharmony_ci * 'source' and 'dest' bits are the same, this represents a node 113262306a36Sopenharmony_ci * that will be migrating to itself, so no pages need move. 113362306a36Sopenharmony_ci * 113462306a36Sopenharmony_ci * If no bits are left in 'tmp', or if all remaining bits left 113562306a36Sopenharmony_ci * in 'tmp' correspond to the same bit in 'to', return false 113662306a36Sopenharmony_ci * (nothing left to migrate). 113762306a36Sopenharmony_ci * 113862306a36Sopenharmony_ci * This lets us pick a pair of nodes to migrate between, such that 113962306a36Sopenharmony_ci * if possible the dest node is not already occupied by some other 114062306a36Sopenharmony_ci * source node, minimizing the risk of overloading the memory on a 114162306a36Sopenharmony_ci * node that would happen if we migrated incoming memory to a node 114262306a36Sopenharmony_ci * before migrating outgoing memory source that same node. 114362306a36Sopenharmony_ci * 114462306a36Sopenharmony_ci * A single scan of tmp is sufficient. As we go, we remember the 114562306a36Sopenharmony_ci * most recent <s, d> pair that moved (s != d). If we find a pair 114662306a36Sopenharmony_ci * that not only moved, but what's better, moved to an empty slot 114762306a36Sopenharmony_ci * (d is not set in tmp), then we break out then, with that pair. 114862306a36Sopenharmony_ci * Otherwise when we finish scanning from_tmp, we at least have the 114962306a36Sopenharmony_ci * most recent <s, d> pair that moved. If we get all the way through 115062306a36Sopenharmony_ci * the scan of tmp without finding any node that moved, much less 115162306a36Sopenharmony_ci * moved to an empty node, then there is nothing left worth migrating. 115262306a36Sopenharmony_ci */ 115362306a36Sopenharmony_ci 115462306a36Sopenharmony_ci tmp = *from; 115562306a36Sopenharmony_ci while (!nodes_empty(tmp)) { 115662306a36Sopenharmony_ci int s, d; 115762306a36Sopenharmony_ci int source = NUMA_NO_NODE; 115862306a36Sopenharmony_ci int dest = 0; 115962306a36Sopenharmony_ci 116062306a36Sopenharmony_ci for_each_node_mask(s, tmp) { 116162306a36Sopenharmony_ci 116262306a36Sopenharmony_ci /* 116362306a36Sopenharmony_ci * do_migrate_pages() tries to maintain the relative 116462306a36Sopenharmony_ci * node relationship of the pages established between 116562306a36Sopenharmony_ci * threads and memory areas. 116662306a36Sopenharmony_ci * 116762306a36Sopenharmony_ci * However if the number of source nodes is not equal to 116862306a36Sopenharmony_ci * the number of destination nodes we can not preserve 116962306a36Sopenharmony_ci * this node relative relationship. In that case, skip 117062306a36Sopenharmony_ci * copying memory from a node that is in the destination 117162306a36Sopenharmony_ci * mask. 117262306a36Sopenharmony_ci * 117362306a36Sopenharmony_ci * Example: [2,3,4] -> [3,4,5] moves everything. 117462306a36Sopenharmony_ci * [0-7] - > [3,4,5] moves only 0,1,2,6,7. 117562306a36Sopenharmony_ci */ 117662306a36Sopenharmony_ci 117762306a36Sopenharmony_ci if ((nodes_weight(*from) != nodes_weight(*to)) && 117862306a36Sopenharmony_ci (node_isset(s, *to))) 117962306a36Sopenharmony_ci continue; 118062306a36Sopenharmony_ci 118162306a36Sopenharmony_ci d = node_remap(s, *from, *to); 118262306a36Sopenharmony_ci if (s == d) 118362306a36Sopenharmony_ci continue; 118462306a36Sopenharmony_ci 118562306a36Sopenharmony_ci source = s; /* Node moved. Memorize */ 118662306a36Sopenharmony_ci dest = d; 118762306a36Sopenharmony_ci 118862306a36Sopenharmony_ci /* dest not in remaining from nodes? */ 118962306a36Sopenharmony_ci if (!node_isset(dest, tmp)) 119062306a36Sopenharmony_ci break; 119162306a36Sopenharmony_ci } 119262306a36Sopenharmony_ci if (source == NUMA_NO_NODE) 119362306a36Sopenharmony_ci break; 119462306a36Sopenharmony_ci 119562306a36Sopenharmony_ci node_clear(source, tmp); 119662306a36Sopenharmony_ci err = migrate_to_node(mm, source, dest, flags); 119762306a36Sopenharmony_ci if (err > 0) 119862306a36Sopenharmony_ci busy += err; 119962306a36Sopenharmony_ci if (err < 0) 120062306a36Sopenharmony_ci break; 120162306a36Sopenharmony_ci } 120262306a36Sopenharmony_ci mmap_read_unlock(mm); 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci lru_cache_enable(); 120562306a36Sopenharmony_ci if (err < 0) 120662306a36Sopenharmony_ci return err; 120762306a36Sopenharmony_ci return busy; 120862306a36Sopenharmony_ci 120962306a36Sopenharmony_ci} 121062306a36Sopenharmony_ci 121162306a36Sopenharmony_ci/* 121262306a36Sopenharmony_ci * Allocate a new page for page migration based on vma policy. 121362306a36Sopenharmony_ci * Start by assuming the page is mapped by the same vma as contains @start. 121462306a36Sopenharmony_ci * Search forward from there, if not. N.B., this assumes that the 121562306a36Sopenharmony_ci * list of pages handed to migrate_pages()--which is how we get here-- 121662306a36Sopenharmony_ci * is in virtual address order. 121762306a36Sopenharmony_ci */ 121862306a36Sopenharmony_cistatic struct folio *new_folio(struct folio *src, unsigned long start) 121962306a36Sopenharmony_ci{ 122062306a36Sopenharmony_ci struct vm_area_struct *vma; 122162306a36Sopenharmony_ci unsigned long address; 122262306a36Sopenharmony_ci VMA_ITERATOR(vmi, current->mm, start); 122362306a36Sopenharmony_ci gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; 122462306a36Sopenharmony_ci 122562306a36Sopenharmony_ci for_each_vma(vmi, vma) { 122662306a36Sopenharmony_ci address = page_address_in_vma(&src->page, vma); 122762306a36Sopenharmony_ci if (address != -EFAULT) 122862306a36Sopenharmony_ci break; 122962306a36Sopenharmony_ci } 123062306a36Sopenharmony_ci 123162306a36Sopenharmony_ci if (folio_test_hugetlb(src)) { 123262306a36Sopenharmony_ci return alloc_hugetlb_folio_vma(folio_hstate(src), 123362306a36Sopenharmony_ci vma, address); 123462306a36Sopenharmony_ci } 123562306a36Sopenharmony_ci 123662306a36Sopenharmony_ci if (folio_test_large(src)) 123762306a36Sopenharmony_ci gfp = GFP_TRANSHUGE; 123862306a36Sopenharmony_ci 123962306a36Sopenharmony_ci /* 124062306a36Sopenharmony_ci * if !vma, vma_alloc_folio() will use task or system default policy 124162306a36Sopenharmony_ci */ 124262306a36Sopenharmony_ci return vma_alloc_folio(gfp, folio_order(src), vma, address, 124362306a36Sopenharmony_ci folio_test_large(src)); 124462306a36Sopenharmony_ci} 124562306a36Sopenharmony_ci#else 124662306a36Sopenharmony_ci 124762306a36Sopenharmony_cistatic int migrate_folio_add(struct folio *folio, struct list_head *foliolist, 124862306a36Sopenharmony_ci unsigned long flags) 124962306a36Sopenharmony_ci{ 125062306a36Sopenharmony_ci return -EIO; 125162306a36Sopenharmony_ci} 125262306a36Sopenharmony_ci 125362306a36Sopenharmony_ciint do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, 125462306a36Sopenharmony_ci const nodemask_t *to, int flags) 125562306a36Sopenharmony_ci{ 125662306a36Sopenharmony_ci return -ENOSYS; 125762306a36Sopenharmony_ci} 125862306a36Sopenharmony_ci 125962306a36Sopenharmony_cistatic struct folio *new_folio(struct folio *src, unsigned long start) 126062306a36Sopenharmony_ci{ 126162306a36Sopenharmony_ci return NULL; 126262306a36Sopenharmony_ci} 126362306a36Sopenharmony_ci#endif 126462306a36Sopenharmony_ci 126562306a36Sopenharmony_cistatic long do_mbind(unsigned long start, unsigned long len, 126662306a36Sopenharmony_ci unsigned short mode, unsigned short mode_flags, 126762306a36Sopenharmony_ci nodemask_t *nmask, unsigned long flags) 126862306a36Sopenharmony_ci{ 126962306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 127062306a36Sopenharmony_ci struct vm_area_struct *vma, *prev; 127162306a36Sopenharmony_ci struct vma_iterator vmi; 127262306a36Sopenharmony_ci struct mempolicy *new; 127362306a36Sopenharmony_ci unsigned long end; 127462306a36Sopenharmony_ci int err; 127562306a36Sopenharmony_ci int ret; 127662306a36Sopenharmony_ci LIST_HEAD(pagelist); 127762306a36Sopenharmony_ci 127862306a36Sopenharmony_ci if (flags & ~(unsigned long)MPOL_MF_VALID) 127962306a36Sopenharmony_ci return -EINVAL; 128062306a36Sopenharmony_ci if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 128162306a36Sopenharmony_ci return -EPERM; 128262306a36Sopenharmony_ci 128362306a36Sopenharmony_ci if (start & ~PAGE_MASK) 128462306a36Sopenharmony_ci return -EINVAL; 128562306a36Sopenharmony_ci 128662306a36Sopenharmony_ci if (mode == MPOL_DEFAULT) 128762306a36Sopenharmony_ci flags &= ~MPOL_MF_STRICT; 128862306a36Sopenharmony_ci 128962306a36Sopenharmony_ci len = PAGE_ALIGN(len); 129062306a36Sopenharmony_ci end = start + len; 129162306a36Sopenharmony_ci 129262306a36Sopenharmony_ci if (end < start) 129362306a36Sopenharmony_ci return -EINVAL; 129462306a36Sopenharmony_ci if (end == start) 129562306a36Sopenharmony_ci return 0; 129662306a36Sopenharmony_ci 129762306a36Sopenharmony_ci new = mpol_new(mode, mode_flags, nmask); 129862306a36Sopenharmony_ci if (IS_ERR(new)) 129962306a36Sopenharmony_ci return PTR_ERR(new); 130062306a36Sopenharmony_ci 130162306a36Sopenharmony_ci if (flags & MPOL_MF_LAZY) 130262306a36Sopenharmony_ci new->flags |= MPOL_F_MOF; 130362306a36Sopenharmony_ci 130462306a36Sopenharmony_ci /* 130562306a36Sopenharmony_ci * If we are using the default policy then operation 130662306a36Sopenharmony_ci * on discontinuous address spaces is okay after all 130762306a36Sopenharmony_ci */ 130862306a36Sopenharmony_ci if (!new) 130962306a36Sopenharmony_ci flags |= MPOL_MF_DISCONTIG_OK; 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_ci pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 131262306a36Sopenharmony_ci start, start + len, mode, mode_flags, 131362306a36Sopenharmony_ci nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); 131462306a36Sopenharmony_ci 131562306a36Sopenharmony_ci if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 131662306a36Sopenharmony_ci 131762306a36Sopenharmony_ci lru_cache_disable(); 131862306a36Sopenharmony_ci } 131962306a36Sopenharmony_ci { 132062306a36Sopenharmony_ci NODEMASK_SCRATCH(scratch); 132162306a36Sopenharmony_ci if (scratch) { 132262306a36Sopenharmony_ci mmap_write_lock(mm); 132362306a36Sopenharmony_ci err = mpol_set_nodemask(new, nmask, scratch); 132462306a36Sopenharmony_ci if (err) 132562306a36Sopenharmony_ci mmap_write_unlock(mm); 132662306a36Sopenharmony_ci } else 132762306a36Sopenharmony_ci err = -ENOMEM; 132862306a36Sopenharmony_ci NODEMASK_SCRATCH_FREE(scratch); 132962306a36Sopenharmony_ci } 133062306a36Sopenharmony_ci if (err) 133162306a36Sopenharmony_ci goto mpol_out; 133262306a36Sopenharmony_ci 133362306a36Sopenharmony_ci /* 133462306a36Sopenharmony_ci * Lock the VMAs before scanning for pages to migrate, to ensure we don't 133562306a36Sopenharmony_ci * miss a concurrently inserted page. 133662306a36Sopenharmony_ci */ 133762306a36Sopenharmony_ci ret = queue_pages_range(mm, start, end, nmask, 133862306a36Sopenharmony_ci flags | MPOL_MF_INVERT, &pagelist, true); 133962306a36Sopenharmony_ci 134062306a36Sopenharmony_ci if (ret < 0) { 134162306a36Sopenharmony_ci err = ret; 134262306a36Sopenharmony_ci goto up_out; 134362306a36Sopenharmony_ci } 134462306a36Sopenharmony_ci 134562306a36Sopenharmony_ci vma_iter_init(&vmi, mm, start); 134662306a36Sopenharmony_ci prev = vma_prev(&vmi); 134762306a36Sopenharmony_ci for_each_vma_range(vmi, vma, end) { 134862306a36Sopenharmony_ci err = mbind_range(&vmi, vma, &prev, start, end, new); 134962306a36Sopenharmony_ci if (err) 135062306a36Sopenharmony_ci break; 135162306a36Sopenharmony_ci } 135262306a36Sopenharmony_ci 135362306a36Sopenharmony_ci if (!err) { 135462306a36Sopenharmony_ci int nr_failed = 0; 135562306a36Sopenharmony_ci 135662306a36Sopenharmony_ci if (!list_empty(&pagelist)) { 135762306a36Sopenharmony_ci WARN_ON_ONCE(flags & MPOL_MF_LAZY); 135862306a36Sopenharmony_ci nr_failed = migrate_pages(&pagelist, new_folio, NULL, 135962306a36Sopenharmony_ci start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); 136062306a36Sopenharmony_ci if (nr_failed) 136162306a36Sopenharmony_ci putback_movable_pages(&pagelist); 136262306a36Sopenharmony_ci } 136362306a36Sopenharmony_ci 136462306a36Sopenharmony_ci if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT)) 136562306a36Sopenharmony_ci err = -EIO; 136662306a36Sopenharmony_ci } else { 136762306a36Sopenharmony_ciup_out: 136862306a36Sopenharmony_ci if (!list_empty(&pagelist)) 136962306a36Sopenharmony_ci putback_movable_pages(&pagelist); 137062306a36Sopenharmony_ci } 137162306a36Sopenharmony_ci 137262306a36Sopenharmony_ci mmap_write_unlock(mm); 137362306a36Sopenharmony_cimpol_out: 137462306a36Sopenharmony_ci mpol_put(new); 137562306a36Sopenharmony_ci if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 137662306a36Sopenharmony_ci lru_cache_enable(); 137762306a36Sopenharmony_ci return err; 137862306a36Sopenharmony_ci} 137962306a36Sopenharmony_ci 138062306a36Sopenharmony_ci/* 138162306a36Sopenharmony_ci * User space interface with variable sized bitmaps for nodelists. 138262306a36Sopenharmony_ci */ 138362306a36Sopenharmony_cistatic int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, 138462306a36Sopenharmony_ci unsigned long maxnode) 138562306a36Sopenharmony_ci{ 138662306a36Sopenharmony_ci unsigned long nlongs = BITS_TO_LONGS(maxnode); 138762306a36Sopenharmony_ci int ret; 138862306a36Sopenharmony_ci 138962306a36Sopenharmony_ci if (in_compat_syscall()) 139062306a36Sopenharmony_ci ret = compat_get_bitmap(mask, 139162306a36Sopenharmony_ci (const compat_ulong_t __user *)nmask, 139262306a36Sopenharmony_ci maxnode); 139362306a36Sopenharmony_ci else 139462306a36Sopenharmony_ci ret = copy_from_user(mask, nmask, 139562306a36Sopenharmony_ci nlongs * sizeof(unsigned long)); 139662306a36Sopenharmony_ci 139762306a36Sopenharmony_ci if (ret) 139862306a36Sopenharmony_ci return -EFAULT; 139962306a36Sopenharmony_ci 140062306a36Sopenharmony_ci if (maxnode % BITS_PER_LONG) 140162306a36Sopenharmony_ci mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; 140262306a36Sopenharmony_ci 140362306a36Sopenharmony_ci return 0; 140462306a36Sopenharmony_ci} 140562306a36Sopenharmony_ci 140662306a36Sopenharmony_ci/* Copy a node mask from user space. */ 140762306a36Sopenharmony_cistatic int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, 140862306a36Sopenharmony_ci unsigned long maxnode) 140962306a36Sopenharmony_ci{ 141062306a36Sopenharmony_ci --maxnode; 141162306a36Sopenharmony_ci nodes_clear(*nodes); 141262306a36Sopenharmony_ci if (maxnode == 0 || !nmask) 141362306a36Sopenharmony_ci return 0; 141462306a36Sopenharmony_ci if (maxnode > PAGE_SIZE*BITS_PER_BYTE) 141562306a36Sopenharmony_ci return -EINVAL; 141662306a36Sopenharmony_ci 141762306a36Sopenharmony_ci /* 141862306a36Sopenharmony_ci * When the user specified more nodes than supported just check 141962306a36Sopenharmony_ci * if the non supported part is all zero, one word at a time, 142062306a36Sopenharmony_ci * starting at the end. 142162306a36Sopenharmony_ci */ 142262306a36Sopenharmony_ci while (maxnode > MAX_NUMNODES) { 142362306a36Sopenharmony_ci unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); 142462306a36Sopenharmony_ci unsigned long t; 142562306a36Sopenharmony_ci 142662306a36Sopenharmony_ci if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) 142762306a36Sopenharmony_ci return -EFAULT; 142862306a36Sopenharmony_ci 142962306a36Sopenharmony_ci if (maxnode - bits >= MAX_NUMNODES) { 143062306a36Sopenharmony_ci maxnode -= bits; 143162306a36Sopenharmony_ci } else { 143262306a36Sopenharmony_ci maxnode = MAX_NUMNODES; 143362306a36Sopenharmony_ci t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); 143462306a36Sopenharmony_ci } 143562306a36Sopenharmony_ci if (t) 143662306a36Sopenharmony_ci return -EINVAL; 143762306a36Sopenharmony_ci } 143862306a36Sopenharmony_ci 143962306a36Sopenharmony_ci return get_bitmap(nodes_addr(*nodes), nmask, maxnode); 144062306a36Sopenharmony_ci} 144162306a36Sopenharmony_ci 144262306a36Sopenharmony_ci/* Copy a kernel node mask to user space */ 144362306a36Sopenharmony_cistatic int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, 144462306a36Sopenharmony_ci nodemask_t *nodes) 144562306a36Sopenharmony_ci{ 144662306a36Sopenharmony_ci unsigned long copy = ALIGN(maxnode-1, 64) / 8; 144762306a36Sopenharmony_ci unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); 144862306a36Sopenharmony_ci bool compat = in_compat_syscall(); 144962306a36Sopenharmony_ci 145062306a36Sopenharmony_ci if (compat) 145162306a36Sopenharmony_ci nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); 145262306a36Sopenharmony_ci 145362306a36Sopenharmony_ci if (copy > nbytes) { 145462306a36Sopenharmony_ci if (copy > PAGE_SIZE) 145562306a36Sopenharmony_ci return -EINVAL; 145662306a36Sopenharmony_ci if (clear_user((char __user *)mask + nbytes, copy - nbytes)) 145762306a36Sopenharmony_ci return -EFAULT; 145862306a36Sopenharmony_ci copy = nbytes; 145962306a36Sopenharmony_ci maxnode = nr_node_ids; 146062306a36Sopenharmony_ci } 146162306a36Sopenharmony_ci 146262306a36Sopenharmony_ci if (compat) 146362306a36Sopenharmony_ci return compat_put_bitmap((compat_ulong_t __user *)mask, 146462306a36Sopenharmony_ci nodes_addr(*nodes), maxnode); 146562306a36Sopenharmony_ci 146662306a36Sopenharmony_ci return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 146762306a36Sopenharmony_ci} 146862306a36Sopenharmony_ci 146962306a36Sopenharmony_ci/* Basic parameter sanity check used by both mbind() and set_mempolicy() */ 147062306a36Sopenharmony_cistatic inline int sanitize_mpol_flags(int *mode, unsigned short *flags) 147162306a36Sopenharmony_ci{ 147262306a36Sopenharmony_ci *flags = *mode & MPOL_MODE_FLAGS; 147362306a36Sopenharmony_ci *mode &= ~MPOL_MODE_FLAGS; 147462306a36Sopenharmony_ci 147562306a36Sopenharmony_ci if ((unsigned int)(*mode) >= MPOL_MAX) 147662306a36Sopenharmony_ci return -EINVAL; 147762306a36Sopenharmony_ci if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) 147862306a36Sopenharmony_ci return -EINVAL; 147962306a36Sopenharmony_ci if (*flags & MPOL_F_NUMA_BALANCING) { 148062306a36Sopenharmony_ci if (*mode != MPOL_BIND) 148162306a36Sopenharmony_ci return -EINVAL; 148262306a36Sopenharmony_ci *flags |= (MPOL_F_MOF | MPOL_F_MORON); 148362306a36Sopenharmony_ci } 148462306a36Sopenharmony_ci return 0; 148562306a36Sopenharmony_ci} 148662306a36Sopenharmony_ci 148762306a36Sopenharmony_cistatic long kernel_mbind(unsigned long start, unsigned long len, 148862306a36Sopenharmony_ci unsigned long mode, const unsigned long __user *nmask, 148962306a36Sopenharmony_ci unsigned long maxnode, unsigned int flags) 149062306a36Sopenharmony_ci{ 149162306a36Sopenharmony_ci unsigned short mode_flags; 149262306a36Sopenharmony_ci nodemask_t nodes; 149362306a36Sopenharmony_ci int lmode = mode; 149462306a36Sopenharmony_ci int err; 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_ci start = untagged_addr(start); 149762306a36Sopenharmony_ci err = sanitize_mpol_flags(&lmode, &mode_flags); 149862306a36Sopenharmony_ci if (err) 149962306a36Sopenharmony_ci return err; 150062306a36Sopenharmony_ci 150162306a36Sopenharmony_ci err = get_nodes(&nodes, nmask, maxnode); 150262306a36Sopenharmony_ci if (err) 150362306a36Sopenharmony_ci return err; 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci return do_mbind(start, len, lmode, mode_flags, &nodes, flags); 150662306a36Sopenharmony_ci} 150762306a36Sopenharmony_ci 150862306a36Sopenharmony_ciSYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, 150962306a36Sopenharmony_ci unsigned long, home_node, unsigned long, flags) 151062306a36Sopenharmony_ci{ 151162306a36Sopenharmony_ci struct mm_struct *mm = current->mm; 151262306a36Sopenharmony_ci struct vm_area_struct *vma, *prev; 151362306a36Sopenharmony_ci struct mempolicy *new, *old; 151462306a36Sopenharmony_ci unsigned long end; 151562306a36Sopenharmony_ci int err = -ENOENT; 151662306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm, start); 151762306a36Sopenharmony_ci 151862306a36Sopenharmony_ci start = untagged_addr(start); 151962306a36Sopenharmony_ci if (start & ~PAGE_MASK) 152062306a36Sopenharmony_ci return -EINVAL; 152162306a36Sopenharmony_ci /* 152262306a36Sopenharmony_ci * flags is used for future extension if any. 152362306a36Sopenharmony_ci */ 152462306a36Sopenharmony_ci if (flags != 0) 152562306a36Sopenharmony_ci return -EINVAL; 152662306a36Sopenharmony_ci 152762306a36Sopenharmony_ci /* 152862306a36Sopenharmony_ci * Check home_node is online to avoid accessing uninitialized 152962306a36Sopenharmony_ci * NODE_DATA. 153062306a36Sopenharmony_ci */ 153162306a36Sopenharmony_ci if (home_node >= MAX_NUMNODES || !node_online(home_node)) 153262306a36Sopenharmony_ci return -EINVAL; 153362306a36Sopenharmony_ci 153462306a36Sopenharmony_ci len = PAGE_ALIGN(len); 153562306a36Sopenharmony_ci end = start + len; 153662306a36Sopenharmony_ci 153762306a36Sopenharmony_ci if (end < start) 153862306a36Sopenharmony_ci return -EINVAL; 153962306a36Sopenharmony_ci if (end == start) 154062306a36Sopenharmony_ci return 0; 154162306a36Sopenharmony_ci mmap_write_lock(mm); 154262306a36Sopenharmony_ci prev = vma_prev(&vmi); 154362306a36Sopenharmony_ci for_each_vma_range(vmi, vma, end) { 154462306a36Sopenharmony_ci /* 154562306a36Sopenharmony_ci * If any vma in the range got policy other than MPOL_BIND 154662306a36Sopenharmony_ci * or MPOL_PREFERRED_MANY we return error. We don't reset 154762306a36Sopenharmony_ci * the home node for vmas we already updated before. 154862306a36Sopenharmony_ci */ 154962306a36Sopenharmony_ci old = vma_policy(vma); 155062306a36Sopenharmony_ci if (!old) { 155162306a36Sopenharmony_ci prev = vma; 155262306a36Sopenharmony_ci continue; 155362306a36Sopenharmony_ci } 155462306a36Sopenharmony_ci if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { 155562306a36Sopenharmony_ci err = -EOPNOTSUPP; 155662306a36Sopenharmony_ci break; 155762306a36Sopenharmony_ci } 155862306a36Sopenharmony_ci new = mpol_dup(old); 155962306a36Sopenharmony_ci if (IS_ERR(new)) { 156062306a36Sopenharmony_ci err = PTR_ERR(new); 156162306a36Sopenharmony_ci break; 156262306a36Sopenharmony_ci } 156362306a36Sopenharmony_ci 156462306a36Sopenharmony_ci vma_start_write(vma); 156562306a36Sopenharmony_ci new->home_node = home_node; 156662306a36Sopenharmony_ci err = mbind_range(&vmi, vma, &prev, start, end, new); 156762306a36Sopenharmony_ci mpol_put(new); 156862306a36Sopenharmony_ci if (err) 156962306a36Sopenharmony_ci break; 157062306a36Sopenharmony_ci } 157162306a36Sopenharmony_ci mmap_write_unlock(mm); 157262306a36Sopenharmony_ci return err; 157362306a36Sopenharmony_ci} 157462306a36Sopenharmony_ci 157562306a36Sopenharmony_ciSYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, 157662306a36Sopenharmony_ci unsigned long, mode, const unsigned long __user *, nmask, 157762306a36Sopenharmony_ci unsigned long, maxnode, unsigned int, flags) 157862306a36Sopenharmony_ci{ 157962306a36Sopenharmony_ci return kernel_mbind(start, len, mode, nmask, maxnode, flags); 158062306a36Sopenharmony_ci} 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_ci/* Set the process memory policy */ 158362306a36Sopenharmony_cistatic long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, 158462306a36Sopenharmony_ci unsigned long maxnode) 158562306a36Sopenharmony_ci{ 158662306a36Sopenharmony_ci unsigned short mode_flags; 158762306a36Sopenharmony_ci nodemask_t nodes; 158862306a36Sopenharmony_ci int lmode = mode; 158962306a36Sopenharmony_ci int err; 159062306a36Sopenharmony_ci 159162306a36Sopenharmony_ci err = sanitize_mpol_flags(&lmode, &mode_flags); 159262306a36Sopenharmony_ci if (err) 159362306a36Sopenharmony_ci return err; 159462306a36Sopenharmony_ci 159562306a36Sopenharmony_ci err = get_nodes(&nodes, nmask, maxnode); 159662306a36Sopenharmony_ci if (err) 159762306a36Sopenharmony_ci return err; 159862306a36Sopenharmony_ci 159962306a36Sopenharmony_ci return do_set_mempolicy(lmode, mode_flags, &nodes); 160062306a36Sopenharmony_ci} 160162306a36Sopenharmony_ci 160262306a36Sopenharmony_ciSYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, 160362306a36Sopenharmony_ci unsigned long, maxnode) 160462306a36Sopenharmony_ci{ 160562306a36Sopenharmony_ci return kernel_set_mempolicy(mode, nmask, maxnode); 160662306a36Sopenharmony_ci} 160762306a36Sopenharmony_ci 160862306a36Sopenharmony_cistatic int kernel_migrate_pages(pid_t pid, unsigned long maxnode, 160962306a36Sopenharmony_ci const unsigned long __user *old_nodes, 161062306a36Sopenharmony_ci const unsigned long __user *new_nodes) 161162306a36Sopenharmony_ci{ 161262306a36Sopenharmony_ci struct mm_struct *mm = NULL; 161362306a36Sopenharmony_ci struct task_struct *task; 161462306a36Sopenharmony_ci nodemask_t task_nodes; 161562306a36Sopenharmony_ci int err; 161662306a36Sopenharmony_ci nodemask_t *old; 161762306a36Sopenharmony_ci nodemask_t *new; 161862306a36Sopenharmony_ci NODEMASK_SCRATCH(scratch); 161962306a36Sopenharmony_ci 162062306a36Sopenharmony_ci if (!scratch) 162162306a36Sopenharmony_ci return -ENOMEM; 162262306a36Sopenharmony_ci 162362306a36Sopenharmony_ci old = &scratch->mask1; 162462306a36Sopenharmony_ci new = &scratch->mask2; 162562306a36Sopenharmony_ci 162662306a36Sopenharmony_ci err = get_nodes(old, old_nodes, maxnode); 162762306a36Sopenharmony_ci if (err) 162862306a36Sopenharmony_ci goto out; 162962306a36Sopenharmony_ci 163062306a36Sopenharmony_ci err = get_nodes(new, new_nodes, maxnode); 163162306a36Sopenharmony_ci if (err) 163262306a36Sopenharmony_ci goto out; 163362306a36Sopenharmony_ci 163462306a36Sopenharmony_ci /* Find the mm_struct */ 163562306a36Sopenharmony_ci rcu_read_lock(); 163662306a36Sopenharmony_ci task = pid ? find_task_by_vpid(pid) : current; 163762306a36Sopenharmony_ci if (!task) { 163862306a36Sopenharmony_ci rcu_read_unlock(); 163962306a36Sopenharmony_ci err = -ESRCH; 164062306a36Sopenharmony_ci goto out; 164162306a36Sopenharmony_ci } 164262306a36Sopenharmony_ci get_task_struct(task); 164362306a36Sopenharmony_ci 164462306a36Sopenharmony_ci err = -EINVAL; 164562306a36Sopenharmony_ci 164662306a36Sopenharmony_ci /* 164762306a36Sopenharmony_ci * Check if this process has the right to modify the specified process. 164862306a36Sopenharmony_ci * Use the regular "ptrace_may_access()" checks. 164962306a36Sopenharmony_ci */ 165062306a36Sopenharmony_ci if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 165162306a36Sopenharmony_ci rcu_read_unlock(); 165262306a36Sopenharmony_ci err = -EPERM; 165362306a36Sopenharmony_ci goto out_put; 165462306a36Sopenharmony_ci } 165562306a36Sopenharmony_ci rcu_read_unlock(); 165662306a36Sopenharmony_ci 165762306a36Sopenharmony_ci task_nodes = cpuset_mems_allowed(task); 165862306a36Sopenharmony_ci /* Is the user allowed to access the target nodes? */ 165962306a36Sopenharmony_ci if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 166062306a36Sopenharmony_ci err = -EPERM; 166162306a36Sopenharmony_ci goto out_put; 166262306a36Sopenharmony_ci } 166362306a36Sopenharmony_ci 166462306a36Sopenharmony_ci task_nodes = cpuset_mems_allowed(current); 166562306a36Sopenharmony_ci nodes_and(*new, *new, task_nodes); 166662306a36Sopenharmony_ci if (nodes_empty(*new)) 166762306a36Sopenharmony_ci goto out_put; 166862306a36Sopenharmony_ci 166962306a36Sopenharmony_ci err = security_task_movememory(task); 167062306a36Sopenharmony_ci if (err) 167162306a36Sopenharmony_ci goto out_put; 167262306a36Sopenharmony_ci 167362306a36Sopenharmony_ci mm = get_task_mm(task); 167462306a36Sopenharmony_ci put_task_struct(task); 167562306a36Sopenharmony_ci 167662306a36Sopenharmony_ci if (!mm) { 167762306a36Sopenharmony_ci err = -EINVAL; 167862306a36Sopenharmony_ci goto out; 167962306a36Sopenharmony_ci } 168062306a36Sopenharmony_ci 168162306a36Sopenharmony_ci err = do_migrate_pages(mm, old, new, 168262306a36Sopenharmony_ci capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 168362306a36Sopenharmony_ci 168462306a36Sopenharmony_ci mmput(mm); 168562306a36Sopenharmony_ciout: 168662306a36Sopenharmony_ci NODEMASK_SCRATCH_FREE(scratch); 168762306a36Sopenharmony_ci 168862306a36Sopenharmony_ci return err; 168962306a36Sopenharmony_ci 169062306a36Sopenharmony_ciout_put: 169162306a36Sopenharmony_ci put_task_struct(task); 169262306a36Sopenharmony_ci goto out; 169362306a36Sopenharmony_ci 169462306a36Sopenharmony_ci} 169562306a36Sopenharmony_ci 169662306a36Sopenharmony_ciSYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, 169762306a36Sopenharmony_ci const unsigned long __user *, old_nodes, 169862306a36Sopenharmony_ci const unsigned long __user *, new_nodes) 169962306a36Sopenharmony_ci{ 170062306a36Sopenharmony_ci return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); 170162306a36Sopenharmony_ci} 170262306a36Sopenharmony_ci 170362306a36Sopenharmony_ci 170462306a36Sopenharmony_ci/* Retrieve NUMA policy */ 170562306a36Sopenharmony_cistatic int kernel_get_mempolicy(int __user *policy, 170662306a36Sopenharmony_ci unsigned long __user *nmask, 170762306a36Sopenharmony_ci unsigned long maxnode, 170862306a36Sopenharmony_ci unsigned long addr, 170962306a36Sopenharmony_ci unsigned long flags) 171062306a36Sopenharmony_ci{ 171162306a36Sopenharmony_ci int err; 171262306a36Sopenharmony_ci int pval; 171362306a36Sopenharmony_ci nodemask_t nodes; 171462306a36Sopenharmony_ci 171562306a36Sopenharmony_ci if (nmask != NULL && maxnode < nr_node_ids) 171662306a36Sopenharmony_ci return -EINVAL; 171762306a36Sopenharmony_ci 171862306a36Sopenharmony_ci addr = untagged_addr(addr); 171962306a36Sopenharmony_ci 172062306a36Sopenharmony_ci err = do_get_mempolicy(&pval, &nodes, addr, flags); 172162306a36Sopenharmony_ci 172262306a36Sopenharmony_ci if (err) 172362306a36Sopenharmony_ci return err; 172462306a36Sopenharmony_ci 172562306a36Sopenharmony_ci if (policy && put_user(pval, policy)) 172662306a36Sopenharmony_ci return -EFAULT; 172762306a36Sopenharmony_ci 172862306a36Sopenharmony_ci if (nmask) 172962306a36Sopenharmony_ci err = copy_nodes_to_user(nmask, maxnode, &nodes); 173062306a36Sopenharmony_ci 173162306a36Sopenharmony_ci return err; 173262306a36Sopenharmony_ci} 173362306a36Sopenharmony_ci 173462306a36Sopenharmony_ciSYSCALL_DEFINE5(get_mempolicy, int __user *, policy, 173562306a36Sopenharmony_ci unsigned long __user *, nmask, unsigned long, maxnode, 173662306a36Sopenharmony_ci unsigned long, addr, unsigned long, flags) 173762306a36Sopenharmony_ci{ 173862306a36Sopenharmony_ci return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); 173962306a36Sopenharmony_ci} 174062306a36Sopenharmony_ci 174162306a36Sopenharmony_cibool vma_migratable(struct vm_area_struct *vma) 174262306a36Sopenharmony_ci{ 174362306a36Sopenharmony_ci if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 174462306a36Sopenharmony_ci return false; 174562306a36Sopenharmony_ci 174662306a36Sopenharmony_ci /* 174762306a36Sopenharmony_ci * DAX device mappings require predictable access latency, so avoid 174862306a36Sopenharmony_ci * incurring periodic faults. 174962306a36Sopenharmony_ci */ 175062306a36Sopenharmony_ci if (vma_is_dax(vma)) 175162306a36Sopenharmony_ci return false; 175262306a36Sopenharmony_ci 175362306a36Sopenharmony_ci if (is_vm_hugetlb_page(vma) && 175462306a36Sopenharmony_ci !hugepage_migration_supported(hstate_vma(vma))) 175562306a36Sopenharmony_ci return false; 175662306a36Sopenharmony_ci 175762306a36Sopenharmony_ci /* 175862306a36Sopenharmony_ci * Migration allocates pages in the highest zone. If we cannot 175962306a36Sopenharmony_ci * do so then migration (at least from node to node) is not 176062306a36Sopenharmony_ci * possible. 176162306a36Sopenharmony_ci */ 176262306a36Sopenharmony_ci if (vma->vm_file && 176362306a36Sopenharmony_ci gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) 176462306a36Sopenharmony_ci < policy_zone) 176562306a36Sopenharmony_ci return false; 176662306a36Sopenharmony_ci return true; 176762306a36Sopenharmony_ci} 176862306a36Sopenharmony_ci 176962306a36Sopenharmony_cistruct mempolicy *__get_vma_policy(struct vm_area_struct *vma, 177062306a36Sopenharmony_ci unsigned long addr) 177162306a36Sopenharmony_ci{ 177262306a36Sopenharmony_ci struct mempolicy *pol = NULL; 177362306a36Sopenharmony_ci 177462306a36Sopenharmony_ci if (vma) { 177562306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->get_policy) { 177662306a36Sopenharmony_ci pol = vma->vm_ops->get_policy(vma, addr); 177762306a36Sopenharmony_ci } else if (vma->vm_policy) { 177862306a36Sopenharmony_ci pol = vma->vm_policy; 177962306a36Sopenharmony_ci 178062306a36Sopenharmony_ci /* 178162306a36Sopenharmony_ci * shmem_alloc_page() passes MPOL_F_SHARED policy with 178262306a36Sopenharmony_ci * a pseudo vma whose vma->vm_ops=NULL. Take a reference 178362306a36Sopenharmony_ci * count on these policies which will be dropped by 178462306a36Sopenharmony_ci * mpol_cond_put() later 178562306a36Sopenharmony_ci */ 178662306a36Sopenharmony_ci if (mpol_needs_cond_ref(pol)) 178762306a36Sopenharmony_ci mpol_get(pol); 178862306a36Sopenharmony_ci } 178962306a36Sopenharmony_ci } 179062306a36Sopenharmony_ci 179162306a36Sopenharmony_ci return pol; 179262306a36Sopenharmony_ci} 179362306a36Sopenharmony_ci 179462306a36Sopenharmony_ci/* 179562306a36Sopenharmony_ci * get_vma_policy(@vma, @addr) 179662306a36Sopenharmony_ci * @vma: virtual memory area whose policy is sought 179762306a36Sopenharmony_ci * @addr: address in @vma for shared policy lookup 179862306a36Sopenharmony_ci * 179962306a36Sopenharmony_ci * Returns effective policy for a VMA at specified address. 180062306a36Sopenharmony_ci * Falls back to current->mempolicy or system default policy, as necessary. 180162306a36Sopenharmony_ci * Shared policies [those marked as MPOL_F_SHARED] require an extra reference 180262306a36Sopenharmony_ci * count--added by the get_policy() vm_op, as appropriate--to protect against 180362306a36Sopenharmony_ci * freeing by another task. It is the caller's responsibility to free the 180462306a36Sopenharmony_ci * extra reference for shared policies. 180562306a36Sopenharmony_ci */ 180662306a36Sopenharmony_cistatic struct mempolicy *get_vma_policy(struct vm_area_struct *vma, 180762306a36Sopenharmony_ci unsigned long addr) 180862306a36Sopenharmony_ci{ 180962306a36Sopenharmony_ci struct mempolicy *pol = __get_vma_policy(vma, addr); 181062306a36Sopenharmony_ci 181162306a36Sopenharmony_ci if (!pol) 181262306a36Sopenharmony_ci pol = get_task_policy(current); 181362306a36Sopenharmony_ci 181462306a36Sopenharmony_ci return pol; 181562306a36Sopenharmony_ci} 181662306a36Sopenharmony_ci 181762306a36Sopenharmony_cibool vma_policy_mof(struct vm_area_struct *vma) 181862306a36Sopenharmony_ci{ 181962306a36Sopenharmony_ci struct mempolicy *pol; 182062306a36Sopenharmony_ci 182162306a36Sopenharmony_ci if (vma->vm_ops && vma->vm_ops->get_policy) { 182262306a36Sopenharmony_ci bool ret = false; 182362306a36Sopenharmony_ci 182462306a36Sopenharmony_ci pol = vma->vm_ops->get_policy(vma, vma->vm_start); 182562306a36Sopenharmony_ci if (pol && (pol->flags & MPOL_F_MOF)) 182662306a36Sopenharmony_ci ret = true; 182762306a36Sopenharmony_ci mpol_cond_put(pol); 182862306a36Sopenharmony_ci 182962306a36Sopenharmony_ci return ret; 183062306a36Sopenharmony_ci } 183162306a36Sopenharmony_ci 183262306a36Sopenharmony_ci pol = vma->vm_policy; 183362306a36Sopenharmony_ci if (!pol) 183462306a36Sopenharmony_ci pol = get_task_policy(current); 183562306a36Sopenharmony_ci 183662306a36Sopenharmony_ci return pol->flags & MPOL_F_MOF; 183762306a36Sopenharmony_ci} 183862306a36Sopenharmony_ci 183962306a36Sopenharmony_cibool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 184062306a36Sopenharmony_ci{ 184162306a36Sopenharmony_ci enum zone_type dynamic_policy_zone = policy_zone; 184262306a36Sopenharmony_ci 184362306a36Sopenharmony_ci BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); 184462306a36Sopenharmony_ci 184562306a36Sopenharmony_ci /* 184662306a36Sopenharmony_ci * if policy->nodes has movable memory only, 184762306a36Sopenharmony_ci * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. 184862306a36Sopenharmony_ci * 184962306a36Sopenharmony_ci * policy->nodes is intersect with node_states[N_MEMORY]. 185062306a36Sopenharmony_ci * so if the following test fails, it implies 185162306a36Sopenharmony_ci * policy->nodes has movable memory only. 185262306a36Sopenharmony_ci */ 185362306a36Sopenharmony_ci if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) 185462306a36Sopenharmony_ci dynamic_policy_zone = ZONE_MOVABLE; 185562306a36Sopenharmony_ci 185662306a36Sopenharmony_ci return zone >= dynamic_policy_zone; 185762306a36Sopenharmony_ci} 185862306a36Sopenharmony_ci 185962306a36Sopenharmony_ci/* 186062306a36Sopenharmony_ci * Return a nodemask representing a mempolicy for filtering nodes for 186162306a36Sopenharmony_ci * page allocation 186262306a36Sopenharmony_ci */ 186362306a36Sopenharmony_cinodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) 186462306a36Sopenharmony_ci{ 186562306a36Sopenharmony_ci int mode = policy->mode; 186662306a36Sopenharmony_ci 186762306a36Sopenharmony_ci /* Lower zones don't get a nodemask applied for MPOL_BIND */ 186862306a36Sopenharmony_ci if (unlikely(mode == MPOL_BIND) && 186962306a36Sopenharmony_ci apply_policy_zone(policy, gfp_zone(gfp)) && 187062306a36Sopenharmony_ci cpuset_nodemask_valid_mems_allowed(&policy->nodes)) 187162306a36Sopenharmony_ci return &policy->nodes; 187262306a36Sopenharmony_ci 187362306a36Sopenharmony_ci if (mode == MPOL_PREFERRED_MANY) 187462306a36Sopenharmony_ci return &policy->nodes; 187562306a36Sopenharmony_ci 187662306a36Sopenharmony_ci return NULL; 187762306a36Sopenharmony_ci} 187862306a36Sopenharmony_ci 187962306a36Sopenharmony_ci/* 188062306a36Sopenharmony_ci * Return the preferred node id for 'prefer' mempolicy, and return 188162306a36Sopenharmony_ci * the given id for all other policies. 188262306a36Sopenharmony_ci * 188362306a36Sopenharmony_ci * policy_node() is always coupled with policy_nodemask(), which 188462306a36Sopenharmony_ci * secures the nodemask limit for 'bind' and 'prefer-many' policy. 188562306a36Sopenharmony_ci */ 188662306a36Sopenharmony_cistatic int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) 188762306a36Sopenharmony_ci{ 188862306a36Sopenharmony_ci if (policy->mode == MPOL_PREFERRED) { 188962306a36Sopenharmony_ci nd = first_node(policy->nodes); 189062306a36Sopenharmony_ci } else { 189162306a36Sopenharmony_ci /* 189262306a36Sopenharmony_ci * __GFP_THISNODE shouldn't even be used with the bind policy 189362306a36Sopenharmony_ci * because we might easily break the expectation to stay on the 189462306a36Sopenharmony_ci * requested node and not break the policy. 189562306a36Sopenharmony_ci */ 189662306a36Sopenharmony_ci WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); 189762306a36Sopenharmony_ci } 189862306a36Sopenharmony_ci 189962306a36Sopenharmony_ci if ((policy->mode == MPOL_BIND || 190062306a36Sopenharmony_ci policy->mode == MPOL_PREFERRED_MANY) && 190162306a36Sopenharmony_ci policy->home_node != NUMA_NO_NODE) 190262306a36Sopenharmony_ci return policy->home_node; 190362306a36Sopenharmony_ci 190462306a36Sopenharmony_ci return nd; 190562306a36Sopenharmony_ci} 190662306a36Sopenharmony_ci 190762306a36Sopenharmony_ci/* Do dynamic interleaving for a process */ 190862306a36Sopenharmony_cistatic unsigned interleave_nodes(struct mempolicy *policy) 190962306a36Sopenharmony_ci{ 191062306a36Sopenharmony_ci unsigned next; 191162306a36Sopenharmony_ci struct task_struct *me = current; 191262306a36Sopenharmony_ci 191362306a36Sopenharmony_ci next = next_node_in(me->il_prev, policy->nodes); 191462306a36Sopenharmony_ci if (next < MAX_NUMNODES) 191562306a36Sopenharmony_ci me->il_prev = next; 191662306a36Sopenharmony_ci return next; 191762306a36Sopenharmony_ci} 191862306a36Sopenharmony_ci 191962306a36Sopenharmony_ci/* 192062306a36Sopenharmony_ci * Depending on the memory policy provide a node from which to allocate the 192162306a36Sopenharmony_ci * next slab entry. 192262306a36Sopenharmony_ci */ 192362306a36Sopenharmony_ciunsigned int mempolicy_slab_node(void) 192462306a36Sopenharmony_ci{ 192562306a36Sopenharmony_ci struct mempolicy *policy; 192662306a36Sopenharmony_ci int node = numa_mem_id(); 192762306a36Sopenharmony_ci 192862306a36Sopenharmony_ci if (!in_task()) 192962306a36Sopenharmony_ci return node; 193062306a36Sopenharmony_ci 193162306a36Sopenharmony_ci policy = current->mempolicy; 193262306a36Sopenharmony_ci if (!policy) 193362306a36Sopenharmony_ci return node; 193462306a36Sopenharmony_ci 193562306a36Sopenharmony_ci switch (policy->mode) { 193662306a36Sopenharmony_ci case MPOL_PREFERRED: 193762306a36Sopenharmony_ci return first_node(policy->nodes); 193862306a36Sopenharmony_ci 193962306a36Sopenharmony_ci case MPOL_INTERLEAVE: 194062306a36Sopenharmony_ci return interleave_nodes(policy); 194162306a36Sopenharmony_ci 194262306a36Sopenharmony_ci case MPOL_BIND: 194362306a36Sopenharmony_ci case MPOL_PREFERRED_MANY: 194462306a36Sopenharmony_ci { 194562306a36Sopenharmony_ci struct zoneref *z; 194662306a36Sopenharmony_ci 194762306a36Sopenharmony_ci /* 194862306a36Sopenharmony_ci * Follow bind policy behavior and start allocation at the 194962306a36Sopenharmony_ci * first node. 195062306a36Sopenharmony_ci */ 195162306a36Sopenharmony_ci struct zonelist *zonelist; 195262306a36Sopenharmony_ci enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 195362306a36Sopenharmony_ci zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; 195462306a36Sopenharmony_ci z = first_zones_zonelist(zonelist, highest_zoneidx, 195562306a36Sopenharmony_ci &policy->nodes); 195662306a36Sopenharmony_ci return z->zone ? zone_to_nid(z->zone) : node; 195762306a36Sopenharmony_ci } 195862306a36Sopenharmony_ci case MPOL_LOCAL: 195962306a36Sopenharmony_ci return node; 196062306a36Sopenharmony_ci 196162306a36Sopenharmony_ci default: 196262306a36Sopenharmony_ci BUG(); 196362306a36Sopenharmony_ci } 196462306a36Sopenharmony_ci} 196562306a36Sopenharmony_ci 196662306a36Sopenharmony_ci/* 196762306a36Sopenharmony_ci * Do static interleaving for a VMA with known offset @n. Returns the n'th 196862306a36Sopenharmony_ci * node in pol->nodes (starting from n=0), wrapping around if n exceeds the 196962306a36Sopenharmony_ci * number of present nodes. 197062306a36Sopenharmony_ci */ 197162306a36Sopenharmony_cistatic unsigned offset_il_node(struct mempolicy *pol, unsigned long n) 197262306a36Sopenharmony_ci{ 197362306a36Sopenharmony_ci nodemask_t nodemask = pol->nodes; 197462306a36Sopenharmony_ci unsigned int target, nnodes; 197562306a36Sopenharmony_ci int i; 197662306a36Sopenharmony_ci int nid; 197762306a36Sopenharmony_ci /* 197862306a36Sopenharmony_ci * The barrier will stabilize the nodemask in a register or on 197962306a36Sopenharmony_ci * the stack so that it will stop changing under the code. 198062306a36Sopenharmony_ci * 198162306a36Sopenharmony_ci * Between first_node() and next_node(), pol->nodes could be changed 198262306a36Sopenharmony_ci * by other threads. So we put pol->nodes in a local stack. 198362306a36Sopenharmony_ci */ 198462306a36Sopenharmony_ci barrier(); 198562306a36Sopenharmony_ci 198662306a36Sopenharmony_ci nnodes = nodes_weight(nodemask); 198762306a36Sopenharmony_ci if (!nnodes) 198862306a36Sopenharmony_ci return numa_node_id(); 198962306a36Sopenharmony_ci target = (unsigned int)n % nnodes; 199062306a36Sopenharmony_ci nid = first_node(nodemask); 199162306a36Sopenharmony_ci for (i = 0; i < target; i++) 199262306a36Sopenharmony_ci nid = next_node(nid, nodemask); 199362306a36Sopenharmony_ci return nid; 199462306a36Sopenharmony_ci} 199562306a36Sopenharmony_ci 199662306a36Sopenharmony_ci/* Determine a node number for interleave */ 199762306a36Sopenharmony_cistatic inline unsigned interleave_nid(struct mempolicy *pol, 199862306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr, int shift) 199962306a36Sopenharmony_ci{ 200062306a36Sopenharmony_ci if (vma) { 200162306a36Sopenharmony_ci unsigned long off; 200262306a36Sopenharmony_ci 200362306a36Sopenharmony_ci /* 200462306a36Sopenharmony_ci * for small pages, there is no difference between 200562306a36Sopenharmony_ci * shift and PAGE_SHIFT, so the bit-shift is safe. 200662306a36Sopenharmony_ci * for huge pages, since vm_pgoff is in units of small 200762306a36Sopenharmony_ci * pages, we need to shift off the always 0 bits to get 200862306a36Sopenharmony_ci * a useful offset. 200962306a36Sopenharmony_ci */ 201062306a36Sopenharmony_ci BUG_ON(shift < PAGE_SHIFT); 201162306a36Sopenharmony_ci off = vma->vm_pgoff >> (shift - PAGE_SHIFT); 201262306a36Sopenharmony_ci off += (addr - vma->vm_start) >> shift; 201362306a36Sopenharmony_ci return offset_il_node(pol, off); 201462306a36Sopenharmony_ci } else 201562306a36Sopenharmony_ci return interleave_nodes(pol); 201662306a36Sopenharmony_ci} 201762306a36Sopenharmony_ci 201862306a36Sopenharmony_ci#ifdef CONFIG_HUGETLBFS 201962306a36Sopenharmony_ci/* 202062306a36Sopenharmony_ci * huge_node(@vma, @addr, @gfp_flags, @mpol) 202162306a36Sopenharmony_ci * @vma: virtual memory area whose policy is sought 202262306a36Sopenharmony_ci * @addr: address in @vma for shared policy lookup and interleave policy 202362306a36Sopenharmony_ci * @gfp_flags: for requested zone 202462306a36Sopenharmony_ci * @mpol: pointer to mempolicy pointer for reference counted mempolicy 202562306a36Sopenharmony_ci * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy 202662306a36Sopenharmony_ci * 202762306a36Sopenharmony_ci * Returns a nid suitable for a huge page allocation and a pointer 202862306a36Sopenharmony_ci * to the struct mempolicy for conditional unref after allocation. 202962306a36Sopenharmony_ci * If the effective policy is 'bind' or 'prefer-many', returns a pointer 203062306a36Sopenharmony_ci * to the mempolicy's @nodemask for filtering the zonelist. 203162306a36Sopenharmony_ci * 203262306a36Sopenharmony_ci * Must be protected by read_mems_allowed_begin() 203362306a36Sopenharmony_ci */ 203462306a36Sopenharmony_ciint huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, 203562306a36Sopenharmony_ci struct mempolicy **mpol, nodemask_t **nodemask) 203662306a36Sopenharmony_ci{ 203762306a36Sopenharmony_ci int nid; 203862306a36Sopenharmony_ci int mode; 203962306a36Sopenharmony_ci 204062306a36Sopenharmony_ci *mpol = get_vma_policy(vma, addr); 204162306a36Sopenharmony_ci *nodemask = NULL; 204262306a36Sopenharmony_ci mode = (*mpol)->mode; 204362306a36Sopenharmony_ci 204462306a36Sopenharmony_ci if (unlikely(mode == MPOL_INTERLEAVE)) { 204562306a36Sopenharmony_ci nid = interleave_nid(*mpol, vma, addr, 204662306a36Sopenharmony_ci huge_page_shift(hstate_vma(vma))); 204762306a36Sopenharmony_ci } else { 204862306a36Sopenharmony_ci nid = policy_node(gfp_flags, *mpol, numa_node_id()); 204962306a36Sopenharmony_ci if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) 205062306a36Sopenharmony_ci *nodemask = &(*mpol)->nodes; 205162306a36Sopenharmony_ci } 205262306a36Sopenharmony_ci return nid; 205362306a36Sopenharmony_ci} 205462306a36Sopenharmony_ci 205562306a36Sopenharmony_ci/* 205662306a36Sopenharmony_ci * init_nodemask_of_mempolicy 205762306a36Sopenharmony_ci * 205862306a36Sopenharmony_ci * If the current task's mempolicy is "default" [NULL], return 'false' 205962306a36Sopenharmony_ci * to indicate default policy. Otherwise, extract the policy nodemask 206062306a36Sopenharmony_ci * for 'bind' or 'interleave' policy into the argument nodemask, or 206162306a36Sopenharmony_ci * initialize the argument nodemask to contain the single node for 206262306a36Sopenharmony_ci * 'preferred' or 'local' policy and return 'true' to indicate presence 206362306a36Sopenharmony_ci * of non-default mempolicy. 206462306a36Sopenharmony_ci * 206562306a36Sopenharmony_ci * We don't bother with reference counting the mempolicy [mpol_get/put] 206662306a36Sopenharmony_ci * because the current task is examining it's own mempolicy and a task's 206762306a36Sopenharmony_ci * mempolicy is only ever changed by the task itself. 206862306a36Sopenharmony_ci * 206962306a36Sopenharmony_ci * N.B., it is the caller's responsibility to free a returned nodemask. 207062306a36Sopenharmony_ci */ 207162306a36Sopenharmony_cibool init_nodemask_of_mempolicy(nodemask_t *mask) 207262306a36Sopenharmony_ci{ 207362306a36Sopenharmony_ci struct mempolicy *mempolicy; 207462306a36Sopenharmony_ci 207562306a36Sopenharmony_ci if (!(mask && current->mempolicy)) 207662306a36Sopenharmony_ci return false; 207762306a36Sopenharmony_ci 207862306a36Sopenharmony_ci task_lock(current); 207962306a36Sopenharmony_ci mempolicy = current->mempolicy; 208062306a36Sopenharmony_ci switch (mempolicy->mode) { 208162306a36Sopenharmony_ci case MPOL_PREFERRED: 208262306a36Sopenharmony_ci case MPOL_PREFERRED_MANY: 208362306a36Sopenharmony_ci case MPOL_BIND: 208462306a36Sopenharmony_ci case MPOL_INTERLEAVE: 208562306a36Sopenharmony_ci *mask = mempolicy->nodes; 208662306a36Sopenharmony_ci break; 208762306a36Sopenharmony_ci 208862306a36Sopenharmony_ci case MPOL_LOCAL: 208962306a36Sopenharmony_ci init_nodemask_of_node(mask, numa_node_id()); 209062306a36Sopenharmony_ci break; 209162306a36Sopenharmony_ci 209262306a36Sopenharmony_ci default: 209362306a36Sopenharmony_ci BUG(); 209462306a36Sopenharmony_ci } 209562306a36Sopenharmony_ci task_unlock(current); 209662306a36Sopenharmony_ci 209762306a36Sopenharmony_ci return true; 209862306a36Sopenharmony_ci} 209962306a36Sopenharmony_ci#endif 210062306a36Sopenharmony_ci 210162306a36Sopenharmony_ci/* 210262306a36Sopenharmony_ci * mempolicy_in_oom_domain 210362306a36Sopenharmony_ci * 210462306a36Sopenharmony_ci * If tsk's mempolicy is "bind", check for intersection between mask and 210562306a36Sopenharmony_ci * the policy nodemask. Otherwise, return true for all other policies 210662306a36Sopenharmony_ci * including "interleave", as a tsk with "interleave" policy may have 210762306a36Sopenharmony_ci * memory allocated from all nodes in system. 210862306a36Sopenharmony_ci * 210962306a36Sopenharmony_ci * Takes task_lock(tsk) to prevent freeing of its mempolicy. 211062306a36Sopenharmony_ci */ 211162306a36Sopenharmony_cibool mempolicy_in_oom_domain(struct task_struct *tsk, 211262306a36Sopenharmony_ci const nodemask_t *mask) 211362306a36Sopenharmony_ci{ 211462306a36Sopenharmony_ci struct mempolicy *mempolicy; 211562306a36Sopenharmony_ci bool ret = true; 211662306a36Sopenharmony_ci 211762306a36Sopenharmony_ci if (!mask) 211862306a36Sopenharmony_ci return ret; 211962306a36Sopenharmony_ci 212062306a36Sopenharmony_ci task_lock(tsk); 212162306a36Sopenharmony_ci mempolicy = tsk->mempolicy; 212262306a36Sopenharmony_ci if (mempolicy && mempolicy->mode == MPOL_BIND) 212362306a36Sopenharmony_ci ret = nodes_intersects(mempolicy->nodes, *mask); 212462306a36Sopenharmony_ci task_unlock(tsk); 212562306a36Sopenharmony_ci 212662306a36Sopenharmony_ci return ret; 212762306a36Sopenharmony_ci} 212862306a36Sopenharmony_ci 212962306a36Sopenharmony_ci/* Allocate a page in interleaved policy. 213062306a36Sopenharmony_ci Own path because it needs to do special accounting. */ 213162306a36Sopenharmony_cistatic struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 213262306a36Sopenharmony_ci unsigned nid) 213362306a36Sopenharmony_ci{ 213462306a36Sopenharmony_ci struct page *page; 213562306a36Sopenharmony_ci 213662306a36Sopenharmony_ci page = __alloc_pages(gfp, order, nid, NULL); 213762306a36Sopenharmony_ci /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ 213862306a36Sopenharmony_ci if (!static_branch_likely(&vm_numa_stat_key)) 213962306a36Sopenharmony_ci return page; 214062306a36Sopenharmony_ci if (page && page_to_nid(page) == nid) { 214162306a36Sopenharmony_ci preempt_disable(); 214262306a36Sopenharmony_ci __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); 214362306a36Sopenharmony_ci preempt_enable(); 214462306a36Sopenharmony_ci } 214562306a36Sopenharmony_ci return page; 214662306a36Sopenharmony_ci} 214762306a36Sopenharmony_ci 214862306a36Sopenharmony_cistatic struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, 214962306a36Sopenharmony_ci int nid, struct mempolicy *pol) 215062306a36Sopenharmony_ci{ 215162306a36Sopenharmony_ci struct page *page; 215262306a36Sopenharmony_ci gfp_t preferred_gfp; 215362306a36Sopenharmony_ci 215462306a36Sopenharmony_ci /* 215562306a36Sopenharmony_ci * This is a two pass approach. The first pass will only try the 215662306a36Sopenharmony_ci * preferred nodes but skip the direct reclaim and allow the 215762306a36Sopenharmony_ci * allocation to fail, while the second pass will try all the 215862306a36Sopenharmony_ci * nodes in system. 215962306a36Sopenharmony_ci */ 216062306a36Sopenharmony_ci preferred_gfp = gfp | __GFP_NOWARN; 216162306a36Sopenharmony_ci preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 216262306a36Sopenharmony_ci page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); 216362306a36Sopenharmony_ci if (!page) 216462306a36Sopenharmony_ci page = __alloc_pages(gfp, order, nid, NULL); 216562306a36Sopenharmony_ci 216662306a36Sopenharmony_ci return page; 216762306a36Sopenharmony_ci} 216862306a36Sopenharmony_ci 216962306a36Sopenharmony_ci/** 217062306a36Sopenharmony_ci * vma_alloc_folio - Allocate a folio for a VMA. 217162306a36Sopenharmony_ci * @gfp: GFP flags. 217262306a36Sopenharmony_ci * @order: Order of the folio. 217362306a36Sopenharmony_ci * @vma: Pointer to VMA or NULL if not available. 217462306a36Sopenharmony_ci * @addr: Virtual address of the allocation. Must be inside @vma. 217562306a36Sopenharmony_ci * @hugepage: For hugepages try only the preferred node if possible. 217662306a36Sopenharmony_ci * 217762306a36Sopenharmony_ci * Allocate a folio for a specific address in @vma, using the appropriate 217862306a36Sopenharmony_ci * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock 217962306a36Sopenharmony_ci * of the mm_struct of the VMA to prevent it from going away. Should be 218062306a36Sopenharmony_ci * used for all allocations for folios that will be mapped into user space. 218162306a36Sopenharmony_ci * 218262306a36Sopenharmony_ci * Return: The folio on success or NULL if allocation fails. 218362306a36Sopenharmony_ci */ 218462306a36Sopenharmony_cistruct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, 218562306a36Sopenharmony_ci unsigned long addr, bool hugepage) 218662306a36Sopenharmony_ci{ 218762306a36Sopenharmony_ci struct mempolicy *pol; 218862306a36Sopenharmony_ci int node = numa_node_id(); 218962306a36Sopenharmony_ci struct folio *folio; 219062306a36Sopenharmony_ci int preferred_nid; 219162306a36Sopenharmony_ci nodemask_t *nmask; 219262306a36Sopenharmony_ci 219362306a36Sopenharmony_ci pol = get_vma_policy(vma, addr); 219462306a36Sopenharmony_ci 219562306a36Sopenharmony_ci if (pol->mode == MPOL_INTERLEAVE) { 219662306a36Sopenharmony_ci struct page *page; 219762306a36Sopenharmony_ci unsigned nid; 219862306a36Sopenharmony_ci 219962306a36Sopenharmony_ci nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 220062306a36Sopenharmony_ci mpol_cond_put(pol); 220162306a36Sopenharmony_ci gfp |= __GFP_COMP; 220262306a36Sopenharmony_ci page = alloc_page_interleave(gfp, order, nid); 220362306a36Sopenharmony_ci folio = (struct folio *)page; 220462306a36Sopenharmony_ci if (folio && order > 1) 220562306a36Sopenharmony_ci folio_prep_large_rmappable(folio); 220662306a36Sopenharmony_ci goto out; 220762306a36Sopenharmony_ci } 220862306a36Sopenharmony_ci 220962306a36Sopenharmony_ci if (pol->mode == MPOL_PREFERRED_MANY) { 221062306a36Sopenharmony_ci struct page *page; 221162306a36Sopenharmony_ci 221262306a36Sopenharmony_ci node = policy_node(gfp, pol, node); 221362306a36Sopenharmony_ci gfp |= __GFP_COMP; 221462306a36Sopenharmony_ci page = alloc_pages_preferred_many(gfp, order, node, pol); 221562306a36Sopenharmony_ci mpol_cond_put(pol); 221662306a36Sopenharmony_ci folio = (struct folio *)page; 221762306a36Sopenharmony_ci if (folio && order > 1) 221862306a36Sopenharmony_ci folio_prep_large_rmappable(folio); 221962306a36Sopenharmony_ci goto out; 222062306a36Sopenharmony_ci } 222162306a36Sopenharmony_ci 222262306a36Sopenharmony_ci if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { 222362306a36Sopenharmony_ci int hpage_node = node; 222462306a36Sopenharmony_ci 222562306a36Sopenharmony_ci /* 222662306a36Sopenharmony_ci * For hugepage allocation and non-interleave policy which 222762306a36Sopenharmony_ci * allows the current node (or other explicitly preferred 222862306a36Sopenharmony_ci * node) we only try to allocate from the current/preferred 222962306a36Sopenharmony_ci * node and don't fall back to other nodes, as the cost of 223062306a36Sopenharmony_ci * remote accesses would likely offset THP benefits. 223162306a36Sopenharmony_ci * 223262306a36Sopenharmony_ci * If the policy is interleave or does not allow the current 223362306a36Sopenharmony_ci * node in its nodemask, we allocate the standard way. 223462306a36Sopenharmony_ci */ 223562306a36Sopenharmony_ci if (pol->mode == MPOL_PREFERRED) 223662306a36Sopenharmony_ci hpage_node = first_node(pol->nodes); 223762306a36Sopenharmony_ci 223862306a36Sopenharmony_ci nmask = policy_nodemask(gfp, pol); 223962306a36Sopenharmony_ci if (!nmask || node_isset(hpage_node, *nmask)) { 224062306a36Sopenharmony_ci mpol_cond_put(pol); 224162306a36Sopenharmony_ci /* 224262306a36Sopenharmony_ci * First, try to allocate THP only on local node, but 224362306a36Sopenharmony_ci * don't reclaim unnecessarily, just compact. 224462306a36Sopenharmony_ci */ 224562306a36Sopenharmony_ci folio = __folio_alloc_node(gfp | __GFP_THISNODE | 224662306a36Sopenharmony_ci __GFP_NORETRY, order, hpage_node); 224762306a36Sopenharmony_ci 224862306a36Sopenharmony_ci /* 224962306a36Sopenharmony_ci * If hugepage allocations are configured to always 225062306a36Sopenharmony_ci * synchronous compact or the vma has been madvised 225162306a36Sopenharmony_ci * to prefer hugepage backing, retry allowing remote 225262306a36Sopenharmony_ci * memory with both reclaim and compact as well. 225362306a36Sopenharmony_ci */ 225462306a36Sopenharmony_ci if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) 225562306a36Sopenharmony_ci folio = __folio_alloc(gfp, order, hpage_node, 225662306a36Sopenharmony_ci nmask); 225762306a36Sopenharmony_ci 225862306a36Sopenharmony_ci goto out; 225962306a36Sopenharmony_ci } 226062306a36Sopenharmony_ci } 226162306a36Sopenharmony_ci 226262306a36Sopenharmony_ci nmask = policy_nodemask(gfp, pol); 226362306a36Sopenharmony_ci preferred_nid = policy_node(gfp, pol, node); 226462306a36Sopenharmony_ci folio = __folio_alloc(gfp, order, preferred_nid, nmask); 226562306a36Sopenharmony_ci mpol_cond_put(pol); 226662306a36Sopenharmony_ciout: 226762306a36Sopenharmony_ci return folio; 226862306a36Sopenharmony_ci} 226962306a36Sopenharmony_ciEXPORT_SYMBOL(vma_alloc_folio); 227062306a36Sopenharmony_ci 227162306a36Sopenharmony_ci/** 227262306a36Sopenharmony_ci * alloc_pages - Allocate pages. 227362306a36Sopenharmony_ci * @gfp: GFP flags. 227462306a36Sopenharmony_ci * @order: Power of two of number of pages to allocate. 227562306a36Sopenharmony_ci * 227662306a36Sopenharmony_ci * Allocate 1 << @order contiguous pages. The physical address of the 227762306a36Sopenharmony_ci * first page is naturally aligned (eg an order-3 allocation will be aligned 227862306a36Sopenharmony_ci * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current 227962306a36Sopenharmony_ci * process is honoured when in process context. 228062306a36Sopenharmony_ci * 228162306a36Sopenharmony_ci * Context: Can be called from any context, providing the appropriate GFP 228262306a36Sopenharmony_ci * flags are used. 228362306a36Sopenharmony_ci * Return: The page on success or NULL if allocation fails. 228462306a36Sopenharmony_ci */ 228562306a36Sopenharmony_cistruct page *alloc_pages(gfp_t gfp, unsigned order) 228662306a36Sopenharmony_ci{ 228762306a36Sopenharmony_ci struct mempolicy *pol = &default_policy; 228862306a36Sopenharmony_ci struct page *page; 228962306a36Sopenharmony_ci 229062306a36Sopenharmony_ci if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 229162306a36Sopenharmony_ci pol = get_task_policy(current); 229262306a36Sopenharmony_ci 229362306a36Sopenharmony_ci /* 229462306a36Sopenharmony_ci * No reference counting needed for current->mempolicy 229562306a36Sopenharmony_ci * nor system default_policy 229662306a36Sopenharmony_ci */ 229762306a36Sopenharmony_ci if (pol->mode == MPOL_INTERLEAVE) 229862306a36Sopenharmony_ci page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 229962306a36Sopenharmony_ci else if (pol->mode == MPOL_PREFERRED_MANY) 230062306a36Sopenharmony_ci page = alloc_pages_preferred_many(gfp, order, 230162306a36Sopenharmony_ci policy_node(gfp, pol, numa_node_id()), pol); 230262306a36Sopenharmony_ci else 230362306a36Sopenharmony_ci page = __alloc_pages(gfp, order, 230462306a36Sopenharmony_ci policy_node(gfp, pol, numa_node_id()), 230562306a36Sopenharmony_ci policy_nodemask(gfp, pol)); 230662306a36Sopenharmony_ci 230762306a36Sopenharmony_ci return page; 230862306a36Sopenharmony_ci} 230962306a36Sopenharmony_ciEXPORT_SYMBOL(alloc_pages); 231062306a36Sopenharmony_ci 231162306a36Sopenharmony_cistruct folio *folio_alloc(gfp_t gfp, unsigned order) 231262306a36Sopenharmony_ci{ 231362306a36Sopenharmony_ci struct page *page = alloc_pages(gfp | __GFP_COMP, order); 231462306a36Sopenharmony_ci struct folio *folio = (struct folio *)page; 231562306a36Sopenharmony_ci 231662306a36Sopenharmony_ci if (folio && order > 1) 231762306a36Sopenharmony_ci folio_prep_large_rmappable(folio); 231862306a36Sopenharmony_ci return folio; 231962306a36Sopenharmony_ci} 232062306a36Sopenharmony_ciEXPORT_SYMBOL(folio_alloc); 232162306a36Sopenharmony_ci 232262306a36Sopenharmony_cistatic unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, 232362306a36Sopenharmony_ci struct mempolicy *pol, unsigned long nr_pages, 232462306a36Sopenharmony_ci struct page **page_array) 232562306a36Sopenharmony_ci{ 232662306a36Sopenharmony_ci int nodes; 232762306a36Sopenharmony_ci unsigned long nr_pages_per_node; 232862306a36Sopenharmony_ci int delta; 232962306a36Sopenharmony_ci int i; 233062306a36Sopenharmony_ci unsigned long nr_allocated; 233162306a36Sopenharmony_ci unsigned long total_allocated = 0; 233262306a36Sopenharmony_ci 233362306a36Sopenharmony_ci nodes = nodes_weight(pol->nodes); 233462306a36Sopenharmony_ci nr_pages_per_node = nr_pages / nodes; 233562306a36Sopenharmony_ci delta = nr_pages - nodes * nr_pages_per_node; 233662306a36Sopenharmony_ci 233762306a36Sopenharmony_ci for (i = 0; i < nodes; i++) { 233862306a36Sopenharmony_ci if (delta) { 233962306a36Sopenharmony_ci nr_allocated = __alloc_pages_bulk(gfp, 234062306a36Sopenharmony_ci interleave_nodes(pol), NULL, 234162306a36Sopenharmony_ci nr_pages_per_node + 1, NULL, 234262306a36Sopenharmony_ci page_array); 234362306a36Sopenharmony_ci delta--; 234462306a36Sopenharmony_ci } else { 234562306a36Sopenharmony_ci nr_allocated = __alloc_pages_bulk(gfp, 234662306a36Sopenharmony_ci interleave_nodes(pol), NULL, 234762306a36Sopenharmony_ci nr_pages_per_node, NULL, page_array); 234862306a36Sopenharmony_ci } 234962306a36Sopenharmony_ci 235062306a36Sopenharmony_ci page_array += nr_allocated; 235162306a36Sopenharmony_ci total_allocated += nr_allocated; 235262306a36Sopenharmony_ci } 235362306a36Sopenharmony_ci 235462306a36Sopenharmony_ci return total_allocated; 235562306a36Sopenharmony_ci} 235662306a36Sopenharmony_ci 235762306a36Sopenharmony_cistatic unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, 235862306a36Sopenharmony_ci struct mempolicy *pol, unsigned long nr_pages, 235962306a36Sopenharmony_ci struct page **page_array) 236062306a36Sopenharmony_ci{ 236162306a36Sopenharmony_ci gfp_t preferred_gfp; 236262306a36Sopenharmony_ci unsigned long nr_allocated = 0; 236362306a36Sopenharmony_ci 236462306a36Sopenharmony_ci preferred_gfp = gfp | __GFP_NOWARN; 236562306a36Sopenharmony_ci preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); 236662306a36Sopenharmony_ci 236762306a36Sopenharmony_ci nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, 236862306a36Sopenharmony_ci nr_pages, NULL, page_array); 236962306a36Sopenharmony_ci 237062306a36Sopenharmony_ci if (nr_allocated < nr_pages) 237162306a36Sopenharmony_ci nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, 237262306a36Sopenharmony_ci nr_pages - nr_allocated, NULL, 237362306a36Sopenharmony_ci page_array + nr_allocated); 237462306a36Sopenharmony_ci return nr_allocated; 237562306a36Sopenharmony_ci} 237662306a36Sopenharmony_ci 237762306a36Sopenharmony_ci/* alloc pages bulk and mempolicy should be considered at the 237862306a36Sopenharmony_ci * same time in some situation such as vmalloc. 237962306a36Sopenharmony_ci * 238062306a36Sopenharmony_ci * It can accelerate memory allocation especially interleaving 238162306a36Sopenharmony_ci * allocate memory. 238262306a36Sopenharmony_ci */ 238362306a36Sopenharmony_ciunsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, 238462306a36Sopenharmony_ci unsigned long nr_pages, struct page **page_array) 238562306a36Sopenharmony_ci{ 238662306a36Sopenharmony_ci struct mempolicy *pol = &default_policy; 238762306a36Sopenharmony_ci 238862306a36Sopenharmony_ci if (!in_interrupt() && !(gfp & __GFP_THISNODE)) 238962306a36Sopenharmony_ci pol = get_task_policy(current); 239062306a36Sopenharmony_ci 239162306a36Sopenharmony_ci if (pol->mode == MPOL_INTERLEAVE) 239262306a36Sopenharmony_ci return alloc_pages_bulk_array_interleave(gfp, pol, 239362306a36Sopenharmony_ci nr_pages, page_array); 239462306a36Sopenharmony_ci 239562306a36Sopenharmony_ci if (pol->mode == MPOL_PREFERRED_MANY) 239662306a36Sopenharmony_ci return alloc_pages_bulk_array_preferred_many(gfp, 239762306a36Sopenharmony_ci numa_node_id(), pol, nr_pages, page_array); 239862306a36Sopenharmony_ci 239962306a36Sopenharmony_ci return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), 240062306a36Sopenharmony_ci policy_nodemask(gfp, pol), nr_pages, NULL, 240162306a36Sopenharmony_ci page_array); 240262306a36Sopenharmony_ci} 240362306a36Sopenharmony_ci 240462306a36Sopenharmony_ciint vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 240562306a36Sopenharmony_ci{ 240662306a36Sopenharmony_ci struct mempolicy *pol = mpol_dup(vma_policy(src)); 240762306a36Sopenharmony_ci 240862306a36Sopenharmony_ci if (IS_ERR(pol)) 240962306a36Sopenharmony_ci return PTR_ERR(pol); 241062306a36Sopenharmony_ci dst->vm_policy = pol; 241162306a36Sopenharmony_ci return 0; 241262306a36Sopenharmony_ci} 241362306a36Sopenharmony_ci 241462306a36Sopenharmony_ci/* 241562306a36Sopenharmony_ci * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 241662306a36Sopenharmony_ci * rebinds the mempolicy its copying by calling mpol_rebind_policy() 241762306a36Sopenharmony_ci * with the mems_allowed returned by cpuset_mems_allowed(). This 241862306a36Sopenharmony_ci * keeps mempolicies cpuset relative after its cpuset moves. See 241962306a36Sopenharmony_ci * further kernel/cpuset.c update_nodemask(). 242062306a36Sopenharmony_ci * 242162306a36Sopenharmony_ci * current's mempolicy may be rebinded by the other task(the task that changes 242262306a36Sopenharmony_ci * cpuset's mems), so we needn't do rebind work for current task. 242362306a36Sopenharmony_ci */ 242462306a36Sopenharmony_ci 242562306a36Sopenharmony_ci/* Slow path of a mempolicy duplicate */ 242662306a36Sopenharmony_cistruct mempolicy *__mpol_dup(struct mempolicy *old) 242762306a36Sopenharmony_ci{ 242862306a36Sopenharmony_ci struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 242962306a36Sopenharmony_ci 243062306a36Sopenharmony_ci if (!new) 243162306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 243262306a36Sopenharmony_ci 243362306a36Sopenharmony_ci /* task's mempolicy is protected by alloc_lock */ 243462306a36Sopenharmony_ci if (old == current->mempolicy) { 243562306a36Sopenharmony_ci task_lock(current); 243662306a36Sopenharmony_ci *new = *old; 243762306a36Sopenharmony_ci task_unlock(current); 243862306a36Sopenharmony_ci } else 243962306a36Sopenharmony_ci *new = *old; 244062306a36Sopenharmony_ci 244162306a36Sopenharmony_ci if (current_cpuset_is_being_rebound()) { 244262306a36Sopenharmony_ci nodemask_t mems = cpuset_mems_allowed(current); 244362306a36Sopenharmony_ci mpol_rebind_policy(new, &mems); 244462306a36Sopenharmony_ci } 244562306a36Sopenharmony_ci atomic_set(&new->refcnt, 1); 244662306a36Sopenharmony_ci return new; 244762306a36Sopenharmony_ci} 244862306a36Sopenharmony_ci 244962306a36Sopenharmony_ci/* Slow path of a mempolicy comparison */ 245062306a36Sopenharmony_cibool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 245162306a36Sopenharmony_ci{ 245262306a36Sopenharmony_ci if (!a || !b) 245362306a36Sopenharmony_ci return false; 245462306a36Sopenharmony_ci if (a->mode != b->mode) 245562306a36Sopenharmony_ci return false; 245662306a36Sopenharmony_ci if (a->flags != b->flags) 245762306a36Sopenharmony_ci return false; 245862306a36Sopenharmony_ci if (a->home_node != b->home_node) 245962306a36Sopenharmony_ci return false; 246062306a36Sopenharmony_ci if (mpol_store_user_nodemask(a)) 246162306a36Sopenharmony_ci if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 246262306a36Sopenharmony_ci return false; 246362306a36Sopenharmony_ci 246462306a36Sopenharmony_ci switch (a->mode) { 246562306a36Sopenharmony_ci case MPOL_BIND: 246662306a36Sopenharmony_ci case MPOL_INTERLEAVE: 246762306a36Sopenharmony_ci case MPOL_PREFERRED: 246862306a36Sopenharmony_ci case MPOL_PREFERRED_MANY: 246962306a36Sopenharmony_ci return !!nodes_equal(a->nodes, b->nodes); 247062306a36Sopenharmony_ci case MPOL_LOCAL: 247162306a36Sopenharmony_ci return true; 247262306a36Sopenharmony_ci default: 247362306a36Sopenharmony_ci BUG(); 247462306a36Sopenharmony_ci return false; 247562306a36Sopenharmony_ci } 247662306a36Sopenharmony_ci} 247762306a36Sopenharmony_ci 247862306a36Sopenharmony_ci/* 247962306a36Sopenharmony_ci * Shared memory backing store policy support. 248062306a36Sopenharmony_ci * 248162306a36Sopenharmony_ci * Remember policies even when nobody has shared memory mapped. 248262306a36Sopenharmony_ci * The policies are kept in Red-Black tree linked from the inode. 248362306a36Sopenharmony_ci * They are protected by the sp->lock rwlock, which should be held 248462306a36Sopenharmony_ci * for any accesses to the tree. 248562306a36Sopenharmony_ci */ 248662306a36Sopenharmony_ci 248762306a36Sopenharmony_ci/* 248862306a36Sopenharmony_ci * lookup first element intersecting start-end. Caller holds sp->lock for 248962306a36Sopenharmony_ci * reading or for writing 249062306a36Sopenharmony_ci */ 249162306a36Sopenharmony_cistatic struct sp_node * 249262306a36Sopenharmony_cisp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 249362306a36Sopenharmony_ci{ 249462306a36Sopenharmony_ci struct rb_node *n = sp->root.rb_node; 249562306a36Sopenharmony_ci 249662306a36Sopenharmony_ci while (n) { 249762306a36Sopenharmony_ci struct sp_node *p = rb_entry(n, struct sp_node, nd); 249862306a36Sopenharmony_ci 249962306a36Sopenharmony_ci if (start >= p->end) 250062306a36Sopenharmony_ci n = n->rb_right; 250162306a36Sopenharmony_ci else if (end <= p->start) 250262306a36Sopenharmony_ci n = n->rb_left; 250362306a36Sopenharmony_ci else 250462306a36Sopenharmony_ci break; 250562306a36Sopenharmony_ci } 250662306a36Sopenharmony_ci if (!n) 250762306a36Sopenharmony_ci return NULL; 250862306a36Sopenharmony_ci for (;;) { 250962306a36Sopenharmony_ci struct sp_node *w = NULL; 251062306a36Sopenharmony_ci struct rb_node *prev = rb_prev(n); 251162306a36Sopenharmony_ci if (!prev) 251262306a36Sopenharmony_ci break; 251362306a36Sopenharmony_ci w = rb_entry(prev, struct sp_node, nd); 251462306a36Sopenharmony_ci if (w->end <= start) 251562306a36Sopenharmony_ci break; 251662306a36Sopenharmony_ci n = prev; 251762306a36Sopenharmony_ci } 251862306a36Sopenharmony_ci return rb_entry(n, struct sp_node, nd); 251962306a36Sopenharmony_ci} 252062306a36Sopenharmony_ci 252162306a36Sopenharmony_ci/* 252262306a36Sopenharmony_ci * Insert a new shared policy into the list. Caller holds sp->lock for 252362306a36Sopenharmony_ci * writing. 252462306a36Sopenharmony_ci */ 252562306a36Sopenharmony_cistatic void sp_insert(struct shared_policy *sp, struct sp_node *new) 252662306a36Sopenharmony_ci{ 252762306a36Sopenharmony_ci struct rb_node **p = &sp->root.rb_node; 252862306a36Sopenharmony_ci struct rb_node *parent = NULL; 252962306a36Sopenharmony_ci struct sp_node *nd; 253062306a36Sopenharmony_ci 253162306a36Sopenharmony_ci while (*p) { 253262306a36Sopenharmony_ci parent = *p; 253362306a36Sopenharmony_ci nd = rb_entry(parent, struct sp_node, nd); 253462306a36Sopenharmony_ci if (new->start < nd->start) 253562306a36Sopenharmony_ci p = &(*p)->rb_left; 253662306a36Sopenharmony_ci else if (new->end > nd->end) 253762306a36Sopenharmony_ci p = &(*p)->rb_right; 253862306a36Sopenharmony_ci else 253962306a36Sopenharmony_ci BUG(); 254062306a36Sopenharmony_ci } 254162306a36Sopenharmony_ci rb_link_node(&new->nd, parent, p); 254262306a36Sopenharmony_ci rb_insert_color(&new->nd, &sp->root); 254362306a36Sopenharmony_ci pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, 254462306a36Sopenharmony_ci new->policy ? new->policy->mode : 0); 254562306a36Sopenharmony_ci} 254662306a36Sopenharmony_ci 254762306a36Sopenharmony_ci/* Find shared policy intersecting idx */ 254862306a36Sopenharmony_cistruct mempolicy * 254962306a36Sopenharmony_cimpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) 255062306a36Sopenharmony_ci{ 255162306a36Sopenharmony_ci struct mempolicy *pol = NULL; 255262306a36Sopenharmony_ci struct sp_node *sn; 255362306a36Sopenharmony_ci 255462306a36Sopenharmony_ci if (!sp->root.rb_node) 255562306a36Sopenharmony_ci return NULL; 255662306a36Sopenharmony_ci read_lock(&sp->lock); 255762306a36Sopenharmony_ci sn = sp_lookup(sp, idx, idx+1); 255862306a36Sopenharmony_ci if (sn) { 255962306a36Sopenharmony_ci mpol_get(sn->policy); 256062306a36Sopenharmony_ci pol = sn->policy; 256162306a36Sopenharmony_ci } 256262306a36Sopenharmony_ci read_unlock(&sp->lock); 256362306a36Sopenharmony_ci return pol; 256462306a36Sopenharmony_ci} 256562306a36Sopenharmony_ci 256662306a36Sopenharmony_cistatic void sp_free(struct sp_node *n) 256762306a36Sopenharmony_ci{ 256862306a36Sopenharmony_ci mpol_put(n->policy); 256962306a36Sopenharmony_ci kmem_cache_free(sn_cache, n); 257062306a36Sopenharmony_ci} 257162306a36Sopenharmony_ci 257262306a36Sopenharmony_ci/** 257362306a36Sopenharmony_ci * mpol_misplaced - check whether current page node is valid in policy 257462306a36Sopenharmony_ci * 257562306a36Sopenharmony_ci * @page: page to be checked 257662306a36Sopenharmony_ci * @vma: vm area where page mapped 257762306a36Sopenharmony_ci * @addr: virtual address where page mapped 257862306a36Sopenharmony_ci * 257962306a36Sopenharmony_ci * Lookup current policy node id for vma,addr and "compare to" page's 258062306a36Sopenharmony_ci * node id. Policy determination "mimics" alloc_page_vma(). 258162306a36Sopenharmony_ci * Called from fault path where we know the vma and faulting address. 258262306a36Sopenharmony_ci * 258362306a36Sopenharmony_ci * Return: NUMA_NO_NODE if the page is in a node that is valid for this 258462306a36Sopenharmony_ci * policy, or a suitable node ID to allocate a replacement page from. 258562306a36Sopenharmony_ci */ 258662306a36Sopenharmony_ciint mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) 258762306a36Sopenharmony_ci{ 258862306a36Sopenharmony_ci struct mempolicy *pol; 258962306a36Sopenharmony_ci struct zoneref *z; 259062306a36Sopenharmony_ci int curnid = page_to_nid(page); 259162306a36Sopenharmony_ci unsigned long pgoff; 259262306a36Sopenharmony_ci int thiscpu = raw_smp_processor_id(); 259362306a36Sopenharmony_ci int thisnid = cpu_to_node(thiscpu); 259462306a36Sopenharmony_ci int polnid = NUMA_NO_NODE; 259562306a36Sopenharmony_ci int ret = NUMA_NO_NODE; 259662306a36Sopenharmony_ci 259762306a36Sopenharmony_ci pol = get_vma_policy(vma, addr); 259862306a36Sopenharmony_ci if (!(pol->flags & MPOL_F_MOF)) 259962306a36Sopenharmony_ci goto out; 260062306a36Sopenharmony_ci 260162306a36Sopenharmony_ci switch (pol->mode) { 260262306a36Sopenharmony_ci case MPOL_INTERLEAVE: 260362306a36Sopenharmony_ci pgoff = vma->vm_pgoff; 260462306a36Sopenharmony_ci pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; 260562306a36Sopenharmony_ci polnid = offset_il_node(pol, pgoff); 260662306a36Sopenharmony_ci break; 260762306a36Sopenharmony_ci 260862306a36Sopenharmony_ci case MPOL_PREFERRED: 260962306a36Sopenharmony_ci if (node_isset(curnid, pol->nodes)) 261062306a36Sopenharmony_ci goto out; 261162306a36Sopenharmony_ci polnid = first_node(pol->nodes); 261262306a36Sopenharmony_ci break; 261362306a36Sopenharmony_ci 261462306a36Sopenharmony_ci case MPOL_LOCAL: 261562306a36Sopenharmony_ci polnid = numa_node_id(); 261662306a36Sopenharmony_ci break; 261762306a36Sopenharmony_ci 261862306a36Sopenharmony_ci case MPOL_BIND: 261962306a36Sopenharmony_ci /* Optimize placement among multiple nodes via NUMA balancing */ 262062306a36Sopenharmony_ci if (pol->flags & MPOL_F_MORON) { 262162306a36Sopenharmony_ci if (node_isset(thisnid, pol->nodes)) 262262306a36Sopenharmony_ci break; 262362306a36Sopenharmony_ci goto out; 262462306a36Sopenharmony_ci } 262562306a36Sopenharmony_ci fallthrough; 262662306a36Sopenharmony_ci 262762306a36Sopenharmony_ci case MPOL_PREFERRED_MANY: 262862306a36Sopenharmony_ci /* 262962306a36Sopenharmony_ci * use current page if in policy nodemask, 263062306a36Sopenharmony_ci * else select nearest allowed node, if any. 263162306a36Sopenharmony_ci * If no allowed nodes, use current [!misplaced]. 263262306a36Sopenharmony_ci */ 263362306a36Sopenharmony_ci if (node_isset(curnid, pol->nodes)) 263462306a36Sopenharmony_ci goto out; 263562306a36Sopenharmony_ci z = first_zones_zonelist( 263662306a36Sopenharmony_ci node_zonelist(numa_node_id(), GFP_HIGHUSER), 263762306a36Sopenharmony_ci gfp_zone(GFP_HIGHUSER), 263862306a36Sopenharmony_ci &pol->nodes); 263962306a36Sopenharmony_ci polnid = zone_to_nid(z->zone); 264062306a36Sopenharmony_ci break; 264162306a36Sopenharmony_ci 264262306a36Sopenharmony_ci default: 264362306a36Sopenharmony_ci BUG(); 264462306a36Sopenharmony_ci } 264562306a36Sopenharmony_ci 264662306a36Sopenharmony_ci /* Migrate the page towards the node whose CPU is referencing it */ 264762306a36Sopenharmony_ci if (pol->flags & MPOL_F_MORON) { 264862306a36Sopenharmony_ci polnid = thisnid; 264962306a36Sopenharmony_ci 265062306a36Sopenharmony_ci if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) 265162306a36Sopenharmony_ci goto out; 265262306a36Sopenharmony_ci } 265362306a36Sopenharmony_ci 265462306a36Sopenharmony_ci if (curnid != polnid) 265562306a36Sopenharmony_ci ret = polnid; 265662306a36Sopenharmony_ciout: 265762306a36Sopenharmony_ci mpol_cond_put(pol); 265862306a36Sopenharmony_ci 265962306a36Sopenharmony_ci return ret; 266062306a36Sopenharmony_ci} 266162306a36Sopenharmony_ci 266262306a36Sopenharmony_ci/* 266362306a36Sopenharmony_ci * Drop the (possibly final) reference to task->mempolicy. It needs to be 266462306a36Sopenharmony_ci * dropped after task->mempolicy is set to NULL so that any allocation done as 266562306a36Sopenharmony_ci * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed 266662306a36Sopenharmony_ci * policy. 266762306a36Sopenharmony_ci */ 266862306a36Sopenharmony_civoid mpol_put_task_policy(struct task_struct *task) 266962306a36Sopenharmony_ci{ 267062306a36Sopenharmony_ci struct mempolicy *pol; 267162306a36Sopenharmony_ci 267262306a36Sopenharmony_ci task_lock(task); 267362306a36Sopenharmony_ci pol = task->mempolicy; 267462306a36Sopenharmony_ci task->mempolicy = NULL; 267562306a36Sopenharmony_ci task_unlock(task); 267662306a36Sopenharmony_ci mpol_put(pol); 267762306a36Sopenharmony_ci} 267862306a36Sopenharmony_ci 267962306a36Sopenharmony_cistatic void sp_delete(struct shared_policy *sp, struct sp_node *n) 268062306a36Sopenharmony_ci{ 268162306a36Sopenharmony_ci pr_debug("deleting %lx-l%lx\n", n->start, n->end); 268262306a36Sopenharmony_ci rb_erase(&n->nd, &sp->root); 268362306a36Sopenharmony_ci sp_free(n); 268462306a36Sopenharmony_ci} 268562306a36Sopenharmony_ci 268662306a36Sopenharmony_cistatic void sp_node_init(struct sp_node *node, unsigned long start, 268762306a36Sopenharmony_ci unsigned long end, struct mempolicy *pol) 268862306a36Sopenharmony_ci{ 268962306a36Sopenharmony_ci node->start = start; 269062306a36Sopenharmony_ci node->end = end; 269162306a36Sopenharmony_ci node->policy = pol; 269262306a36Sopenharmony_ci} 269362306a36Sopenharmony_ci 269462306a36Sopenharmony_cistatic struct sp_node *sp_alloc(unsigned long start, unsigned long end, 269562306a36Sopenharmony_ci struct mempolicy *pol) 269662306a36Sopenharmony_ci{ 269762306a36Sopenharmony_ci struct sp_node *n; 269862306a36Sopenharmony_ci struct mempolicy *newpol; 269962306a36Sopenharmony_ci 270062306a36Sopenharmony_ci n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 270162306a36Sopenharmony_ci if (!n) 270262306a36Sopenharmony_ci return NULL; 270362306a36Sopenharmony_ci 270462306a36Sopenharmony_ci newpol = mpol_dup(pol); 270562306a36Sopenharmony_ci if (IS_ERR(newpol)) { 270662306a36Sopenharmony_ci kmem_cache_free(sn_cache, n); 270762306a36Sopenharmony_ci return NULL; 270862306a36Sopenharmony_ci } 270962306a36Sopenharmony_ci newpol->flags |= MPOL_F_SHARED; 271062306a36Sopenharmony_ci sp_node_init(n, start, end, newpol); 271162306a36Sopenharmony_ci 271262306a36Sopenharmony_ci return n; 271362306a36Sopenharmony_ci} 271462306a36Sopenharmony_ci 271562306a36Sopenharmony_ci/* Replace a policy range. */ 271662306a36Sopenharmony_cistatic int shared_policy_replace(struct shared_policy *sp, unsigned long start, 271762306a36Sopenharmony_ci unsigned long end, struct sp_node *new) 271862306a36Sopenharmony_ci{ 271962306a36Sopenharmony_ci struct sp_node *n; 272062306a36Sopenharmony_ci struct sp_node *n_new = NULL; 272162306a36Sopenharmony_ci struct mempolicy *mpol_new = NULL; 272262306a36Sopenharmony_ci int ret = 0; 272362306a36Sopenharmony_ci 272462306a36Sopenharmony_cirestart: 272562306a36Sopenharmony_ci write_lock(&sp->lock); 272662306a36Sopenharmony_ci n = sp_lookup(sp, start, end); 272762306a36Sopenharmony_ci /* Take care of old policies in the same range. */ 272862306a36Sopenharmony_ci while (n && n->start < end) { 272962306a36Sopenharmony_ci struct rb_node *next = rb_next(&n->nd); 273062306a36Sopenharmony_ci if (n->start >= start) { 273162306a36Sopenharmony_ci if (n->end <= end) 273262306a36Sopenharmony_ci sp_delete(sp, n); 273362306a36Sopenharmony_ci else 273462306a36Sopenharmony_ci n->start = end; 273562306a36Sopenharmony_ci } else { 273662306a36Sopenharmony_ci /* Old policy spanning whole new range. */ 273762306a36Sopenharmony_ci if (n->end > end) { 273862306a36Sopenharmony_ci if (!n_new) 273962306a36Sopenharmony_ci goto alloc_new; 274062306a36Sopenharmony_ci 274162306a36Sopenharmony_ci *mpol_new = *n->policy; 274262306a36Sopenharmony_ci atomic_set(&mpol_new->refcnt, 1); 274362306a36Sopenharmony_ci sp_node_init(n_new, end, n->end, mpol_new); 274462306a36Sopenharmony_ci n->end = start; 274562306a36Sopenharmony_ci sp_insert(sp, n_new); 274662306a36Sopenharmony_ci n_new = NULL; 274762306a36Sopenharmony_ci mpol_new = NULL; 274862306a36Sopenharmony_ci break; 274962306a36Sopenharmony_ci } else 275062306a36Sopenharmony_ci n->end = start; 275162306a36Sopenharmony_ci } 275262306a36Sopenharmony_ci if (!next) 275362306a36Sopenharmony_ci break; 275462306a36Sopenharmony_ci n = rb_entry(next, struct sp_node, nd); 275562306a36Sopenharmony_ci } 275662306a36Sopenharmony_ci if (new) 275762306a36Sopenharmony_ci sp_insert(sp, new); 275862306a36Sopenharmony_ci write_unlock(&sp->lock); 275962306a36Sopenharmony_ci ret = 0; 276062306a36Sopenharmony_ci 276162306a36Sopenharmony_cierr_out: 276262306a36Sopenharmony_ci if (mpol_new) 276362306a36Sopenharmony_ci mpol_put(mpol_new); 276462306a36Sopenharmony_ci if (n_new) 276562306a36Sopenharmony_ci kmem_cache_free(sn_cache, n_new); 276662306a36Sopenharmony_ci 276762306a36Sopenharmony_ci return ret; 276862306a36Sopenharmony_ci 276962306a36Sopenharmony_cialloc_new: 277062306a36Sopenharmony_ci write_unlock(&sp->lock); 277162306a36Sopenharmony_ci ret = -ENOMEM; 277262306a36Sopenharmony_ci n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); 277362306a36Sopenharmony_ci if (!n_new) 277462306a36Sopenharmony_ci goto err_out; 277562306a36Sopenharmony_ci mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 277662306a36Sopenharmony_ci if (!mpol_new) 277762306a36Sopenharmony_ci goto err_out; 277862306a36Sopenharmony_ci atomic_set(&mpol_new->refcnt, 1); 277962306a36Sopenharmony_ci goto restart; 278062306a36Sopenharmony_ci} 278162306a36Sopenharmony_ci 278262306a36Sopenharmony_ci/** 278362306a36Sopenharmony_ci * mpol_shared_policy_init - initialize shared policy for inode 278462306a36Sopenharmony_ci * @sp: pointer to inode shared policy 278562306a36Sopenharmony_ci * @mpol: struct mempolicy to install 278662306a36Sopenharmony_ci * 278762306a36Sopenharmony_ci * Install non-NULL @mpol in inode's shared policy rb-tree. 278862306a36Sopenharmony_ci * On entry, the current task has a reference on a non-NULL @mpol. 278962306a36Sopenharmony_ci * This must be released on exit. 279062306a36Sopenharmony_ci * This is called at get_inode() calls and we can use GFP_KERNEL. 279162306a36Sopenharmony_ci */ 279262306a36Sopenharmony_civoid mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 279362306a36Sopenharmony_ci{ 279462306a36Sopenharmony_ci int ret; 279562306a36Sopenharmony_ci 279662306a36Sopenharmony_ci sp->root = RB_ROOT; /* empty tree == default mempolicy */ 279762306a36Sopenharmony_ci rwlock_init(&sp->lock); 279862306a36Sopenharmony_ci 279962306a36Sopenharmony_ci if (mpol) { 280062306a36Sopenharmony_ci struct vm_area_struct pvma; 280162306a36Sopenharmony_ci struct mempolicy *new; 280262306a36Sopenharmony_ci NODEMASK_SCRATCH(scratch); 280362306a36Sopenharmony_ci 280462306a36Sopenharmony_ci if (!scratch) 280562306a36Sopenharmony_ci goto put_mpol; 280662306a36Sopenharmony_ci /* contextualize the tmpfs mount point mempolicy */ 280762306a36Sopenharmony_ci new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 280862306a36Sopenharmony_ci if (IS_ERR(new)) 280962306a36Sopenharmony_ci goto free_scratch; /* no valid nodemask intersection */ 281062306a36Sopenharmony_ci 281162306a36Sopenharmony_ci task_lock(current); 281262306a36Sopenharmony_ci ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); 281362306a36Sopenharmony_ci task_unlock(current); 281462306a36Sopenharmony_ci if (ret) 281562306a36Sopenharmony_ci goto put_new; 281662306a36Sopenharmony_ci 281762306a36Sopenharmony_ci /* Create pseudo-vma that contains just the policy */ 281862306a36Sopenharmony_ci vma_init(&pvma, NULL); 281962306a36Sopenharmony_ci pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 282062306a36Sopenharmony_ci mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 282162306a36Sopenharmony_ci 282262306a36Sopenharmony_ciput_new: 282362306a36Sopenharmony_ci mpol_put(new); /* drop initial ref */ 282462306a36Sopenharmony_cifree_scratch: 282562306a36Sopenharmony_ci NODEMASK_SCRATCH_FREE(scratch); 282662306a36Sopenharmony_ciput_mpol: 282762306a36Sopenharmony_ci mpol_put(mpol); /* drop our incoming ref on sb mpol */ 282862306a36Sopenharmony_ci } 282962306a36Sopenharmony_ci} 283062306a36Sopenharmony_ci 283162306a36Sopenharmony_ciint mpol_set_shared_policy(struct shared_policy *info, 283262306a36Sopenharmony_ci struct vm_area_struct *vma, struct mempolicy *npol) 283362306a36Sopenharmony_ci{ 283462306a36Sopenharmony_ci int err; 283562306a36Sopenharmony_ci struct sp_node *new = NULL; 283662306a36Sopenharmony_ci unsigned long sz = vma_pages(vma); 283762306a36Sopenharmony_ci 283862306a36Sopenharmony_ci pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", 283962306a36Sopenharmony_ci vma->vm_pgoff, 284062306a36Sopenharmony_ci sz, npol ? npol->mode : -1, 284162306a36Sopenharmony_ci npol ? npol->flags : -1, 284262306a36Sopenharmony_ci npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE); 284362306a36Sopenharmony_ci 284462306a36Sopenharmony_ci if (npol) { 284562306a36Sopenharmony_ci new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 284662306a36Sopenharmony_ci if (!new) 284762306a36Sopenharmony_ci return -ENOMEM; 284862306a36Sopenharmony_ci } 284962306a36Sopenharmony_ci err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 285062306a36Sopenharmony_ci if (err && new) 285162306a36Sopenharmony_ci sp_free(new); 285262306a36Sopenharmony_ci return err; 285362306a36Sopenharmony_ci} 285462306a36Sopenharmony_ci 285562306a36Sopenharmony_ci/* Free a backing policy store on inode delete. */ 285662306a36Sopenharmony_civoid mpol_free_shared_policy(struct shared_policy *p) 285762306a36Sopenharmony_ci{ 285862306a36Sopenharmony_ci struct sp_node *n; 285962306a36Sopenharmony_ci struct rb_node *next; 286062306a36Sopenharmony_ci 286162306a36Sopenharmony_ci if (!p->root.rb_node) 286262306a36Sopenharmony_ci return; 286362306a36Sopenharmony_ci write_lock(&p->lock); 286462306a36Sopenharmony_ci next = rb_first(&p->root); 286562306a36Sopenharmony_ci while (next) { 286662306a36Sopenharmony_ci n = rb_entry(next, struct sp_node, nd); 286762306a36Sopenharmony_ci next = rb_next(&n->nd); 286862306a36Sopenharmony_ci sp_delete(p, n); 286962306a36Sopenharmony_ci } 287062306a36Sopenharmony_ci write_unlock(&p->lock); 287162306a36Sopenharmony_ci} 287262306a36Sopenharmony_ci 287362306a36Sopenharmony_ci#ifdef CONFIG_NUMA_BALANCING 287462306a36Sopenharmony_cistatic int __initdata numabalancing_override; 287562306a36Sopenharmony_ci 287662306a36Sopenharmony_cistatic void __init check_numabalancing_enable(void) 287762306a36Sopenharmony_ci{ 287862306a36Sopenharmony_ci bool numabalancing_default = false; 287962306a36Sopenharmony_ci 288062306a36Sopenharmony_ci if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) 288162306a36Sopenharmony_ci numabalancing_default = true; 288262306a36Sopenharmony_ci 288362306a36Sopenharmony_ci /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ 288462306a36Sopenharmony_ci if (numabalancing_override) 288562306a36Sopenharmony_ci set_numabalancing_state(numabalancing_override == 1); 288662306a36Sopenharmony_ci 288762306a36Sopenharmony_ci if (num_online_nodes() > 1 && !numabalancing_override) { 288862306a36Sopenharmony_ci pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", 288962306a36Sopenharmony_ci numabalancing_default ? "Enabling" : "Disabling"); 289062306a36Sopenharmony_ci set_numabalancing_state(numabalancing_default); 289162306a36Sopenharmony_ci } 289262306a36Sopenharmony_ci} 289362306a36Sopenharmony_ci 289462306a36Sopenharmony_cistatic int __init setup_numabalancing(char *str) 289562306a36Sopenharmony_ci{ 289662306a36Sopenharmony_ci int ret = 0; 289762306a36Sopenharmony_ci if (!str) 289862306a36Sopenharmony_ci goto out; 289962306a36Sopenharmony_ci 290062306a36Sopenharmony_ci if (!strcmp(str, "enable")) { 290162306a36Sopenharmony_ci numabalancing_override = 1; 290262306a36Sopenharmony_ci ret = 1; 290362306a36Sopenharmony_ci } else if (!strcmp(str, "disable")) { 290462306a36Sopenharmony_ci numabalancing_override = -1; 290562306a36Sopenharmony_ci ret = 1; 290662306a36Sopenharmony_ci } 290762306a36Sopenharmony_ciout: 290862306a36Sopenharmony_ci if (!ret) 290962306a36Sopenharmony_ci pr_warn("Unable to parse numa_balancing=\n"); 291062306a36Sopenharmony_ci 291162306a36Sopenharmony_ci return ret; 291262306a36Sopenharmony_ci} 291362306a36Sopenharmony_ci__setup("numa_balancing=", setup_numabalancing); 291462306a36Sopenharmony_ci#else 291562306a36Sopenharmony_cistatic inline void __init check_numabalancing_enable(void) 291662306a36Sopenharmony_ci{ 291762306a36Sopenharmony_ci} 291862306a36Sopenharmony_ci#endif /* CONFIG_NUMA_BALANCING */ 291962306a36Sopenharmony_ci 292062306a36Sopenharmony_ci/* assumes fs == KERNEL_DS */ 292162306a36Sopenharmony_civoid __init numa_policy_init(void) 292262306a36Sopenharmony_ci{ 292362306a36Sopenharmony_ci nodemask_t interleave_nodes; 292462306a36Sopenharmony_ci unsigned long largest = 0; 292562306a36Sopenharmony_ci int nid, prefer = 0; 292662306a36Sopenharmony_ci 292762306a36Sopenharmony_ci policy_cache = kmem_cache_create("numa_policy", 292862306a36Sopenharmony_ci sizeof(struct mempolicy), 292962306a36Sopenharmony_ci 0, SLAB_PANIC, NULL); 293062306a36Sopenharmony_ci 293162306a36Sopenharmony_ci sn_cache = kmem_cache_create("shared_policy_node", 293262306a36Sopenharmony_ci sizeof(struct sp_node), 293362306a36Sopenharmony_ci 0, SLAB_PANIC, NULL); 293462306a36Sopenharmony_ci 293562306a36Sopenharmony_ci for_each_node(nid) { 293662306a36Sopenharmony_ci preferred_node_policy[nid] = (struct mempolicy) { 293762306a36Sopenharmony_ci .refcnt = ATOMIC_INIT(1), 293862306a36Sopenharmony_ci .mode = MPOL_PREFERRED, 293962306a36Sopenharmony_ci .flags = MPOL_F_MOF | MPOL_F_MORON, 294062306a36Sopenharmony_ci .nodes = nodemask_of_node(nid), 294162306a36Sopenharmony_ci }; 294262306a36Sopenharmony_ci } 294362306a36Sopenharmony_ci 294462306a36Sopenharmony_ci /* 294562306a36Sopenharmony_ci * Set interleaving policy for system init. Interleaving is only 294662306a36Sopenharmony_ci * enabled across suitably sized nodes (default is >= 16MB), or 294762306a36Sopenharmony_ci * fall back to the largest node if they're all smaller. 294862306a36Sopenharmony_ci */ 294962306a36Sopenharmony_ci nodes_clear(interleave_nodes); 295062306a36Sopenharmony_ci for_each_node_state(nid, N_MEMORY) { 295162306a36Sopenharmony_ci unsigned long total_pages = node_present_pages(nid); 295262306a36Sopenharmony_ci 295362306a36Sopenharmony_ci /* Preserve the largest node */ 295462306a36Sopenharmony_ci if (largest < total_pages) { 295562306a36Sopenharmony_ci largest = total_pages; 295662306a36Sopenharmony_ci prefer = nid; 295762306a36Sopenharmony_ci } 295862306a36Sopenharmony_ci 295962306a36Sopenharmony_ci /* Interleave this node? */ 296062306a36Sopenharmony_ci if ((total_pages << PAGE_SHIFT) >= (16 << 20)) 296162306a36Sopenharmony_ci node_set(nid, interleave_nodes); 296262306a36Sopenharmony_ci } 296362306a36Sopenharmony_ci 296462306a36Sopenharmony_ci /* All too small, use the largest */ 296562306a36Sopenharmony_ci if (unlikely(nodes_empty(interleave_nodes))) 296662306a36Sopenharmony_ci node_set(prefer, interleave_nodes); 296762306a36Sopenharmony_ci 296862306a36Sopenharmony_ci if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) 296962306a36Sopenharmony_ci pr_err("%s: interleaving failed\n", __func__); 297062306a36Sopenharmony_ci 297162306a36Sopenharmony_ci check_numabalancing_enable(); 297262306a36Sopenharmony_ci} 297362306a36Sopenharmony_ci 297462306a36Sopenharmony_ci/* Reset policy of current process to default */ 297562306a36Sopenharmony_civoid numa_default_policy(void) 297662306a36Sopenharmony_ci{ 297762306a36Sopenharmony_ci do_set_mempolicy(MPOL_DEFAULT, 0, NULL); 297862306a36Sopenharmony_ci} 297962306a36Sopenharmony_ci 298062306a36Sopenharmony_ci/* 298162306a36Sopenharmony_ci * Parse and format mempolicy from/to strings 298262306a36Sopenharmony_ci */ 298362306a36Sopenharmony_ci 298462306a36Sopenharmony_cistatic const char * const policy_modes[] = 298562306a36Sopenharmony_ci{ 298662306a36Sopenharmony_ci [MPOL_DEFAULT] = "default", 298762306a36Sopenharmony_ci [MPOL_PREFERRED] = "prefer", 298862306a36Sopenharmony_ci [MPOL_BIND] = "bind", 298962306a36Sopenharmony_ci [MPOL_INTERLEAVE] = "interleave", 299062306a36Sopenharmony_ci [MPOL_LOCAL] = "local", 299162306a36Sopenharmony_ci [MPOL_PREFERRED_MANY] = "prefer (many)", 299262306a36Sopenharmony_ci}; 299362306a36Sopenharmony_ci 299462306a36Sopenharmony_ci 299562306a36Sopenharmony_ci#ifdef CONFIG_TMPFS 299662306a36Sopenharmony_ci/** 299762306a36Sopenharmony_ci * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. 299862306a36Sopenharmony_ci * @str: string containing mempolicy to parse 299962306a36Sopenharmony_ci * @mpol: pointer to struct mempolicy pointer, returned on success. 300062306a36Sopenharmony_ci * 300162306a36Sopenharmony_ci * Format of input: 300262306a36Sopenharmony_ci * <mode>[=<flags>][:<nodelist>] 300362306a36Sopenharmony_ci * 300462306a36Sopenharmony_ci * Return: %0 on success, else %1 300562306a36Sopenharmony_ci */ 300662306a36Sopenharmony_ciint mpol_parse_str(char *str, struct mempolicy **mpol) 300762306a36Sopenharmony_ci{ 300862306a36Sopenharmony_ci struct mempolicy *new = NULL; 300962306a36Sopenharmony_ci unsigned short mode_flags; 301062306a36Sopenharmony_ci nodemask_t nodes; 301162306a36Sopenharmony_ci char *nodelist = strchr(str, ':'); 301262306a36Sopenharmony_ci char *flags = strchr(str, '='); 301362306a36Sopenharmony_ci int err = 1, mode; 301462306a36Sopenharmony_ci 301562306a36Sopenharmony_ci if (flags) 301662306a36Sopenharmony_ci *flags++ = '\0'; /* terminate mode string */ 301762306a36Sopenharmony_ci 301862306a36Sopenharmony_ci if (nodelist) { 301962306a36Sopenharmony_ci /* NUL-terminate mode or flags string */ 302062306a36Sopenharmony_ci *nodelist++ = '\0'; 302162306a36Sopenharmony_ci if (nodelist_parse(nodelist, nodes)) 302262306a36Sopenharmony_ci goto out; 302362306a36Sopenharmony_ci if (!nodes_subset(nodes, node_states[N_MEMORY])) 302462306a36Sopenharmony_ci goto out; 302562306a36Sopenharmony_ci } else 302662306a36Sopenharmony_ci nodes_clear(nodes); 302762306a36Sopenharmony_ci 302862306a36Sopenharmony_ci mode = match_string(policy_modes, MPOL_MAX, str); 302962306a36Sopenharmony_ci if (mode < 0) 303062306a36Sopenharmony_ci goto out; 303162306a36Sopenharmony_ci 303262306a36Sopenharmony_ci switch (mode) { 303362306a36Sopenharmony_ci case MPOL_PREFERRED: 303462306a36Sopenharmony_ci /* 303562306a36Sopenharmony_ci * Insist on a nodelist of one node only, although later 303662306a36Sopenharmony_ci * we use first_node(nodes) to grab a single node, so here 303762306a36Sopenharmony_ci * nodelist (or nodes) cannot be empty. 303862306a36Sopenharmony_ci */ 303962306a36Sopenharmony_ci if (nodelist) { 304062306a36Sopenharmony_ci char *rest = nodelist; 304162306a36Sopenharmony_ci while (isdigit(*rest)) 304262306a36Sopenharmony_ci rest++; 304362306a36Sopenharmony_ci if (*rest) 304462306a36Sopenharmony_ci goto out; 304562306a36Sopenharmony_ci if (nodes_empty(nodes)) 304662306a36Sopenharmony_ci goto out; 304762306a36Sopenharmony_ci } 304862306a36Sopenharmony_ci break; 304962306a36Sopenharmony_ci case MPOL_INTERLEAVE: 305062306a36Sopenharmony_ci /* 305162306a36Sopenharmony_ci * Default to online nodes with memory if no nodelist 305262306a36Sopenharmony_ci */ 305362306a36Sopenharmony_ci if (!nodelist) 305462306a36Sopenharmony_ci nodes = node_states[N_MEMORY]; 305562306a36Sopenharmony_ci break; 305662306a36Sopenharmony_ci case MPOL_LOCAL: 305762306a36Sopenharmony_ci /* 305862306a36Sopenharmony_ci * Don't allow a nodelist; mpol_new() checks flags 305962306a36Sopenharmony_ci */ 306062306a36Sopenharmony_ci if (nodelist) 306162306a36Sopenharmony_ci goto out; 306262306a36Sopenharmony_ci break; 306362306a36Sopenharmony_ci case MPOL_DEFAULT: 306462306a36Sopenharmony_ci /* 306562306a36Sopenharmony_ci * Insist on a empty nodelist 306662306a36Sopenharmony_ci */ 306762306a36Sopenharmony_ci if (!nodelist) 306862306a36Sopenharmony_ci err = 0; 306962306a36Sopenharmony_ci goto out; 307062306a36Sopenharmony_ci case MPOL_PREFERRED_MANY: 307162306a36Sopenharmony_ci case MPOL_BIND: 307262306a36Sopenharmony_ci /* 307362306a36Sopenharmony_ci * Insist on a nodelist 307462306a36Sopenharmony_ci */ 307562306a36Sopenharmony_ci if (!nodelist) 307662306a36Sopenharmony_ci goto out; 307762306a36Sopenharmony_ci } 307862306a36Sopenharmony_ci 307962306a36Sopenharmony_ci mode_flags = 0; 308062306a36Sopenharmony_ci if (flags) { 308162306a36Sopenharmony_ci /* 308262306a36Sopenharmony_ci * Currently, we only support two mutually exclusive 308362306a36Sopenharmony_ci * mode flags. 308462306a36Sopenharmony_ci */ 308562306a36Sopenharmony_ci if (!strcmp(flags, "static")) 308662306a36Sopenharmony_ci mode_flags |= MPOL_F_STATIC_NODES; 308762306a36Sopenharmony_ci else if (!strcmp(flags, "relative")) 308862306a36Sopenharmony_ci mode_flags |= MPOL_F_RELATIVE_NODES; 308962306a36Sopenharmony_ci else 309062306a36Sopenharmony_ci goto out; 309162306a36Sopenharmony_ci } 309262306a36Sopenharmony_ci 309362306a36Sopenharmony_ci new = mpol_new(mode, mode_flags, &nodes); 309462306a36Sopenharmony_ci if (IS_ERR(new)) 309562306a36Sopenharmony_ci goto out; 309662306a36Sopenharmony_ci 309762306a36Sopenharmony_ci /* 309862306a36Sopenharmony_ci * Save nodes for mpol_to_str() to show the tmpfs mount options 309962306a36Sopenharmony_ci * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. 310062306a36Sopenharmony_ci */ 310162306a36Sopenharmony_ci if (mode != MPOL_PREFERRED) { 310262306a36Sopenharmony_ci new->nodes = nodes; 310362306a36Sopenharmony_ci } else if (nodelist) { 310462306a36Sopenharmony_ci nodes_clear(new->nodes); 310562306a36Sopenharmony_ci node_set(first_node(nodes), new->nodes); 310662306a36Sopenharmony_ci } else { 310762306a36Sopenharmony_ci new->mode = MPOL_LOCAL; 310862306a36Sopenharmony_ci } 310962306a36Sopenharmony_ci 311062306a36Sopenharmony_ci /* 311162306a36Sopenharmony_ci * Save nodes for contextualization: this will be used to "clone" 311262306a36Sopenharmony_ci * the mempolicy in a specific context [cpuset] at a later time. 311362306a36Sopenharmony_ci */ 311462306a36Sopenharmony_ci new->w.user_nodemask = nodes; 311562306a36Sopenharmony_ci 311662306a36Sopenharmony_ci err = 0; 311762306a36Sopenharmony_ci 311862306a36Sopenharmony_ciout: 311962306a36Sopenharmony_ci /* Restore string for error message */ 312062306a36Sopenharmony_ci if (nodelist) 312162306a36Sopenharmony_ci *--nodelist = ':'; 312262306a36Sopenharmony_ci if (flags) 312362306a36Sopenharmony_ci *--flags = '='; 312462306a36Sopenharmony_ci if (!err) 312562306a36Sopenharmony_ci *mpol = new; 312662306a36Sopenharmony_ci return err; 312762306a36Sopenharmony_ci} 312862306a36Sopenharmony_ci#endif /* CONFIG_TMPFS */ 312962306a36Sopenharmony_ci 313062306a36Sopenharmony_ci/** 313162306a36Sopenharmony_ci * mpol_to_str - format a mempolicy structure for printing 313262306a36Sopenharmony_ci * @buffer: to contain formatted mempolicy string 313362306a36Sopenharmony_ci * @maxlen: length of @buffer 313462306a36Sopenharmony_ci * @pol: pointer to mempolicy to be formatted 313562306a36Sopenharmony_ci * 313662306a36Sopenharmony_ci * Convert @pol into a string. If @buffer is too short, truncate the string. 313762306a36Sopenharmony_ci * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the 313862306a36Sopenharmony_ci * longest flag, "relative", and to display at least a few node ids. 313962306a36Sopenharmony_ci */ 314062306a36Sopenharmony_civoid mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 314162306a36Sopenharmony_ci{ 314262306a36Sopenharmony_ci char *p = buffer; 314362306a36Sopenharmony_ci nodemask_t nodes = NODE_MASK_NONE; 314462306a36Sopenharmony_ci unsigned short mode = MPOL_DEFAULT; 314562306a36Sopenharmony_ci unsigned short flags = 0; 314662306a36Sopenharmony_ci 314762306a36Sopenharmony_ci if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { 314862306a36Sopenharmony_ci mode = pol->mode; 314962306a36Sopenharmony_ci flags = pol->flags; 315062306a36Sopenharmony_ci } 315162306a36Sopenharmony_ci 315262306a36Sopenharmony_ci switch (mode) { 315362306a36Sopenharmony_ci case MPOL_DEFAULT: 315462306a36Sopenharmony_ci case MPOL_LOCAL: 315562306a36Sopenharmony_ci break; 315662306a36Sopenharmony_ci case MPOL_PREFERRED: 315762306a36Sopenharmony_ci case MPOL_PREFERRED_MANY: 315862306a36Sopenharmony_ci case MPOL_BIND: 315962306a36Sopenharmony_ci case MPOL_INTERLEAVE: 316062306a36Sopenharmony_ci nodes = pol->nodes; 316162306a36Sopenharmony_ci break; 316262306a36Sopenharmony_ci default: 316362306a36Sopenharmony_ci WARN_ON_ONCE(1); 316462306a36Sopenharmony_ci snprintf(p, maxlen, "unknown"); 316562306a36Sopenharmony_ci return; 316662306a36Sopenharmony_ci } 316762306a36Sopenharmony_ci 316862306a36Sopenharmony_ci p += snprintf(p, maxlen, "%s", policy_modes[mode]); 316962306a36Sopenharmony_ci 317062306a36Sopenharmony_ci if (flags & MPOL_MODE_FLAGS) { 317162306a36Sopenharmony_ci p += snprintf(p, buffer + maxlen - p, "="); 317262306a36Sopenharmony_ci 317362306a36Sopenharmony_ci /* 317462306a36Sopenharmony_ci * Currently, the only defined flags are mutually exclusive 317562306a36Sopenharmony_ci */ 317662306a36Sopenharmony_ci if (flags & MPOL_F_STATIC_NODES) 317762306a36Sopenharmony_ci p += snprintf(p, buffer + maxlen - p, "static"); 317862306a36Sopenharmony_ci else if (flags & MPOL_F_RELATIVE_NODES) 317962306a36Sopenharmony_ci p += snprintf(p, buffer + maxlen - p, "relative"); 318062306a36Sopenharmony_ci } 318162306a36Sopenharmony_ci 318262306a36Sopenharmony_ci if (!nodes_empty(nodes)) 318362306a36Sopenharmony_ci p += scnprintf(p, buffer + maxlen - p, ":%*pbl", 318462306a36Sopenharmony_ci nodemask_pr_args(&nodes)); 318562306a36Sopenharmony_ci} 3186