162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Simple NUMA memory policy for the Linux kernel.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright 2003,2004 Andi Kleen, SuSE Labs.
662306a36Sopenharmony_ci * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * NUMA policy allows the user to give hints in which node(s) memory should
962306a36Sopenharmony_ci * be allocated.
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci * Support four policies per VMA and per process:
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * The VMA policy has priority over the process policy for a page fault.
1462306a36Sopenharmony_ci *
1562306a36Sopenharmony_ci * interleave     Allocate memory interleaved over a set of nodes,
1662306a36Sopenharmony_ci *                with normal fallback if it fails.
1762306a36Sopenharmony_ci *                For VMA based allocations this interleaves based on the
1862306a36Sopenharmony_ci *                offset into the backing object or offset into the mapping
1962306a36Sopenharmony_ci *                for anonymous memory. For process policy an process counter
2062306a36Sopenharmony_ci *                is used.
2162306a36Sopenharmony_ci *
2262306a36Sopenharmony_ci * bind           Only allocate memory on a specific set of nodes,
2362306a36Sopenharmony_ci *                no fallback.
2462306a36Sopenharmony_ci *                FIXME: memory is allocated starting with the first node
2562306a36Sopenharmony_ci *                to the last. It would be better if bind would truly restrict
2662306a36Sopenharmony_ci *                the allocation to memory nodes instead
2762306a36Sopenharmony_ci *
2862306a36Sopenharmony_ci * preferred       Try a specific node first before normal fallback.
2962306a36Sopenharmony_ci *                As a special case NUMA_NO_NODE here means do the allocation
3062306a36Sopenharmony_ci *                on the local CPU. This is normally identical to default,
3162306a36Sopenharmony_ci *                but useful to set in a VMA when you have a non default
3262306a36Sopenharmony_ci *                process policy.
3362306a36Sopenharmony_ci *
3462306a36Sopenharmony_ci * preferred many Try a set of nodes first before normal fallback. This is
3562306a36Sopenharmony_ci *                similar to preferred without the special case.
3662306a36Sopenharmony_ci *
3762306a36Sopenharmony_ci * default        Allocate on the local node first, or when on a VMA
3862306a36Sopenharmony_ci *                use the process policy. This is what Linux always did
3962306a36Sopenharmony_ci *		  in a NUMA aware kernel and still does by, ahem, default.
4062306a36Sopenharmony_ci *
4162306a36Sopenharmony_ci * The process policy is applied for most non interrupt memory allocations
4262306a36Sopenharmony_ci * in that process' context. Interrupts ignore the policies and always
4362306a36Sopenharmony_ci * try to allocate on the local CPU. The VMA policy is only applied for memory
4462306a36Sopenharmony_ci * allocations for a VMA in the VM.
4562306a36Sopenharmony_ci *
4662306a36Sopenharmony_ci * Currently there are a few corner cases in swapping where the policy
4762306a36Sopenharmony_ci * is not applied, but the majority should be handled. When process policy
4862306a36Sopenharmony_ci * is used it is not remembered over swap outs/swap ins.
4962306a36Sopenharmony_ci *
5062306a36Sopenharmony_ci * Only the highest zone in the zone hierarchy gets policied. Allocations
5162306a36Sopenharmony_ci * requesting a lower zone just use default policy. This implies that
5262306a36Sopenharmony_ci * on systems with highmem kernel lowmem allocation don't get policied.
5362306a36Sopenharmony_ci * Same with GFP_DMA allocations.
5462306a36Sopenharmony_ci *
5562306a36Sopenharmony_ci * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
5662306a36Sopenharmony_ci * all users and remembered even when nobody has memory mapped.
5762306a36Sopenharmony_ci */
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci/* Notebook:
6062306a36Sopenharmony_ci   fix mmap readahead to honour policy and enable policy for any page cache
6162306a36Sopenharmony_ci   object
6262306a36Sopenharmony_ci   statistics for bigpages
6362306a36Sopenharmony_ci   global policy for page cache? currently it uses process policy. Requires
6462306a36Sopenharmony_ci   first item above.
6562306a36Sopenharmony_ci   handle mremap for shared memory (currently ignored for the policy)
6662306a36Sopenharmony_ci   grows down?
6762306a36Sopenharmony_ci   make bind policy root only? It can trigger oom much faster and the
6862306a36Sopenharmony_ci   kernel is not always grateful with that.
6962306a36Sopenharmony_ci*/
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci#include <linux/mempolicy.h>
7462306a36Sopenharmony_ci#include <linux/pagewalk.h>
7562306a36Sopenharmony_ci#include <linux/highmem.h>
7662306a36Sopenharmony_ci#include <linux/hugetlb.h>
7762306a36Sopenharmony_ci#include <linux/kernel.h>
7862306a36Sopenharmony_ci#include <linux/sched.h>
7962306a36Sopenharmony_ci#include <linux/sched/mm.h>
8062306a36Sopenharmony_ci#include <linux/sched/numa_balancing.h>
8162306a36Sopenharmony_ci#include <linux/sched/task.h>
8262306a36Sopenharmony_ci#include <linux/nodemask.h>
8362306a36Sopenharmony_ci#include <linux/cpuset.h>
8462306a36Sopenharmony_ci#include <linux/slab.h>
8562306a36Sopenharmony_ci#include <linux/string.h>
8662306a36Sopenharmony_ci#include <linux/export.h>
8762306a36Sopenharmony_ci#include <linux/nsproxy.h>
8862306a36Sopenharmony_ci#include <linux/interrupt.h>
8962306a36Sopenharmony_ci#include <linux/init.h>
9062306a36Sopenharmony_ci#include <linux/compat.h>
9162306a36Sopenharmony_ci#include <linux/ptrace.h>
9262306a36Sopenharmony_ci#include <linux/swap.h>
9362306a36Sopenharmony_ci#include <linux/seq_file.h>
9462306a36Sopenharmony_ci#include <linux/proc_fs.h>
9562306a36Sopenharmony_ci#include <linux/migrate.h>
9662306a36Sopenharmony_ci#include <linux/ksm.h>
9762306a36Sopenharmony_ci#include <linux/rmap.h>
9862306a36Sopenharmony_ci#include <linux/security.h>
9962306a36Sopenharmony_ci#include <linux/syscalls.h>
10062306a36Sopenharmony_ci#include <linux/ctype.h>
10162306a36Sopenharmony_ci#include <linux/mm_inline.h>
10262306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
10362306a36Sopenharmony_ci#include <linux/printk.h>
10462306a36Sopenharmony_ci#include <linux/swapops.h>
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci#include <asm/tlbflush.h>
10762306a36Sopenharmony_ci#include <asm/tlb.h>
10862306a36Sopenharmony_ci#include <linux/uaccess.h>
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci#include "internal.h"
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci/* Internal flags */
11362306a36Sopenharmony_ci#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
11462306a36Sopenharmony_ci#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_cistatic struct kmem_cache *policy_cache;
11762306a36Sopenharmony_cistatic struct kmem_cache *sn_cache;
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci/* Highest zone. An specific allocation for a zone below that is not
12062306a36Sopenharmony_ci   policied. */
12162306a36Sopenharmony_cienum zone_type policy_zone = 0;
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci/*
12462306a36Sopenharmony_ci * run-time system-wide default policy => local allocation
12562306a36Sopenharmony_ci */
12662306a36Sopenharmony_cistatic struct mempolicy default_policy = {
12762306a36Sopenharmony_ci	.refcnt = ATOMIC_INIT(1), /* never free it */
12862306a36Sopenharmony_ci	.mode = MPOL_LOCAL,
12962306a36Sopenharmony_ci};
13062306a36Sopenharmony_ci
13162306a36Sopenharmony_cistatic struct mempolicy preferred_node_policy[MAX_NUMNODES];
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci/**
13462306a36Sopenharmony_ci * numa_nearest_node - Find nearest node by state
13562306a36Sopenharmony_ci * @node: Node id to start the search
13662306a36Sopenharmony_ci * @state: State to filter the search
13762306a36Sopenharmony_ci *
13862306a36Sopenharmony_ci * Lookup the closest node by distance if @nid is not in state.
13962306a36Sopenharmony_ci *
14062306a36Sopenharmony_ci * Return: this @node if it is in state, otherwise the closest node by distance
14162306a36Sopenharmony_ci */
14262306a36Sopenharmony_ciint numa_nearest_node(int node, unsigned int state)
14362306a36Sopenharmony_ci{
14462306a36Sopenharmony_ci	int min_dist = INT_MAX, dist, n, min_node;
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	if (state >= NR_NODE_STATES)
14762306a36Sopenharmony_ci		return -EINVAL;
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	if (node == NUMA_NO_NODE || node_state(node, state))
15062306a36Sopenharmony_ci		return node;
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci	min_node = node;
15362306a36Sopenharmony_ci	for_each_node_state(n, state) {
15462306a36Sopenharmony_ci		dist = node_distance(node, n);
15562306a36Sopenharmony_ci		if (dist < min_dist) {
15662306a36Sopenharmony_ci			min_dist = dist;
15762306a36Sopenharmony_ci			min_node = n;
15862306a36Sopenharmony_ci		}
15962306a36Sopenharmony_ci	}
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	return min_node;
16262306a36Sopenharmony_ci}
16362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(numa_nearest_node);
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_cistruct mempolicy *get_task_policy(struct task_struct *p)
16662306a36Sopenharmony_ci{
16762306a36Sopenharmony_ci	struct mempolicy *pol = p->mempolicy;
16862306a36Sopenharmony_ci	int node;
16962306a36Sopenharmony_ci
17062306a36Sopenharmony_ci	if (pol)
17162306a36Sopenharmony_ci		return pol;
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	node = numa_node_id();
17462306a36Sopenharmony_ci	if (node != NUMA_NO_NODE) {
17562306a36Sopenharmony_ci		pol = &preferred_node_policy[node];
17662306a36Sopenharmony_ci		/* preferred_node_policy is not initialised early in boot */
17762306a36Sopenharmony_ci		if (pol->mode)
17862306a36Sopenharmony_ci			return pol;
17962306a36Sopenharmony_ci	}
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	return &default_policy;
18262306a36Sopenharmony_ci}
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_cistatic const struct mempolicy_operations {
18562306a36Sopenharmony_ci	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
18662306a36Sopenharmony_ci	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
18762306a36Sopenharmony_ci} mpol_ops[MPOL_MAX];
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_cistatic inline int mpol_store_user_nodemask(const struct mempolicy *pol)
19062306a36Sopenharmony_ci{
19162306a36Sopenharmony_ci	return pol->flags & MPOL_MODE_FLAGS;
19262306a36Sopenharmony_ci}
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_cistatic void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
19562306a36Sopenharmony_ci				   const nodemask_t *rel)
19662306a36Sopenharmony_ci{
19762306a36Sopenharmony_ci	nodemask_t tmp;
19862306a36Sopenharmony_ci	nodes_fold(tmp, *orig, nodes_weight(*rel));
19962306a36Sopenharmony_ci	nodes_onto(*ret, tmp, *rel);
20062306a36Sopenharmony_ci}
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_cistatic int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
20362306a36Sopenharmony_ci{
20462306a36Sopenharmony_ci	if (nodes_empty(*nodes))
20562306a36Sopenharmony_ci		return -EINVAL;
20662306a36Sopenharmony_ci	pol->nodes = *nodes;
20762306a36Sopenharmony_ci	return 0;
20862306a36Sopenharmony_ci}
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_cistatic int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
21162306a36Sopenharmony_ci{
21262306a36Sopenharmony_ci	if (nodes_empty(*nodes))
21362306a36Sopenharmony_ci		return -EINVAL;
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	nodes_clear(pol->nodes);
21662306a36Sopenharmony_ci	node_set(first_node(*nodes), pol->nodes);
21762306a36Sopenharmony_ci	return 0;
21862306a36Sopenharmony_ci}
21962306a36Sopenharmony_ci
22062306a36Sopenharmony_ci/*
22162306a36Sopenharmony_ci * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
22262306a36Sopenharmony_ci * any, for the new policy.  mpol_new() has already validated the nodes
22362306a36Sopenharmony_ci * parameter with respect to the policy mode and flags.
22462306a36Sopenharmony_ci *
22562306a36Sopenharmony_ci * Must be called holding task's alloc_lock to protect task's mems_allowed
22662306a36Sopenharmony_ci * and mempolicy.  May also be called holding the mmap_lock for write.
22762306a36Sopenharmony_ci */
22862306a36Sopenharmony_cistatic int mpol_set_nodemask(struct mempolicy *pol,
22962306a36Sopenharmony_ci		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
23062306a36Sopenharmony_ci{
23162306a36Sopenharmony_ci	int ret;
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_ci	/*
23462306a36Sopenharmony_ci	 * Default (pol==NULL) resp. local memory policies are not a
23562306a36Sopenharmony_ci	 * subject of any remapping. They also do not need any special
23662306a36Sopenharmony_ci	 * constructor.
23762306a36Sopenharmony_ci	 */
23862306a36Sopenharmony_ci	if (!pol || pol->mode == MPOL_LOCAL)
23962306a36Sopenharmony_ci		return 0;
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_ci	/* Check N_MEMORY */
24262306a36Sopenharmony_ci	nodes_and(nsc->mask1,
24362306a36Sopenharmony_ci		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	VM_BUG_ON(!nodes);
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	if (pol->flags & MPOL_F_RELATIVE_NODES)
24862306a36Sopenharmony_ci		mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
24962306a36Sopenharmony_ci	else
25062306a36Sopenharmony_ci		nodes_and(nsc->mask2, *nodes, nsc->mask1);
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci	if (mpol_store_user_nodemask(pol))
25362306a36Sopenharmony_ci		pol->w.user_nodemask = *nodes;
25462306a36Sopenharmony_ci	else
25562306a36Sopenharmony_ci		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
25862306a36Sopenharmony_ci	return ret;
25962306a36Sopenharmony_ci}
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_ci/*
26262306a36Sopenharmony_ci * This function just creates a new policy, does some check and simple
26362306a36Sopenharmony_ci * initialization. You must invoke mpol_set_nodemask() to set nodes.
26462306a36Sopenharmony_ci */
26562306a36Sopenharmony_cistatic struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
26662306a36Sopenharmony_ci				  nodemask_t *nodes)
26762306a36Sopenharmony_ci{
26862306a36Sopenharmony_ci	struct mempolicy *policy;
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
27162306a36Sopenharmony_ci		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	if (mode == MPOL_DEFAULT) {
27462306a36Sopenharmony_ci		if (nodes && !nodes_empty(*nodes))
27562306a36Sopenharmony_ci			return ERR_PTR(-EINVAL);
27662306a36Sopenharmony_ci		return NULL;
27762306a36Sopenharmony_ci	}
27862306a36Sopenharmony_ci	VM_BUG_ON(!nodes);
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	/*
28162306a36Sopenharmony_ci	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
28262306a36Sopenharmony_ci	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
28362306a36Sopenharmony_ci	 * All other modes require a valid pointer to a non-empty nodemask.
28462306a36Sopenharmony_ci	 */
28562306a36Sopenharmony_ci	if (mode == MPOL_PREFERRED) {
28662306a36Sopenharmony_ci		if (nodes_empty(*nodes)) {
28762306a36Sopenharmony_ci			if (((flags & MPOL_F_STATIC_NODES) ||
28862306a36Sopenharmony_ci			     (flags & MPOL_F_RELATIVE_NODES)))
28962306a36Sopenharmony_ci				return ERR_PTR(-EINVAL);
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci			mode = MPOL_LOCAL;
29262306a36Sopenharmony_ci		}
29362306a36Sopenharmony_ci	} else if (mode == MPOL_LOCAL) {
29462306a36Sopenharmony_ci		if (!nodes_empty(*nodes) ||
29562306a36Sopenharmony_ci		    (flags & MPOL_F_STATIC_NODES) ||
29662306a36Sopenharmony_ci		    (flags & MPOL_F_RELATIVE_NODES))
29762306a36Sopenharmony_ci			return ERR_PTR(-EINVAL);
29862306a36Sopenharmony_ci	} else if (nodes_empty(*nodes))
29962306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
30062306a36Sopenharmony_ci	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
30162306a36Sopenharmony_ci	if (!policy)
30262306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
30362306a36Sopenharmony_ci	atomic_set(&policy->refcnt, 1);
30462306a36Sopenharmony_ci	policy->mode = mode;
30562306a36Sopenharmony_ci	policy->flags = flags;
30662306a36Sopenharmony_ci	policy->home_node = NUMA_NO_NODE;
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	return policy;
30962306a36Sopenharmony_ci}
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci/* Slow path of a mpol destructor. */
31262306a36Sopenharmony_civoid __mpol_put(struct mempolicy *p)
31362306a36Sopenharmony_ci{
31462306a36Sopenharmony_ci	if (!atomic_dec_and_test(&p->refcnt))
31562306a36Sopenharmony_ci		return;
31662306a36Sopenharmony_ci	kmem_cache_free(policy_cache, p);
31762306a36Sopenharmony_ci}
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_cistatic void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
32062306a36Sopenharmony_ci{
32162306a36Sopenharmony_ci}
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_cistatic void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
32462306a36Sopenharmony_ci{
32562306a36Sopenharmony_ci	nodemask_t tmp;
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	if (pol->flags & MPOL_F_STATIC_NODES)
32862306a36Sopenharmony_ci		nodes_and(tmp, pol->w.user_nodemask, *nodes);
32962306a36Sopenharmony_ci	else if (pol->flags & MPOL_F_RELATIVE_NODES)
33062306a36Sopenharmony_ci		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
33162306a36Sopenharmony_ci	else {
33262306a36Sopenharmony_ci		nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
33362306a36Sopenharmony_ci								*nodes);
33462306a36Sopenharmony_ci		pol->w.cpuset_mems_allowed = *nodes;
33562306a36Sopenharmony_ci	}
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	if (nodes_empty(tmp))
33862306a36Sopenharmony_ci		tmp = *nodes;
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	pol->nodes = tmp;
34162306a36Sopenharmony_ci}
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_cistatic void mpol_rebind_preferred(struct mempolicy *pol,
34462306a36Sopenharmony_ci						const nodemask_t *nodes)
34562306a36Sopenharmony_ci{
34662306a36Sopenharmony_ci	pol->w.cpuset_mems_allowed = *nodes;
34762306a36Sopenharmony_ci}
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_ci/*
35062306a36Sopenharmony_ci * mpol_rebind_policy - Migrate a policy to a different set of nodes
35162306a36Sopenharmony_ci *
35262306a36Sopenharmony_ci * Per-vma policies are protected by mmap_lock. Allocations using per-task
35362306a36Sopenharmony_ci * policies are protected by task->mems_allowed_seq to prevent a premature
35462306a36Sopenharmony_ci * OOM/allocation failure due to parallel nodemask modification.
35562306a36Sopenharmony_ci */
35662306a36Sopenharmony_cistatic void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
35762306a36Sopenharmony_ci{
35862306a36Sopenharmony_ci	if (!pol || pol->mode == MPOL_LOCAL)
35962306a36Sopenharmony_ci		return;
36062306a36Sopenharmony_ci	if (!mpol_store_user_nodemask(pol) &&
36162306a36Sopenharmony_ci	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
36262306a36Sopenharmony_ci		return;
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_ci	mpol_ops[pol->mode].rebind(pol, newmask);
36562306a36Sopenharmony_ci}
36662306a36Sopenharmony_ci
36762306a36Sopenharmony_ci/*
36862306a36Sopenharmony_ci * Wrapper for mpol_rebind_policy() that just requires task
36962306a36Sopenharmony_ci * pointer, and updates task mempolicy.
37062306a36Sopenharmony_ci *
37162306a36Sopenharmony_ci * Called with task's alloc_lock held.
37262306a36Sopenharmony_ci */
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_civoid mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
37562306a36Sopenharmony_ci{
37662306a36Sopenharmony_ci	mpol_rebind_policy(tsk->mempolicy, new);
37762306a36Sopenharmony_ci}
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci/*
38062306a36Sopenharmony_ci * Rebind each vma in mm to new nodemask.
38162306a36Sopenharmony_ci *
38262306a36Sopenharmony_ci * Call holding a reference to mm.  Takes mm->mmap_lock during call.
38362306a36Sopenharmony_ci */
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_civoid mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
38662306a36Sopenharmony_ci{
38762306a36Sopenharmony_ci	struct vm_area_struct *vma;
38862306a36Sopenharmony_ci	VMA_ITERATOR(vmi, mm, 0);
38962306a36Sopenharmony_ci
39062306a36Sopenharmony_ci	mmap_write_lock(mm);
39162306a36Sopenharmony_ci	for_each_vma(vmi, vma) {
39262306a36Sopenharmony_ci		vma_start_write(vma);
39362306a36Sopenharmony_ci		mpol_rebind_policy(vma->vm_policy, new);
39462306a36Sopenharmony_ci	}
39562306a36Sopenharmony_ci	mmap_write_unlock(mm);
39662306a36Sopenharmony_ci}
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_cistatic const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
39962306a36Sopenharmony_ci	[MPOL_DEFAULT] = {
40062306a36Sopenharmony_ci		.rebind = mpol_rebind_default,
40162306a36Sopenharmony_ci	},
40262306a36Sopenharmony_ci	[MPOL_INTERLEAVE] = {
40362306a36Sopenharmony_ci		.create = mpol_new_nodemask,
40462306a36Sopenharmony_ci		.rebind = mpol_rebind_nodemask,
40562306a36Sopenharmony_ci	},
40662306a36Sopenharmony_ci	[MPOL_PREFERRED] = {
40762306a36Sopenharmony_ci		.create = mpol_new_preferred,
40862306a36Sopenharmony_ci		.rebind = mpol_rebind_preferred,
40962306a36Sopenharmony_ci	},
41062306a36Sopenharmony_ci	[MPOL_BIND] = {
41162306a36Sopenharmony_ci		.create = mpol_new_nodemask,
41262306a36Sopenharmony_ci		.rebind = mpol_rebind_nodemask,
41362306a36Sopenharmony_ci	},
41462306a36Sopenharmony_ci	[MPOL_LOCAL] = {
41562306a36Sopenharmony_ci		.rebind = mpol_rebind_default,
41662306a36Sopenharmony_ci	},
41762306a36Sopenharmony_ci	[MPOL_PREFERRED_MANY] = {
41862306a36Sopenharmony_ci		.create = mpol_new_nodemask,
41962306a36Sopenharmony_ci		.rebind = mpol_rebind_preferred,
42062306a36Sopenharmony_ci	},
42162306a36Sopenharmony_ci};
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_cistatic int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
42462306a36Sopenharmony_ci				unsigned long flags);
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_cistruct queue_pages {
42762306a36Sopenharmony_ci	struct list_head *pagelist;
42862306a36Sopenharmony_ci	unsigned long flags;
42962306a36Sopenharmony_ci	nodemask_t *nmask;
43062306a36Sopenharmony_ci	unsigned long start;
43162306a36Sopenharmony_ci	unsigned long end;
43262306a36Sopenharmony_ci	struct vm_area_struct *first;
43362306a36Sopenharmony_ci	bool has_unmovable;
43462306a36Sopenharmony_ci};
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci/*
43762306a36Sopenharmony_ci * Check if the folio's nid is in qp->nmask.
43862306a36Sopenharmony_ci *
43962306a36Sopenharmony_ci * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
44062306a36Sopenharmony_ci * in the invert of qp->nmask.
44162306a36Sopenharmony_ci */
44262306a36Sopenharmony_cistatic inline bool queue_folio_required(struct folio *folio,
44362306a36Sopenharmony_ci					struct queue_pages *qp)
44462306a36Sopenharmony_ci{
44562306a36Sopenharmony_ci	int nid = folio_nid(folio);
44662306a36Sopenharmony_ci	unsigned long flags = qp->flags;
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
44962306a36Sopenharmony_ci}
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci/*
45262306a36Sopenharmony_ci * queue_folios_pmd() has three possible return values:
45362306a36Sopenharmony_ci * 0 - folios are placed on the right node or queued successfully, or
45462306a36Sopenharmony_ci *     special page is met, i.e. zero page, or unmovable page is found
45562306a36Sopenharmony_ci *     but continue walking (indicated by queue_pages.has_unmovable).
45662306a36Sopenharmony_ci * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
45762306a36Sopenharmony_ci *        existing folio was already on a node that does not follow the
45862306a36Sopenharmony_ci *        policy.
45962306a36Sopenharmony_ci */
46062306a36Sopenharmony_cistatic int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
46162306a36Sopenharmony_ci				unsigned long end, struct mm_walk *walk)
46262306a36Sopenharmony_ci	__releases(ptl)
46362306a36Sopenharmony_ci{
46462306a36Sopenharmony_ci	int ret = 0;
46562306a36Sopenharmony_ci	struct folio *folio;
46662306a36Sopenharmony_ci	struct queue_pages *qp = walk->private;
46762306a36Sopenharmony_ci	unsigned long flags;
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	if (unlikely(is_pmd_migration_entry(*pmd))) {
47062306a36Sopenharmony_ci		ret = -EIO;
47162306a36Sopenharmony_ci		goto unlock;
47262306a36Sopenharmony_ci	}
47362306a36Sopenharmony_ci	folio = pfn_folio(pmd_pfn(*pmd));
47462306a36Sopenharmony_ci	if (is_huge_zero_page(&folio->page)) {
47562306a36Sopenharmony_ci		walk->action = ACTION_CONTINUE;
47662306a36Sopenharmony_ci		goto unlock;
47762306a36Sopenharmony_ci	}
47862306a36Sopenharmony_ci	if (!queue_folio_required(folio, qp))
47962306a36Sopenharmony_ci		goto unlock;
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	flags = qp->flags;
48262306a36Sopenharmony_ci	/* go to folio migration */
48362306a36Sopenharmony_ci	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
48462306a36Sopenharmony_ci		if (!vma_migratable(walk->vma) ||
48562306a36Sopenharmony_ci		    migrate_folio_add(folio, qp->pagelist, flags)) {
48662306a36Sopenharmony_ci			qp->has_unmovable = true;
48762306a36Sopenharmony_ci			goto unlock;
48862306a36Sopenharmony_ci		}
48962306a36Sopenharmony_ci	} else
49062306a36Sopenharmony_ci		ret = -EIO;
49162306a36Sopenharmony_ciunlock:
49262306a36Sopenharmony_ci	spin_unlock(ptl);
49362306a36Sopenharmony_ci	return ret;
49462306a36Sopenharmony_ci}
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci/*
49762306a36Sopenharmony_ci * Scan through pages checking if pages follow certain conditions,
49862306a36Sopenharmony_ci * and move them to the pagelist if they do.
49962306a36Sopenharmony_ci *
50062306a36Sopenharmony_ci * queue_folios_pte_range() has three possible return values:
50162306a36Sopenharmony_ci * 0 - folios are placed on the right node or queued successfully, or
50262306a36Sopenharmony_ci *     special page is met, i.e. zero page, or unmovable page is found
50362306a36Sopenharmony_ci *     but continue walking (indicated by queue_pages.has_unmovable).
50462306a36Sopenharmony_ci * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
50562306a36Sopenharmony_ci *        on a node that does not follow the policy.
50662306a36Sopenharmony_ci */
50762306a36Sopenharmony_cistatic int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
50862306a36Sopenharmony_ci			unsigned long end, struct mm_walk *walk)
50962306a36Sopenharmony_ci{
51062306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
51162306a36Sopenharmony_ci	struct folio *folio;
51262306a36Sopenharmony_ci	struct queue_pages *qp = walk->private;
51362306a36Sopenharmony_ci	unsigned long flags = qp->flags;
51462306a36Sopenharmony_ci	pte_t *pte, *mapped_pte;
51562306a36Sopenharmony_ci	pte_t ptent;
51662306a36Sopenharmony_ci	spinlock_t *ptl;
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_ci	ptl = pmd_trans_huge_lock(pmd, vma);
51962306a36Sopenharmony_ci	if (ptl)
52062306a36Sopenharmony_ci		return queue_folios_pmd(pmd, ptl, addr, end, walk);
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_ci	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
52362306a36Sopenharmony_ci	if (!pte) {
52462306a36Sopenharmony_ci		walk->action = ACTION_AGAIN;
52562306a36Sopenharmony_ci		return 0;
52662306a36Sopenharmony_ci	}
52762306a36Sopenharmony_ci	for (; addr != end; pte++, addr += PAGE_SIZE) {
52862306a36Sopenharmony_ci		ptent = ptep_get(pte);
52962306a36Sopenharmony_ci		if (!pte_present(ptent))
53062306a36Sopenharmony_ci			continue;
53162306a36Sopenharmony_ci		folio = vm_normal_folio(vma, addr, ptent);
53262306a36Sopenharmony_ci		if (!folio || folio_is_zone_device(folio))
53362306a36Sopenharmony_ci			continue;
53462306a36Sopenharmony_ci		/*
53562306a36Sopenharmony_ci		 * vm_normal_folio() filters out zero pages, but there might
53662306a36Sopenharmony_ci		 * still be reserved folios to skip, perhaps in a VDSO.
53762306a36Sopenharmony_ci		 */
53862306a36Sopenharmony_ci		if (folio_test_reserved(folio))
53962306a36Sopenharmony_ci			continue;
54062306a36Sopenharmony_ci		if (!queue_folio_required(folio, qp))
54162306a36Sopenharmony_ci			continue;
54262306a36Sopenharmony_ci		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
54362306a36Sopenharmony_ci			/*
54462306a36Sopenharmony_ci			 * MPOL_MF_STRICT must be specified if we get here.
54562306a36Sopenharmony_ci			 * Continue walking vmas due to MPOL_MF_MOVE* flags.
54662306a36Sopenharmony_ci			 */
54762306a36Sopenharmony_ci			if (!vma_migratable(vma))
54862306a36Sopenharmony_ci				qp->has_unmovable = true;
54962306a36Sopenharmony_ci
55062306a36Sopenharmony_ci			/*
55162306a36Sopenharmony_ci			 * Do not abort immediately since there may be
55262306a36Sopenharmony_ci			 * temporary off LRU pages in the range.  Still
55362306a36Sopenharmony_ci			 * need migrate other LRU pages.
55462306a36Sopenharmony_ci			 */
55562306a36Sopenharmony_ci			if (migrate_folio_add(folio, qp->pagelist, flags))
55662306a36Sopenharmony_ci				qp->has_unmovable = true;
55762306a36Sopenharmony_ci		} else
55862306a36Sopenharmony_ci			break;
55962306a36Sopenharmony_ci	}
56062306a36Sopenharmony_ci	pte_unmap_unlock(mapped_pte, ptl);
56162306a36Sopenharmony_ci	cond_resched();
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci	return addr != end ? -EIO : 0;
56462306a36Sopenharmony_ci}
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_cistatic int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
56762306a36Sopenharmony_ci			       unsigned long addr, unsigned long end,
56862306a36Sopenharmony_ci			       struct mm_walk *walk)
56962306a36Sopenharmony_ci{
57062306a36Sopenharmony_ci	int ret = 0;
57162306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE
57262306a36Sopenharmony_ci	struct queue_pages *qp = walk->private;
57362306a36Sopenharmony_ci	unsigned long flags = (qp->flags & MPOL_MF_VALID);
57462306a36Sopenharmony_ci	struct folio *folio;
57562306a36Sopenharmony_ci	spinlock_t *ptl;
57662306a36Sopenharmony_ci	pte_t entry;
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
57962306a36Sopenharmony_ci	entry = huge_ptep_get(pte);
58062306a36Sopenharmony_ci	if (!pte_present(entry))
58162306a36Sopenharmony_ci		goto unlock;
58262306a36Sopenharmony_ci	folio = pfn_folio(pte_pfn(entry));
58362306a36Sopenharmony_ci	if (!queue_folio_required(folio, qp))
58462306a36Sopenharmony_ci		goto unlock;
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_ci	if (flags == MPOL_MF_STRICT) {
58762306a36Sopenharmony_ci		/*
58862306a36Sopenharmony_ci		 * STRICT alone means only detecting misplaced folio and no
58962306a36Sopenharmony_ci		 * need to further check other vma.
59062306a36Sopenharmony_ci		 */
59162306a36Sopenharmony_ci		ret = -EIO;
59262306a36Sopenharmony_ci		goto unlock;
59362306a36Sopenharmony_ci	}
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ci	if (!vma_migratable(walk->vma)) {
59662306a36Sopenharmony_ci		/*
59762306a36Sopenharmony_ci		 * Must be STRICT with MOVE*, otherwise .test_walk() have
59862306a36Sopenharmony_ci		 * stopped walking current vma.
59962306a36Sopenharmony_ci		 * Detecting misplaced folio but allow migrating folios which
60062306a36Sopenharmony_ci		 * have been queued.
60162306a36Sopenharmony_ci		 */
60262306a36Sopenharmony_ci		qp->has_unmovable = true;
60362306a36Sopenharmony_ci		goto unlock;
60462306a36Sopenharmony_ci	}
60562306a36Sopenharmony_ci
60662306a36Sopenharmony_ci	/*
60762306a36Sopenharmony_ci	 * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
60862306a36Sopenharmony_ci	 * is shared it is likely not worth migrating.
60962306a36Sopenharmony_ci	 *
61062306a36Sopenharmony_ci	 * To check if the folio is shared, ideally we want to make sure
61162306a36Sopenharmony_ci	 * every page is mapped to the same process. Doing that is very
61262306a36Sopenharmony_ci	 * expensive, so check the estimated mapcount of the folio instead.
61362306a36Sopenharmony_ci	 */
61462306a36Sopenharmony_ci	if (flags & (MPOL_MF_MOVE_ALL) ||
61562306a36Sopenharmony_ci	    (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
61662306a36Sopenharmony_ci	     !hugetlb_pmd_shared(pte))) {
61762306a36Sopenharmony_ci		if (!isolate_hugetlb(folio, qp->pagelist) &&
61862306a36Sopenharmony_ci			(flags & MPOL_MF_STRICT))
61962306a36Sopenharmony_ci			/*
62062306a36Sopenharmony_ci			 * Failed to isolate folio but allow migrating pages
62162306a36Sopenharmony_ci			 * which have been queued.
62262306a36Sopenharmony_ci			 */
62362306a36Sopenharmony_ci			qp->has_unmovable = true;
62462306a36Sopenharmony_ci	}
62562306a36Sopenharmony_ciunlock:
62662306a36Sopenharmony_ci	spin_unlock(ptl);
62762306a36Sopenharmony_ci#else
62862306a36Sopenharmony_ci	BUG();
62962306a36Sopenharmony_ci#endif
63062306a36Sopenharmony_ci	return ret;
63162306a36Sopenharmony_ci}
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci#ifdef CONFIG_NUMA_BALANCING
63462306a36Sopenharmony_ci/*
63562306a36Sopenharmony_ci * This is used to mark a range of virtual addresses to be inaccessible.
63662306a36Sopenharmony_ci * These are later cleared by a NUMA hinting fault. Depending on these
63762306a36Sopenharmony_ci * faults, pages may be migrated for better NUMA placement.
63862306a36Sopenharmony_ci *
63962306a36Sopenharmony_ci * This is assuming that NUMA faults are handled using PROT_NONE. If
64062306a36Sopenharmony_ci * an architecture makes a different choice, it will need further
64162306a36Sopenharmony_ci * changes to the core.
64262306a36Sopenharmony_ci */
64362306a36Sopenharmony_ciunsigned long change_prot_numa(struct vm_area_struct *vma,
64462306a36Sopenharmony_ci			unsigned long addr, unsigned long end)
64562306a36Sopenharmony_ci{
64662306a36Sopenharmony_ci	struct mmu_gather tlb;
64762306a36Sopenharmony_ci	long nr_updated;
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_ci	tlb_gather_mmu(&tlb, vma->vm_mm);
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ci	nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
65262306a36Sopenharmony_ci	if (nr_updated > 0)
65362306a36Sopenharmony_ci		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_ci	tlb_finish_mmu(&tlb);
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	return nr_updated;
65862306a36Sopenharmony_ci}
65962306a36Sopenharmony_ci#else
66062306a36Sopenharmony_cistatic unsigned long change_prot_numa(struct vm_area_struct *vma,
66162306a36Sopenharmony_ci			unsigned long addr, unsigned long end)
66262306a36Sopenharmony_ci{
66362306a36Sopenharmony_ci	return 0;
66462306a36Sopenharmony_ci}
66562306a36Sopenharmony_ci#endif /* CONFIG_NUMA_BALANCING */
66662306a36Sopenharmony_ci
66762306a36Sopenharmony_cistatic int queue_pages_test_walk(unsigned long start, unsigned long end,
66862306a36Sopenharmony_ci				struct mm_walk *walk)
66962306a36Sopenharmony_ci{
67062306a36Sopenharmony_ci	struct vm_area_struct *next, *vma = walk->vma;
67162306a36Sopenharmony_ci	struct queue_pages *qp = walk->private;
67262306a36Sopenharmony_ci	unsigned long endvma = vma->vm_end;
67362306a36Sopenharmony_ci	unsigned long flags = qp->flags;
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci	/* range check first */
67662306a36Sopenharmony_ci	VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
67762306a36Sopenharmony_ci
67862306a36Sopenharmony_ci	if (!qp->first) {
67962306a36Sopenharmony_ci		qp->first = vma;
68062306a36Sopenharmony_ci		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
68162306a36Sopenharmony_ci			(qp->start < vma->vm_start))
68262306a36Sopenharmony_ci			/* hole at head side of range */
68362306a36Sopenharmony_ci			return -EFAULT;
68462306a36Sopenharmony_ci	}
68562306a36Sopenharmony_ci	next = find_vma(vma->vm_mm, vma->vm_end);
68662306a36Sopenharmony_ci	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
68762306a36Sopenharmony_ci		((vma->vm_end < qp->end) &&
68862306a36Sopenharmony_ci		(!next || vma->vm_end < next->vm_start)))
68962306a36Sopenharmony_ci		/* hole at middle or tail of range */
69062306a36Sopenharmony_ci		return -EFAULT;
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	/*
69362306a36Sopenharmony_ci	 * Need check MPOL_MF_STRICT to return -EIO if possible
69462306a36Sopenharmony_ci	 * regardless of vma_migratable
69562306a36Sopenharmony_ci	 */
69662306a36Sopenharmony_ci	if (!vma_migratable(vma) &&
69762306a36Sopenharmony_ci	    !(flags & MPOL_MF_STRICT))
69862306a36Sopenharmony_ci		return 1;
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ci	if (endvma > end)
70162306a36Sopenharmony_ci		endvma = end;
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ci	if (flags & MPOL_MF_LAZY) {
70462306a36Sopenharmony_ci		/* Similar to task_numa_work, skip inaccessible VMAs */
70562306a36Sopenharmony_ci		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
70662306a36Sopenharmony_ci			!(vma->vm_flags & VM_MIXEDMAP))
70762306a36Sopenharmony_ci			change_prot_numa(vma, start, endvma);
70862306a36Sopenharmony_ci		return 1;
70962306a36Sopenharmony_ci	}
71062306a36Sopenharmony_ci
71162306a36Sopenharmony_ci	/* queue pages from current vma */
71262306a36Sopenharmony_ci	if (flags & MPOL_MF_VALID)
71362306a36Sopenharmony_ci		return 0;
71462306a36Sopenharmony_ci	return 1;
71562306a36Sopenharmony_ci}
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_cistatic const struct mm_walk_ops queue_pages_walk_ops = {
71862306a36Sopenharmony_ci	.hugetlb_entry		= queue_folios_hugetlb,
71962306a36Sopenharmony_ci	.pmd_entry		= queue_folios_pte_range,
72062306a36Sopenharmony_ci	.test_walk		= queue_pages_test_walk,
72162306a36Sopenharmony_ci	.walk_lock		= PGWALK_RDLOCK,
72262306a36Sopenharmony_ci};
72362306a36Sopenharmony_ci
72462306a36Sopenharmony_cistatic const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
72562306a36Sopenharmony_ci	.hugetlb_entry		= queue_folios_hugetlb,
72662306a36Sopenharmony_ci	.pmd_entry		= queue_folios_pte_range,
72762306a36Sopenharmony_ci	.test_walk		= queue_pages_test_walk,
72862306a36Sopenharmony_ci	.walk_lock		= PGWALK_WRLOCK,
72962306a36Sopenharmony_ci};
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci/*
73262306a36Sopenharmony_ci * Walk through page tables and collect pages to be migrated.
73362306a36Sopenharmony_ci *
73462306a36Sopenharmony_ci * If pages found in a given range are on a set of nodes (determined by
73562306a36Sopenharmony_ci * @nodes and @flags,) it's isolated and queued to the pagelist which is
73662306a36Sopenharmony_ci * passed via @private.
73762306a36Sopenharmony_ci *
73862306a36Sopenharmony_ci * queue_pages_range() has three possible return values:
73962306a36Sopenharmony_ci * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
74062306a36Sopenharmony_ci *     specified.
74162306a36Sopenharmony_ci * 0 - queue pages successfully or no misplaced page.
74262306a36Sopenharmony_ci * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
74362306a36Sopenharmony_ci *         memory range specified by nodemask and maxnode points outside
74462306a36Sopenharmony_ci *         your accessible address space (-EFAULT)
74562306a36Sopenharmony_ci */
74662306a36Sopenharmony_cistatic int
74762306a36Sopenharmony_ciqueue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
74862306a36Sopenharmony_ci		nodemask_t *nodes, unsigned long flags,
74962306a36Sopenharmony_ci		struct list_head *pagelist, bool lock_vma)
75062306a36Sopenharmony_ci{
75162306a36Sopenharmony_ci	int err;
75262306a36Sopenharmony_ci	struct queue_pages qp = {
75362306a36Sopenharmony_ci		.pagelist = pagelist,
75462306a36Sopenharmony_ci		.flags = flags,
75562306a36Sopenharmony_ci		.nmask = nodes,
75662306a36Sopenharmony_ci		.start = start,
75762306a36Sopenharmony_ci		.end = end,
75862306a36Sopenharmony_ci		.first = NULL,
75962306a36Sopenharmony_ci		.has_unmovable = false,
76062306a36Sopenharmony_ci	};
76162306a36Sopenharmony_ci	const struct mm_walk_ops *ops = lock_vma ?
76262306a36Sopenharmony_ci			&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_ci	err = walk_page_range(mm, start, end, ops, &qp);
76562306a36Sopenharmony_ci
76662306a36Sopenharmony_ci	if (qp.has_unmovable)
76762306a36Sopenharmony_ci		err = 1;
76862306a36Sopenharmony_ci	if (!qp.first)
76962306a36Sopenharmony_ci		/* whole range in hole */
77062306a36Sopenharmony_ci		err = -EFAULT;
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci	return err;
77362306a36Sopenharmony_ci}
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci/*
77662306a36Sopenharmony_ci * Apply policy to a single VMA
77762306a36Sopenharmony_ci * This must be called with the mmap_lock held for writing.
77862306a36Sopenharmony_ci */
77962306a36Sopenharmony_cistatic int vma_replace_policy(struct vm_area_struct *vma,
78062306a36Sopenharmony_ci						struct mempolicy *pol)
78162306a36Sopenharmony_ci{
78262306a36Sopenharmony_ci	int err;
78362306a36Sopenharmony_ci	struct mempolicy *old;
78462306a36Sopenharmony_ci	struct mempolicy *new;
78562306a36Sopenharmony_ci
78662306a36Sopenharmony_ci	vma_assert_write_locked(vma);
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
78962306a36Sopenharmony_ci		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
79062306a36Sopenharmony_ci		 vma->vm_ops, vma->vm_file,
79162306a36Sopenharmony_ci		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	new = mpol_dup(pol);
79462306a36Sopenharmony_ci	if (IS_ERR(new))
79562306a36Sopenharmony_ci		return PTR_ERR(new);
79662306a36Sopenharmony_ci
79762306a36Sopenharmony_ci	if (vma->vm_ops && vma->vm_ops->set_policy) {
79862306a36Sopenharmony_ci		err = vma->vm_ops->set_policy(vma, new);
79962306a36Sopenharmony_ci		if (err)
80062306a36Sopenharmony_ci			goto err_out;
80162306a36Sopenharmony_ci	}
80262306a36Sopenharmony_ci
80362306a36Sopenharmony_ci	old = vma->vm_policy;
80462306a36Sopenharmony_ci	vma->vm_policy = new; /* protected by mmap_lock */
80562306a36Sopenharmony_ci	mpol_put(old);
80662306a36Sopenharmony_ci
80762306a36Sopenharmony_ci	return 0;
80862306a36Sopenharmony_ci err_out:
80962306a36Sopenharmony_ci	mpol_put(new);
81062306a36Sopenharmony_ci	return err;
81162306a36Sopenharmony_ci}
81262306a36Sopenharmony_ci
81362306a36Sopenharmony_ci/* Split or merge the VMA (if required) and apply the new policy */
81462306a36Sopenharmony_cistatic int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
81562306a36Sopenharmony_ci		struct vm_area_struct **prev, unsigned long start,
81662306a36Sopenharmony_ci		unsigned long end, struct mempolicy *new_pol)
81762306a36Sopenharmony_ci{
81862306a36Sopenharmony_ci	struct vm_area_struct *merged;
81962306a36Sopenharmony_ci	unsigned long vmstart, vmend;
82062306a36Sopenharmony_ci	pgoff_t pgoff;
82162306a36Sopenharmony_ci	int err;
82262306a36Sopenharmony_ci
82362306a36Sopenharmony_ci	vmend = min(end, vma->vm_end);
82462306a36Sopenharmony_ci	if (start > vma->vm_start) {
82562306a36Sopenharmony_ci		*prev = vma;
82662306a36Sopenharmony_ci		vmstart = start;
82762306a36Sopenharmony_ci	} else {
82862306a36Sopenharmony_ci		vmstart = vma->vm_start;
82962306a36Sopenharmony_ci	}
83062306a36Sopenharmony_ci
83162306a36Sopenharmony_ci	if (mpol_equal(vma_policy(vma), new_pol)) {
83262306a36Sopenharmony_ci		*prev = vma;
83362306a36Sopenharmony_ci		return 0;
83462306a36Sopenharmony_ci	}
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci	pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
83762306a36Sopenharmony_ci	merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
83862306a36Sopenharmony_ci			 vma->anon_vma, vma->vm_file, pgoff, new_pol,
83962306a36Sopenharmony_ci			 vma->vm_userfaultfd_ctx, anon_vma_name(vma));
84062306a36Sopenharmony_ci	if (merged) {
84162306a36Sopenharmony_ci		*prev = merged;
84262306a36Sopenharmony_ci		return vma_replace_policy(merged, new_pol);
84362306a36Sopenharmony_ci	}
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci	if (vma->vm_start != vmstart) {
84662306a36Sopenharmony_ci		err = split_vma(vmi, vma, vmstart, 1);
84762306a36Sopenharmony_ci		if (err)
84862306a36Sopenharmony_ci			return err;
84962306a36Sopenharmony_ci	}
85062306a36Sopenharmony_ci
85162306a36Sopenharmony_ci	if (vma->vm_end != vmend) {
85262306a36Sopenharmony_ci		err = split_vma(vmi, vma, vmend, 0);
85362306a36Sopenharmony_ci		if (err)
85462306a36Sopenharmony_ci			return err;
85562306a36Sopenharmony_ci	}
85662306a36Sopenharmony_ci
85762306a36Sopenharmony_ci	*prev = vma;
85862306a36Sopenharmony_ci	return vma_replace_policy(vma, new_pol);
85962306a36Sopenharmony_ci}
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_ci/* Set the process memory policy */
86262306a36Sopenharmony_cistatic long do_set_mempolicy(unsigned short mode, unsigned short flags,
86362306a36Sopenharmony_ci			     nodemask_t *nodes)
86462306a36Sopenharmony_ci{
86562306a36Sopenharmony_ci	struct mempolicy *new, *old;
86662306a36Sopenharmony_ci	NODEMASK_SCRATCH(scratch);
86762306a36Sopenharmony_ci	int ret;
86862306a36Sopenharmony_ci
86962306a36Sopenharmony_ci	if (!scratch)
87062306a36Sopenharmony_ci		return -ENOMEM;
87162306a36Sopenharmony_ci
87262306a36Sopenharmony_ci	new = mpol_new(mode, flags, nodes);
87362306a36Sopenharmony_ci	if (IS_ERR(new)) {
87462306a36Sopenharmony_ci		ret = PTR_ERR(new);
87562306a36Sopenharmony_ci		goto out;
87662306a36Sopenharmony_ci	}
87762306a36Sopenharmony_ci
87862306a36Sopenharmony_ci	task_lock(current);
87962306a36Sopenharmony_ci	ret = mpol_set_nodemask(new, nodes, scratch);
88062306a36Sopenharmony_ci	if (ret) {
88162306a36Sopenharmony_ci		task_unlock(current);
88262306a36Sopenharmony_ci		mpol_put(new);
88362306a36Sopenharmony_ci		goto out;
88462306a36Sopenharmony_ci	}
88562306a36Sopenharmony_ci
88662306a36Sopenharmony_ci	old = current->mempolicy;
88762306a36Sopenharmony_ci	current->mempolicy = new;
88862306a36Sopenharmony_ci	if (new && new->mode == MPOL_INTERLEAVE)
88962306a36Sopenharmony_ci		current->il_prev = MAX_NUMNODES-1;
89062306a36Sopenharmony_ci	task_unlock(current);
89162306a36Sopenharmony_ci	mpol_put(old);
89262306a36Sopenharmony_ci	ret = 0;
89362306a36Sopenharmony_ciout:
89462306a36Sopenharmony_ci	NODEMASK_SCRATCH_FREE(scratch);
89562306a36Sopenharmony_ci	return ret;
89662306a36Sopenharmony_ci}
89762306a36Sopenharmony_ci
89862306a36Sopenharmony_ci/*
89962306a36Sopenharmony_ci * Return nodemask for policy for get_mempolicy() query
90062306a36Sopenharmony_ci *
90162306a36Sopenharmony_ci * Called with task's alloc_lock held
90262306a36Sopenharmony_ci */
90362306a36Sopenharmony_cistatic void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
90462306a36Sopenharmony_ci{
90562306a36Sopenharmony_ci	nodes_clear(*nodes);
90662306a36Sopenharmony_ci	if (p == &default_policy)
90762306a36Sopenharmony_ci		return;
90862306a36Sopenharmony_ci
90962306a36Sopenharmony_ci	switch (p->mode) {
91062306a36Sopenharmony_ci	case MPOL_BIND:
91162306a36Sopenharmony_ci	case MPOL_INTERLEAVE:
91262306a36Sopenharmony_ci	case MPOL_PREFERRED:
91362306a36Sopenharmony_ci	case MPOL_PREFERRED_MANY:
91462306a36Sopenharmony_ci		*nodes = p->nodes;
91562306a36Sopenharmony_ci		break;
91662306a36Sopenharmony_ci	case MPOL_LOCAL:
91762306a36Sopenharmony_ci		/* return empty node mask for local allocation */
91862306a36Sopenharmony_ci		break;
91962306a36Sopenharmony_ci	default:
92062306a36Sopenharmony_ci		BUG();
92162306a36Sopenharmony_ci	}
92262306a36Sopenharmony_ci}
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_cistatic int lookup_node(struct mm_struct *mm, unsigned long addr)
92562306a36Sopenharmony_ci{
92662306a36Sopenharmony_ci	struct page *p = NULL;
92762306a36Sopenharmony_ci	int ret;
92862306a36Sopenharmony_ci
92962306a36Sopenharmony_ci	ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
93062306a36Sopenharmony_ci	if (ret > 0) {
93162306a36Sopenharmony_ci		ret = page_to_nid(p);
93262306a36Sopenharmony_ci		put_page(p);
93362306a36Sopenharmony_ci	}
93462306a36Sopenharmony_ci	return ret;
93562306a36Sopenharmony_ci}
93662306a36Sopenharmony_ci
93762306a36Sopenharmony_ci/* Retrieve NUMA policy */
93862306a36Sopenharmony_cistatic long do_get_mempolicy(int *policy, nodemask_t *nmask,
93962306a36Sopenharmony_ci			     unsigned long addr, unsigned long flags)
94062306a36Sopenharmony_ci{
94162306a36Sopenharmony_ci	int err;
94262306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
94362306a36Sopenharmony_ci	struct vm_area_struct *vma = NULL;
94462306a36Sopenharmony_ci	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
94562306a36Sopenharmony_ci
94662306a36Sopenharmony_ci	if (flags &
94762306a36Sopenharmony_ci		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
94862306a36Sopenharmony_ci		return -EINVAL;
94962306a36Sopenharmony_ci
95062306a36Sopenharmony_ci	if (flags & MPOL_F_MEMS_ALLOWED) {
95162306a36Sopenharmony_ci		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
95262306a36Sopenharmony_ci			return -EINVAL;
95362306a36Sopenharmony_ci		*policy = 0;	/* just so it's initialized */
95462306a36Sopenharmony_ci		task_lock(current);
95562306a36Sopenharmony_ci		*nmask  = cpuset_current_mems_allowed;
95662306a36Sopenharmony_ci		task_unlock(current);
95762306a36Sopenharmony_ci		return 0;
95862306a36Sopenharmony_ci	}
95962306a36Sopenharmony_ci
96062306a36Sopenharmony_ci	if (flags & MPOL_F_ADDR) {
96162306a36Sopenharmony_ci		/*
96262306a36Sopenharmony_ci		 * Do NOT fall back to task policy if the
96362306a36Sopenharmony_ci		 * vma/shared policy at addr is NULL.  We
96462306a36Sopenharmony_ci		 * want to return MPOL_DEFAULT in this case.
96562306a36Sopenharmony_ci		 */
96662306a36Sopenharmony_ci		mmap_read_lock(mm);
96762306a36Sopenharmony_ci		vma = vma_lookup(mm, addr);
96862306a36Sopenharmony_ci		if (!vma) {
96962306a36Sopenharmony_ci			mmap_read_unlock(mm);
97062306a36Sopenharmony_ci			return -EFAULT;
97162306a36Sopenharmony_ci		}
97262306a36Sopenharmony_ci		if (vma->vm_ops && vma->vm_ops->get_policy)
97362306a36Sopenharmony_ci			pol = vma->vm_ops->get_policy(vma, addr);
97462306a36Sopenharmony_ci		else
97562306a36Sopenharmony_ci			pol = vma->vm_policy;
97662306a36Sopenharmony_ci	} else if (addr)
97762306a36Sopenharmony_ci		return -EINVAL;
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci	if (!pol)
98062306a36Sopenharmony_ci		pol = &default_policy;	/* indicates default behavior */
98162306a36Sopenharmony_ci
98262306a36Sopenharmony_ci	if (flags & MPOL_F_NODE) {
98362306a36Sopenharmony_ci		if (flags & MPOL_F_ADDR) {
98462306a36Sopenharmony_ci			/*
98562306a36Sopenharmony_ci			 * Take a refcount on the mpol, because we are about to
98662306a36Sopenharmony_ci			 * drop the mmap_lock, after which only "pol" remains
98762306a36Sopenharmony_ci			 * valid, "vma" is stale.
98862306a36Sopenharmony_ci			 */
98962306a36Sopenharmony_ci			pol_refcount = pol;
99062306a36Sopenharmony_ci			vma = NULL;
99162306a36Sopenharmony_ci			mpol_get(pol);
99262306a36Sopenharmony_ci			mmap_read_unlock(mm);
99362306a36Sopenharmony_ci			err = lookup_node(mm, addr);
99462306a36Sopenharmony_ci			if (err < 0)
99562306a36Sopenharmony_ci				goto out;
99662306a36Sopenharmony_ci			*policy = err;
99762306a36Sopenharmony_ci		} else if (pol == current->mempolicy &&
99862306a36Sopenharmony_ci				pol->mode == MPOL_INTERLEAVE) {
99962306a36Sopenharmony_ci			*policy = next_node_in(current->il_prev, pol->nodes);
100062306a36Sopenharmony_ci		} else {
100162306a36Sopenharmony_ci			err = -EINVAL;
100262306a36Sopenharmony_ci			goto out;
100362306a36Sopenharmony_ci		}
100462306a36Sopenharmony_ci	} else {
100562306a36Sopenharmony_ci		*policy = pol == &default_policy ? MPOL_DEFAULT :
100662306a36Sopenharmony_ci						pol->mode;
100762306a36Sopenharmony_ci		/*
100862306a36Sopenharmony_ci		 * Internal mempolicy flags must be masked off before exposing
100962306a36Sopenharmony_ci		 * the policy to userspace.
101062306a36Sopenharmony_ci		 */
101162306a36Sopenharmony_ci		*policy |= (pol->flags & MPOL_MODE_FLAGS);
101262306a36Sopenharmony_ci	}
101362306a36Sopenharmony_ci
101462306a36Sopenharmony_ci	err = 0;
101562306a36Sopenharmony_ci	if (nmask) {
101662306a36Sopenharmony_ci		if (mpol_store_user_nodemask(pol)) {
101762306a36Sopenharmony_ci			*nmask = pol->w.user_nodemask;
101862306a36Sopenharmony_ci		} else {
101962306a36Sopenharmony_ci			task_lock(current);
102062306a36Sopenharmony_ci			get_policy_nodemask(pol, nmask);
102162306a36Sopenharmony_ci			task_unlock(current);
102262306a36Sopenharmony_ci		}
102362306a36Sopenharmony_ci	}
102462306a36Sopenharmony_ci
102562306a36Sopenharmony_ci out:
102662306a36Sopenharmony_ci	mpol_cond_put(pol);
102762306a36Sopenharmony_ci	if (vma)
102862306a36Sopenharmony_ci		mmap_read_unlock(mm);
102962306a36Sopenharmony_ci	if (pol_refcount)
103062306a36Sopenharmony_ci		mpol_put(pol_refcount);
103162306a36Sopenharmony_ci	return err;
103262306a36Sopenharmony_ci}
103362306a36Sopenharmony_ci
103462306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION
103562306a36Sopenharmony_cistatic int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
103662306a36Sopenharmony_ci				unsigned long flags)
103762306a36Sopenharmony_ci{
103862306a36Sopenharmony_ci	/*
103962306a36Sopenharmony_ci	 * We try to migrate only unshared folios. If it is shared it
104062306a36Sopenharmony_ci	 * is likely not worth migrating.
104162306a36Sopenharmony_ci	 *
104262306a36Sopenharmony_ci	 * To check if the folio is shared, ideally we want to make sure
104362306a36Sopenharmony_ci	 * every page is mapped to the same process. Doing that is very
104462306a36Sopenharmony_ci	 * expensive, so check the estimated mapcount of the folio instead.
104562306a36Sopenharmony_ci	 */
104662306a36Sopenharmony_ci	if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
104762306a36Sopenharmony_ci		if (folio_isolate_lru(folio)) {
104862306a36Sopenharmony_ci			list_add_tail(&folio->lru, foliolist);
104962306a36Sopenharmony_ci			node_stat_mod_folio(folio,
105062306a36Sopenharmony_ci				NR_ISOLATED_ANON + folio_is_file_lru(folio),
105162306a36Sopenharmony_ci				folio_nr_pages(folio));
105262306a36Sopenharmony_ci		} else if (flags & MPOL_MF_STRICT) {
105362306a36Sopenharmony_ci			/*
105462306a36Sopenharmony_ci			 * Non-movable folio may reach here.  And, there may be
105562306a36Sopenharmony_ci			 * temporary off LRU folios or non-LRU movable folios.
105662306a36Sopenharmony_ci			 * Treat them as unmovable folios since they can't be
105762306a36Sopenharmony_ci			 * isolated, so they can't be moved at the moment.  It
105862306a36Sopenharmony_ci			 * should return -EIO for this case too.
105962306a36Sopenharmony_ci			 */
106062306a36Sopenharmony_ci			return -EIO;
106162306a36Sopenharmony_ci		}
106262306a36Sopenharmony_ci	}
106362306a36Sopenharmony_ci
106462306a36Sopenharmony_ci	return 0;
106562306a36Sopenharmony_ci}
106662306a36Sopenharmony_ci
106762306a36Sopenharmony_ci/*
106862306a36Sopenharmony_ci * Migrate pages from one node to a target node.
106962306a36Sopenharmony_ci * Returns error or the number of pages not migrated.
107062306a36Sopenharmony_ci */
107162306a36Sopenharmony_cistatic int migrate_to_node(struct mm_struct *mm, int source, int dest,
107262306a36Sopenharmony_ci			   int flags)
107362306a36Sopenharmony_ci{
107462306a36Sopenharmony_ci	nodemask_t nmask;
107562306a36Sopenharmony_ci	struct vm_area_struct *vma;
107662306a36Sopenharmony_ci	LIST_HEAD(pagelist);
107762306a36Sopenharmony_ci	int err = 0;
107862306a36Sopenharmony_ci	struct migration_target_control mtc = {
107962306a36Sopenharmony_ci		.nid = dest,
108062306a36Sopenharmony_ci		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
108162306a36Sopenharmony_ci	};
108262306a36Sopenharmony_ci
108362306a36Sopenharmony_ci	nodes_clear(nmask);
108462306a36Sopenharmony_ci	node_set(source, nmask);
108562306a36Sopenharmony_ci
108662306a36Sopenharmony_ci	/*
108762306a36Sopenharmony_ci	 * This does not "check" the range but isolates all pages that
108862306a36Sopenharmony_ci	 * need migration.  Between passing in the full user address
108962306a36Sopenharmony_ci	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
109062306a36Sopenharmony_ci	 */
109162306a36Sopenharmony_ci	vma = find_vma(mm, 0);
109262306a36Sopenharmony_ci	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
109362306a36Sopenharmony_ci	queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
109462306a36Sopenharmony_ci			flags | MPOL_MF_DISCONTIG_OK, &pagelist, false);
109562306a36Sopenharmony_ci
109662306a36Sopenharmony_ci	if (!list_empty(&pagelist)) {
109762306a36Sopenharmony_ci		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
109862306a36Sopenharmony_ci				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
109962306a36Sopenharmony_ci		if (err)
110062306a36Sopenharmony_ci			putback_movable_pages(&pagelist);
110162306a36Sopenharmony_ci	}
110262306a36Sopenharmony_ci
110362306a36Sopenharmony_ci	return err;
110462306a36Sopenharmony_ci}
110562306a36Sopenharmony_ci
110662306a36Sopenharmony_ci/*
110762306a36Sopenharmony_ci * Move pages between the two nodesets so as to preserve the physical
110862306a36Sopenharmony_ci * layout as much as possible.
110962306a36Sopenharmony_ci *
111062306a36Sopenharmony_ci * Returns the number of page that could not be moved.
111162306a36Sopenharmony_ci */
111262306a36Sopenharmony_ciint do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
111362306a36Sopenharmony_ci		     const nodemask_t *to, int flags)
111462306a36Sopenharmony_ci{
111562306a36Sopenharmony_ci	int busy = 0;
111662306a36Sopenharmony_ci	int err = 0;
111762306a36Sopenharmony_ci	nodemask_t tmp;
111862306a36Sopenharmony_ci
111962306a36Sopenharmony_ci	lru_cache_disable();
112062306a36Sopenharmony_ci
112162306a36Sopenharmony_ci	mmap_read_lock(mm);
112262306a36Sopenharmony_ci
112362306a36Sopenharmony_ci	/*
112462306a36Sopenharmony_ci	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
112562306a36Sopenharmony_ci	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
112662306a36Sopenharmony_ci	 * bit in 'tmp', and return that <source, dest> pair for migration.
112762306a36Sopenharmony_ci	 * The pair of nodemasks 'to' and 'from' define the map.
112862306a36Sopenharmony_ci	 *
112962306a36Sopenharmony_ci	 * If no pair of bits is found that way, fallback to picking some
113062306a36Sopenharmony_ci	 * pair of 'source' and 'dest' bits that are not the same.  If the
113162306a36Sopenharmony_ci	 * 'source' and 'dest' bits are the same, this represents a node
113262306a36Sopenharmony_ci	 * that will be migrating to itself, so no pages need move.
113362306a36Sopenharmony_ci	 *
113462306a36Sopenharmony_ci	 * If no bits are left in 'tmp', or if all remaining bits left
113562306a36Sopenharmony_ci	 * in 'tmp' correspond to the same bit in 'to', return false
113662306a36Sopenharmony_ci	 * (nothing left to migrate).
113762306a36Sopenharmony_ci	 *
113862306a36Sopenharmony_ci	 * This lets us pick a pair of nodes to migrate between, such that
113962306a36Sopenharmony_ci	 * if possible the dest node is not already occupied by some other
114062306a36Sopenharmony_ci	 * source node, minimizing the risk of overloading the memory on a
114162306a36Sopenharmony_ci	 * node that would happen if we migrated incoming memory to a node
114262306a36Sopenharmony_ci	 * before migrating outgoing memory source that same node.
114362306a36Sopenharmony_ci	 *
114462306a36Sopenharmony_ci	 * A single scan of tmp is sufficient.  As we go, we remember the
114562306a36Sopenharmony_ci	 * most recent <s, d> pair that moved (s != d).  If we find a pair
114662306a36Sopenharmony_ci	 * that not only moved, but what's better, moved to an empty slot
114762306a36Sopenharmony_ci	 * (d is not set in tmp), then we break out then, with that pair.
114862306a36Sopenharmony_ci	 * Otherwise when we finish scanning from_tmp, we at least have the
114962306a36Sopenharmony_ci	 * most recent <s, d> pair that moved.  If we get all the way through
115062306a36Sopenharmony_ci	 * the scan of tmp without finding any node that moved, much less
115162306a36Sopenharmony_ci	 * moved to an empty node, then there is nothing left worth migrating.
115262306a36Sopenharmony_ci	 */
115362306a36Sopenharmony_ci
115462306a36Sopenharmony_ci	tmp = *from;
115562306a36Sopenharmony_ci	while (!nodes_empty(tmp)) {
115662306a36Sopenharmony_ci		int s, d;
115762306a36Sopenharmony_ci		int source = NUMA_NO_NODE;
115862306a36Sopenharmony_ci		int dest = 0;
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_ci		for_each_node_mask(s, tmp) {
116162306a36Sopenharmony_ci
116262306a36Sopenharmony_ci			/*
116362306a36Sopenharmony_ci			 * do_migrate_pages() tries to maintain the relative
116462306a36Sopenharmony_ci			 * node relationship of the pages established between
116562306a36Sopenharmony_ci			 * threads and memory areas.
116662306a36Sopenharmony_ci                         *
116762306a36Sopenharmony_ci			 * However if the number of source nodes is not equal to
116862306a36Sopenharmony_ci			 * the number of destination nodes we can not preserve
116962306a36Sopenharmony_ci			 * this node relative relationship.  In that case, skip
117062306a36Sopenharmony_ci			 * copying memory from a node that is in the destination
117162306a36Sopenharmony_ci			 * mask.
117262306a36Sopenharmony_ci			 *
117362306a36Sopenharmony_ci			 * Example: [2,3,4] -> [3,4,5] moves everything.
117462306a36Sopenharmony_ci			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
117562306a36Sopenharmony_ci			 */
117662306a36Sopenharmony_ci
117762306a36Sopenharmony_ci			if ((nodes_weight(*from) != nodes_weight(*to)) &&
117862306a36Sopenharmony_ci						(node_isset(s, *to)))
117962306a36Sopenharmony_ci				continue;
118062306a36Sopenharmony_ci
118162306a36Sopenharmony_ci			d = node_remap(s, *from, *to);
118262306a36Sopenharmony_ci			if (s == d)
118362306a36Sopenharmony_ci				continue;
118462306a36Sopenharmony_ci
118562306a36Sopenharmony_ci			source = s;	/* Node moved. Memorize */
118662306a36Sopenharmony_ci			dest = d;
118762306a36Sopenharmony_ci
118862306a36Sopenharmony_ci			/* dest not in remaining from nodes? */
118962306a36Sopenharmony_ci			if (!node_isset(dest, tmp))
119062306a36Sopenharmony_ci				break;
119162306a36Sopenharmony_ci		}
119262306a36Sopenharmony_ci		if (source == NUMA_NO_NODE)
119362306a36Sopenharmony_ci			break;
119462306a36Sopenharmony_ci
119562306a36Sopenharmony_ci		node_clear(source, tmp);
119662306a36Sopenharmony_ci		err = migrate_to_node(mm, source, dest, flags);
119762306a36Sopenharmony_ci		if (err > 0)
119862306a36Sopenharmony_ci			busy += err;
119962306a36Sopenharmony_ci		if (err < 0)
120062306a36Sopenharmony_ci			break;
120162306a36Sopenharmony_ci	}
120262306a36Sopenharmony_ci	mmap_read_unlock(mm);
120362306a36Sopenharmony_ci
120462306a36Sopenharmony_ci	lru_cache_enable();
120562306a36Sopenharmony_ci	if (err < 0)
120662306a36Sopenharmony_ci		return err;
120762306a36Sopenharmony_ci	return busy;
120862306a36Sopenharmony_ci
120962306a36Sopenharmony_ci}
121062306a36Sopenharmony_ci
121162306a36Sopenharmony_ci/*
121262306a36Sopenharmony_ci * Allocate a new page for page migration based on vma policy.
121362306a36Sopenharmony_ci * Start by assuming the page is mapped by the same vma as contains @start.
121462306a36Sopenharmony_ci * Search forward from there, if not.  N.B., this assumes that the
121562306a36Sopenharmony_ci * list of pages handed to migrate_pages()--which is how we get here--
121662306a36Sopenharmony_ci * is in virtual address order.
121762306a36Sopenharmony_ci */
121862306a36Sopenharmony_cistatic struct folio *new_folio(struct folio *src, unsigned long start)
121962306a36Sopenharmony_ci{
122062306a36Sopenharmony_ci	struct vm_area_struct *vma;
122162306a36Sopenharmony_ci	unsigned long address;
122262306a36Sopenharmony_ci	VMA_ITERATOR(vmi, current->mm, start);
122362306a36Sopenharmony_ci	gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
122462306a36Sopenharmony_ci
122562306a36Sopenharmony_ci	for_each_vma(vmi, vma) {
122662306a36Sopenharmony_ci		address = page_address_in_vma(&src->page, vma);
122762306a36Sopenharmony_ci		if (address != -EFAULT)
122862306a36Sopenharmony_ci			break;
122962306a36Sopenharmony_ci	}
123062306a36Sopenharmony_ci
123162306a36Sopenharmony_ci	if (folio_test_hugetlb(src)) {
123262306a36Sopenharmony_ci		return alloc_hugetlb_folio_vma(folio_hstate(src),
123362306a36Sopenharmony_ci				vma, address);
123462306a36Sopenharmony_ci	}
123562306a36Sopenharmony_ci
123662306a36Sopenharmony_ci	if (folio_test_large(src))
123762306a36Sopenharmony_ci		gfp = GFP_TRANSHUGE;
123862306a36Sopenharmony_ci
123962306a36Sopenharmony_ci	/*
124062306a36Sopenharmony_ci	 * if !vma, vma_alloc_folio() will use task or system default policy
124162306a36Sopenharmony_ci	 */
124262306a36Sopenharmony_ci	return vma_alloc_folio(gfp, folio_order(src), vma, address,
124362306a36Sopenharmony_ci			folio_test_large(src));
124462306a36Sopenharmony_ci}
124562306a36Sopenharmony_ci#else
124662306a36Sopenharmony_ci
124762306a36Sopenharmony_cistatic int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
124862306a36Sopenharmony_ci				unsigned long flags)
124962306a36Sopenharmony_ci{
125062306a36Sopenharmony_ci	return -EIO;
125162306a36Sopenharmony_ci}
125262306a36Sopenharmony_ci
125362306a36Sopenharmony_ciint do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
125462306a36Sopenharmony_ci		     const nodemask_t *to, int flags)
125562306a36Sopenharmony_ci{
125662306a36Sopenharmony_ci	return -ENOSYS;
125762306a36Sopenharmony_ci}
125862306a36Sopenharmony_ci
125962306a36Sopenharmony_cistatic struct folio *new_folio(struct folio *src, unsigned long start)
126062306a36Sopenharmony_ci{
126162306a36Sopenharmony_ci	return NULL;
126262306a36Sopenharmony_ci}
126362306a36Sopenharmony_ci#endif
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_cistatic long do_mbind(unsigned long start, unsigned long len,
126662306a36Sopenharmony_ci		     unsigned short mode, unsigned short mode_flags,
126762306a36Sopenharmony_ci		     nodemask_t *nmask, unsigned long flags)
126862306a36Sopenharmony_ci{
126962306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
127062306a36Sopenharmony_ci	struct vm_area_struct *vma, *prev;
127162306a36Sopenharmony_ci	struct vma_iterator vmi;
127262306a36Sopenharmony_ci	struct mempolicy *new;
127362306a36Sopenharmony_ci	unsigned long end;
127462306a36Sopenharmony_ci	int err;
127562306a36Sopenharmony_ci	int ret;
127662306a36Sopenharmony_ci	LIST_HEAD(pagelist);
127762306a36Sopenharmony_ci
127862306a36Sopenharmony_ci	if (flags & ~(unsigned long)MPOL_MF_VALID)
127962306a36Sopenharmony_ci		return -EINVAL;
128062306a36Sopenharmony_ci	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
128162306a36Sopenharmony_ci		return -EPERM;
128262306a36Sopenharmony_ci
128362306a36Sopenharmony_ci	if (start & ~PAGE_MASK)
128462306a36Sopenharmony_ci		return -EINVAL;
128562306a36Sopenharmony_ci
128662306a36Sopenharmony_ci	if (mode == MPOL_DEFAULT)
128762306a36Sopenharmony_ci		flags &= ~MPOL_MF_STRICT;
128862306a36Sopenharmony_ci
128962306a36Sopenharmony_ci	len = PAGE_ALIGN(len);
129062306a36Sopenharmony_ci	end = start + len;
129162306a36Sopenharmony_ci
129262306a36Sopenharmony_ci	if (end < start)
129362306a36Sopenharmony_ci		return -EINVAL;
129462306a36Sopenharmony_ci	if (end == start)
129562306a36Sopenharmony_ci		return 0;
129662306a36Sopenharmony_ci
129762306a36Sopenharmony_ci	new = mpol_new(mode, mode_flags, nmask);
129862306a36Sopenharmony_ci	if (IS_ERR(new))
129962306a36Sopenharmony_ci		return PTR_ERR(new);
130062306a36Sopenharmony_ci
130162306a36Sopenharmony_ci	if (flags & MPOL_MF_LAZY)
130262306a36Sopenharmony_ci		new->flags |= MPOL_F_MOF;
130362306a36Sopenharmony_ci
130462306a36Sopenharmony_ci	/*
130562306a36Sopenharmony_ci	 * If we are using the default policy then operation
130662306a36Sopenharmony_ci	 * on discontinuous address spaces is okay after all
130762306a36Sopenharmony_ci	 */
130862306a36Sopenharmony_ci	if (!new)
130962306a36Sopenharmony_ci		flags |= MPOL_MF_DISCONTIG_OK;
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_ci	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
131262306a36Sopenharmony_ci		 start, start + len, mode, mode_flags,
131362306a36Sopenharmony_ci		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
131462306a36Sopenharmony_ci
131562306a36Sopenharmony_ci	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci		lru_cache_disable();
131862306a36Sopenharmony_ci	}
131962306a36Sopenharmony_ci	{
132062306a36Sopenharmony_ci		NODEMASK_SCRATCH(scratch);
132162306a36Sopenharmony_ci		if (scratch) {
132262306a36Sopenharmony_ci			mmap_write_lock(mm);
132362306a36Sopenharmony_ci			err = mpol_set_nodemask(new, nmask, scratch);
132462306a36Sopenharmony_ci			if (err)
132562306a36Sopenharmony_ci				mmap_write_unlock(mm);
132662306a36Sopenharmony_ci		} else
132762306a36Sopenharmony_ci			err = -ENOMEM;
132862306a36Sopenharmony_ci		NODEMASK_SCRATCH_FREE(scratch);
132962306a36Sopenharmony_ci	}
133062306a36Sopenharmony_ci	if (err)
133162306a36Sopenharmony_ci		goto mpol_out;
133262306a36Sopenharmony_ci
133362306a36Sopenharmony_ci	/*
133462306a36Sopenharmony_ci	 * Lock the VMAs before scanning for pages to migrate, to ensure we don't
133562306a36Sopenharmony_ci	 * miss a concurrently inserted page.
133662306a36Sopenharmony_ci	 */
133762306a36Sopenharmony_ci	ret = queue_pages_range(mm, start, end, nmask,
133862306a36Sopenharmony_ci			  flags | MPOL_MF_INVERT, &pagelist, true);
133962306a36Sopenharmony_ci
134062306a36Sopenharmony_ci	if (ret < 0) {
134162306a36Sopenharmony_ci		err = ret;
134262306a36Sopenharmony_ci		goto up_out;
134362306a36Sopenharmony_ci	}
134462306a36Sopenharmony_ci
134562306a36Sopenharmony_ci	vma_iter_init(&vmi, mm, start);
134662306a36Sopenharmony_ci	prev = vma_prev(&vmi);
134762306a36Sopenharmony_ci	for_each_vma_range(vmi, vma, end) {
134862306a36Sopenharmony_ci		err = mbind_range(&vmi, vma, &prev, start, end, new);
134962306a36Sopenharmony_ci		if (err)
135062306a36Sopenharmony_ci			break;
135162306a36Sopenharmony_ci	}
135262306a36Sopenharmony_ci
135362306a36Sopenharmony_ci	if (!err) {
135462306a36Sopenharmony_ci		int nr_failed = 0;
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_ci		if (!list_empty(&pagelist)) {
135762306a36Sopenharmony_ci			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
135862306a36Sopenharmony_ci			nr_failed = migrate_pages(&pagelist, new_folio, NULL,
135962306a36Sopenharmony_ci				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
136062306a36Sopenharmony_ci			if (nr_failed)
136162306a36Sopenharmony_ci				putback_movable_pages(&pagelist);
136262306a36Sopenharmony_ci		}
136362306a36Sopenharmony_ci
136462306a36Sopenharmony_ci		if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT))
136562306a36Sopenharmony_ci			err = -EIO;
136662306a36Sopenharmony_ci	} else {
136762306a36Sopenharmony_ciup_out:
136862306a36Sopenharmony_ci		if (!list_empty(&pagelist))
136962306a36Sopenharmony_ci			putback_movable_pages(&pagelist);
137062306a36Sopenharmony_ci	}
137162306a36Sopenharmony_ci
137262306a36Sopenharmony_ci	mmap_write_unlock(mm);
137362306a36Sopenharmony_cimpol_out:
137462306a36Sopenharmony_ci	mpol_put(new);
137562306a36Sopenharmony_ci	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
137662306a36Sopenharmony_ci		lru_cache_enable();
137762306a36Sopenharmony_ci	return err;
137862306a36Sopenharmony_ci}
137962306a36Sopenharmony_ci
138062306a36Sopenharmony_ci/*
138162306a36Sopenharmony_ci * User space interface with variable sized bitmaps for nodelists.
138262306a36Sopenharmony_ci */
138362306a36Sopenharmony_cistatic int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
138462306a36Sopenharmony_ci		      unsigned long maxnode)
138562306a36Sopenharmony_ci{
138662306a36Sopenharmony_ci	unsigned long nlongs = BITS_TO_LONGS(maxnode);
138762306a36Sopenharmony_ci	int ret;
138862306a36Sopenharmony_ci
138962306a36Sopenharmony_ci	if (in_compat_syscall())
139062306a36Sopenharmony_ci		ret = compat_get_bitmap(mask,
139162306a36Sopenharmony_ci					(const compat_ulong_t __user *)nmask,
139262306a36Sopenharmony_ci					maxnode);
139362306a36Sopenharmony_ci	else
139462306a36Sopenharmony_ci		ret = copy_from_user(mask, nmask,
139562306a36Sopenharmony_ci				     nlongs * sizeof(unsigned long));
139662306a36Sopenharmony_ci
139762306a36Sopenharmony_ci	if (ret)
139862306a36Sopenharmony_ci		return -EFAULT;
139962306a36Sopenharmony_ci
140062306a36Sopenharmony_ci	if (maxnode % BITS_PER_LONG)
140162306a36Sopenharmony_ci		mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
140262306a36Sopenharmony_ci
140362306a36Sopenharmony_ci	return 0;
140462306a36Sopenharmony_ci}
140562306a36Sopenharmony_ci
140662306a36Sopenharmony_ci/* Copy a node mask from user space. */
140762306a36Sopenharmony_cistatic int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
140862306a36Sopenharmony_ci		     unsigned long maxnode)
140962306a36Sopenharmony_ci{
141062306a36Sopenharmony_ci	--maxnode;
141162306a36Sopenharmony_ci	nodes_clear(*nodes);
141262306a36Sopenharmony_ci	if (maxnode == 0 || !nmask)
141362306a36Sopenharmony_ci		return 0;
141462306a36Sopenharmony_ci	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
141562306a36Sopenharmony_ci		return -EINVAL;
141662306a36Sopenharmony_ci
141762306a36Sopenharmony_ci	/*
141862306a36Sopenharmony_ci	 * When the user specified more nodes than supported just check
141962306a36Sopenharmony_ci	 * if the non supported part is all zero, one word at a time,
142062306a36Sopenharmony_ci	 * starting at the end.
142162306a36Sopenharmony_ci	 */
142262306a36Sopenharmony_ci	while (maxnode > MAX_NUMNODES) {
142362306a36Sopenharmony_ci		unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
142462306a36Sopenharmony_ci		unsigned long t;
142562306a36Sopenharmony_ci
142662306a36Sopenharmony_ci		if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
142762306a36Sopenharmony_ci			return -EFAULT;
142862306a36Sopenharmony_ci
142962306a36Sopenharmony_ci		if (maxnode - bits >= MAX_NUMNODES) {
143062306a36Sopenharmony_ci			maxnode -= bits;
143162306a36Sopenharmony_ci		} else {
143262306a36Sopenharmony_ci			maxnode = MAX_NUMNODES;
143362306a36Sopenharmony_ci			t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
143462306a36Sopenharmony_ci		}
143562306a36Sopenharmony_ci		if (t)
143662306a36Sopenharmony_ci			return -EINVAL;
143762306a36Sopenharmony_ci	}
143862306a36Sopenharmony_ci
143962306a36Sopenharmony_ci	return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
144062306a36Sopenharmony_ci}
144162306a36Sopenharmony_ci
144262306a36Sopenharmony_ci/* Copy a kernel node mask to user space */
144362306a36Sopenharmony_cistatic int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
144462306a36Sopenharmony_ci			      nodemask_t *nodes)
144562306a36Sopenharmony_ci{
144662306a36Sopenharmony_ci	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
144762306a36Sopenharmony_ci	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
144862306a36Sopenharmony_ci	bool compat = in_compat_syscall();
144962306a36Sopenharmony_ci
145062306a36Sopenharmony_ci	if (compat)
145162306a36Sopenharmony_ci		nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
145262306a36Sopenharmony_ci
145362306a36Sopenharmony_ci	if (copy > nbytes) {
145462306a36Sopenharmony_ci		if (copy > PAGE_SIZE)
145562306a36Sopenharmony_ci			return -EINVAL;
145662306a36Sopenharmony_ci		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
145762306a36Sopenharmony_ci			return -EFAULT;
145862306a36Sopenharmony_ci		copy = nbytes;
145962306a36Sopenharmony_ci		maxnode = nr_node_ids;
146062306a36Sopenharmony_ci	}
146162306a36Sopenharmony_ci
146262306a36Sopenharmony_ci	if (compat)
146362306a36Sopenharmony_ci		return compat_put_bitmap((compat_ulong_t __user *)mask,
146462306a36Sopenharmony_ci					 nodes_addr(*nodes), maxnode);
146562306a36Sopenharmony_ci
146662306a36Sopenharmony_ci	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
146762306a36Sopenharmony_ci}
146862306a36Sopenharmony_ci
146962306a36Sopenharmony_ci/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
147062306a36Sopenharmony_cistatic inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
147162306a36Sopenharmony_ci{
147262306a36Sopenharmony_ci	*flags = *mode & MPOL_MODE_FLAGS;
147362306a36Sopenharmony_ci	*mode &= ~MPOL_MODE_FLAGS;
147462306a36Sopenharmony_ci
147562306a36Sopenharmony_ci	if ((unsigned int)(*mode) >=  MPOL_MAX)
147662306a36Sopenharmony_ci		return -EINVAL;
147762306a36Sopenharmony_ci	if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
147862306a36Sopenharmony_ci		return -EINVAL;
147962306a36Sopenharmony_ci	if (*flags & MPOL_F_NUMA_BALANCING) {
148062306a36Sopenharmony_ci		if (*mode != MPOL_BIND)
148162306a36Sopenharmony_ci			return -EINVAL;
148262306a36Sopenharmony_ci		*flags |= (MPOL_F_MOF | MPOL_F_MORON);
148362306a36Sopenharmony_ci	}
148462306a36Sopenharmony_ci	return 0;
148562306a36Sopenharmony_ci}
148662306a36Sopenharmony_ci
148762306a36Sopenharmony_cistatic long kernel_mbind(unsigned long start, unsigned long len,
148862306a36Sopenharmony_ci			 unsigned long mode, const unsigned long __user *nmask,
148962306a36Sopenharmony_ci			 unsigned long maxnode, unsigned int flags)
149062306a36Sopenharmony_ci{
149162306a36Sopenharmony_ci	unsigned short mode_flags;
149262306a36Sopenharmony_ci	nodemask_t nodes;
149362306a36Sopenharmony_ci	int lmode = mode;
149462306a36Sopenharmony_ci	int err;
149562306a36Sopenharmony_ci
149662306a36Sopenharmony_ci	start = untagged_addr(start);
149762306a36Sopenharmony_ci	err = sanitize_mpol_flags(&lmode, &mode_flags);
149862306a36Sopenharmony_ci	if (err)
149962306a36Sopenharmony_ci		return err;
150062306a36Sopenharmony_ci
150162306a36Sopenharmony_ci	err = get_nodes(&nodes, nmask, maxnode);
150262306a36Sopenharmony_ci	if (err)
150362306a36Sopenharmony_ci		return err;
150462306a36Sopenharmony_ci
150562306a36Sopenharmony_ci	return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
150662306a36Sopenharmony_ci}
150762306a36Sopenharmony_ci
150862306a36Sopenharmony_ciSYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
150962306a36Sopenharmony_ci		unsigned long, home_node, unsigned long, flags)
151062306a36Sopenharmony_ci{
151162306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
151262306a36Sopenharmony_ci	struct vm_area_struct *vma, *prev;
151362306a36Sopenharmony_ci	struct mempolicy *new, *old;
151462306a36Sopenharmony_ci	unsigned long end;
151562306a36Sopenharmony_ci	int err = -ENOENT;
151662306a36Sopenharmony_ci	VMA_ITERATOR(vmi, mm, start);
151762306a36Sopenharmony_ci
151862306a36Sopenharmony_ci	start = untagged_addr(start);
151962306a36Sopenharmony_ci	if (start & ~PAGE_MASK)
152062306a36Sopenharmony_ci		return -EINVAL;
152162306a36Sopenharmony_ci	/*
152262306a36Sopenharmony_ci	 * flags is used for future extension if any.
152362306a36Sopenharmony_ci	 */
152462306a36Sopenharmony_ci	if (flags != 0)
152562306a36Sopenharmony_ci		return -EINVAL;
152662306a36Sopenharmony_ci
152762306a36Sopenharmony_ci	/*
152862306a36Sopenharmony_ci	 * Check home_node is online to avoid accessing uninitialized
152962306a36Sopenharmony_ci	 * NODE_DATA.
153062306a36Sopenharmony_ci	 */
153162306a36Sopenharmony_ci	if (home_node >= MAX_NUMNODES || !node_online(home_node))
153262306a36Sopenharmony_ci		return -EINVAL;
153362306a36Sopenharmony_ci
153462306a36Sopenharmony_ci	len = PAGE_ALIGN(len);
153562306a36Sopenharmony_ci	end = start + len;
153662306a36Sopenharmony_ci
153762306a36Sopenharmony_ci	if (end < start)
153862306a36Sopenharmony_ci		return -EINVAL;
153962306a36Sopenharmony_ci	if (end == start)
154062306a36Sopenharmony_ci		return 0;
154162306a36Sopenharmony_ci	mmap_write_lock(mm);
154262306a36Sopenharmony_ci	prev = vma_prev(&vmi);
154362306a36Sopenharmony_ci	for_each_vma_range(vmi, vma, end) {
154462306a36Sopenharmony_ci		/*
154562306a36Sopenharmony_ci		 * If any vma in the range got policy other than MPOL_BIND
154662306a36Sopenharmony_ci		 * or MPOL_PREFERRED_MANY we return error. We don't reset
154762306a36Sopenharmony_ci		 * the home node for vmas we already updated before.
154862306a36Sopenharmony_ci		 */
154962306a36Sopenharmony_ci		old = vma_policy(vma);
155062306a36Sopenharmony_ci		if (!old) {
155162306a36Sopenharmony_ci			prev = vma;
155262306a36Sopenharmony_ci			continue;
155362306a36Sopenharmony_ci		}
155462306a36Sopenharmony_ci		if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
155562306a36Sopenharmony_ci			err = -EOPNOTSUPP;
155662306a36Sopenharmony_ci			break;
155762306a36Sopenharmony_ci		}
155862306a36Sopenharmony_ci		new = mpol_dup(old);
155962306a36Sopenharmony_ci		if (IS_ERR(new)) {
156062306a36Sopenharmony_ci			err = PTR_ERR(new);
156162306a36Sopenharmony_ci			break;
156262306a36Sopenharmony_ci		}
156362306a36Sopenharmony_ci
156462306a36Sopenharmony_ci		vma_start_write(vma);
156562306a36Sopenharmony_ci		new->home_node = home_node;
156662306a36Sopenharmony_ci		err = mbind_range(&vmi, vma, &prev, start, end, new);
156762306a36Sopenharmony_ci		mpol_put(new);
156862306a36Sopenharmony_ci		if (err)
156962306a36Sopenharmony_ci			break;
157062306a36Sopenharmony_ci	}
157162306a36Sopenharmony_ci	mmap_write_unlock(mm);
157262306a36Sopenharmony_ci	return err;
157362306a36Sopenharmony_ci}
157462306a36Sopenharmony_ci
157562306a36Sopenharmony_ciSYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
157662306a36Sopenharmony_ci		unsigned long, mode, const unsigned long __user *, nmask,
157762306a36Sopenharmony_ci		unsigned long, maxnode, unsigned int, flags)
157862306a36Sopenharmony_ci{
157962306a36Sopenharmony_ci	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
158062306a36Sopenharmony_ci}
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci/* Set the process memory policy */
158362306a36Sopenharmony_cistatic long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
158462306a36Sopenharmony_ci				 unsigned long maxnode)
158562306a36Sopenharmony_ci{
158662306a36Sopenharmony_ci	unsigned short mode_flags;
158762306a36Sopenharmony_ci	nodemask_t nodes;
158862306a36Sopenharmony_ci	int lmode = mode;
158962306a36Sopenharmony_ci	int err;
159062306a36Sopenharmony_ci
159162306a36Sopenharmony_ci	err = sanitize_mpol_flags(&lmode, &mode_flags);
159262306a36Sopenharmony_ci	if (err)
159362306a36Sopenharmony_ci		return err;
159462306a36Sopenharmony_ci
159562306a36Sopenharmony_ci	err = get_nodes(&nodes, nmask, maxnode);
159662306a36Sopenharmony_ci	if (err)
159762306a36Sopenharmony_ci		return err;
159862306a36Sopenharmony_ci
159962306a36Sopenharmony_ci	return do_set_mempolicy(lmode, mode_flags, &nodes);
160062306a36Sopenharmony_ci}
160162306a36Sopenharmony_ci
160262306a36Sopenharmony_ciSYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
160362306a36Sopenharmony_ci		unsigned long, maxnode)
160462306a36Sopenharmony_ci{
160562306a36Sopenharmony_ci	return kernel_set_mempolicy(mode, nmask, maxnode);
160662306a36Sopenharmony_ci}
160762306a36Sopenharmony_ci
160862306a36Sopenharmony_cistatic int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
160962306a36Sopenharmony_ci				const unsigned long __user *old_nodes,
161062306a36Sopenharmony_ci				const unsigned long __user *new_nodes)
161162306a36Sopenharmony_ci{
161262306a36Sopenharmony_ci	struct mm_struct *mm = NULL;
161362306a36Sopenharmony_ci	struct task_struct *task;
161462306a36Sopenharmony_ci	nodemask_t task_nodes;
161562306a36Sopenharmony_ci	int err;
161662306a36Sopenharmony_ci	nodemask_t *old;
161762306a36Sopenharmony_ci	nodemask_t *new;
161862306a36Sopenharmony_ci	NODEMASK_SCRATCH(scratch);
161962306a36Sopenharmony_ci
162062306a36Sopenharmony_ci	if (!scratch)
162162306a36Sopenharmony_ci		return -ENOMEM;
162262306a36Sopenharmony_ci
162362306a36Sopenharmony_ci	old = &scratch->mask1;
162462306a36Sopenharmony_ci	new = &scratch->mask2;
162562306a36Sopenharmony_ci
162662306a36Sopenharmony_ci	err = get_nodes(old, old_nodes, maxnode);
162762306a36Sopenharmony_ci	if (err)
162862306a36Sopenharmony_ci		goto out;
162962306a36Sopenharmony_ci
163062306a36Sopenharmony_ci	err = get_nodes(new, new_nodes, maxnode);
163162306a36Sopenharmony_ci	if (err)
163262306a36Sopenharmony_ci		goto out;
163362306a36Sopenharmony_ci
163462306a36Sopenharmony_ci	/* Find the mm_struct */
163562306a36Sopenharmony_ci	rcu_read_lock();
163662306a36Sopenharmony_ci	task = pid ? find_task_by_vpid(pid) : current;
163762306a36Sopenharmony_ci	if (!task) {
163862306a36Sopenharmony_ci		rcu_read_unlock();
163962306a36Sopenharmony_ci		err = -ESRCH;
164062306a36Sopenharmony_ci		goto out;
164162306a36Sopenharmony_ci	}
164262306a36Sopenharmony_ci	get_task_struct(task);
164362306a36Sopenharmony_ci
164462306a36Sopenharmony_ci	err = -EINVAL;
164562306a36Sopenharmony_ci
164662306a36Sopenharmony_ci	/*
164762306a36Sopenharmony_ci	 * Check if this process has the right to modify the specified process.
164862306a36Sopenharmony_ci	 * Use the regular "ptrace_may_access()" checks.
164962306a36Sopenharmony_ci	 */
165062306a36Sopenharmony_ci	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
165162306a36Sopenharmony_ci		rcu_read_unlock();
165262306a36Sopenharmony_ci		err = -EPERM;
165362306a36Sopenharmony_ci		goto out_put;
165462306a36Sopenharmony_ci	}
165562306a36Sopenharmony_ci	rcu_read_unlock();
165662306a36Sopenharmony_ci
165762306a36Sopenharmony_ci	task_nodes = cpuset_mems_allowed(task);
165862306a36Sopenharmony_ci	/* Is the user allowed to access the target nodes? */
165962306a36Sopenharmony_ci	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
166062306a36Sopenharmony_ci		err = -EPERM;
166162306a36Sopenharmony_ci		goto out_put;
166262306a36Sopenharmony_ci	}
166362306a36Sopenharmony_ci
166462306a36Sopenharmony_ci	task_nodes = cpuset_mems_allowed(current);
166562306a36Sopenharmony_ci	nodes_and(*new, *new, task_nodes);
166662306a36Sopenharmony_ci	if (nodes_empty(*new))
166762306a36Sopenharmony_ci		goto out_put;
166862306a36Sopenharmony_ci
166962306a36Sopenharmony_ci	err = security_task_movememory(task);
167062306a36Sopenharmony_ci	if (err)
167162306a36Sopenharmony_ci		goto out_put;
167262306a36Sopenharmony_ci
167362306a36Sopenharmony_ci	mm = get_task_mm(task);
167462306a36Sopenharmony_ci	put_task_struct(task);
167562306a36Sopenharmony_ci
167662306a36Sopenharmony_ci	if (!mm) {
167762306a36Sopenharmony_ci		err = -EINVAL;
167862306a36Sopenharmony_ci		goto out;
167962306a36Sopenharmony_ci	}
168062306a36Sopenharmony_ci
168162306a36Sopenharmony_ci	err = do_migrate_pages(mm, old, new,
168262306a36Sopenharmony_ci		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
168362306a36Sopenharmony_ci
168462306a36Sopenharmony_ci	mmput(mm);
168562306a36Sopenharmony_ciout:
168662306a36Sopenharmony_ci	NODEMASK_SCRATCH_FREE(scratch);
168762306a36Sopenharmony_ci
168862306a36Sopenharmony_ci	return err;
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_ciout_put:
169162306a36Sopenharmony_ci	put_task_struct(task);
169262306a36Sopenharmony_ci	goto out;
169362306a36Sopenharmony_ci
169462306a36Sopenharmony_ci}
169562306a36Sopenharmony_ci
169662306a36Sopenharmony_ciSYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
169762306a36Sopenharmony_ci		const unsigned long __user *, old_nodes,
169862306a36Sopenharmony_ci		const unsigned long __user *, new_nodes)
169962306a36Sopenharmony_ci{
170062306a36Sopenharmony_ci	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
170162306a36Sopenharmony_ci}
170262306a36Sopenharmony_ci
170362306a36Sopenharmony_ci
170462306a36Sopenharmony_ci/* Retrieve NUMA policy */
170562306a36Sopenharmony_cistatic int kernel_get_mempolicy(int __user *policy,
170662306a36Sopenharmony_ci				unsigned long __user *nmask,
170762306a36Sopenharmony_ci				unsigned long maxnode,
170862306a36Sopenharmony_ci				unsigned long addr,
170962306a36Sopenharmony_ci				unsigned long flags)
171062306a36Sopenharmony_ci{
171162306a36Sopenharmony_ci	int err;
171262306a36Sopenharmony_ci	int pval;
171362306a36Sopenharmony_ci	nodemask_t nodes;
171462306a36Sopenharmony_ci
171562306a36Sopenharmony_ci	if (nmask != NULL && maxnode < nr_node_ids)
171662306a36Sopenharmony_ci		return -EINVAL;
171762306a36Sopenharmony_ci
171862306a36Sopenharmony_ci	addr = untagged_addr(addr);
171962306a36Sopenharmony_ci
172062306a36Sopenharmony_ci	err = do_get_mempolicy(&pval, &nodes, addr, flags);
172162306a36Sopenharmony_ci
172262306a36Sopenharmony_ci	if (err)
172362306a36Sopenharmony_ci		return err;
172462306a36Sopenharmony_ci
172562306a36Sopenharmony_ci	if (policy && put_user(pval, policy))
172662306a36Sopenharmony_ci		return -EFAULT;
172762306a36Sopenharmony_ci
172862306a36Sopenharmony_ci	if (nmask)
172962306a36Sopenharmony_ci		err = copy_nodes_to_user(nmask, maxnode, &nodes);
173062306a36Sopenharmony_ci
173162306a36Sopenharmony_ci	return err;
173262306a36Sopenharmony_ci}
173362306a36Sopenharmony_ci
173462306a36Sopenharmony_ciSYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
173562306a36Sopenharmony_ci		unsigned long __user *, nmask, unsigned long, maxnode,
173662306a36Sopenharmony_ci		unsigned long, addr, unsigned long, flags)
173762306a36Sopenharmony_ci{
173862306a36Sopenharmony_ci	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
173962306a36Sopenharmony_ci}
174062306a36Sopenharmony_ci
174162306a36Sopenharmony_cibool vma_migratable(struct vm_area_struct *vma)
174262306a36Sopenharmony_ci{
174362306a36Sopenharmony_ci	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
174462306a36Sopenharmony_ci		return false;
174562306a36Sopenharmony_ci
174662306a36Sopenharmony_ci	/*
174762306a36Sopenharmony_ci	 * DAX device mappings require predictable access latency, so avoid
174862306a36Sopenharmony_ci	 * incurring periodic faults.
174962306a36Sopenharmony_ci	 */
175062306a36Sopenharmony_ci	if (vma_is_dax(vma))
175162306a36Sopenharmony_ci		return false;
175262306a36Sopenharmony_ci
175362306a36Sopenharmony_ci	if (is_vm_hugetlb_page(vma) &&
175462306a36Sopenharmony_ci		!hugepage_migration_supported(hstate_vma(vma)))
175562306a36Sopenharmony_ci		return false;
175662306a36Sopenharmony_ci
175762306a36Sopenharmony_ci	/*
175862306a36Sopenharmony_ci	 * Migration allocates pages in the highest zone. If we cannot
175962306a36Sopenharmony_ci	 * do so then migration (at least from node to node) is not
176062306a36Sopenharmony_ci	 * possible.
176162306a36Sopenharmony_ci	 */
176262306a36Sopenharmony_ci	if (vma->vm_file &&
176362306a36Sopenharmony_ci		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
176462306a36Sopenharmony_ci			< policy_zone)
176562306a36Sopenharmony_ci		return false;
176662306a36Sopenharmony_ci	return true;
176762306a36Sopenharmony_ci}
176862306a36Sopenharmony_ci
176962306a36Sopenharmony_cistruct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
177062306a36Sopenharmony_ci						unsigned long addr)
177162306a36Sopenharmony_ci{
177262306a36Sopenharmony_ci	struct mempolicy *pol = NULL;
177362306a36Sopenharmony_ci
177462306a36Sopenharmony_ci	if (vma) {
177562306a36Sopenharmony_ci		if (vma->vm_ops && vma->vm_ops->get_policy) {
177662306a36Sopenharmony_ci			pol = vma->vm_ops->get_policy(vma, addr);
177762306a36Sopenharmony_ci		} else if (vma->vm_policy) {
177862306a36Sopenharmony_ci			pol = vma->vm_policy;
177962306a36Sopenharmony_ci
178062306a36Sopenharmony_ci			/*
178162306a36Sopenharmony_ci			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
178262306a36Sopenharmony_ci			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
178362306a36Sopenharmony_ci			 * count on these policies which will be dropped by
178462306a36Sopenharmony_ci			 * mpol_cond_put() later
178562306a36Sopenharmony_ci			 */
178662306a36Sopenharmony_ci			if (mpol_needs_cond_ref(pol))
178762306a36Sopenharmony_ci				mpol_get(pol);
178862306a36Sopenharmony_ci		}
178962306a36Sopenharmony_ci	}
179062306a36Sopenharmony_ci
179162306a36Sopenharmony_ci	return pol;
179262306a36Sopenharmony_ci}
179362306a36Sopenharmony_ci
179462306a36Sopenharmony_ci/*
179562306a36Sopenharmony_ci * get_vma_policy(@vma, @addr)
179662306a36Sopenharmony_ci * @vma: virtual memory area whose policy is sought
179762306a36Sopenharmony_ci * @addr: address in @vma for shared policy lookup
179862306a36Sopenharmony_ci *
179962306a36Sopenharmony_ci * Returns effective policy for a VMA at specified address.
180062306a36Sopenharmony_ci * Falls back to current->mempolicy or system default policy, as necessary.
180162306a36Sopenharmony_ci * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
180262306a36Sopenharmony_ci * count--added by the get_policy() vm_op, as appropriate--to protect against
180362306a36Sopenharmony_ci * freeing by another task.  It is the caller's responsibility to free the
180462306a36Sopenharmony_ci * extra reference for shared policies.
180562306a36Sopenharmony_ci */
180662306a36Sopenharmony_cistatic struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
180762306a36Sopenharmony_ci						unsigned long addr)
180862306a36Sopenharmony_ci{
180962306a36Sopenharmony_ci	struct mempolicy *pol = __get_vma_policy(vma, addr);
181062306a36Sopenharmony_ci
181162306a36Sopenharmony_ci	if (!pol)
181262306a36Sopenharmony_ci		pol = get_task_policy(current);
181362306a36Sopenharmony_ci
181462306a36Sopenharmony_ci	return pol;
181562306a36Sopenharmony_ci}
181662306a36Sopenharmony_ci
181762306a36Sopenharmony_cibool vma_policy_mof(struct vm_area_struct *vma)
181862306a36Sopenharmony_ci{
181962306a36Sopenharmony_ci	struct mempolicy *pol;
182062306a36Sopenharmony_ci
182162306a36Sopenharmony_ci	if (vma->vm_ops && vma->vm_ops->get_policy) {
182262306a36Sopenharmony_ci		bool ret = false;
182362306a36Sopenharmony_ci
182462306a36Sopenharmony_ci		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
182562306a36Sopenharmony_ci		if (pol && (pol->flags & MPOL_F_MOF))
182662306a36Sopenharmony_ci			ret = true;
182762306a36Sopenharmony_ci		mpol_cond_put(pol);
182862306a36Sopenharmony_ci
182962306a36Sopenharmony_ci		return ret;
183062306a36Sopenharmony_ci	}
183162306a36Sopenharmony_ci
183262306a36Sopenharmony_ci	pol = vma->vm_policy;
183362306a36Sopenharmony_ci	if (!pol)
183462306a36Sopenharmony_ci		pol = get_task_policy(current);
183562306a36Sopenharmony_ci
183662306a36Sopenharmony_ci	return pol->flags & MPOL_F_MOF;
183762306a36Sopenharmony_ci}
183862306a36Sopenharmony_ci
183962306a36Sopenharmony_cibool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
184062306a36Sopenharmony_ci{
184162306a36Sopenharmony_ci	enum zone_type dynamic_policy_zone = policy_zone;
184262306a36Sopenharmony_ci
184362306a36Sopenharmony_ci	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
184462306a36Sopenharmony_ci
184562306a36Sopenharmony_ci	/*
184662306a36Sopenharmony_ci	 * if policy->nodes has movable memory only,
184762306a36Sopenharmony_ci	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
184862306a36Sopenharmony_ci	 *
184962306a36Sopenharmony_ci	 * policy->nodes is intersect with node_states[N_MEMORY].
185062306a36Sopenharmony_ci	 * so if the following test fails, it implies
185162306a36Sopenharmony_ci	 * policy->nodes has movable memory only.
185262306a36Sopenharmony_ci	 */
185362306a36Sopenharmony_ci	if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
185462306a36Sopenharmony_ci		dynamic_policy_zone = ZONE_MOVABLE;
185562306a36Sopenharmony_ci
185662306a36Sopenharmony_ci	return zone >= dynamic_policy_zone;
185762306a36Sopenharmony_ci}
185862306a36Sopenharmony_ci
185962306a36Sopenharmony_ci/*
186062306a36Sopenharmony_ci * Return a nodemask representing a mempolicy for filtering nodes for
186162306a36Sopenharmony_ci * page allocation
186262306a36Sopenharmony_ci */
186362306a36Sopenharmony_cinodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
186462306a36Sopenharmony_ci{
186562306a36Sopenharmony_ci	int mode = policy->mode;
186662306a36Sopenharmony_ci
186762306a36Sopenharmony_ci	/* Lower zones don't get a nodemask applied for MPOL_BIND */
186862306a36Sopenharmony_ci	if (unlikely(mode == MPOL_BIND) &&
186962306a36Sopenharmony_ci		apply_policy_zone(policy, gfp_zone(gfp)) &&
187062306a36Sopenharmony_ci		cpuset_nodemask_valid_mems_allowed(&policy->nodes))
187162306a36Sopenharmony_ci		return &policy->nodes;
187262306a36Sopenharmony_ci
187362306a36Sopenharmony_ci	if (mode == MPOL_PREFERRED_MANY)
187462306a36Sopenharmony_ci		return &policy->nodes;
187562306a36Sopenharmony_ci
187662306a36Sopenharmony_ci	return NULL;
187762306a36Sopenharmony_ci}
187862306a36Sopenharmony_ci
187962306a36Sopenharmony_ci/*
188062306a36Sopenharmony_ci * Return the  preferred node id for 'prefer' mempolicy, and return
188162306a36Sopenharmony_ci * the given id for all other policies.
188262306a36Sopenharmony_ci *
188362306a36Sopenharmony_ci * policy_node() is always coupled with policy_nodemask(), which
188462306a36Sopenharmony_ci * secures the nodemask limit for 'bind' and 'prefer-many' policy.
188562306a36Sopenharmony_ci */
188662306a36Sopenharmony_cistatic int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
188762306a36Sopenharmony_ci{
188862306a36Sopenharmony_ci	if (policy->mode == MPOL_PREFERRED) {
188962306a36Sopenharmony_ci		nd = first_node(policy->nodes);
189062306a36Sopenharmony_ci	} else {
189162306a36Sopenharmony_ci		/*
189262306a36Sopenharmony_ci		 * __GFP_THISNODE shouldn't even be used with the bind policy
189362306a36Sopenharmony_ci		 * because we might easily break the expectation to stay on the
189462306a36Sopenharmony_ci		 * requested node and not break the policy.
189562306a36Sopenharmony_ci		 */
189662306a36Sopenharmony_ci		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
189762306a36Sopenharmony_ci	}
189862306a36Sopenharmony_ci
189962306a36Sopenharmony_ci	if ((policy->mode == MPOL_BIND ||
190062306a36Sopenharmony_ci	     policy->mode == MPOL_PREFERRED_MANY) &&
190162306a36Sopenharmony_ci	    policy->home_node != NUMA_NO_NODE)
190262306a36Sopenharmony_ci		return policy->home_node;
190362306a36Sopenharmony_ci
190462306a36Sopenharmony_ci	return nd;
190562306a36Sopenharmony_ci}
190662306a36Sopenharmony_ci
190762306a36Sopenharmony_ci/* Do dynamic interleaving for a process */
190862306a36Sopenharmony_cistatic unsigned interleave_nodes(struct mempolicy *policy)
190962306a36Sopenharmony_ci{
191062306a36Sopenharmony_ci	unsigned next;
191162306a36Sopenharmony_ci	struct task_struct *me = current;
191262306a36Sopenharmony_ci
191362306a36Sopenharmony_ci	next = next_node_in(me->il_prev, policy->nodes);
191462306a36Sopenharmony_ci	if (next < MAX_NUMNODES)
191562306a36Sopenharmony_ci		me->il_prev = next;
191662306a36Sopenharmony_ci	return next;
191762306a36Sopenharmony_ci}
191862306a36Sopenharmony_ci
191962306a36Sopenharmony_ci/*
192062306a36Sopenharmony_ci * Depending on the memory policy provide a node from which to allocate the
192162306a36Sopenharmony_ci * next slab entry.
192262306a36Sopenharmony_ci */
192362306a36Sopenharmony_ciunsigned int mempolicy_slab_node(void)
192462306a36Sopenharmony_ci{
192562306a36Sopenharmony_ci	struct mempolicy *policy;
192662306a36Sopenharmony_ci	int node = numa_mem_id();
192762306a36Sopenharmony_ci
192862306a36Sopenharmony_ci	if (!in_task())
192962306a36Sopenharmony_ci		return node;
193062306a36Sopenharmony_ci
193162306a36Sopenharmony_ci	policy = current->mempolicy;
193262306a36Sopenharmony_ci	if (!policy)
193362306a36Sopenharmony_ci		return node;
193462306a36Sopenharmony_ci
193562306a36Sopenharmony_ci	switch (policy->mode) {
193662306a36Sopenharmony_ci	case MPOL_PREFERRED:
193762306a36Sopenharmony_ci		return first_node(policy->nodes);
193862306a36Sopenharmony_ci
193962306a36Sopenharmony_ci	case MPOL_INTERLEAVE:
194062306a36Sopenharmony_ci		return interleave_nodes(policy);
194162306a36Sopenharmony_ci
194262306a36Sopenharmony_ci	case MPOL_BIND:
194362306a36Sopenharmony_ci	case MPOL_PREFERRED_MANY:
194462306a36Sopenharmony_ci	{
194562306a36Sopenharmony_ci		struct zoneref *z;
194662306a36Sopenharmony_ci
194762306a36Sopenharmony_ci		/*
194862306a36Sopenharmony_ci		 * Follow bind policy behavior and start allocation at the
194962306a36Sopenharmony_ci		 * first node.
195062306a36Sopenharmony_ci		 */
195162306a36Sopenharmony_ci		struct zonelist *zonelist;
195262306a36Sopenharmony_ci		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
195362306a36Sopenharmony_ci		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
195462306a36Sopenharmony_ci		z = first_zones_zonelist(zonelist, highest_zoneidx,
195562306a36Sopenharmony_ci							&policy->nodes);
195662306a36Sopenharmony_ci		return z->zone ? zone_to_nid(z->zone) : node;
195762306a36Sopenharmony_ci	}
195862306a36Sopenharmony_ci	case MPOL_LOCAL:
195962306a36Sopenharmony_ci		return node;
196062306a36Sopenharmony_ci
196162306a36Sopenharmony_ci	default:
196262306a36Sopenharmony_ci		BUG();
196362306a36Sopenharmony_ci	}
196462306a36Sopenharmony_ci}
196562306a36Sopenharmony_ci
196662306a36Sopenharmony_ci/*
196762306a36Sopenharmony_ci * Do static interleaving for a VMA with known offset @n.  Returns the n'th
196862306a36Sopenharmony_ci * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
196962306a36Sopenharmony_ci * number of present nodes.
197062306a36Sopenharmony_ci */
197162306a36Sopenharmony_cistatic unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
197262306a36Sopenharmony_ci{
197362306a36Sopenharmony_ci	nodemask_t nodemask = pol->nodes;
197462306a36Sopenharmony_ci	unsigned int target, nnodes;
197562306a36Sopenharmony_ci	int i;
197662306a36Sopenharmony_ci	int nid;
197762306a36Sopenharmony_ci	/*
197862306a36Sopenharmony_ci	 * The barrier will stabilize the nodemask in a register or on
197962306a36Sopenharmony_ci	 * the stack so that it will stop changing under the code.
198062306a36Sopenharmony_ci	 *
198162306a36Sopenharmony_ci	 * Between first_node() and next_node(), pol->nodes could be changed
198262306a36Sopenharmony_ci	 * by other threads. So we put pol->nodes in a local stack.
198362306a36Sopenharmony_ci	 */
198462306a36Sopenharmony_ci	barrier();
198562306a36Sopenharmony_ci
198662306a36Sopenharmony_ci	nnodes = nodes_weight(nodemask);
198762306a36Sopenharmony_ci	if (!nnodes)
198862306a36Sopenharmony_ci		return numa_node_id();
198962306a36Sopenharmony_ci	target = (unsigned int)n % nnodes;
199062306a36Sopenharmony_ci	nid = first_node(nodemask);
199162306a36Sopenharmony_ci	for (i = 0; i < target; i++)
199262306a36Sopenharmony_ci		nid = next_node(nid, nodemask);
199362306a36Sopenharmony_ci	return nid;
199462306a36Sopenharmony_ci}
199562306a36Sopenharmony_ci
199662306a36Sopenharmony_ci/* Determine a node number for interleave */
199762306a36Sopenharmony_cistatic inline unsigned interleave_nid(struct mempolicy *pol,
199862306a36Sopenharmony_ci		 struct vm_area_struct *vma, unsigned long addr, int shift)
199962306a36Sopenharmony_ci{
200062306a36Sopenharmony_ci	if (vma) {
200162306a36Sopenharmony_ci		unsigned long off;
200262306a36Sopenharmony_ci
200362306a36Sopenharmony_ci		/*
200462306a36Sopenharmony_ci		 * for small pages, there is no difference between
200562306a36Sopenharmony_ci		 * shift and PAGE_SHIFT, so the bit-shift is safe.
200662306a36Sopenharmony_ci		 * for huge pages, since vm_pgoff is in units of small
200762306a36Sopenharmony_ci		 * pages, we need to shift off the always 0 bits to get
200862306a36Sopenharmony_ci		 * a useful offset.
200962306a36Sopenharmony_ci		 */
201062306a36Sopenharmony_ci		BUG_ON(shift < PAGE_SHIFT);
201162306a36Sopenharmony_ci		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
201262306a36Sopenharmony_ci		off += (addr - vma->vm_start) >> shift;
201362306a36Sopenharmony_ci		return offset_il_node(pol, off);
201462306a36Sopenharmony_ci	} else
201562306a36Sopenharmony_ci		return interleave_nodes(pol);
201662306a36Sopenharmony_ci}
201762306a36Sopenharmony_ci
201862306a36Sopenharmony_ci#ifdef CONFIG_HUGETLBFS
201962306a36Sopenharmony_ci/*
202062306a36Sopenharmony_ci * huge_node(@vma, @addr, @gfp_flags, @mpol)
202162306a36Sopenharmony_ci * @vma: virtual memory area whose policy is sought
202262306a36Sopenharmony_ci * @addr: address in @vma for shared policy lookup and interleave policy
202362306a36Sopenharmony_ci * @gfp_flags: for requested zone
202462306a36Sopenharmony_ci * @mpol: pointer to mempolicy pointer for reference counted mempolicy
202562306a36Sopenharmony_ci * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
202662306a36Sopenharmony_ci *
202762306a36Sopenharmony_ci * Returns a nid suitable for a huge page allocation and a pointer
202862306a36Sopenharmony_ci * to the struct mempolicy for conditional unref after allocation.
202962306a36Sopenharmony_ci * If the effective policy is 'bind' or 'prefer-many', returns a pointer
203062306a36Sopenharmony_ci * to the mempolicy's @nodemask for filtering the zonelist.
203162306a36Sopenharmony_ci *
203262306a36Sopenharmony_ci * Must be protected by read_mems_allowed_begin()
203362306a36Sopenharmony_ci */
203462306a36Sopenharmony_ciint huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
203562306a36Sopenharmony_ci				struct mempolicy **mpol, nodemask_t **nodemask)
203662306a36Sopenharmony_ci{
203762306a36Sopenharmony_ci	int nid;
203862306a36Sopenharmony_ci	int mode;
203962306a36Sopenharmony_ci
204062306a36Sopenharmony_ci	*mpol = get_vma_policy(vma, addr);
204162306a36Sopenharmony_ci	*nodemask = NULL;
204262306a36Sopenharmony_ci	mode = (*mpol)->mode;
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci	if (unlikely(mode == MPOL_INTERLEAVE)) {
204562306a36Sopenharmony_ci		nid = interleave_nid(*mpol, vma, addr,
204662306a36Sopenharmony_ci					huge_page_shift(hstate_vma(vma)));
204762306a36Sopenharmony_ci	} else {
204862306a36Sopenharmony_ci		nid = policy_node(gfp_flags, *mpol, numa_node_id());
204962306a36Sopenharmony_ci		if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
205062306a36Sopenharmony_ci			*nodemask = &(*mpol)->nodes;
205162306a36Sopenharmony_ci	}
205262306a36Sopenharmony_ci	return nid;
205362306a36Sopenharmony_ci}
205462306a36Sopenharmony_ci
205562306a36Sopenharmony_ci/*
205662306a36Sopenharmony_ci * init_nodemask_of_mempolicy
205762306a36Sopenharmony_ci *
205862306a36Sopenharmony_ci * If the current task's mempolicy is "default" [NULL], return 'false'
205962306a36Sopenharmony_ci * to indicate default policy.  Otherwise, extract the policy nodemask
206062306a36Sopenharmony_ci * for 'bind' or 'interleave' policy into the argument nodemask, or
206162306a36Sopenharmony_ci * initialize the argument nodemask to contain the single node for
206262306a36Sopenharmony_ci * 'preferred' or 'local' policy and return 'true' to indicate presence
206362306a36Sopenharmony_ci * of non-default mempolicy.
206462306a36Sopenharmony_ci *
206562306a36Sopenharmony_ci * We don't bother with reference counting the mempolicy [mpol_get/put]
206662306a36Sopenharmony_ci * because the current task is examining it's own mempolicy and a task's
206762306a36Sopenharmony_ci * mempolicy is only ever changed by the task itself.
206862306a36Sopenharmony_ci *
206962306a36Sopenharmony_ci * N.B., it is the caller's responsibility to free a returned nodemask.
207062306a36Sopenharmony_ci */
207162306a36Sopenharmony_cibool init_nodemask_of_mempolicy(nodemask_t *mask)
207262306a36Sopenharmony_ci{
207362306a36Sopenharmony_ci	struct mempolicy *mempolicy;
207462306a36Sopenharmony_ci
207562306a36Sopenharmony_ci	if (!(mask && current->mempolicy))
207662306a36Sopenharmony_ci		return false;
207762306a36Sopenharmony_ci
207862306a36Sopenharmony_ci	task_lock(current);
207962306a36Sopenharmony_ci	mempolicy = current->mempolicy;
208062306a36Sopenharmony_ci	switch (mempolicy->mode) {
208162306a36Sopenharmony_ci	case MPOL_PREFERRED:
208262306a36Sopenharmony_ci	case MPOL_PREFERRED_MANY:
208362306a36Sopenharmony_ci	case MPOL_BIND:
208462306a36Sopenharmony_ci	case MPOL_INTERLEAVE:
208562306a36Sopenharmony_ci		*mask = mempolicy->nodes;
208662306a36Sopenharmony_ci		break;
208762306a36Sopenharmony_ci
208862306a36Sopenharmony_ci	case MPOL_LOCAL:
208962306a36Sopenharmony_ci		init_nodemask_of_node(mask, numa_node_id());
209062306a36Sopenharmony_ci		break;
209162306a36Sopenharmony_ci
209262306a36Sopenharmony_ci	default:
209362306a36Sopenharmony_ci		BUG();
209462306a36Sopenharmony_ci	}
209562306a36Sopenharmony_ci	task_unlock(current);
209662306a36Sopenharmony_ci
209762306a36Sopenharmony_ci	return true;
209862306a36Sopenharmony_ci}
209962306a36Sopenharmony_ci#endif
210062306a36Sopenharmony_ci
210162306a36Sopenharmony_ci/*
210262306a36Sopenharmony_ci * mempolicy_in_oom_domain
210362306a36Sopenharmony_ci *
210462306a36Sopenharmony_ci * If tsk's mempolicy is "bind", check for intersection between mask and
210562306a36Sopenharmony_ci * the policy nodemask. Otherwise, return true for all other policies
210662306a36Sopenharmony_ci * including "interleave", as a tsk with "interleave" policy may have
210762306a36Sopenharmony_ci * memory allocated from all nodes in system.
210862306a36Sopenharmony_ci *
210962306a36Sopenharmony_ci * Takes task_lock(tsk) to prevent freeing of its mempolicy.
211062306a36Sopenharmony_ci */
211162306a36Sopenharmony_cibool mempolicy_in_oom_domain(struct task_struct *tsk,
211262306a36Sopenharmony_ci					const nodemask_t *mask)
211362306a36Sopenharmony_ci{
211462306a36Sopenharmony_ci	struct mempolicy *mempolicy;
211562306a36Sopenharmony_ci	bool ret = true;
211662306a36Sopenharmony_ci
211762306a36Sopenharmony_ci	if (!mask)
211862306a36Sopenharmony_ci		return ret;
211962306a36Sopenharmony_ci
212062306a36Sopenharmony_ci	task_lock(tsk);
212162306a36Sopenharmony_ci	mempolicy = tsk->mempolicy;
212262306a36Sopenharmony_ci	if (mempolicy && mempolicy->mode == MPOL_BIND)
212362306a36Sopenharmony_ci		ret = nodes_intersects(mempolicy->nodes, *mask);
212462306a36Sopenharmony_ci	task_unlock(tsk);
212562306a36Sopenharmony_ci
212662306a36Sopenharmony_ci	return ret;
212762306a36Sopenharmony_ci}
212862306a36Sopenharmony_ci
212962306a36Sopenharmony_ci/* Allocate a page in interleaved policy.
213062306a36Sopenharmony_ci   Own path because it needs to do special accounting. */
213162306a36Sopenharmony_cistatic struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
213262306a36Sopenharmony_ci					unsigned nid)
213362306a36Sopenharmony_ci{
213462306a36Sopenharmony_ci	struct page *page;
213562306a36Sopenharmony_ci
213662306a36Sopenharmony_ci	page = __alloc_pages(gfp, order, nid, NULL);
213762306a36Sopenharmony_ci	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
213862306a36Sopenharmony_ci	if (!static_branch_likely(&vm_numa_stat_key))
213962306a36Sopenharmony_ci		return page;
214062306a36Sopenharmony_ci	if (page && page_to_nid(page) == nid) {
214162306a36Sopenharmony_ci		preempt_disable();
214262306a36Sopenharmony_ci		__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
214362306a36Sopenharmony_ci		preempt_enable();
214462306a36Sopenharmony_ci	}
214562306a36Sopenharmony_ci	return page;
214662306a36Sopenharmony_ci}
214762306a36Sopenharmony_ci
214862306a36Sopenharmony_cistatic struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
214962306a36Sopenharmony_ci						int nid, struct mempolicy *pol)
215062306a36Sopenharmony_ci{
215162306a36Sopenharmony_ci	struct page *page;
215262306a36Sopenharmony_ci	gfp_t preferred_gfp;
215362306a36Sopenharmony_ci
215462306a36Sopenharmony_ci	/*
215562306a36Sopenharmony_ci	 * This is a two pass approach. The first pass will only try the
215662306a36Sopenharmony_ci	 * preferred nodes but skip the direct reclaim and allow the
215762306a36Sopenharmony_ci	 * allocation to fail, while the second pass will try all the
215862306a36Sopenharmony_ci	 * nodes in system.
215962306a36Sopenharmony_ci	 */
216062306a36Sopenharmony_ci	preferred_gfp = gfp | __GFP_NOWARN;
216162306a36Sopenharmony_ci	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
216262306a36Sopenharmony_ci	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
216362306a36Sopenharmony_ci	if (!page)
216462306a36Sopenharmony_ci		page = __alloc_pages(gfp, order, nid, NULL);
216562306a36Sopenharmony_ci
216662306a36Sopenharmony_ci	return page;
216762306a36Sopenharmony_ci}
216862306a36Sopenharmony_ci
216962306a36Sopenharmony_ci/**
217062306a36Sopenharmony_ci * vma_alloc_folio - Allocate a folio for a VMA.
217162306a36Sopenharmony_ci * @gfp: GFP flags.
217262306a36Sopenharmony_ci * @order: Order of the folio.
217362306a36Sopenharmony_ci * @vma: Pointer to VMA or NULL if not available.
217462306a36Sopenharmony_ci * @addr: Virtual address of the allocation.  Must be inside @vma.
217562306a36Sopenharmony_ci * @hugepage: For hugepages try only the preferred node if possible.
217662306a36Sopenharmony_ci *
217762306a36Sopenharmony_ci * Allocate a folio for a specific address in @vma, using the appropriate
217862306a36Sopenharmony_ci * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
217962306a36Sopenharmony_ci * of the mm_struct of the VMA to prevent it from going away.  Should be
218062306a36Sopenharmony_ci * used for all allocations for folios that will be mapped into user space.
218162306a36Sopenharmony_ci *
218262306a36Sopenharmony_ci * Return: The folio on success or NULL if allocation fails.
218362306a36Sopenharmony_ci */
218462306a36Sopenharmony_cistruct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
218562306a36Sopenharmony_ci		unsigned long addr, bool hugepage)
218662306a36Sopenharmony_ci{
218762306a36Sopenharmony_ci	struct mempolicy *pol;
218862306a36Sopenharmony_ci	int node = numa_node_id();
218962306a36Sopenharmony_ci	struct folio *folio;
219062306a36Sopenharmony_ci	int preferred_nid;
219162306a36Sopenharmony_ci	nodemask_t *nmask;
219262306a36Sopenharmony_ci
219362306a36Sopenharmony_ci	pol = get_vma_policy(vma, addr);
219462306a36Sopenharmony_ci
219562306a36Sopenharmony_ci	if (pol->mode == MPOL_INTERLEAVE) {
219662306a36Sopenharmony_ci		struct page *page;
219762306a36Sopenharmony_ci		unsigned nid;
219862306a36Sopenharmony_ci
219962306a36Sopenharmony_ci		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
220062306a36Sopenharmony_ci		mpol_cond_put(pol);
220162306a36Sopenharmony_ci		gfp |= __GFP_COMP;
220262306a36Sopenharmony_ci		page = alloc_page_interleave(gfp, order, nid);
220362306a36Sopenharmony_ci		folio = (struct folio *)page;
220462306a36Sopenharmony_ci		if (folio && order > 1)
220562306a36Sopenharmony_ci			folio_prep_large_rmappable(folio);
220662306a36Sopenharmony_ci		goto out;
220762306a36Sopenharmony_ci	}
220862306a36Sopenharmony_ci
220962306a36Sopenharmony_ci	if (pol->mode == MPOL_PREFERRED_MANY) {
221062306a36Sopenharmony_ci		struct page *page;
221162306a36Sopenharmony_ci
221262306a36Sopenharmony_ci		node = policy_node(gfp, pol, node);
221362306a36Sopenharmony_ci		gfp |= __GFP_COMP;
221462306a36Sopenharmony_ci		page = alloc_pages_preferred_many(gfp, order, node, pol);
221562306a36Sopenharmony_ci		mpol_cond_put(pol);
221662306a36Sopenharmony_ci		folio = (struct folio *)page;
221762306a36Sopenharmony_ci		if (folio && order > 1)
221862306a36Sopenharmony_ci			folio_prep_large_rmappable(folio);
221962306a36Sopenharmony_ci		goto out;
222062306a36Sopenharmony_ci	}
222162306a36Sopenharmony_ci
222262306a36Sopenharmony_ci	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
222362306a36Sopenharmony_ci		int hpage_node = node;
222462306a36Sopenharmony_ci
222562306a36Sopenharmony_ci		/*
222662306a36Sopenharmony_ci		 * For hugepage allocation and non-interleave policy which
222762306a36Sopenharmony_ci		 * allows the current node (or other explicitly preferred
222862306a36Sopenharmony_ci		 * node) we only try to allocate from the current/preferred
222962306a36Sopenharmony_ci		 * node and don't fall back to other nodes, as the cost of
223062306a36Sopenharmony_ci		 * remote accesses would likely offset THP benefits.
223162306a36Sopenharmony_ci		 *
223262306a36Sopenharmony_ci		 * If the policy is interleave or does not allow the current
223362306a36Sopenharmony_ci		 * node in its nodemask, we allocate the standard way.
223462306a36Sopenharmony_ci		 */
223562306a36Sopenharmony_ci		if (pol->mode == MPOL_PREFERRED)
223662306a36Sopenharmony_ci			hpage_node = first_node(pol->nodes);
223762306a36Sopenharmony_ci
223862306a36Sopenharmony_ci		nmask = policy_nodemask(gfp, pol);
223962306a36Sopenharmony_ci		if (!nmask || node_isset(hpage_node, *nmask)) {
224062306a36Sopenharmony_ci			mpol_cond_put(pol);
224162306a36Sopenharmony_ci			/*
224262306a36Sopenharmony_ci			 * First, try to allocate THP only on local node, but
224362306a36Sopenharmony_ci			 * don't reclaim unnecessarily, just compact.
224462306a36Sopenharmony_ci			 */
224562306a36Sopenharmony_ci			folio = __folio_alloc_node(gfp | __GFP_THISNODE |
224662306a36Sopenharmony_ci					__GFP_NORETRY, order, hpage_node);
224762306a36Sopenharmony_ci
224862306a36Sopenharmony_ci			/*
224962306a36Sopenharmony_ci			 * If hugepage allocations are configured to always
225062306a36Sopenharmony_ci			 * synchronous compact or the vma has been madvised
225162306a36Sopenharmony_ci			 * to prefer hugepage backing, retry allowing remote
225262306a36Sopenharmony_ci			 * memory with both reclaim and compact as well.
225362306a36Sopenharmony_ci			 */
225462306a36Sopenharmony_ci			if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
225562306a36Sopenharmony_ci				folio = __folio_alloc(gfp, order, hpage_node,
225662306a36Sopenharmony_ci						      nmask);
225762306a36Sopenharmony_ci
225862306a36Sopenharmony_ci			goto out;
225962306a36Sopenharmony_ci		}
226062306a36Sopenharmony_ci	}
226162306a36Sopenharmony_ci
226262306a36Sopenharmony_ci	nmask = policy_nodemask(gfp, pol);
226362306a36Sopenharmony_ci	preferred_nid = policy_node(gfp, pol, node);
226462306a36Sopenharmony_ci	folio = __folio_alloc(gfp, order, preferred_nid, nmask);
226562306a36Sopenharmony_ci	mpol_cond_put(pol);
226662306a36Sopenharmony_ciout:
226762306a36Sopenharmony_ci	return folio;
226862306a36Sopenharmony_ci}
226962306a36Sopenharmony_ciEXPORT_SYMBOL(vma_alloc_folio);
227062306a36Sopenharmony_ci
227162306a36Sopenharmony_ci/**
227262306a36Sopenharmony_ci * alloc_pages - Allocate pages.
227362306a36Sopenharmony_ci * @gfp: GFP flags.
227462306a36Sopenharmony_ci * @order: Power of two of number of pages to allocate.
227562306a36Sopenharmony_ci *
227662306a36Sopenharmony_ci * Allocate 1 << @order contiguous pages.  The physical address of the
227762306a36Sopenharmony_ci * first page is naturally aligned (eg an order-3 allocation will be aligned
227862306a36Sopenharmony_ci * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
227962306a36Sopenharmony_ci * process is honoured when in process context.
228062306a36Sopenharmony_ci *
228162306a36Sopenharmony_ci * Context: Can be called from any context, providing the appropriate GFP
228262306a36Sopenharmony_ci * flags are used.
228362306a36Sopenharmony_ci * Return: The page on success or NULL if allocation fails.
228462306a36Sopenharmony_ci */
228562306a36Sopenharmony_cistruct page *alloc_pages(gfp_t gfp, unsigned order)
228662306a36Sopenharmony_ci{
228762306a36Sopenharmony_ci	struct mempolicy *pol = &default_policy;
228862306a36Sopenharmony_ci	struct page *page;
228962306a36Sopenharmony_ci
229062306a36Sopenharmony_ci	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
229162306a36Sopenharmony_ci		pol = get_task_policy(current);
229262306a36Sopenharmony_ci
229362306a36Sopenharmony_ci	/*
229462306a36Sopenharmony_ci	 * No reference counting needed for current->mempolicy
229562306a36Sopenharmony_ci	 * nor system default_policy
229662306a36Sopenharmony_ci	 */
229762306a36Sopenharmony_ci	if (pol->mode == MPOL_INTERLEAVE)
229862306a36Sopenharmony_ci		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
229962306a36Sopenharmony_ci	else if (pol->mode == MPOL_PREFERRED_MANY)
230062306a36Sopenharmony_ci		page = alloc_pages_preferred_many(gfp, order,
230162306a36Sopenharmony_ci				  policy_node(gfp, pol, numa_node_id()), pol);
230262306a36Sopenharmony_ci	else
230362306a36Sopenharmony_ci		page = __alloc_pages(gfp, order,
230462306a36Sopenharmony_ci				policy_node(gfp, pol, numa_node_id()),
230562306a36Sopenharmony_ci				policy_nodemask(gfp, pol));
230662306a36Sopenharmony_ci
230762306a36Sopenharmony_ci	return page;
230862306a36Sopenharmony_ci}
230962306a36Sopenharmony_ciEXPORT_SYMBOL(alloc_pages);
231062306a36Sopenharmony_ci
231162306a36Sopenharmony_cistruct folio *folio_alloc(gfp_t gfp, unsigned order)
231262306a36Sopenharmony_ci{
231362306a36Sopenharmony_ci	struct page *page = alloc_pages(gfp | __GFP_COMP, order);
231462306a36Sopenharmony_ci	struct folio *folio = (struct folio *)page;
231562306a36Sopenharmony_ci
231662306a36Sopenharmony_ci	if (folio && order > 1)
231762306a36Sopenharmony_ci		folio_prep_large_rmappable(folio);
231862306a36Sopenharmony_ci	return folio;
231962306a36Sopenharmony_ci}
232062306a36Sopenharmony_ciEXPORT_SYMBOL(folio_alloc);
232162306a36Sopenharmony_ci
232262306a36Sopenharmony_cistatic unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
232362306a36Sopenharmony_ci		struct mempolicy *pol, unsigned long nr_pages,
232462306a36Sopenharmony_ci		struct page **page_array)
232562306a36Sopenharmony_ci{
232662306a36Sopenharmony_ci	int nodes;
232762306a36Sopenharmony_ci	unsigned long nr_pages_per_node;
232862306a36Sopenharmony_ci	int delta;
232962306a36Sopenharmony_ci	int i;
233062306a36Sopenharmony_ci	unsigned long nr_allocated;
233162306a36Sopenharmony_ci	unsigned long total_allocated = 0;
233262306a36Sopenharmony_ci
233362306a36Sopenharmony_ci	nodes = nodes_weight(pol->nodes);
233462306a36Sopenharmony_ci	nr_pages_per_node = nr_pages / nodes;
233562306a36Sopenharmony_ci	delta = nr_pages - nodes * nr_pages_per_node;
233662306a36Sopenharmony_ci
233762306a36Sopenharmony_ci	for (i = 0; i < nodes; i++) {
233862306a36Sopenharmony_ci		if (delta) {
233962306a36Sopenharmony_ci			nr_allocated = __alloc_pages_bulk(gfp,
234062306a36Sopenharmony_ci					interleave_nodes(pol), NULL,
234162306a36Sopenharmony_ci					nr_pages_per_node + 1, NULL,
234262306a36Sopenharmony_ci					page_array);
234362306a36Sopenharmony_ci			delta--;
234462306a36Sopenharmony_ci		} else {
234562306a36Sopenharmony_ci			nr_allocated = __alloc_pages_bulk(gfp,
234662306a36Sopenharmony_ci					interleave_nodes(pol), NULL,
234762306a36Sopenharmony_ci					nr_pages_per_node, NULL, page_array);
234862306a36Sopenharmony_ci		}
234962306a36Sopenharmony_ci
235062306a36Sopenharmony_ci		page_array += nr_allocated;
235162306a36Sopenharmony_ci		total_allocated += nr_allocated;
235262306a36Sopenharmony_ci	}
235362306a36Sopenharmony_ci
235462306a36Sopenharmony_ci	return total_allocated;
235562306a36Sopenharmony_ci}
235662306a36Sopenharmony_ci
235762306a36Sopenharmony_cistatic unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
235862306a36Sopenharmony_ci		struct mempolicy *pol, unsigned long nr_pages,
235962306a36Sopenharmony_ci		struct page **page_array)
236062306a36Sopenharmony_ci{
236162306a36Sopenharmony_ci	gfp_t preferred_gfp;
236262306a36Sopenharmony_ci	unsigned long nr_allocated = 0;
236362306a36Sopenharmony_ci
236462306a36Sopenharmony_ci	preferred_gfp = gfp | __GFP_NOWARN;
236562306a36Sopenharmony_ci	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
236662306a36Sopenharmony_ci
236762306a36Sopenharmony_ci	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
236862306a36Sopenharmony_ci					   nr_pages, NULL, page_array);
236962306a36Sopenharmony_ci
237062306a36Sopenharmony_ci	if (nr_allocated < nr_pages)
237162306a36Sopenharmony_ci		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
237262306a36Sopenharmony_ci				nr_pages - nr_allocated, NULL,
237362306a36Sopenharmony_ci				page_array + nr_allocated);
237462306a36Sopenharmony_ci	return nr_allocated;
237562306a36Sopenharmony_ci}
237662306a36Sopenharmony_ci
237762306a36Sopenharmony_ci/* alloc pages bulk and mempolicy should be considered at the
237862306a36Sopenharmony_ci * same time in some situation such as vmalloc.
237962306a36Sopenharmony_ci *
238062306a36Sopenharmony_ci * It can accelerate memory allocation especially interleaving
238162306a36Sopenharmony_ci * allocate memory.
238262306a36Sopenharmony_ci */
238362306a36Sopenharmony_ciunsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
238462306a36Sopenharmony_ci		unsigned long nr_pages, struct page **page_array)
238562306a36Sopenharmony_ci{
238662306a36Sopenharmony_ci	struct mempolicy *pol = &default_policy;
238762306a36Sopenharmony_ci
238862306a36Sopenharmony_ci	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
238962306a36Sopenharmony_ci		pol = get_task_policy(current);
239062306a36Sopenharmony_ci
239162306a36Sopenharmony_ci	if (pol->mode == MPOL_INTERLEAVE)
239262306a36Sopenharmony_ci		return alloc_pages_bulk_array_interleave(gfp, pol,
239362306a36Sopenharmony_ci							 nr_pages, page_array);
239462306a36Sopenharmony_ci
239562306a36Sopenharmony_ci	if (pol->mode == MPOL_PREFERRED_MANY)
239662306a36Sopenharmony_ci		return alloc_pages_bulk_array_preferred_many(gfp,
239762306a36Sopenharmony_ci				numa_node_id(), pol, nr_pages, page_array);
239862306a36Sopenharmony_ci
239962306a36Sopenharmony_ci	return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
240062306a36Sopenharmony_ci				  policy_nodemask(gfp, pol), nr_pages, NULL,
240162306a36Sopenharmony_ci				  page_array);
240262306a36Sopenharmony_ci}
240362306a36Sopenharmony_ci
240462306a36Sopenharmony_ciint vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
240562306a36Sopenharmony_ci{
240662306a36Sopenharmony_ci	struct mempolicy *pol = mpol_dup(vma_policy(src));
240762306a36Sopenharmony_ci
240862306a36Sopenharmony_ci	if (IS_ERR(pol))
240962306a36Sopenharmony_ci		return PTR_ERR(pol);
241062306a36Sopenharmony_ci	dst->vm_policy = pol;
241162306a36Sopenharmony_ci	return 0;
241262306a36Sopenharmony_ci}
241362306a36Sopenharmony_ci
241462306a36Sopenharmony_ci/*
241562306a36Sopenharmony_ci * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
241662306a36Sopenharmony_ci * rebinds the mempolicy its copying by calling mpol_rebind_policy()
241762306a36Sopenharmony_ci * with the mems_allowed returned by cpuset_mems_allowed().  This
241862306a36Sopenharmony_ci * keeps mempolicies cpuset relative after its cpuset moves.  See
241962306a36Sopenharmony_ci * further kernel/cpuset.c update_nodemask().
242062306a36Sopenharmony_ci *
242162306a36Sopenharmony_ci * current's mempolicy may be rebinded by the other task(the task that changes
242262306a36Sopenharmony_ci * cpuset's mems), so we needn't do rebind work for current task.
242362306a36Sopenharmony_ci */
242462306a36Sopenharmony_ci
242562306a36Sopenharmony_ci/* Slow path of a mempolicy duplicate */
242662306a36Sopenharmony_cistruct mempolicy *__mpol_dup(struct mempolicy *old)
242762306a36Sopenharmony_ci{
242862306a36Sopenharmony_ci	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
242962306a36Sopenharmony_ci
243062306a36Sopenharmony_ci	if (!new)
243162306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
243262306a36Sopenharmony_ci
243362306a36Sopenharmony_ci	/* task's mempolicy is protected by alloc_lock */
243462306a36Sopenharmony_ci	if (old == current->mempolicy) {
243562306a36Sopenharmony_ci		task_lock(current);
243662306a36Sopenharmony_ci		*new = *old;
243762306a36Sopenharmony_ci		task_unlock(current);
243862306a36Sopenharmony_ci	} else
243962306a36Sopenharmony_ci		*new = *old;
244062306a36Sopenharmony_ci
244162306a36Sopenharmony_ci	if (current_cpuset_is_being_rebound()) {
244262306a36Sopenharmony_ci		nodemask_t mems = cpuset_mems_allowed(current);
244362306a36Sopenharmony_ci		mpol_rebind_policy(new, &mems);
244462306a36Sopenharmony_ci	}
244562306a36Sopenharmony_ci	atomic_set(&new->refcnt, 1);
244662306a36Sopenharmony_ci	return new;
244762306a36Sopenharmony_ci}
244862306a36Sopenharmony_ci
244962306a36Sopenharmony_ci/* Slow path of a mempolicy comparison */
245062306a36Sopenharmony_cibool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
245162306a36Sopenharmony_ci{
245262306a36Sopenharmony_ci	if (!a || !b)
245362306a36Sopenharmony_ci		return false;
245462306a36Sopenharmony_ci	if (a->mode != b->mode)
245562306a36Sopenharmony_ci		return false;
245662306a36Sopenharmony_ci	if (a->flags != b->flags)
245762306a36Sopenharmony_ci		return false;
245862306a36Sopenharmony_ci	if (a->home_node != b->home_node)
245962306a36Sopenharmony_ci		return false;
246062306a36Sopenharmony_ci	if (mpol_store_user_nodemask(a))
246162306a36Sopenharmony_ci		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
246262306a36Sopenharmony_ci			return false;
246362306a36Sopenharmony_ci
246462306a36Sopenharmony_ci	switch (a->mode) {
246562306a36Sopenharmony_ci	case MPOL_BIND:
246662306a36Sopenharmony_ci	case MPOL_INTERLEAVE:
246762306a36Sopenharmony_ci	case MPOL_PREFERRED:
246862306a36Sopenharmony_ci	case MPOL_PREFERRED_MANY:
246962306a36Sopenharmony_ci		return !!nodes_equal(a->nodes, b->nodes);
247062306a36Sopenharmony_ci	case MPOL_LOCAL:
247162306a36Sopenharmony_ci		return true;
247262306a36Sopenharmony_ci	default:
247362306a36Sopenharmony_ci		BUG();
247462306a36Sopenharmony_ci		return false;
247562306a36Sopenharmony_ci	}
247662306a36Sopenharmony_ci}
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_ci/*
247962306a36Sopenharmony_ci * Shared memory backing store policy support.
248062306a36Sopenharmony_ci *
248162306a36Sopenharmony_ci * Remember policies even when nobody has shared memory mapped.
248262306a36Sopenharmony_ci * The policies are kept in Red-Black tree linked from the inode.
248362306a36Sopenharmony_ci * They are protected by the sp->lock rwlock, which should be held
248462306a36Sopenharmony_ci * for any accesses to the tree.
248562306a36Sopenharmony_ci */
248662306a36Sopenharmony_ci
248762306a36Sopenharmony_ci/*
248862306a36Sopenharmony_ci * lookup first element intersecting start-end.  Caller holds sp->lock for
248962306a36Sopenharmony_ci * reading or for writing
249062306a36Sopenharmony_ci */
249162306a36Sopenharmony_cistatic struct sp_node *
249262306a36Sopenharmony_cisp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
249362306a36Sopenharmony_ci{
249462306a36Sopenharmony_ci	struct rb_node *n = sp->root.rb_node;
249562306a36Sopenharmony_ci
249662306a36Sopenharmony_ci	while (n) {
249762306a36Sopenharmony_ci		struct sp_node *p = rb_entry(n, struct sp_node, nd);
249862306a36Sopenharmony_ci
249962306a36Sopenharmony_ci		if (start >= p->end)
250062306a36Sopenharmony_ci			n = n->rb_right;
250162306a36Sopenharmony_ci		else if (end <= p->start)
250262306a36Sopenharmony_ci			n = n->rb_left;
250362306a36Sopenharmony_ci		else
250462306a36Sopenharmony_ci			break;
250562306a36Sopenharmony_ci	}
250662306a36Sopenharmony_ci	if (!n)
250762306a36Sopenharmony_ci		return NULL;
250862306a36Sopenharmony_ci	for (;;) {
250962306a36Sopenharmony_ci		struct sp_node *w = NULL;
251062306a36Sopenharmony_ci		struct rb_node *prev = rb_prev(n);
251162306a36Sopenharmony_ci		if (!prev)
251262306a36Sopenharmony_ci			break;
251362306a36Sopenharmony_ci		w = rb_entry(prev, struct sp_node, nd);
251462306a36Sopenharmony_ci		if (w->end <= start)
251562306a36Sopenharmony_ci			break;
251662306a36Sopenharmony_ci		n = prev;
251762306a36Sopenharmony_ci	}
251862306a36Sopenharmony_ci	return rb_entry(n, struct sp_node, nd);
251962306a36Sopenharmony_ci}
252062306a36Sopenharmony_ci
252162306a36Sopenharmony_ci/*
252262306a36Sopenharmony_ci * Insert a new shared policy into the list.  Caller holds sp->lock for
252362306a36Sopenharmony_ci * writing.
252462306a36Sopenharmony_ci */
252562306a36Sopenharmony_cistatic void sp_insert(struct shared_policy *sp, struct sp_node *new)
252662306a36Sopenharmony_ci{
252762306a36Sopenharmony_ci	struct rb_node **p = &sp->root.rb_node;
252862306a36Sopenharmony_ci	struct rb_node *parent = NULL;
252962306a36Sopenharmony_ci	struct sp_node *nd;
253062306a36Sopenharmony_ci
253162306a36Sopenharmony_ci	while (*p) {
253262306a36Sopenharmony_ci		parent = *p;
253362306a36Sopenharmony_ci		nd = rb_entry(parent, struct sp_node, nd);
253462306a36Sopenharmony_ci		if (new->start < nd->start)
253562306a36Sopenharmony_ci			p = &(*p)->rb_left;
253662306a36Sopenharmony_ci		else if (new->end > nd->end)
253762306a36Sopenharmony_ci			p = &(*p)->rb_right;
253862306a36Sopenharmony_ci		else
253962306a36Sopenharmony_ci			BUG();
254062306a36Sopenharmony_ci	}
254162306a36Sopenharmony_ci	rb_link_node(&new->nd, parent, p);
254262306a36Sopenharmony_ci	rb_insert_color(&new->nd, &sp->root);
254362306a36Sopenharmony_ci	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
254462306a36Sopenharmony_ci		 new->policy ? new->policy->mode : 0);
254562306a36Sopenharmony_ci}
254662306a36Sopenharmony_ci
254762306a36Sopenharmony_ci/* Find shared policy intersecting idx */
254862306a36Sopenharmony_cistruct mempolicy *
254962306a36Sopenharmony_cimpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
255062306a36Sopenharmony_ci{
255162306a36Sopenharmony_ci	struct mempolicy *pol = NULL;
255262306a36Sopenharmony_ci	struct sp_node *sn;
255362306a36Sopenharmony_ci
255462306a36Sopenharmony_ci	if (!sp->root.rb_node)
255562306a36Sopenharmony_ci		return NULL;
255662306a36Sopenharmony_ci	read_lock(&sp->lock);
255762306a36Sopenharmony_ci	sn = sp_lookup(sp, idx, idx+1);
255862306a36Sopenharmony_ci	if (sn) {
255962306a36Sopenharmony_ci		mpol_get(sn->policy);
256062306a36Sopenharmony_ci		pol = sn->policy;
256162306a36Sopenharmony_ci	}
256262306a36Sopenharmony_ci	read_unlock(&sp->lock);
256362306a36Sopenharmony_ci	return pol;
256462306a36Sopenharmony_ci}
256562306a36Sopenharmony_ci
256662306a36Sopenharmony_cistatic void sp_free(struct sp_node *n)
256762306a36Sopenharmony_ci{
256862306a36Sopenharmony_ci	mpol_put(n->policy);
256962306a36Sopenharmony_ci	kmem_cache_free(sn_cache, n);
257062306a36Sopenharmony_ci}
257162306a36Sopenharmony_ci
257262306a36Sopenharmony_ci/**
257362306a36Sopenharmony_ci * mpol_misplaced - check whether current page node is valid in policy
257462306a36Sopenharmony_ci *
257562306a36Sopenharmony_ci * @page: page to be checked
257662306a36Sopenharmony_ci * @vma: vm area where page mapped
257762306a36Sopenharmony_ci * @addr: virtual address where page mapped
257862306a36Sopenharmony_ci *
257962306a36Sopenharmony_ci * Lookup current policy node id for vma,addr and "compare to" page's
258062306a36Sopenharmony_ci * node id.  Policy determination "mimics" alloc_page_vma().
258162306a36Sopenharmony_ci * Called from fault path where we know the vma and faulting address.
258262306a36Sopenharmony_ci *
258362306a36Sopenharmony_ci * Return: NUMA_NO_NODE if the page is in a node that is valid for this
258462306a36Sopenharmony_ci * policy, or a suitable node ID to allocate a replacement page from.
258562306a36Sopenharmony_ci */
258662306a36Sopenharmony_ciint mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
258762306a36Sopenharmony_ci{
258862306a36Sopenharmony_ci	struct mempolicy *pol;
258962306a36Sopenharmony_ci	struct zoneref *z;
259062306a36Sopenharmony_ci	int curnid = page_to_nid(page);
259162306a36Sopenharmony_ci	unsigned long pgoff;
259262306a36Sopenharmony_ci	int thiscpu = raw_smp_processor_id();
259362306a36Sopenharmony_ci	int thisnid = cpu_to_node(thiscpu);
259462306a36Sopenharmony_ci	int polnid = NUMA_NO_NODE;
259562306a36Sopenharmony_ci	int ret = NUMA_NO_NODE;
259662306a36Sopenharmony_ci
259762306a36Sopenharmony_ci	pol = get_vma_policy(vma, addr);
259862306a36Sopenharmony_ci	if (!(pol->flags & MPOL_F_MOF))
259962306a36Sopenharmony_ci		goto out;
260062306a36Sopenharmony_ci
260162306a36Sopenharmony_ci	switch (pol->mode) {
260262306a36Sopenharmony_ci	case MPOL_INTERLEAVE:
260362306a36Sopenharmony_ci		pgoff = vma->vm_pgoff;
260462306a36Sopenharmony_ci		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
260562306a36Sopenharmony_ci		polnid = offset_il_node(pol, pgoff);
260662306a36Sopenharmony_ci		break;
260762306a36Sopenharmony_ci
260862306a36Sopenharmony_ci	case MPOL_PREFERRED:
260962306a36Sopenharmony_ci		if (node_isset(curnid, pol->nodes))
261062306a36Sopenharmony_ci			goto out;
261162306a36Sopenharmony_ci		polnid = first_node(pol->nodes);
261262306a36Sopenharmony_ci		break;
261362306a36Sopenharmony_ci
261462306a36Sopenharmony_ci	case MPOL_LOCAL:
261562306a36Sopenharmony_ci		polnid = numa_node_id();
261662306a36Sopenharmony_ci		break;
261762306a36Sopenharmony_ci
261862306a36Sopenharmony_ci	case MPOL_BIND:
261962306a36Sopenharmony_ci		/* Optimize placement among multiple nodes via NUMA balancing */
262062306a36Sopenharmony_ci		if (pol->flags & MPOL_F_MORON) {
262162306a36Sopenharmony_ci			if (node_isset(thisnid, pol->nodes))
262262306a36Sopenharmony_ci				break;
262362306a36Sopenharmony_ci			goto out;
262462306a36Sopenharmony_ci		}
262562306a36Sopenharmony_ci		fallthrough;
262662306a36Sopenharmony_ci
262762306a36Sopenharmony_ci	case MPOL_PREFERRED_MANY:
262862306a36Sopenharmony_ci		/*
262962306a36Sopenharmony_ci		 * use current page if in policy nodemask,
263062306a36Sopenharmony_ci		 * else select nearest allowed node, if any.
263162306a36Sopenharmony_ci		 * If no allowed nodes, use current [!misplaced].
263262306a36Sopenharmony_ci		 */
263362306a36Sopenharmony_ci		if (node_isset(curnid, pol->nodes))
263462306a36Sopenharmony_ci			goto out;
263562306a36Sopenharmony_ci		z = first_zones_zonelist(
263662306a36Sopenharmony_ci				node_zonelist(numa_node_id(), GFP_HIGHUSER),
263762306a36Sopenharmony_ci				gfp_zone(GFP_HIGHUSER),
263862306a36Sopenharmony_ci				&pol->nodes);
263962306a36Sopenharmony_ci		polnid = zone_to_nid(z->zone);
264062306a36Sopenharmony_ci		break;
264162306a36Sopenharmony_ci
264262306a36Sopenharmony_ci	default:
264362306a36Sopenharmony_ci		BUG();
264462306a36Sopenharmony_ci	}
264562306a36Sopenharmony_ci
264662306a36Sopenharmony_ci	/* Migrate the page towards the node whose CPU is referencing it */
264762306a36Sopenharmony_ci	if (pol->flags & MPOL_F_MORON) {
264862306a36Sopenharmony_ci		polnid = thisnid;
264962306a36Sopenharmony_ci
265062306a36Sopenharmony_ci		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
265162306a36Sopenharmony_ci			goto out;
265262306a36Sopenharmony_ci	}
265362306a36Sopenharmony_ci
265462306a36Sopenharmony_ci	if (curnid != polnid)
265562306a36Sopenharmony_ci		ret = polnid;
265662306a36Sopenharmony_ciout:
265762306a36Sopenharmony_ci	mpol_cond_put(pol);
265862306a36Sopenharmony_ci
265962306a36Sopenharmony_ci	return ret;
266062306a36Sopenharmony_ci}
266162306a36Sopenharmony_ci
266262306a36Sopenharmony_ci/*
266362306a36Sopenharmony_ci * Drop the (possibly final) reference to task->mempolicy.  It needs to be
266462306a36Sopenharmony_ci * dropped after task->mempolicy is set to NULL so that any allocation done as
266562306a36Sopenharmony_ci * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
266662306a36Sopenharmony_ci * policy.
266762306a36Sopenharmony_ci */
266862306a36Sopenharmony_civoid mpol_put_task_policy(struct task_struct *task)
266962306a36Sopenharmony_ci{
267062306a36Sopenharmony_ci	struct mempolicy *pol;
267162306a36Sopenharmony_ci
267262306a36Sopenharmony_ci	task_lock(task);
267362306a36Sopenharmony_ci	pol = task->mempolicy;
267462306a36Sopenharmony_ci	task->mempolicy = NULL;
267562306a36Sopenharmony_ci	task_unlock(task);
267662306a36Sopenharmony_ci	mpol_put(pol);
267762306a36Sopenharmony_ci}
267862306a36Sopenharmony_ci
267962306a36Sopenharmony_cistatic void sp_delete(struct shared_policy *sp, struct sp_node *n)
268062306a36Sopenharmony_ci{
268162306a36Sopenharmony_ci	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
268262306a36Sopenharmony_ci	rb_erase(&n->nd, &sp->root);
268362306a36Sopenharmony_ci	sp_free(n);
268462306a36Sopenharmony_ci}
268562306a36Sopenharmony_ci
268662306a36Sopenharmony_cistatic void sp_node_init(struct sp_node *node, unsigned long start,
268762306a36Sopenharmony_ci			unsigned long end, struct mempolicy *pol)
268862306a36Sopenharmony_ci{
268962306a36Sopenharmony_ci	node->start = start;
269062306a36Sopenharmony_ci	node->end = end;
269162306a36Sopenharmony_ci	node->policy = pol;
269262306a36Sopenharmony_ci}
269362306a36Sopenharmony_ci
269462306a36Sopenharmony_cistatic struct sp_node *sp_alloc(unsigned long start, unsigned long end,
269562306a36Sopenharmony_ci				struct mempolicy *pol)
269662306a36Sopenharmony_ci{
269762306a36Sopenharmony_ci	struct sp_node *n;
269862306a36Sopenharmony_ci	struct mempolicy *newpol;
269962306a36Sopenharmony_ci
270062306a36Sopenharmony_ci	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
270162306a36Sopenharmony_ci	if (!n)
270262306a36Sopenharmony_ci		return NULL;
270362306a36Sopenharmony_ci
270462306a36Sopenharmony_ci	newpol = mpol_dup(pol);
270562306a36Sopenharmony_ci	if (IS_ERR(newpol)) {
270662306a36Sopenharmony_ci		kmem_cache_free(sn_cache, n);
270762306a36Sopenharmony_ci		return NULL;
270862306a36Sopenharmony_ci	}
270962306a36Sopenharmony_ci	newpol->flags |= MPOL_F_SHARED;
271062306a36Sopenharmony_ci	sp_node_init(n, start, end, newpol);
271162306a36Sopenharmony_ci
271262306a36Sopenharmony_ci	return n;
271362306a36Sopenharmony_ci}
271462306a36Sopenharmony_ci
271562306a36Sopenharmony_ci/* Replace a policy range. */
271662306a36Sopenharmony_cistatic int shared_policy_replace(struct shared_policy *sp, unsigned long start,
271762306a36Sopenharmony_ci				 unsigned long end, struct sp_node *new)
271862306a36Sopenharmony_ci{
271962306a36Sopenharmony_ci	struct sp_node *n;
272062306a36Sopenharmony_ci	struct sp_node *n_new = NULL;
272162306a36Sopenharmony_ci	struct mempolicy *mpol_new = NULL;
272262306a36Sopenharmony_ci	int ret = 0;
272362306a36Sopenharmony_ci
272462306a36Sopenharmony_cirestart:
272562306a36Sopenharmony_ci	write_lock(&sp->lock);
272662306a36Sopenharmony_ci	n = sp_lookup(sp, start, end);
272762306a36Sopenharmony_ci	/* Take care of old policies in the same range. */
272862306a36Sopenharmony_ci	while (n && n->start < end) {
272962306a36Sopenharmony_ci		struct rb_node *next = rb_next(&n->nd);
273062306a36Sopenharmony_ci		if (n->start >= start) {
273162306a36Sopenharmony_ci			if (n->end <= end)
273262306a36Sopenharmony_ci				sp_delete(sp, n);
273362306a36Sopenharmony_ci			else
273462306a36Sopenharmony_ci				n->start = end;
273562306a36Sopenharmony_ci		} else {
273662306a36Sopenharmony_ci			/* Old policy spanning whole new range. */
273762306a36Sopenharmony_ci			if (n->end > end) {
273862306a36Sopenharmony_ci				if (!n_new)
273962306a36Sopenharmony_ci					goto alloc_new;
274062306a36Sopenharmony_ci
274162306a36Sopenharmony_ci				*mpol_new = *n->policy;
274262306a36Sopenharmony_ci				atomic_set(&mpol_new->refcnt, 1);
274362306a36Sopenharmony_ci				sp_node_init(n_new, end, n->end, mpol_new);
274462306a36Sopenharmony_ci				n->end = start;
274562306a36Sopenharmony_ci				sp_insert(sp, n_new);
274662306a36Sopenharmony_ci				n_new = NULL;
274762306a36Sopenharmony_ci				mpol_new = NULL;
274862306a36Sopenharmony_ci				break;
274962306a36Sopenharmony_ci			} else
275062306a36Sopenharmony_ci				n->end = start;
275162306a36Sopenharmony_ci		}
275262306a36Sopenharmony_ci		if (!next)
275362306a36Sopenharmony_ci			break;
275462306a36Sopenharmony_ci		n = rb_entry(next, struct sp_node, nd);
275562306a36Sopenharmony_ci	}
275662306a36Sopenharmony_ci	if (new)
275762306a36Sopenharmony_ci		sp_insert(sp, new);
275862306a36Sopenharmony_ci	write_unlock(&sp->lock);
275962306a36Sopenharmony_ci	ret = 0;
276062306a36Sopenharmony_ci
276162306a36Sopenharmony_cierr_out:
276262306a36Sopenharmony_ci	if (mpol_new)
276362306a36Sopenharmony_ci		mpol_put(mpol_new);
276462306a36Sopenharmony_ci	if (n_new)
276562306a36Sopenharmony_ci		kmem_cache_free(sn_cache, n_new);
276662306a36Sopenharmony_ci
276762306a36Sopenharmony_ci	return ret;
276862306a36Sopenharmony_ci
276962306a36Sopenharmony_cialloc_new:
277062306a36Sopenharmony_ci	write_unlock(&sp->lock);
277162306a36Sopenharmony_ci	ret = -ENOMEM;
277262306a36Sopenharmony_ci	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
277362306a36Sopenharmony_ci	if (!n_new)
277462306a36Sopenharmony_ci		goto err_out;
277562306a36Sopenharmony_ci	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277662306a36Sopenharmony_ci	if (!mpol_new)
277762306a36Sopenharmony_ci		goto err_out;
277862306a36Sopenharmony_ci	atomic_set(&mpol_new->refcnt, 1);
277962306a36Sopenharmony_ci	goto restart;
278062306a36Sopenharmony_ci}
278162306a36Sopenharmony_ci
278262306a36Sopenharmony_ci/**
278362306a36Sopenharmony_ci * mpol_shared_policy_init - initialize shared policy for inode
278462306a36Sopenharmony_ci * @sp: pointer to inode shared policy
278562306a36Sopenharmony_ci * @mpol:  struct mempolicy to install
278662306a36Sopenharmony_ci *
278762306a36Sopenharmony_ci * Install non-NULL @mpol in inode's shared policy rb-tree.
278862306a36Sopenharmony_ci * On entry, the current task has a reference on a non-NULL @mpol.
278962306a36Sopenharmony_ci * This must be released on exit.
279062306a36Sopenharmony_ci * This is called at get_inode() calls and we can use GFP_KERNEL.
279162306a36Sopenharmony_ci */
279262306a36Sopenharmony_civoid mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
279362306a36Sopenharmony_ci{
279462306a36Sopenharmony_ci	int ret;
279562306a36Sopenharmony_ci
279662306a36Sopenharmony_ci	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
279762306a36Sopenharmony_ci	rwlock_init(&sp->lock);
279862306a36Sopenharmony_ci
279962306a36Sopenharmony_ci	if (mpol) {
280062306a36Sopenharmony_ci		struct vm_area_struct pvma;
280162306a36Sopenharmony_ci		struct mempolicy *new;
280262306a36Sopenharmony_ci		NODEMASK_SCRATCH(scratch);
280362306a36Sopenharmony_ci
280462306a36Sopenharmony_ci		if (!scratch)
280562306a36Sopenharmony_ci			goto put_mpol;
280662306a36Sopenharmony_ci		/* contextualize the tmpfs mount point mempolicy */
280762306a36Sopenharmony_ci		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
280862306a36Sopenharmony_ci		if (IS_ERR(new))
280962306a36Sopenharmony_ci			goto free_scratch; /* no valid nodemask intersection */
281062306a36Sopenharmony_ci
281162306a36Sopenharmony_ci		task_lock(current);
281262306a36Sopenharmony_ci		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
281362306a36Sopenharmony_ci		task_unlock(current);
281462306a36Sopenharmony_ci		if (ret)
281562306a36Sopenharmony_ci			goto put_new;
281662306a36Sopenharmony_ci
281762306a36Sopenharmony_ci		/* Create pseudo-vma that contains just the policy */
281862306a36Sopenharmony_ci		vma_init(&pvma, NULL);
281962306a36Sopenharmony_ci		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
282062306a36Sopenharmony_ci		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
282162306a36Sopenharmony_ci
282262306a36Sopenharmony_ciput_new:
282362306a36Sopenharmony_ci		mpol_put(new);			/* drop initial ref */
282462306a36Sopenharmony_cifree_scratch:
282562306a36Sopenharmony_ci		NODEMASK_SCRATCH_FREE(scratch);
282662306a36Sopenharmony_ciput_mpol:
282762306a36Sopenharmony_ci		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
282862306a36Sopenharmony_ci	}
282962306a36Sopenharmony_ci}
283062306a36Sopenharmony_ci
283162306a36Sopenharmony_ciint mpol_set_shared_policy(struct shared_policy *info,
283262306a36Sopenharmony_ci			struct vm_area_struct *vma, struct mempolicy *npol)
283362306a36Sopenharmony_ci{
283462306a36Sopenharmony_ci	int err;
283562306a36Sopenharmony_ci	struct sp_node *new = NULL;
283662306a36Sopenharmony_ci	unsigned long sz = vma_pages(vma);
283762306a36Sopenharmony_ci
283862306a36Sopenharmony_ci	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
283962306a36Sopenharmony_ci		 vma->vm_pgoff,
284062306a36Sopenharmony_ci		 sz, npol ? npol->mode : -1,
284162306a36Sopenharmony_ci		 npol ? npol->flags : -1,
284262306a36Sopenharmony_ci		 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
284362306a36Sopenharmony_ci
284462306a36Sopenharmony_ci	if (npol) {
284562306a36Sopenharmony_ci		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
284662306a36Sopenharmony_ci		if (!new)
284762306a36Sopenharmony_ci			return -ENOMEM;
284862306a36Sopenharmony_ci	}
284962306a36Sopenharmony_ci	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
285062306a36Sopenharmony_ci	if (err && new)
285162306a36Sopenharmony_ci		sp_free(new);
285262306a36Sopenharmony_ci	return err;
285362306a36Sopenharmony_ci}
285462306a36Sopenharmony_ci
285562306a36Sopenharmony_ci/* Free a backing policy store on inode delete. */
285662306a36Sopenharmony_civoid mpol_free_shared_policy(struct shared_policy *p)
285762306a36Sopenharmony_ci{
285862306a36Sopenharmony_ci	struct sp_node *n;
285962306a36Sopenharmony_ci	struct rb_node *next;
286062306a36Sopenharmony_ci
286162306a36Sopenharmony_ci	if (!p->root.rb_node)
286262306a36Sopenharmony_ci		return;
286362306a36Sopenharmony_ci	write_lock(&p->lock);
286462306a36Sopenharmony_ci	next = rb_first(&p->root);
286562306a36Sopenharmony_ci	while (next) {
286662306a36Sopenharmony_ci		n = rb_entry(next, struct sp_node, nd);
286762306a36Sopenharmony_ci		next = rb_next(&n->nd);
286862306a36Sopenharmony_ci		sp_delete(p, n);
286962306a36Sopenharmony_ci	}
287062306a36Sopenharmony_ci	write_unlock(&p->lock);
287162306a36Sopenharmony_ci}
287262306a36Sopenharmony_ci
287362306a36Sopenharmony_ci#ifdef CONFIG_NUMA_BALANCING
287462306a36Sopenharmony_cistatic int __initdata numabalancing_override;
287562306a36Sopenharmony_ci
287662306a36Sopenharmony_cistatic void __init check_numabalancing_enable(void)
287762306a36Sopenharmony_ci{
287862306a36Sopenharmony_ci	bool numabalancing_default = false;
287962306a36Sopenharmony_ci
288062306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
288162306a36Sopenharmony_ci		numabalancing_default = true;
288262306a36Sopenharmony_ci
288362306a36Sopenharmony_ci	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
288462306a36Sopenharmony_ci	if (numabalancing_override)
288562306a36Sopenharmony_ci		set_numabalancing_state(numabalancing_override == 1);
288662306a36Sopenharmony_ci
288762306a36Sopenharmony_ci	if (num_online_nodes() > 1 && !numabalancing_override) {
288862306a36Sopenharmony_ci		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
288962306a36Sopenharmony_ci			numabalancing_default ? "Enabling" : "Disabling");
289062306a36Sopenharmony_ci		set_numabalancing_state(numabalancing_default);
289162306a36Sopenharmony_ci	}
289262306a36Sopenharmony_ci}
289362306a36Sopenharmony_ci
289462306a36Sopenharmony_cistatic int __init setup_numabalancing(char *str)
289562306a36Sopenharmony_ci{
289662306a36Sopenharmony_ci	int ret = 0;
289762306a36Sopenharmony_ci	if (!str)
289862306a36Sopenharmony_ci		goto out;
289962306a36Sopenharmony_ci
290062306a36Sopenharmony_ci	if (!strcmp(str, "enable")) {
290162306a36Sopenharmony_ci		numabalancing_override = 1;
290262306a36Sopenharmony_ci		ret = 1;
290362306a36Sopenharmony_ci	} else if (!strcmp(str, "disable")) {
290462306a36Sopenharmony_ci		numabalancing_override = -1;
290562306a36Sopenharmony_ci		ret = 1;
290662306a36Sopenharmony_ci	}
290762306a36Sopenharmony_ciout:
290862306a36Sopenharmony_ci	if (!ret)
290962306a36Sopenharmony_ci		pr_warn("Unable to parse numa_balancing=\n");
291062306a36Sopenharmony_ci
291162306a36Sopenharmony_ci	return ret;
291262306a36Sopenharmony_ci}
291362306a36Sopenharmony_ci__setup("numa_balancing=", setup_numabalancing);
291462306a36Sopenharmony_ci#else
291562306a36Sopenharmony_cistatic inline void __init check_numabalancing_enable(void)
291662306a36Sopenharmony_ci{
291762306a36Sopenharmony_ci}
291862306a36Sopenharmony_ci#endif /* CONFIG_NUMA_BALANCING */
291962306a36Sopenharmony_ci
292062306a36Sopenharmony_ci/* assumes fs == KERNEL_DS */
292162306a36Sopenharmony_civoid __init numa_policy_init(void)
292262306a36Sopenharmony_ci{
292362306a36Sopenharmony_ci	nodemask_t interleave_nodes;
292462306a36Sopenharmony_ci	unsigned long largest = 0;
292562306a36Sopenharmony_ci	int nid, prefer = 0;
292662306a36Sopenharmony_ci
292762306a36Sopenharmony_ci	policy_cache = kmem_cache_create("numa_policy",
292862306a36Sopenharmony_ci					 sizeof(struct mempolicy),
292962306a36Sopenharmony_ci					 0, SLAB_PANIC, NULL);
293062306a36Sopenharmony_ci
293162306a36Sopenharmony_ci	sn_cache = kmem_cache_create("shared_policy_node",
293262306a36Sopenharmony_ci				     sizeof(struct sp_node),
293362306a36Sopenharmony_ci				     0, SLAB_PANIC, NULL);
293462306a36Sopenharmony_ci
293562306a36Sopenharmony_ci	for_each_node(nid) {
293662306a36Sopenharmony_ci		preferred_node_policy[nid] = (struct mempolicy) {
293762306a36Sopenharmony_ci			.refcnt = ATOMIC_INIT(1),
293862306a36Sopenharmony_ci			.mode = MPOL_PREFERRED,
293962306a36Sopenharmony_ci			.flags = MPOL_F_MOF | MPOL_F_MORON,
294062306a36Sopenharmony_ci			.nodes = nodemask_of_node(nid),
294162306a36Sopenharmony_ci		};
294262306a36Sopenharmony_ci	}
294362306a36Sopenharmony_ci
294462306a36Sopenharmony_ci	/*
294562306a36Sopenharmony_ci	 * Set interleaving policy for system init. Interleaving is only
294662306a36Sopenharmony_ci	 * enabled across suitably sized nodes (default is >= 16MB), or
294762306a36Sopenharmony_ci	 * fall back to the largest node if they're all smaller.
294862306a36Sopenharmony_ci	 */
294962306a36Sopenharmony_ci	nodes_clear(interleave_nodes);
295062306a36Sopenharmony_ci	for_each_node_state(nid, N_MEMORY) {
295162306a36Sopenharmony_ci		unsigned long total_pages = node_present_pages(nid);
295262306a36Sopenharmony_ci
295362306a36Sopenharmony_ci		/* Preserve the largest node */
295462306a36Sopenharmony_ci		if (largest < total_pages) {
295562306a36Sopenharmony_ci			largest = total_pages;
295662306a36Sopenharmony_ci			prefer = nid;
295762306a36Sopenharmony_ci		}
295862306a36Sopenharmony_ci
295962306a36Sopenharmony_ci		/* Interleave this node? */
296062306a36Sopenharmony_ci		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
296162306a36Sopenharmony_ci			node_set(nid, interleave_nodes);
296262306a36Sopenharmony_ci	}
296362306a36Sopenharmony_ci
296462306a36Sopenharmony_ci	/* All too small, use the largest */
296562306a36Sopenharmony_ci	if (unlikely(nodes_empty(interleave_nodes)))
296662306a36Sopenharmony_ci		node_set(prefer, interleave_nodes);
296762306a36Sopenharmony_ci
296862306a36Sopenharmony_ci	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
296962306a36Sopenharmony_ci		pr_err("%s: interleaving failed\n", __func__);
297062306a36Sopenharmony_ci
297162306a36Sopenharmony_ci	check_numabalancing_enable();
297262306a36Sopenharmony_ci}
297362306a36Sopenharmony_ci
297462306a36Sopenharmony_ci/* Reset policy of current process to default */
297562306a36Sopenharmony_civoid numa_default_policy(void)
297662306a36Sopenharmony_ci{
297762306a36Sopenharmony_ci	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
297862306a36Sopenharmony_ci}
297962306a36Sopenharmony_ci
298062306a36Sopenharmony_ci/*
298162306a36Sopenharmony_ci * Parse and format mempolicy from/to strings
298262306a36Sopenharmony_ci */
298362306a36Sopenharmony_ci
298462306a36Sopenharmony_cistatic const char * const policy_modes[] =
298562306a36Sopenharmony_ci{
298662306a36Sopenharmony_ci	[MPOL_DEFAULT]    = "default",
298762306a36Sopenharmony_ci	[MPOL_PREFERRED]  = "prefer",
298862306a36Sopenharmony_ci	[MPOL_BIND]       = "bind",
298962306a36Sopenharmony_ci	[MPOL_INTERLEAVE] = "interleave",
299062306a36Sopenharmony_ci	[MPOL_LOCAL]      = "local",
299162306a36Sopenharmony_ci	[MPOL_PREFERRED_MANY]  = "prefer (many)",
299262306a36Sopenharmony_ci};
299362306a36Sopenharmony_ci
299462306a36Sopenharmony_ci
299562306a36Sopenharmony_ci#ifdef CONFIG_TMPFS
299662306a36Sopenharmony_ci/**
299762306a36Sopenharmony_ci * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
299862306a36Sopenharmony_ci * @str:  string containing mempolicy to parse
299962306a36Sopenharmony_ci * @mpol:  pointer to struct mempolicy pointer, returned on success.
300062306a36Sopenharmony_ci *
300162306a36Sopenharmony_ci * Format of input:
300262306a36Sopenharmony_ci *	<mode>[=<flags>][:<nodelist>]
300362306a36Sopenharmony_ci *
300462306a36Sopenharmony_ci * Return: %0 on success, else %1
300562306a36Sopenharmony_ci */
300662306a36Sopenharmony_ciint mpol_parse_str(char *str, struct mempolicy **mpol)
300762306a36Sopenharmony_ci{
300862306a36Sopenharmony_ci	struct mempolicy *new = NULL;
300962306a36Sopenharmony_ci	unsigned short mode_flags;
301062306a36Sopenharmony_ci	nodemask_t nodes;
301162306a36Sopenharmony_ci	char *nodelist = strchr(str, ':');
301262306a36Sopenharmony_ci	char *flags = strchr(str, '=');
301362306a36Sopenharmony_ci	int err = 1, mode;
301462306a36Sopenharmony_ci
301562306a36Sopenharmony_ci	if (flags)
301662306a36Sopenharmony_ci		*flags++ = '\0';	/* terminate mode string */
301762306a36Sopenharmony_ci
301862306a36Sopenharmony_ci	if (nodelist) {
301962306a36Sopenharmony_ci		/* NUL-terminate mode or flags string */
302062306a36Sopenharmony_ci		*nodelist++ = '\0';
302162306a36Sopenharmony_ci		if (nodelist_parse(nodelist, nodes))
302262306a36Sopenharmony_ci			goto out;
302362306a36Sopenharmony_ci		if (!nodes_subset(nodes, node_states[N_MEMORY]))
302462306a36Sopenharmony_ci			goto out;
302562306a36Sopenharmony_ci	} else
302662306a36Sopenharmony_ci		nodes_clear(nodes);
302762306a36Sopenharmony_ci
302862306a36Sopenharmony_ci	mode = match_string(policy_modes, MPOL_MAX, str);
302962306a36Sopenharmony_ci	if (mode < 0)
303062306a36Sopenharmony_ci		goto out;
303162306a36Sopenharmony_ci
303262306a36Sopenharmony_ci	switch (mode) {
303362306a36Sopenharmony_ci	case MPOL_PREFERRED:
303462306a36Sopenharmony_ci		/*
303562306a36Sopenharmony_ci		 * Insist on a nodelist of one node only, although later
303662306a36Sopenharmony_ci		 * we use first_node(nodes) to grab a single node, so here
303762306a36Sopenharmony_ci		 * nodelist (or nodes) cannot be empty.
303862306a36Sopenharmony_ci		 */
303962306a36Sopenharmony_ci		if (nodelist) {
304062306a36Sopenharmony_ci			char *rest = nodelist;
304162306a36Sopenharmony_ci			while (isdigit(*rest))
304262306a36Sopenharmony_ci				rest++;
304362306a36Sopenharmony_ci			if (*rest)
304462306a36Sopenharmony_ci				goto out;
304562306a36Sopenharmony_ci			if (nodes_empty(nodes))
304662306a36Sopenharmony_ci				goto out;
304762306a36Sopenharmony_ci		}
304862306a36Sopenharmony_ci		break;
304962306a36Sopenharmony_ci	case MPOL_INTERLEAVE:
305062306a36Sopenharmony_ci		/*
305162306a36Sopenharmony_ci		 * Default to online nodes with memory if no nodelist
305262306a36Sopenharmony_ci		 */
305362306a36Sopenharmony_ci		if (!nodelist)
305462306a36Sopenharmony_ci			nodes = node_states[N_MEMORY];
305562306a36Sopenharmony_ci		break;
305662306a36Sopenharmony_ci	case MPOL_LOCAL:
305762306a36Sopenharmony_ci		/*
305862306a36Sopenharmony_ci		 * Don't allow a nodelist;  mpol_new() checks flags
305962306a36Sopenharmony_ci		 */
306062306a36Sopenharmony_ci		if (nodelist)
306162306a36Sopenharmony_ci			goto out;
306262306a36Sopenharmony_ci		break;
306362306a36Sopenharmony_ci	case MPOL_DEFAULT:
306462306a36Sopenharmony_ci		/*
306562306a36Sopenharmony_ci		 * Insist on a empty nodelist
306662306a36Sopenharmony_ci		 */
306762306a36Sopenharmony_ci		if (!nodelist)
306862306a36Sopenharmony_ci			err = 0;
306962306a36Sopenharmony_ci		goto out;
307062306a36Sopenharmony_ci	case MPOL_PREFERRED_MANY:
307162306a36Sopenharmony_ci	case MPOL_BIND:
307262306a36Sopenharmony_ci		/*
307362306a36Sopenharmony_ci		 * Insist on a nodelist
307462306a36Sopenharmony_ci		 */
307562306a36Sopenharmony_ci		if (!nodelist)
307662306a36Sopenharmony_ci			goto out;
307762306a36Sopenharmony_ci	}
307862306a36Sopenharmony_ci
307962306a36Sopenharmony_ci	mode_flags = 0;
308062306a36Sopenharmony_ci	if (flags) {
308162306a36Sopenharmony_ci		/*
308262306a36Sopenharmony_ci		 * Currently, we only support two mutually exclusive
308362306a36Sopenharmony_ci		 * mode flags.
308462306a36Sopenharmony_ci		 */
308562306a36Sopenharmony_ci		if (!strcmp(flags, "static"))
308662306a36Sopenharmony_ci			mode_flags |= MPOL_F_STATIC_NODES;
308762306a36Sopenharmony_ci		else if (!strcmp(flags, "relative"))
308862306a36Sopenharmony_ci			mode_flags |= MPOL_F_RELATIVE_NODES;
308962306a36Sopenharmony_ci		else
309062306a36Sopenharmony_ci			goto out;
309162306a36Sopenharmony_ci	}
309262306a36Sopenharmony_ci
309362306a36Sopenharmony_ci	new = mpol_new(mode, mode_flags, &nodes);
309462306a36Sopenharmony_ci	if (IS_ERR(new))
309562306a36Sopenharmony_ci		goto out;
309662306a36Sopenharmony_ci
309762306a36Sopenharmony_ci	/*
309862306a36Sopenharmony_ci	 * Save nodes for mpol_to_str() to show the tmpfs mount options
309962306a36Sopenharmony_ci	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
310062306a36Sopenharmony_ci	 */
310162306a36Sopenharmony_ci	if (mode != MPOL_PREFERRED) {
310262306a36Sopenharmony_ci		new->nodes = nodes;
310362306a36Sopenharmony_ci	} else if (nodelist) {
310462306a36Sopenharmony_ci		nodes_clear(new->nodes);
310562306a36Sopenharmony_ci		node_set(first_node(nodes), new->nodes);
310662306a36Sopenharmony_ci	} else {
310762306a36Sopenharmony_ci		new->mode = MPOL_LOCAL;
310862306a36Sopenharmony_ci	}
310962306a36Sopenharmony_ci
311062306a36Sopenharmony_ci	/*
311162306a36Sopenharmony_ci	 * Save nodes for contextualization: this will be used to "clone"
311262306a36Sopenharmony_ci	 * the mempolicy in a specific context [cpuset] at a later time.
311362306a36Sopenharmony_ci	 */
311462306a36Sopenharmony_ci	new->w.user_nodemask = nodes;
311562306a36Sopenharmony_ci
311662306a36Sopenharmony_ci	err = 0;
311762306a36Sopenharmony_ci
311862306a36Sopenharmony_ciout:
311962306a36Sopenharmony_ci	/* Restore string for error message */
312062306a36Sopenharmony_ci	if (nodelist)
312162306a36Sopenharmony_ci		*--nodelist = ':';
312262306a36Sopenharmony_ci	if (flags)
312362306a36Sopenharmony_ci		*--flags = '=';
312462306a36Sopenharmony_ci	if (!err)
312562306a36Sopenharmony_ci		*mpol = new;
312662306a36Sopenharmony_ci	return err;
312762306a36Sopenharmony_ci}
312862306a36Sopenharmony_ci#endif /* CONFIG_TMPFS */
312962306a36Sopenharmony_ci
313062306a36Sopenharmony_ci/**
313162306a36Sopenharmony_ci * mpol_to_str - format a mempolicy structure for printing
313262306a36Sopenharmony_ci * @buffer:  to contain formatted mempolicy string
313362306a36Sopenharmony_ci * @maxlen:  length of @buffer
313462306a36Sopenharmony_ci * @pol:  pointer to mempolicy to be formatted
313562306a36Sopenharmony_ci *
313662306a36Sopenharmony_ci * Convert @pol into a string.  If @buffer is too short, truncate the string.
313762306a36Sopenharmony_ci * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
313862306a36Sopenharmony_ci * longest flag, "relative", and to display at least a few node ids.
313962306a36Sopenharmony_ci */
314062306a36Sopenharmony_civoid mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
314162306a36Sopenharmony_ci{
314262306a36Sopenharmony_ci	char *p = buffer;
314362306a36Sopenharmony_ci	nodemask_t nodes = NODE_MASK_NONE;
314462306a36Sopenharmony_ci	unsigned short mode = MPOL_DEFAULT;
314562306a36Sopenharmony_ci	unsigned short flags = 0;
314662306a36Sopenharmony_ci
314762306a36Sopenharmony_ci	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
314862306a36Sopenharmony_ci		mode = pol->mode;
314962306a36Sopenharmony_ci		flags = pol->flags;
315062306a36Sopenharmony_ci	}
315162306a36Sopenharmony_ci
315262306a36Sopenharmony_ci	switch (mode) {
315362306a36Sopenharmony_ci	case MPOL_DEFAULT:
315462306a36Sopenharmony_ci	case MPOL_LOCAL:
315562306a36Sopenharmony_ci		break;
315662306a36Sopenharmony_ci	case MPOL_PREFERRED:
315762306a36Sopenharmony_ci	case MPOL_PREFERRED_MANY:
315862306a36Sopenharmony_ci	case MPOL_BIND:
315962306a36Sopenharmony_ci	case MPOL_INTERLEAVE:
316062306a36Sopenharmony_ci		nodes = pol->nodes;
316162306a36Sopenharmony_ci		break;
316262306a36Sopenharmony_ci	default:
316362306a36Sopenharmony_ci		WARN_ON_ONCE(1);
316462306a36Sopenharmony_ci		snprintf(p, maxlen, "unknown");
316562306a36Sopenharmony_ci		return;
316662306a36Sopenharmony_ci	}
316762306a36Sopenharmony_ci
316862306a36Sopenharmony_ci	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
316962306a36Sopenharmony_ci
317062306a36Sopenharmony_ci	if (flags & MPOL_MODE_FLAGS) {
317162306a36Sopenharmony_ci		p += snprintf(p, buffer + maxlen - p, "=");
317262306a36Sopenharmony_ci
317362306a36Sopenharmony_ci		/*
317462306a36Sopenharmony_ci		 * Currently, the only defined flags are mutually exclusive
317562306a36Sopenharmony_ci		 */
317662306a36Sopenharmony_ci		if (flags & MPOL_F_STATIC_NODES)
317762306a36Sopenharmony_ci			p += snprintf(p, buffer + maxlen - p, "static");
317862306a36Sopenharmony_ci		else if (flags & MPOL_F_RELATIVE_NODES)
317962306a36Sopenharmony_ci			p += snprintf(p, buffer + maxlen - p, "relative");
318062306a36Sopenharmony_ci	}
318162306a36Sopenharmony_ci
318262306a36Sopenharmony_ci	if (!nodes_empty(nodes))
318362306a36Sopenharmony_ci		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
318462306a36Sopenharmony_ci			       nodemask_pr_args(&nodes));
318562306a36Sopenharmony_ci}
3186