18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Simple NUMA memory policy for the Linux kernel.
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright 2003,2004 Andi Kleen, SuSE Labs.
68c2ecf20Sopenharmony_ci * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * NUMA policy allows the user to give hints in which node(s) memory should
98c2ecf20Sopenharmony_ci * be allocated.
108c2ecf20Sopenharmony_ci *
118c2ecf20Sopenharmony_ci * Support four policies per VMA and per process:
128c2ecf20Sopenharmony_ci *
138c2ecf20Sopenharmony_ci * The VMA policy has priority over the process policy for a page fault.
148c2ecf20Sopenharmony_ci *
158c2ecf20Sopenharmony_ci * interleave     Allocate memory interleaved over a set of nodes,
168c2ecf20Sopenharmony_ci *                with normal fallback if it fails.
178c2ecf20Sopenharmony_ci *                For VMA based allocations this interleaves based on the
188c2ecf20Sopenharmony_ci *                offset into the backing object or offset into the mapping
198c2ecf20Sopenharmony_ci *                for anonymous memory. For process policy an process counter
208c2ecf20Sopenharmony_ci *                is used.
218c2ecf20Sopenharmony_ci *
228c2ecf20Sopenharmony_ci * bind           Only allocate memory on a specific set of nodes,
238c2ecf20Sopenharmony_ci *                no fallback.
248c2ecf20Sopenharmony_ci *                FIXME: memory is allocated starting with the first node
258c2ecf20Sopenharmony_ci *                to the last. It would be better if bind would truly restrict
268c2ecf20Sopenharmony_ci *                the allocation to memory nodes instead
278c2ecf20Sopenharmony_ci *
288c2ecf20Sopenharmony_ci * preferred       Try a specific node first before normal fallback.
298c2ecf20Sopenharmony_ci *                As a special case NUMA_NO_NODE here means do the allocation
308c2ecf20Sopenharmony_ci *                on the local CPU. This is normally identical to default,
318c2ecf20Sopenharmony_ci *                but useful to set in a VMA when you have a non default
328c2ecf20Sopenharmony_ci *                process policy.
338c2ecf20Sopenharmony_ci *
348c2ecf20Sopenharmony_ci * default        Allocate on the local node first, or when on a VMA
358c2ecf20Sopenharmony_ci *                use the process policy. This is what Linux always did
368c2ecf20Sopenharmony_ci *		  in a NUMA aware kernel and still does by, ahem, default.
378c2ecf20Sopenharmony_ci *
388c2ecf20Sopenharmony_ci * The process policy is applied for most non interrupt memory allocations
398c2ecf20Sopenharmony_ci * in that process' context. Interrupts ignore the policies and always
408c2ecf20Sopenharmony_ci * try to allocate on the local CPU. The VMA policy is only applied for memory
418c2ecf20Sopenharmony_ci * allocations for a VMA in the VM.
428c2ecf20Sopenharmony_ci *
438c2ecf20Sopenharmony_ci * Currently there are a few corner cases in swapping where the policy
448c2ecf20Sopenharmony_ci * is not applied, but the majority should be handled. When process policy
458c2ecf20Sopenharmony_ci * is used it is not remembered over swap outs/swap ins.
468c2ecf20Sopenharmony_ci *
478c2ecf20Sopenharmony_ci * Only the highest zone in the zone hierarchy gets policied. Allocations
488c2ecf20Sopenharmony_ci * requesting a lower zone just use default policy. This implies that
498c2ecf20Sopenharmony_ci * on systems with highmem kernel lowmem allocation don't get policied.
508c2ecf20Sopenharmony_ci * Same with GFP_DMA allocations.
518c2ecf20Sopenharmony_ci *
528c2ecf20Sopenharmony_ci * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
538c2ecf20Sopenharmony_ci * all users and remembered even when nobody has memory mapped.
548c2ecf20Sopenharmony_ci */
558c2ecf20Sopenharmony_ci
568c2ecf20Sopenharmony_ci/* Notebook:
578c2ecf20Sopenharmony_ci   fix mmap readahead to honour policy and enable policy for any page cache
588c2ecf20Sopenharmony_ci   object
598c2ecf20Sopenharmony_ci   statistics for bigpages
608c2ecf20Sopenharmony_ci   global policy for page cache? currently it uses process policy. Requires
618c2ecf20Sopenharmony_ci   first item above.
628c2ecf20Sopenharmony_ci   handle mremap for shared memory (currently ignored for the policy)
638c2ecf20Sopenharmony_ci   grows down?
648c2ecf20Sopenharmony_ci   make bind policy root only? It can trigger oom much faster and the
658c2ecf20Sopenharmony_ci   kernel is not always grateful with that.
668c2ecf20Sopenharmony_ci*/
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci#include <linux/mempolicy.h>
718c2ecf20Sopenharmony_ci#include <linux/pagewalk.h>
728c2ecf20Sopenharmony_ci#include <linux/highmem.h>
738c2ecf20Sopenharmony_ci#include <linux/hugetlb.h>
748c2ecf20Sopenharmony_ci#include <linux/kernel.h>
758c2ecf20Sopenharmony_ci#include <linux/sched.h>
768c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
778c2ecf20Sopenharmony_ci#include <linux/sched/numa_balancing.h>
788c2ecf20Sopenharmony_ci#include <linux/sched/task.h>
798c2ecf20Sopenharmony_ci#include <linux/nodemask.h>
808c2ecf20Sopenharmony_ci#include <linux/cpuset.h>
818c2ecf20Sopenharmony_ci#include <linux/slab.h>
828c2ecf20Sopenharmony_ci#include <linux/string.h>
838c2ecf20Sopenharmony_ci#include <linux/export.h>
848c2ecf20Sopenharmony_ci#include <linux/nsproxy.h>
858c2ecf20Sopenharmony_ci#include <linux/interrupt.h>
868c2ecf20Sopenharmony_ci#include <linux/init.h>
878c2ecf20Sopenharmony_ci#include <linux/compat.h>
888c2ecf20Sopenharmony_ci#include <linux/ptrace.h>
898c2ecf20Sopenharmony_ci#include <linux/swap.h>
908c2ecf20Sopenharmony_ci#include <linux/seq_file.h>
918c2ecf20Sopenharmony_ci#include <linux/proc_fs.h>
928c2ecf20Sopenharmony_ci#include <linux/migrate.h>
938c2ecf20Sopenharmony_ci#include <linux/ksm.h>
948c2ecf20Sopenharmony_ci#include <linux/rmap.h>
958c2ecf20Sopenharmony_ci#include <linux/security.h>
968c2ecf20Sopenharmony_ci#include <linux/syscalls.h>
978c2ecf20Sopenharmony_ci#include <linux/ctype.h>
988c2ecf20Sopenharmony_ci#include <linux/mm_inline.h>
998c2ecf20Sopenharmony_ci#include <linux/mmu_notifier.h>
1008c2ecf20Sopenharmony_ci#include <linux/printk.h>
1018c2ecf20Sopenharmony_ci#include <linux/swapops.h>
1028c2ecf20Sopenharmony_ci
1038c2ecf20Sopenharmony_ci#include <asm/tlbflush.h>
1048c2ecf20Sopenharmony_ci#include <linux/uaccess.h>
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_ci#include "internal.h"
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci/* Internal flags */
1098c2ecf20Sopenharmony_ci#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
1108c2ecf20Sopenharmony_ci#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_cistatic struct kmem_cache *policy_cache;
1138c2ecf20Sopenharmony_cistatic struct kmem_cache *sn_cache;
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_ci/* Highest zone. An specific allocation for a zone below that is not
1168c2ecf20Sopenharmony_ci   policied. */
1178c2ecf20Sopenharmony_cienum zone_type policy_zone = 0;
1188c2ecf20Sopenharmony_ci
1198c2ecf20Sopenharmony_ci/*
1208c2ecf20Sopenharmony_ci * run-time system-wide default policy => local allocation
1218c2ecf20Sopenharmony_ci */
1228c2ecf20Sopenharmony_cistatic struct mempolicy default_policy = {
1238c2ecf20Sopenharmony_ci	.refcnt = ATOMIC_INIT(1), /* never free it */
1248c2ecf20Sopenharmony_ci	.mode = MPOL_PREFERRED,
1258c2ecf20Sopenharmony_ci	.flags = MPOL_F_LOCAL,
1268c2ecf20Sopenharmony_ci};
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_cistatic struct mempolicy preferred_node_policy[MAX_NUMNODES];
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci/**
1318c2ecf20Sopenharmony_ci * numa_map_to_online_node - Find closest online node
1328c2ecf20Sopenharmony_ci * @node: Node id to start the search
1338c2ecf20Sopenharmony_ci *
1348c2ecf20Sopenharmony_ci * Lookup the next closest node by distance if @nid is not online.
1358c2ecf20Sopenharmony_ci */
1368c2ecf20Sopenharmony_ciint numa_map_to_online_node(int node)
1378c2ecf20Sopenharmony_ci{
1388c2ecf20Sopenharmony_ci	int min_dist = INT_MAX, dist, n, min_node;
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci	if (node == NUMA_NO_NODE || node_online(node))
1418c2ecf20Sopenharmony_ci		return node;
1428c2ecf20Sopenharmony_ci
1438c2ecf20Sopenharmony_ci	min_node = node;
1448c2ecf20Sopenharmony_ci	for_each_online_node(n) {
1458c2ecf20Sopenharmony_ci		dist = node_distance(node, n);
1468c2ecf20Sopenharmony_ci		if (dist < min_dist) {
1478c2ecf20Sopenharmony_ci			min_dist = dist;
1488c2ecf20Sopenharmony_ci			min_node = n;
1498c2ecf20Sopenharmony_ci		}
1508c2ecf20Sopenharmony_ci	}
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	return min_node;
1538c2ecf20Sopenharmony_ci}
1548c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(numa_map_to_online_node);
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_cistruct mempolicy *get_task_policy(struct task_struct *p)
1578c2ecf20Sopenharmony_ci{
1588c2ecf20Sopenharmony_ci	struct mempolicy *pol = p->mempolicy;
1598c2ecf20Sopenharmony_ci	int node;
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_ci	if (pol)
1628c2ecf20Sopenharmony_ci		return pol;
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_ci	node = numa_node_id();
1658c2ecf20Sopenharmony_ci	if (node != NUMA_NO_NODE) {
1668c2ecf20Sopenharmony_ci		pol = &preferred_node_policy[node];
1678c2ecf20Sopenharmony_ci		/* preferred_node_policy is not initialised early in boot */
1688c2ecf20Sopenharmony_ci		if (pol->mode)
1698c2ecf20Sopenharmony_ci			return pol;
1708c2ecf20Sopenharmony_ci	}
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	return &default_policy;
1738c2ecf20Sopenharmony_ci}
1748c2ecf20Sopenharmony_ci
1758c2ecf20Sopenharmony_cistatic const struct mempolicy_operations {
1768c2ecf20Sopenharmony_ci	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
1778c2ecf20Sopenharmony_ci	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
1788c2ecf20Sopenharmony_ci} mpol_ops[MPOL_MAX];
1798c2ecf20Sopenharmony_ci
1808c2ecf20Sopenharmony_cistatic inline int mpol_store_user_nodemask(const struct mempolicy *pol)
1818c2ecf20Sopenharmony_ci{
1828c2ecf20Sopenharmony_ci	return pol->flags & MPOL_MODE_FLAGS;
1838c2ecf20Sopenharmony_ci}
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_cistatic void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
1868c2ecf20Sopenharmony_ci				   const nodemask_t *rel)
1878c2ecf20Sopenharmony_ci{
1888c2ecf20Sopenharmony_ci	nodemask_t tmp;
1898c2ecf20Sopenharmony_ci	nodes_fold(tmp, *orig, nodes_weight(*rel));
1908c2ecf20Sopenharmony_ci	nodes_onto(*ret, tmp, *rel);
1918c2ecf20Sopenharmony_ci}
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_cistatic int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
1948c2ecf20Sopenharmony_ci{
1958c2ecf20Sopenharmony_ci	if (nodes_empty(*nodes))
1968c2ecf20Sopenharmony_ci		return -EINVAL;
1978c2ecf20Sopenharmony_ci	pol->v.nodes = *nodes;
1988c2ecf20Sopenharmony_ci	return 0;
1998c2ecf20Sopenharmony_ci}
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_cistatic int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
2028c2ecf20Sopenharmony_ci{
2038c2ecf20Sopenharmony_ci	if (!nodes)
2048c2ecf20Sopenharmony_ci		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
2058c2ecf20Sopenharmony_ci	else if (nodes_empty(*nodes))
2068c2ecf20Sopenharmony_ci		return -EINVAL;			/*  no allowed nodes */
2078c2ecf20Sopenharmony_ci	else
2088c2ecf20Sopenharmony_ci		pol->v.preferred_node = first_node(*nodes);
2098c2ecf20Sopenharmony_ci	return 0;
2108c2ecf20Sopenharmony_ci}
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_cistatic int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
2138c2ecf20Sopenharmony_ci{
2148c2ecf20Sopenharmony_ci	if (nodes_empty(*nodes))
2158c2ecf20Sopenharmony_ci		return -EINVAL;
2168c2ecf20Sopenharmony_ci	pol->v.nodes = *nodes;
2178c2ecf20Sopenharmony_ci	return 0;
2188c2ecf20Sopenharmony_ci}
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_ci/*
2218c2ecf20Sopenharmony_ci * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
2228c2ecf20Sopenharmony_ci * any, for the new policy.  mpol_new() has already validated the nodes
2238c2ecf20Sopenharmony_ci * parameter with respect to the policy mode and flags.  But, we need to
2248c2ecf20Sopenharmony_ci * handle an empty nodemask with MPOL_PREFERRED here.
2258c2ecf20Sopenharmony_ci *
2268c2ecf20Sopenharmony_ci * Must be called holding task's alloc_lock to protect task's mems_allowed
2278c2ecf20Sopenharmony_ci * and mempolicy.  May also be called holding the mmap_lock for write.
2288c2ecf20Sopenharmony_ci */
2298c2ecf20Sopenharmony_cistatic int mpol_set_nodemask(struct mempolicy *pol,
2308c2ecf20Sopenharmony_ci		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
2318c2ecf20Sopenharmony_ci{
2328c2ecf20Sopenharmony_ci	int ret;
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
2358c2ecf20Sopenharmony_ci	if (pol == NULL)
2368c2ecf20Sopenharmony_ci		return 0;
2378c2ecf20Sopenharmony_ci	/* Check N_MEMORY */
2388c2ecf20Sopenharmony_ci	nodes_and(nsc->mask1,
2398c2ecf20Sopenharmony_ci		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
2408c2ecf20Sopenharmony_ci
2418c2ecf20Sopenharmony_ci	VM_BUG_ON(!nodes);
2428c2ecf20Sopenharmony_ci	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
2438c2ecf20Sopenharmony_ci		nodes = NULL;	/* explicit local allocation */
2448c2ecf20Sopenharmony_ci	else {
2458c2ecf20Sopenharmony_ci		if (pol->flags & MPOL_F_RELATIVE_NODES)
2468c2ecf20Sopenharmony_ci			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
2478c2ecf20Sopenharmony_ci		else
2488c2ecf20Sopenharmony_ci			nodes_and(nsc->mask2, *nodes, nsc->mask1);
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_ci		if (mpol_store_user_nodemask(pol))
2518c2ecf20Sopenharmony_ci			pol->w.user_nodemask = *nodes;
2528c2ecf20Sopenharmony_ci		else
2538c2ecf20Sopenharmony_ci			pol->w.cpuset_mems_allowed =
2548c2ecf20Sopenharmony_ci						cpuset_current_mems_allowed;
2558c2ecf20Sopenharmony_ci	}
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	if (nodes)
2588c2ecf20Sopenharmony_ci		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
2598c2ecf20Sopenharmony_ci	else
2608c2ecf20Sopenharmony_ci		ret = mpol_ops[pol->mode].create(pol, NULL);
2618c2ecf20Sopenharmony_ci	return ret;
2628c2ecf20Sopenharmony_ci}
2638c2ecf20Sopenharmony_ci
2648c2ecf20Sopenharmony_ci/*
2658c2ecf20Sopenharmony_ci * This function just creates a new policy, does some check and simple
2668c2ecf20Sopenharmony_ci * initialization. You must invoke mpol_set_nodemask() to set nodes.
2678c2ecf20Sopenharmony_ci */
2688c2ecf20Sopenharmony_cistatic struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
2698c2ecf20Sopenharmony_ci				  nodemask_t *nodes)
2708c2ecf20Sopenharmony_ci{
2718c2ecf20Sopenharmony_ci	struct mempolicy *policy;
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
2748c2ecf20Sopenharmony_ci		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci	if (mode == MPOL_DEFAULT) {
2778c2ecf20Sopenharmony_ci		if (nodes && !nodes_empty(*nodes))
2788c2ecf20Sopenharmony_ci			return ERR_PTR(-EINVAL);
2798c2ecf20Sopenharmony_ci		return NULL;
2808c2ecf20Sopenharmony_ci	}
2818c2ecf20Sopenharmony_ci	VM_BUG_ON(!nodes);
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_ci	/*
2848c2ecf20Sopenharmony_ci	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
2858c2ecf20Sopenharmony_ci	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
2868c2ecf20Sopenharmony_ci	 * All other modes require a valid pointer to a non-empty nodemask.
2878c2ecf20Sopenharmony_ci	 */
2888c2ecf20Sopenharmony_ci	if (mode == MPOL_PREFERRED) {
2898c2ecf20Sopenharmony_ci		if (nodes_empty(*nodes)) {
2908c2ecf20Sopenharmony_ci			if (((flags & MPOL_F_STATIC_NODES) ||
2918c2ecf20Sopenharmony_ci			     (flags & MPOL_F_RELATIVE_NODES)))
2928c2ecf20Sopenharmony_ci				return ERR_PTR(-EINVAL);
2938c2ecf20Sopenharmony_ci		}
2948c2ecf20Sopenharmony_ci	} else if (mode == MPOL_LOCAL) {
2958c2ecf20Sopenharmony_ci		if (!nodes_empty(*nodes) ||
2968c2ecf20Sopenharmony_ci		    (flags & MPOL_F_STATIC_NODES) ||
2978c2ecf20Sopenharmony_ci		    (flags & MPOL_F_RELATIVE_NODES))
2988c2ecf20Sopenharmony_ci			return ERR_PTR(-EINVAL);
2998c2ecf20Sopenharmony_ci		mode = MPOL_PREFERRED;
3008c2ecf20Sopenharmony_ci	} else if (nodes_empty(*nodes))
3018c2ecf20Sopenharmony_ci		return ERR_PTR(-EINVAL);
3028c2ecf20Sopenharmony_ci	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
3038c2ecf20Sopenharmony_ci	if (!policy)
3048c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
3058c2ecf20Sopenharmony_ci	atomic_set(&policy->refcnt, 1);
3068c2ecf20Sopenharmony_ci	policy->mode = mode;
3078c2ecf20Sopenharmony_ci	policy->flags = flags;
3088c2ecf20Sopenharmony_ci
3098c2ecf20Sopenharmony_ci	return policy;
3108c2ecf20Sopenharmony_ci}
3118c2ecf20Sopenharmony_ci
3128c2ecf20Sopenharmony_ci/* Slow path of a mpol destructor. */
3138c2ecf20Sopenharmony_civoid __mpol_put(struct mempolicy *p)
3148c2ecf20Sopenharmony_ci{
3158c2ecf20Sopenharmony_ci	if (!atomic_dec_and_test(&p->refcnt))
3168c2ecf20Sopenharmony_ci		return;
3178c2ecf20Sopenharmony_ci	kmem_cache_free(policy_cache, p);
3188c2ecf20Sopenharmony_ci}
3198c2ecf20Sopenharmony_ci
3208c2ecf20Sopenharmony_cistatic void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
3218c2ecf20Sopenharmony_ci{
3228c2ecf20Sopenharmony_ci}
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_cistatic void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
3258c2ecf20Sopenharmony_ci{
3268c2ecf20Sopenharmony_ci	nodemask_t tmp;
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_ci	if (pol->flags & MPOL_F_STATIC_NODES)
3298c2ecf20Sopenharmony_ci		nodes_and(tmp, pol->w.user_nodemask, *nodes);
3308c2ecf20Sopenharmony_ci	else if (pol->flags & MPOL_F_RELATIVE_NODES)
3318c2ecf20Sopenharmony_ci		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
3328c2ecf20Sopenharmony_ci	else {
3338c2ecf20Sopenharmony_ci		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
3348c2ecf20Sopenharmony_ci								*nodes);
3358c2ecf20Sopenharmony_ci		pol->w.cpuset_mems_allowed = *nodes;
3368c2ecf20Sopenharmony_ci	}
3378c2ecf20Sopenharmony_ci
3388c2ecf20Sopenharmony_ci	if (nodes_empty(tmp))
3398c2ecf20Sopenharmony_ci		tmp = *nodes;
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	pol->v.nodes = tmp;
3428c2ecf20Sopenharmony_ci}
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_cistatic void mpol_rebind_preferred(struct mempolicy *pol,
3458c2ecf20Sopenharmony_ci						const nodemask_t *nodes)
3468c2ecf20Sopenharmony_ci{
3478c2ecf20Sopenharmony_ci	nodemask_t tmp;
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci	if (pol->flags & MPOL_F_STATIC_NODES) {
3508c2ecf20Sopenharmony_ci		int node = first_node(pol->w.user_nodemask);
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci		if (node_isset(node, *nodes)) {
3538c2ecf20Sopenharmony_ci			pol->v.preferred_node = node;
3548c2ecf20Sopenharmony_ci			pol->flags &= ~MPOL_F_LOCAL;
3558c2ecf20Sopenharmony_ci		} else
3568c2ecf20Sopenharmony_ci			pol->flags |= MPOL_F_LOCAL;
3578c2ecf20Sopenharmony_ci	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
3588c2ecf20Sopenharmony_ci		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
3598c2ecf20Sopenharmony_ci		pol->v.preferred_node = first_node(tmp);
3608c2ecf20Sopenharmony_ci	} else if (!(pol->flags & MPOL_F_LOCAL)) {
3618c2ecf20Sopenharmony_ci		pol->v.preferred_node = node_remap(pol->v.preferred_node,
3628c2ecf20Sopenharmony_ci						   pol->w.cpuset_mems_allowed,
3638c2ecf20Sopenharmony_ci						   *nodes);
3648c2ecf20Sopenharmony_ci		pol->w.cpuset_mems_allowed = *nodes;
3658c2ecf20Sopenharmony_ci	}
3668c2ecf20Sopenharmony_ci}
3678c2ecf20Sopenharmony_ci
3688c2ecf20Sopenharmony_ci/*
3698c2ecf20Sopenharmony_ci * mpol_rebind_policy - Migrate a policy to a different set of nodes
3708c2ecf20Sopenharmony_ci *
3718c2ecf20Sopenharmony_ci * Per-vma policies are protected by mmap_lock. Allocations using per-task
3728c2ecf20Sopenharmony_ci * policies are protected by task->mems_allowed_seq to prevent a premature
3738c2ecf20Sopenharmony_ci * OOM/allocation failure due to parallel nodemask modification.
3748c2ecf20Sopenharmony_ci */
3758c2ecf20Sopenharmony_cistatic void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
3768c2ecf20Sopenharmony_ci{
3778c2ecf20Sopenharmony_ci	if (!pol || pol->mode == MPOL_LOCAL)
3788c2ecf20Sopenharmony_ci		return;
3798c2ecf20Sopenharmony_ci	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
3808c2ecf20Sopenharmony_ci	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
3818c2ecf20Sopenharmony_ci		return;
3828c2ecf20Sopenharmony_ci
3838c2ecf20Sopenharmony_ci	mpol_ops[pol->mode].rebind(pol, newmask);
3848c2ecf20Sopenharmony_ci}
3858c2ecf20Sopenharmony_ci
3868c2ecf20Sopenharmony_ci/*
3878c2ecf20Sopenharmony_ci * Wrapper for mpol_rebind_policy() that just requires task
3888c2ecf20Sopenharmony_ci * pointer, and updates task mempolicy.
3898c2ecf20Sopenharmony_ci *
3908c2ecf20Sopenharmony_ci * Called with task's alloc_lock held.
3918c2ecf20Sopenharmony_ci */
3928c2ecf20Sopenharmony_ci
3938c2ecf20Sopenharmony_civoid mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
3948c2ecf20Sopenharmony_ci{
3958c2ecf20Sopenharmony_ci	mpol_rebind_policy(tsk->mempolicy, new);
3968c2ecf20Sopenharmony_ci}
3978c2ecf20Sopenharmony_ci
3988c2ecf20Sopenharmony_ci/*
3998c2ecf20Sopenharmony_ci * Rebind each vma in mm to new nodemask.
4008c2ecf20Sopenharmony_ci *
4018c2ecf20Sopenharmony_ci * Call holding a reference to mm.  Takes mm->mmap_lock during call.
4028c2ecf20Sopenharmony_ci */
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_civoid mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
4058c2ecf20Sopenharmony_ci{
4068c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
4078c2ecf20Sopenharmony_ci
4088c2ecf20Sopenharmony_ci	mmap_write_lock(mm);
4098c2ecf20Sopenharmony_ci	for (vma = mm->mmap; vma; vma = vma->vm_next)
4108c2ecf20Sopenharmony_ci		mpol_rebind_policy(vma->vm_policy, new);
4118c2ecf20Sopenharmony_ci	mmap_write_unlock(mm);
4128c2ecf20Sopenharmony_ci}
4138c2ecf20Sopenharmony_ci
4148c2ecf20Sopenharmony_cistatic const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
4158c2ecf20Sopenharmony_ci	[MPOL_DEFAULT] = {
4168c2ecf20Sopenharmony_ci		.rebind = mpol_rebind_default,
4178c2ecf20Sopenharmony_ci	},
4188c2ecf20Sopenharmony_ci	[MPOL_INTERLEAVE] = {
4198c2ecf20Sopenharmony_ci		.create = mpol_new_interleave,
4208c2ecf20Sopenharmony_ci		.rebind = mpol_rebind_nodemask,
4218c2ecf20Sopenharmony_ci	},
4228c2ecf20Sopenharmony_ci	[MPOL_PREFERRED] = {
4238c2ecf20Sopenharmony_ci		.create = mpol_new_preferred,
4248c2ecf20Sopenharmony_ci		.rebind = mpol_rebind_preferred,
4258c2ecf20Sopenharmony_ci	},
4268c2ecf20Sopenharmony_ci	[MPOL_BIND] = {
4278c2ecf20Sopenharmony_ci		.create = mpol_new_bind,
4288c2ecf20Sopenharmony_ci		.rebind = mpol_rebind_nodemask,
4298c2ecf20Sopenharmony_ci	},
4308c2ecf20Sopenharmony_ci};
4318c2ecf20Sopenharmony_ci
4328c2ecf20Sopenharmony_cistatic int migrate_page_add(struct page *page, struct list_head *pagelist,
4338c2ecf20Sopenharmony_ci				unsigned long flags);
4348c2ecf20Sopenharmony_ci
4358c2ecf20Sopenharmony_cistruct queue_pages {
4368c2ecf20Sopenharmony_ci	struct list_head *pagelist;
4378c2ecf20Sopenharmony_ci	unsigned long flags;
4388c2ecf20Sopenharmony_ci	nodemask_t *nmask;
4398c2ecf20Sopenharmony_ci	unsigned long start;
4408c2ecf20Sopenharmony_ci	unsigned long end;
4418c2ecf20Sopenharmony_ci	struct vm_area_struct *first;
4428c2ecf20Sopenharmony_ci};
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_ci/*
4458c2ecf20Sopenharmony_ci * Check if the page's nid is in qp->nmask.
4468c2ecf20Sopenharmony_ci *
4478c2ecf20Sopenharmony_ci * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
4488c2ecf20Sopenharmony_ci * in the invert of qp->nmask.
4498c2ecf20Sopenharmony_ci */
4508c2ecf20Sopenharmony_cistatic inline bool queue_pages_required(struct page *page,
4518c2ecf20Sopenharmony_ci					struct queue_pages *qp)
4528c2ecf20Sopenharmony_ci{
4538c2ecf20Sopenharmony_ci	int nid = page_to_nid(page);
4548c2ecf20Sopenharmony_ci	unsigned long flags = qp->flags;
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
4578c2ecf20Sopenharmony_ci}
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci/*
4608c2ecf20Sopenharmony_ci * queue_pages_pmd() has four possible return values:
4618c2ecf20Sopenharmony_ci * 0 - pages are placed on the right node or queued successfully.
4628c2ecf20Sopenharmony_ci * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
4638c2ecf20Sopenharmony_ci *     specified.
4648c2ecf20Sopenharmony_ci * 2 - THP was split.
4658c2ecf20Sopenharmony_ci * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
4668c2ecf20Sopenharmony_ci *        existing page was already on a node that does not follow the
4678c2ecf20Sopenharmony_ci *        policy.
4688c2ecf20Sopenharmony_ci */
4698c2ecf20Sopenharmony_cistatic int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
4708c2ecf20Sopenharmony_ci				unsigned long end, struct mm_walk *walk)
4718c2ecf20Sopenharmony_ci	__releases(ptl)
4728c2ecf20Sopenharmony_ci{
4738c2ecf20Sopenharmony_ci	int ret = 0;
4748c2ecf20Sopenharmony_ci	struct page *page;
4758c2ecf20Sopenharmony_ci	struct queue_pages *qp = walk->private;
4768c2ecf20Sopenharmony_ci	unsigned long flags;
4778c2ecf20Sopenharmony_ci
4788c2ecf20Sopenharmony_ci	if (unlikely(is_pmd_migration_entry(*pmd))) {
4798c2ecf20Sopenharmony_ci		ret = -EIO;
4808c2ecf20Sopenharmony_ci		goto unlock;
4818c2ecf20Sopenharmony_ci	}
4828c2ecf20Sopenharmony_ci	page = pmd_page(*pmd);
4838c2ecf20Sopenharmony_ci	if (is_huge_zero_page(page)) {
4848c2ecf20Sopenharmony_ci		spin_unlock(ptl);
4858c2ecf20Sopenharmony_ci		__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
4868c2ecf20Sopenharmony_ci		ret = 2;
4878c2ecf20Sopenharmony_ci		goto out;
4888c2ecf20Sopenharmony_ci	}
4898c2ecf20Sopenharmony_ci	if (!queue_pages_required(page, qp))
4908c2ecf20Sopenharmony_ci		goto unlock;
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci	flags = qp->flags;
4938c2ecf20Sopenharmony_ci	/* go to thp migration */
4948c2ecf20Sopenharmony_ci	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
4958c2ecf20Sopenharmony_ci		if (!vma_migratable(walk->vma) ||
4968c2ecf20Sopenharmony_ci		    migrate_page_add(page, qp->pagelist, flags)) {
4978c2ecf20Sopenharmony_ci			ret = 1;
4988c2ecf20Sopenharmony_ci			goto unlock;
4998c2ecf20Sopenharmony_ci		}
5008c2ecf20Sopenharmony_ci	} else
5018c2ecf20Sopenharmony_ci		ret = -EIO;
5028c2ecf20Sopenharmony_ciunlock:
5038c2ecf20Sopenharmony_ci	spin_unlock(ptl);
5048c2ecf20Sopenharmony_ciout:
5058c2ecf20Sopenharmony_ci	return ret;
5068c2ecf20Sopenharmony_ci}
5078c2ecf20Sopenharmony_ci
5088c2ecf20Sopenharmony_ci/*
5098c2ecf20Sopenharmony_ci * Scan through pages checking if pages follow certain conditions,
5108c2ecf20Sopenharmony_ci * and move them to the pagelist if they do.
5118c2ecf20Sopenharmony_ci *
5128c2ecf20Sopenharmony_ci * queue_pages_pte_range() has three possible return values:
5138c2ecf20Sopenharmony_ci * 0 - pages are placed on the right node or queued successfully.
5148c2ecf20Sopenharmony_ci * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
5158c2ecf20Sopenharmony_ci *     specified.
5168c2ecf20Sopenharmony_ci * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
5178c2ecf20Sopenharmony_ci *        on a node that does not follow the policy.
5188c2ecf20Sopenharmony_ci */
5198c2ecf20Sopenharmony_cistatic int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
5208c2ecf20Sopenharmony_ci			unsigned long end, struct mm_walk *walk)
5218c2ecf20Sopenharmony_ci{
5228c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
5238c2ecf20Sopenharmony_ci	struct page *page;
5248c2ecf20Sopenharmony_ci	struct queue_pages *qp = walk->private;
5258c2ecf20Sopenharmony_ci	unsigned long flags = qp->flags;
5268c2ecf20Sopenharmony_ci	int ret;
5278c2ecf20Sopenharmony_ci	bool has_unmovable = false;
5288c2ecf20Sopenharmony_ci	pte_t *pte, *mapped_pte;
5298c2ecf20Sopenharmony_ci	spinlock_t *ptl;
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci	ptl = pmd_trans_huge_lock(pmd, vma);
5328c2ecf20Sopenharmony_ci	if (ptl) {
5338c2ecf20Sopenharmony_ci		ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
5348c2ecf20Sopenharmony_ci		if (ret != 2)
5358c2ecf20Sopenharmony_ci			return ret;
5368c2ecf20Sopenharmony_ci	}
5378c2ecf20Sopenharmony_ci	/* THP was split, fall through to pte walk */
5388c2ecf20Sopenharmony_ci
5398c2ecf20Sopenharmony_ci	if (pmd_trans_unstable(pmd))
5408c2ecf20Sopenharmony_ci		return 0;
5418c2ecf20Sopenharmony_ci
5428c2ecf20Sopenharmony_ci	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
5438c2ecf20Sopenharmony_ci	for (; addr != end; pte++, addr += PAGE_SIZE) {
5448c2ecf20Sopenharmony_ci		if (!pte_present(*pte))
5458c2ecf20Sopenharmony_ci			continue;
5468c2ecf20Sopenharmony_ci		page = vm_normal_page(vma, addr, *pte);
5478c2ecf20Sopenharmony_ci		if (!page)
5488c2ecf20Sopenharmony_ci			continue;
5498c2ecf20Sopenharmony_ci		/*
5508c2ecf20Sopenharmony_ci		 * vm_normal_page() filters out zero pages, but there might
5518c2ecf20Sopenharmony_ci		 * still be PageReserved pages to skip, perhaps in a VDSO.
5528c2ecf20Sopenharmony_ci		 */
5538c2ecf20Sopenharmony_ci		if (PageReserved(page))
5548c2ecf20Sopenharmony_ci			continue;
5558c2ecf20Sopenharmony_ci		if (!queue_pages_required(page, qp))
5568c2ecf20Sopenharmony_ci			continue;
5578c2ecf20Sopenharmony_ci		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
5588c2ecf20Sopenharmony_ci			/* MPOL_MF_STRICT must be specified if we get here */
5598c2ecf20Sopenharmony_ci			if (!vma_migratable(vma)) {
5608c2ecf20Sopenharmony_ci				has_unmovable = true;
5618c2ecf20Sopenharmony_ci				break;
5628c2ecf20Sopenharmony_ci			}
5638c2ecf20Sopenharmony_ci
5648c2ecf20Sopenharmony_ci			/*
5658c2ecf20Sopenharmony_ci			 * Do not abort immediately since there may be
5668c2ecf20Sopenharmony_ci			 * temporary off LRU pages in the range.  Still
5678c2ecf20Sopenharmony_ci			 * need migrate other LRU pages.
5688c2ecf20Sopenharmony_ci			 */
5698c2ecf20Sopenharmony_ci			if (migrate_page_add(page, qp->pagelist, flags))
5708c2ecf20Sopenharmony_ci				has_unmovable = true;
5718c2ecf20Sopenharmony_ci		} else
5728c2ecf20Sopenharmony_ci			break;
5738c2ecf20Sopenharmony_ci	}
5748c2ecf20Sopenharmony_ci	pte_unmap_unlock(mapped_pte, ptl);
5758c2ecf20Sopenharmony_ci	cond_resched();
5768c2ecf20Sopenharmony_ci
5778c2ecf20Sopenharmony_ci	if (has_unmovable)
5788c2ecf20Sopenharmony_ci		return 1;
5798c2ecf20Sopenharmony_ci
5808c2ecf20Sopenharmony_ci	return addr != end ? -EIO : 0;
5818c2ecf20Sopenharmony_ci}
5828c2ecf20Sopenharmony_ci
5838c2ecf20Sopenharmony_cistatic int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
5848c2ecf20Sopenharmony_ci			       unsigned long addr, unsigned long end,
5858c2ecf20Sopenharmony_ci			       struct mm_walk *walk)
5868c2ecf20Sopenharmony_ci{
5878c2ecf20Sopenharmony_ci	int ret = 0;
5888c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE
5898c2ecf20Sopenharmony_ci	struct queue_pages *qp = walk->private;
5908c2ecf20Sopenharmony_ci	unsigned long flags = (qp->flags & MPOL_MF_VALID);
5918c2ecf20Sopenharmony_ci	struct page *page;
5928c2ecf20Sopenharmony_ci	spinlock_t *ptl;
5938c2ecf20Sopenharmony_ci	pte_t entry;
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ci	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
5968c2ecf20Sopenharmony_ci	entry = huge_ptep_get(pte);
5978c2ecf20Sopenharmony_ci	if (!pte_present(entry))
5988c2ecf20Sopenharmony_ci		goto unlock;
5998c2ecf20Sopenharmony_ci	page = pte_page(entry);
6008c2ecf20Sopenharmony_ci	if (!queue_pages_required(page, qp))
6018c2ecf20Sopenharmony_ci		goto unlock;
6028c2ecf20Sopenharmony_ci
6038c2ecf20Sopenharmony_ci	if (flags == MPOL_MF_STRICT) {
6048c2ecf20Sopenharmony_ci		/*
6058c2ecf20Sopenharmony_ci		 * STRICT alone means only detecting misplaced page and no
6068c2ecf20Sopenharmony_ci		 * need to further check other vma.
6078c2ecf20Sopenharmony_ci		 */
6088c2ecf20Sopenharmony_ci		ret = -EIO;
6098c2ecf20Sopenharmony_ci		goto unlock;
6108c2ecf20Sopenharmony_ci	}
6118c2ecf20Sopenharmony_ci
6128c2ecf20Sopenharmony_ci	if (!vma_migratable(walk->vma)) {
6138c2ecf20Sopenharmony_ci		/*
6148c2ecf20Sopenharmony_ci		 * Must be STRICT with MOVE*, otherwise .test_walk() have
6158c2ecf20Sopenharmony_ci		 * stopped walking current vma.
6168c2ecf20Sopenharmony_ci		 * Detecting misplaced page but allow migrating pages which
6178c2ecf20Sopenharmony_ci		 * have been queued.
6188c2ecf20Sopenharmony_ci		 */
6198c2ecf20Sopenharmony_ci		ret = 1;
6208c2ecf20Sopenharmony_ci		goto unlock;
6218c2ecf20Sopenharmony_ci	}
6228c2ecf20Sopenharmony_ci
6238c2ecf20Sopenharmony_ci	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
6248c2ecf20Sopenharmony_ci	if (flags & (MPOL_MF_MOVE_ALL) ||
6258c2ecf20Sopenharmony_ci	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
6268c2ecf20Sopenharmony_ci	     !hugetlb_pmd_shared(pte))) {
6278c2ecf20Sopenharmony_ci		if (isolate_hugetlb(page, qp->pagelist) &&
6288c2ecf20Sopenharmony_ci			(flags & MPOL_MF_STRICT))
6298c2ecf20Sopenharmony_ci			/*
6308c2ecf20Sopenharmony_ci			 * Failed to isolate page but allow migrating pages
6318c2ecf20Sopenharmony_ci			 * which have been queued.
6328c2ecf20Sopenharmony_ci			 */
6338c2ecf20Sopenharmony_ci			ret = 1;
6348c2ecf20Sopenharmony_ci	}
6358c2ecf20Sopenharmony_ciunlock:
6368c2ecf20Sopenharmony_ci	spin_unlock(ptl);
6378c2ecf20Sopenharmony_ci#else
6388c2ecf20Sopenharmony_ci	BUG();
6398c2ecf20Sopenharmony_ci#endif
6408c2ecf20Sopenharmony_ci	return ret;
6418c2ecf20Sopenharmony_ci}
6428c2ecf20Sopenharmony_ci
6438c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA_BALANCING
6448c2ecf20Sopenharmony_ci/*
6458c2ecf20Sopenharmony_ci * This is used to mark a range of virtual addresses to be inaccessible.
6468c2ecf20Sopenharmony_ci * These are later cleared by a NUMA hinting fault. Depending on these
6478c2ecf20Sopenharmony_ci * faults, pages may be migrated for better NUMA placement.
6488c2ecf20Sopenharmony_ci *
6498c2ecf20Sopenharmony_ci * This is assuming that NUMA faults are handled using PROT_NONE. If
6508c2ecf20Sopenharmony_ci * an architecture makes a different choice, it will need further
6518c2ecf20Sopenharmony_ci * changes to the core.
6528c2ecf20Sopenharmony_ci */
6538c2ecf20Sopenharmony_ciunsigned long change_prot_numa(struct vm_area_struct *vma,
6548c2ecf20Sopenharmony_ci			unsigned long addr, unsigned long end)
6558c2ecf20Sopenharmony_ci{
6568c2ecf20Sopenharmony_ci	int nr_updated;
6578c2ecf20Sopenharmony_ci
6588c2ecf20Sopenharmony_ci	nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
6598c2ecf20Sopenharmony_ci	if (nr_updated)
6608c2ecf20Sopenharmony_ci		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
6618c2ecf20Sopenharmony_ci
6628c2ecf20Sopenharmony_ci	return nr_updated;
6638c2ecf20Sopenharmony_ci}
6648c2ecf20Sopenharmony_ci#else
6658c2ecf20Sopenharmony_cistatic unsigned long change_prot_numa(struct vm_area_struct *vma,
6668c2ecf20Sopenharmony_ci			unsigned long addr, unsigned long end)
6678c2ecf20Sopenharmony_ci{
6688c2ecf20Sopenharmony_ci	return 0;
6698c2ecf20Sopenharmony_ci}
6708c2ecf20Sopenharmony_ci#endif /* CONFIG_NUMA_BALANCING */
6718c2ecf20Sopenharmony_ci
6728c2ecf20Sopenharmony_cistatic int queue_pages_test_walk(unsigned long start, unsigned long end,
6738c2ecf20Sopenharmony_ci				struct mm_walk *walk)
6748c2ecf20Sopenharmony_ci{
6758c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
6768c2ecf20Sopenharmony_ci	struct queue_pages *qp = walk->private;
6778c2ecf20Sopenharmony_ci	unsigned long endvma = vma->vm_end;
6788c2ecf20Sopenharmony_ci	unsigned long flags = qp->flags;
6798c2ecf20Sopenharmony_ci
6808c2ecf20Sopenharmony_ci	/* range check first */
6818c2ecf20Sopenharmony_ci	VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
6828c2ecf20Sopenharmony_ci
6838c2ecf20Sopenharmony_ci	if (!qp->first) {
6848c2ecf20Sopenharmony_ci		qp->first = vma;
6858c2ecf20Sopenharmony_ci		if (!(flags & MPOL_MF_DISCONTIG_OK) &&
6868c2ecf20Sopenharmony_ci			(qp->start < vma->vm_start))
6878c2ecf20Sopenharmony_ci			/* hole at head side of range */
6888c2ecf20Sopenharmony_ci			return -EFAULT;
6898c2ecf20Sopenharmony_ci	}
6908c2ecf20Sopenharmony_ci	if (!(flags & MPOL_MF_DISCONTIG_OK) &&
6918c2ecf20Sopenharmony_ci		((vma->vm_end < qp->end) &&
6928c2ecf20Sopenharmony_ci		(!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
6938c2ecf20Sopenharmony_ci		/* hole at middle or tail of range */
6948c2ecf20Sopenharmony_ci		return -EFAULT;
6958c2ecf20Sopenharmony_ci
6968c2ecf20Sopenharmony_ci	/*
6978c2ecf20Sopenharmony_ci	 * Need check MPOL_MF_STRICT to return -EIO if possible
6988c2ecf20Sopenharmony_ci	 * regardless of vma_migratable
6998c2ecf20Sopenharmony_ci	 */
7008c2ecf20Sopenharmony_ci	if (!vma_migratable(vma) &&
7018c2ecf20Sopenharmony_ci	    !(flags & MPOL_MF_STRICT))
7028c2ecf20Sopenharmony_ci		return 1;
7038c2ecf20Sopenharmony_ci
7048c2ecf20Sopenharmony_ci	if (endvma > end)
7058c2ecf20Sopenharmony_ci		endvma = end;
7068c2ecf20Sopenharmony_ci
7078c2ecf20Sopenharmony_ci	if (flags & MPOL_MF_LAZY) {
7088c2ecf20Sopenharmony_ci		/* Similar to task_numa_work, skip inaccessible VMAs */
7098c2ecf20Sopenharmony_ci		if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
7108c2ecf20Sopenharmony_ci			!(vma->vm_flags & VM_MIXEDMAP))
7118c2ecf20Sopenharmony_ci			change_prot_numa(vma, start, endvma);
7128c2ecf20Sopenharmony_ci		return 1;
7138c2ecf20Sopenharmony_ci	}
7148c2ecf20Sopenharmony_ci
7158c2ecf20Sopenharmony_ci	/* queue pages from current vma */
7168c2ecf20Sopenharmony_ci	if (flags & MPOL_MF_VALID)
7178c2ecf20Sopenharmony_ci		return 0;
7188c2ecf20Sopenharmony_ci	return 1;
7198c2ecf20Sopenharmony_ci}
7208c2ecf20Sopenharmony_ci
7218c2ecf20Sopenharmony_cistatic const struct mm_walk_ops queue_pages_walk_ops = {
7228c2ecf20Sopenharmony_ci	.hugetlb_entry		= queue_pages_hugetlb,
7238c2ecf20Sopenharmony_ci	.pmd_entry		= queue_pages_pte_range,
7248c2ecf20Sopenharmony_ci	.test_walk		= queue_pages_test_walk,
7258c2ecf20Sopenharmony_ci};
7268c2ecf20Sopenharmony_ci
7278c2ecf20Sopenharmony_ci/*
7288c2ecf20Sopenharmony_ci * Walk through page tables and collect pages to be migrated.
7298c2ecf20Sopenharmony_ci *
7308c2ecf20Sopenharmony_ci * If pages found in a given range are on a set of nodes (determined by
7318c2ecf20Sopenharmony_ci * @nodes and @flags,) it's isolated and queued to the pagelist which is
7328c2ecf20Sopenharmony_ci * passed via @private.
7338c2ecf20Sopenharmony_ci *
7348c2ecf20Sopenharmony_ci * queue_pages_range() has three possible return values:
7358c2ecf20Sopenharmony_ci * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
7368c2ecf20Sopenharmony_ci *     specified.
7378c2ecf20Sopenharmony_ci * 0 - queue pages successfully or no misplaced page.
7388c2ecf20Sopenharmony_ci * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
7398c2ecf20Sopenharmony_ci *         memory range specified by nodemask and maxnode points outside
7408c2ecf20Sopenharmony_ci *         your accessible address space (-EFAULT)
7418c2ecf20Sopenharmony_ci */
7428c2ecf20Sopenharmony_cistatic int
7438c2ecf20Sopenharmony_ciqueue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
7448c2ecf20Sopenharmony_ci		nodemask_t *nodes, unsigned long flags,
7458c2ecf20Sopenharmony_ci		struct list_head *pagelist)
7468c2ecf20Sopenharmony_ci{
7478c2ecf20Sopenharmony_ci	int err;
7488c2ecf20Sopenharmony_ci	struct queue_pages qp = {
7498c2ecf20Sopenharmony_ci		.pagelist = pagelist,
7508c2ecf20Sopenharmony_ci		.flags = flags,
7518c2ecf20Sopenharmony_ci		.nmask = nodes,
7528c2ecf20Sopenharmony_ci		.start = start,
7538c2ecf20Sopenharmony_ci		.end = end,
7548c2ecf20Sopenharmony_ci		.first = NULL,
7558c2ecf20Sopenharmony_ci	};
7568c2ecf20Sopenharmony_ci
7578c2ecf20Sopenharmony_ci	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
7588c2ecf20Sopenharmony_ci
7598c2ecf20Sopenharmony_ci	if (!qp.first)
7608c2ecf20Sopenharmony_ci		/* whole range in hole */
7618c2ecf20Sopenharmony_ci		err = -EFAULT;
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_ci	return err;
7648c2ecf20Sopenharmony_ci}
7658c2ecf20Sopenharmony_ci
7668c2ecf20Sopenharmony_ci/*
7678c2ecf20Sopenharmony_ci * Apply policy to a single VMA
7688c2ecf20Sopenharmony_ci * This must be called with the mmap_lock held for writing.
7698c2ecf20Sopenharmony_ci */
7708c2ecf20Sopenharmony_cistatic int vma_replace_policy(struct vm_area_struct *vma,
7718c2ecf20Sopenharmony_ci						struct mempolicy *pol)
7728c2ecf20Sopenharmony_ci{
7738c2ecf20Sopenharmony_ci	int err;
7748c2ecf20Sopenharmony_ci	struct mempolicy *old;
7758c2ecf20Sopenharmony_ci	struct mempolicy *new;
7768c2ecf20Sopenharmony_ci
7778c2ecf20Sopenharmony_ci	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
7788c2ecf20Sopenharmony_ci		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
7798c2ecf20Sopenharmony_ci		 vma->vm_ops, vma->vm_file,
7808c2ecf20Sopenharmony_ci		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
7818c2ecf20Sopenharmony_ci
7828c2ecf20Sopenharmony_ci	new = mpol_dup(pol);
7838c2ecf20Sopenharmony_ci	if (IS_ERR(new))
7848c2ecf20Sopenharmony_ci		return PTR_ERR(new);
7858c2ecf20Sopenharmony_ci
7868c2ecf20Sopenharmony_ci	if (vma->vm_ops && vma->vm_ops->set_policy) {
7878c2ecf20Sopenharmony_ci		err = vma->vm_ops->set_policy(vma, new);
7888c2ecf20Sopenharmony_ci		if (err)
7898c2ecf20Sopenharmony_ci			goto err_out;
7908c2ecf20Sopenharmony_ci	}
7918c2ecf20Sopenharmony_ci
7928c2ecf20Sopenharmony_ci	old = vma->vm_policy;
7938c2ecf20Sopenharmony_ci	vma->vm_policy = new; /* protected by mmap_lock */
7948c2ecf20Sopenharmony_ci	mpol_put(old);
7958c2ecf20Sopenharmony_ci
7968c2ecf20Sopenharmony_ci	return 0;
7978c2ecf20Sopenharmony_ci err_out:
7988c2ecf20Sopenharmony_ci	mpol_put(new);
7998c2ecf20Sopenharmony_ci	return err;
8008c2ecf20Sopenharmony_ci}
8018c2ecf20Sopenharmony_ci
8028c2ecf20Sopenharmony_ci/* Step 2: apply policy to a range and do splits. */
8038c2ecf20Sopenharmony_cistatic int mbind_range(struct mm_struct *mm, unsigned long start,
8048c2ecf20Sopenharmony_ci		       unsigned long end, struct mempolicy *new_pol)
8058c2ecf20Sopenharmony_ci{
8068c2ecf20Sopenharmony_ci	struct vm_area_struct *prev;
8078c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
8088c2ecf20Sopenharmony_ci	int err = 0;
8098c2ecf20Sopenharmony_ci	pgoff_t pgoff;
8108c2ecf20Sopenharmony_ci	unsigned long vmstart;
8118c2ecf20Sopenharmony_ci	unsigned long vmend;
8128c2ecf20Sopenharmony_ci
8138c2ecf20Sopenharmony_ci	vma = find_vma(mm, start);
8148c2ecf20Sopenharmony_ci	VM_BUG_ON(!vma);
8158c2ecf20Sopenharmony_ci
8168c2ecf20Sopenharmony_ci	prev = vma->vm_prev;
8178c2ecf20Sopenharmony_ci	if (start > vma->vm_start)
8188c2ecf20Sopenharmony_ci		prev = vma;
8198c2ecf20Sopenharmony_ci
8208c2ecf20Sopenharmony_ci	for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
8218c2ecf20Sopenharmony_ci		vmstart = max(start, vma->vm_start);
8228c2ecf20Sopenharmony_ci		vmend   = min(end, vma->vm_end);
8238c2ecf20Sopenharmony_ci
8248c2ecf20Sopenharmony_ci		if (mpol_equal(vma_policy(vma), new_pol))
8258c2ecf20Sopenharmony_ci			continue;
8268c2ecf20Sopenharmony_ci
8278c2ecf20Sopenharmony_ci		pgoff = vma->vm_pgoff +
8288c2ecf20Sopenharmony_ci			((vmstart - vma->vm_start) >> PAGE_SHIFT);
8298c2ecf20Sopenharmony_ci		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
8308c2ecf20Sopenharmony_ci				 vma->anon_vma, vma->vm_file, pgoff,
8318c2ecf20Sopenharmony_ci				 new_pol, vma->vm_userfaultfd_ctx,
8328c2ecf20Sopenharmony_ci				 anon_vma_name(vma));
8338c2ecf20Sopenharmony_ci		if (prev) {
8348c2ecf20Sopenharmony_ci			vma = prev;
8358c2ecf20Sopenharmony_ci			goto replace;
8368c2ecf20Sopenharmony_ci		}
8378c2ecf20Sopenharmony_ci		if (vma->vm_start != vmstart) {
8388c2ecf20Sopenharmony_ci			err = split_vma(vma->vm_mm, vma, vmstart, 1);
8398c2ecf20Sopenharmony_ci			if (err)
8408c2ecf20Sopenharmony_ci				goto out;
8418c2ecf20Sopenharmony_ci		}
8428c2ecf20Sopenharmony_ci		if (vma->vm_end != vmend) {
8438c2ecf20Sopenharmony_ci			err = split_vma(vma->vm_mm, vma, vmend, 0);
8448c2ecf20Sopenharmony_ci			if (err)
8458c2ecf20Sopenharmony_ci				goto out;
8468c2ecf20Sopenharmony_ci		}
8478c2ecf20Sopenharmony_ci replace:
8488c2ecf20Sopenharmony_ci		err = vma_replace_policy(vma, new_pol);
8498c2ecf20Sopenharmony_ci		if (err)
8508c2ecf20Sopenharmony_ci			goto out;
8518c2ecf20Sopenharmony_ci	}
8528c2ecf20Sopenharmony_ci
8538c2ecf20Sopenharmony_ci out:
8548c2ecf20Sopenharmony_ci	return err;
8558c2ecf20Sopenharmony_ci}
8568c2ecf20Sopenharmony_ci
8578c2ecf20Sopenharmony_ci/* Set the process memory policy */
8588c2ecf20Sopenharmony_cistatic long do_set_mempolicy(unsigned short mode, unsigned short flags,
8598c2ecf20Sopenharmony_ci			     nodemask_t *nodes)
8608c2ecf20Sopenharmony_ci{
8618c2ecf20Sopenharmony_ci	struct mempolicy *new, *old;
8628c2ecf20Sopenharmony_ci	NODEMASK_SCRATCH(scratch);
8638c2ecf20Sopenharmony_ci	int ret;
8648c2ecf20Sopenharmony_ci
8658c2ecf20Sopenharmony_ci	if (!scratch)
8668c2ecf20Sopenharmony_ci		return -ENOMEM;
8678c2ecf20Sopenharmony_ci
8688c2ecf20Sopenharmony_ci	new = mpol_new(mode, flags, nodes);
8698c2ecf20Sopenharmony_ci	if (IS_ERR(new)) {
8708c2ecf20Sopenharmony_ci		ret = PTR_ERR(new);
8718c2ecf20Sopenharmony_ci		goto out;
8728c2ecf20Sopenharmony_ci	}
8738c2ecf20Sopenharmony_ci
8748c2ecf20Sopenharmony_ci	ret = mpol_set_nodemask(new, nodes, scratch);
8758c2ecf20Sopenharmony_ci	if (ret) {
8768c2ecf20Sopenharmony_ci		mpol_put(new);
8778c2ecf20Sopenharmony_ci		goto out;
8788c2ecf20Sopenharmony_ci	}
8798c2ecf20Sopenharmony_ci	task_lock(current);
8808c2ecf20Sopenharmony_ci	old = current->mempolicy;
8818c2ecf20Sopenharmony_ci	current->mempolicy = new;
8828c2ecf20Sopenharmony_ci	if (new && new->mode == MPOL_INTERLEAVE)
8838c2ecf20Sopenharmony_ci		current->il_prev = MAX_NUMNODES-1;
8848c2ecf20Sopenharmony_ci	task_unlock(current);
8858c2ecf20Sopenharmony_ci	mpol_put(old);
8868c2ecf20Sopenharmony_ci	ret = 0;
8878c2ecf20Sopenharmony_ciout:
8888c2ecf20Sopenharmony_ci	NODEMASK_SCRATCH_FREE(scratch);
8898c2ecf20Sopenharmony_ci	return ret;
8908c2ecf20Sopenharmony_ci}
8918c2ecf20Sopenharmony_ci
8928c2ecf20Sopenharmony_ci/*
8938c2ecf20Sopenharmony_ci * Return nodemask for policy for get_mempolicy() query
8948c2ecf20Sopenharmony_ci *
8958c2ecf20Sopenharmony_ci * Called with task's alloc_lock held
8968c2ecf20Sopenharmony_ci */
8978c2ecf20Sopenharmony_cistatic void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
8988c2ecf20Sopenharmony_ci{
8998c2ecf20Sopenharmony_ci	nodes_clear(*nodes);
9008c2ecf20Sopenharmony_ci	if (p == &default_policy)
9018c2ecf20Sopenharmony_ci		return;
9028c2ecf20Sopenharmony_ci
9038c2ecf20Sopenharmony_ci	switch (p->mode) {
9048c2ecf20Sopenharmony_ci	case MPOL_BIND:
9058c2ecf20Sopenharmony_ci	case MPOL_INTERLEAVE:
9068c2ecf20Sopenharmony_ci		*nodes = p->v.nodes;
9078c2ecf20Sopenharmony_ci		break;
9088c2ecf20Sopenharmony_ci	case MPOL_PREFERRED:
9098c2ecf20Sopenharmony_ci		if (!(p->flags & MPOL_F_LOCAL))
9108c2ecf20Sopenharmony_ci			node_set(p->v.preferred_node, *nodes);
9118c2ecf20Sopenharmony_ci		/* else return empty node mask for local allocation */
9128c2ecf20Sopenharmony_ci		break;
9138c2ecf20Sopenharmony_ci	default:
9148c2ecf20Sopenharmony_ci		BUG();
9158c2ecf20Sopenharmony_ci	}
9168c2ecf20Sopenharmony_ci}
9178c2ecf20Sopenharmony_ci
9188c2ecf20Sopenharmony_cistatic int lookup_node(struct mm_struct *mm, unsigned long addr)
9198c2ecf20Sopenharmony_ci{
9208c2ecf20Sopenharmony_ci	struct page *p = NULL;
9218c2ecf20Sopenharmony_ci	int err;
9228c2ecf20Sopenharmony_ci
9238c2ecf20Sopenharmony_ci	int locked = 1;
9248c2ecf20Sopenharmony_ci	err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
9258c2ecf20Sopenharmony_ci	if (err > 0) {
9268c2ecf20Sopenharmony_ci		err = page_to_nid(p);
9278c2ecf20Sopenharmony_ci		put_page(p);
9288c2ecf20Sopenharmony_ci	}
9298c2ecf20Sopenharmony_ci	if (locked)
9308c2ecf20Sopenharmony_ci		mmap_read_unlock(mm);
9318c2ecf20Sopenharmony_ci	return err;
9328c2ecf20Sopenharmony_ci}
9338c2ecf20Sopenharmony_ci
9348c2ecf20Sopenharmony_ci/* Retrieve NUMA policy */
9358c2ecf20Sopenharmony_cistatic long do_get_mempolicy(int *policy, nodemask_t *nmask,
9368c2ecf20Sopenharmony_ci			     unsigned long addr, unsigned long flags)
9378c2ecf20Sopenharmony_ci{
9388c2ecf20Sopenharmony_ci	int err;
9398c2ecf20Sopenharmony_ci	struct mm_struct *mm = current->mm;
9408c2ecf20Sopenharmony_ci	struct vm_area_struct *vma = NULL;
9418c2ecf20Sopenharmony_ci	struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
9428c2ecf20Sopenharmony_ci
9438c2ecf20Sopenharmony_ci	if (flags &
9448c2ecf20Sopenharmony_ci		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
9458c2ecf20Sopenharmony_ci		return -EINVAL;
9468c2ecf20Sopenharmony_ci
9478c2ecf20Sopenharmony_ci	if (flags & MPOL_F_MEMS_ALLOWED) {
9488c2ecf20Sopenharmony_ci		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
9498c2ecf20Sopenharmony_ci			return -EINVAL;
9508c2ecf20Sopenharmony_ci		*policy = 0;	/* just so it's initialized */
9518c2ecf20Sopenharmony_ci		task_lock(current);
9528c2ecf20Sopenharmony_ci		*nmask  = cpuset_current_mems_allowed;
9538c2ecf20Sopenharmony_ci		task_unlock(current);
9548c2ecf20Sopenharmony_ci		return 0;
9558c2ecf20Sopenharmony_ci	}
9568c2ecf20Sopenharmony_ci
9578c2ecf20Sopenharmony_ci	if (flags & MPOL_F_ADDR) {
9588c2ecf20Sopenharmony_ci		/*
9598c2ecf20Sopenharmony_ci		 * Do NOT fall back to task policy if the
9608c2ecf20Sopenharmony_ci		 * vma/shared policy at addr is NULL.  We
9618c2ecf20Sopenharmony_ci		 * want to return MPOL_DEFAULT in this case.
9628c2ecf20Sopenharmony_ci		 */
9638c2ecf20Sopenharmony_ci		mmap_read_lock(mm);
9648c2ecf20Sopenharmony_ci		vma = find_vma_intersection(mm, addr, addr+1);
9658c2ecf20Sopenharmony_ci		if (!vma) {
9668c2ecf20Sopenharmony_ci			mmap_read_unlock(mm);
9678c2ecf20Sopenharmony_ci			return -EFAULT;
9688c2ecf20Sopenharmony_ci		}
9698c2ecf20Sopenharmony_ci		if (vma->vm_ops && vma->vm_ops->get_policy)
9708c2ecf20Sopenharmony_ci			pol = vma->vm_ops->get_policy(vma, addr);
9718c2ecf20Sopenharmony_ci		else
9728c2ecf20Sopenharmony_ci			pol = vma->vm_policy;
9738c2ecf20Sopenharmony_ci	} else if (addr)
9748c2ecf20Sopenharmony_ci		return -EINVAL;
9758c2ecf20Sopenharmony_ci
9768c2ecf20Sopenharmony_ci	if (!pol)
9778c2ecf20Sopenharmony_ci		pol = &default_policy;	/* indicates default behavior */
9788c2ecf20Sopenharmony_ci
9798c2ecf20Sopenharmony_ci	if (flags & MPOL_F_NODE) {
9808c2ecf20Sopenharmony_ci		if (flags & MPOL_F_ADDR) {
9818c2ecf20Sopenharmony_ci			/*
9828c2ecf20Sopenharmony_ci			 * Take a refcount on the mpol, lookup_node()
9838c2ecf20Sopenharmony_ci			 * wil drop the mmap_lock, so after calling
9848c2ecf20Sopenharmony_ci			 * lookup_node() only "pol" remains valid, "vma"
9858c2ecf20Sopenharmony_ci			 * is stale.
9868c2ecf20Sopenharmony_ci			 */
9878c2ecf20Sopenharmony_ci			pol_refcount = pol;
9888c2ecf20Sopenharmony_ci			vma = NULL;
9898c2ecf20Sopenharmony_ci			mpol_get(pol);
9908c2ecf20Sopenharmony_ci			err = lookup_node(mm, addr);
9918c2ecf20Sopenharmony_ci			if (err < 0)
9928c2ecf20Sopenharmony_ci				goto out;
9938c2ecf20Sopenharmony_ci			*policy = err;
9948c2ecf20Sopenharmony_ci		} else if (pol == current->mempolicy &&
9958c2ecf20Sopenharmony_ci				pol->mode == MPOL_INTERLEAVE) {
9968c2ecf20Sopenharmony_ci			*policy = next_node_in(current->il_prev, pol->v.nodes);
9978c2ecf20Sopenharmony_ci		} else {
9988c2ecf20Sopenharmony_ci			err = -EINVAL;
9998c2ecf20Sopenharmony_ci			goto out;
10008c2ecf20Sopenharmony_ci		}
10018c2ecf20Sopenharmony_ci	} else {
10028c2ecf20Sopenharmony_ci		*policy = pol == &default_policy ? MPOL_DEFAULT :
10038c2ecf20Sopenharmony_ci						pol->mode;
10048c2ecf20Sopenharmony_ci		/*
10058c2ecf20Sopenharmony_ci		 * Internal mempolicy flags must be masked off before exposing
10068c2ecf20Sopenharmony_ci		 * the policy to userspace.
10078c2ecf20Sopenharmony_ci		 */
10088c2ecf20Sopenharmony_ci		*policy |= (pol->flags & MPOL_MODE_FLAGS);
10098c2ecf20Sopenharmony_ci	}
10108c2ecf20Sopenharmony_ci
10118c2ecf20Sopenharmony_ci	err = 0;
10128c2ecf20Sopenharmony_ci	if (nmask) {
10138c2ecf20Sopenharmony_ci		if (mpol_store_user_nodemask(pol)) {
10148c2ecf20Sopenharmony_ci			*nmask = pol->w.user_nodemask;
10158c2ecf20Sopenharmony_ci		} else {
10168c2ecf20Sopenharmony_ci			task_lock(current);
10178c2ecf20Sopenharmony_ci			get_policy_nodemask(pol, nmask);
10188c2ecf20Sopenharmony_ci			task_unlock(current);
10198c2ecf20Sopenharmony_ci		}
10208c2ecf20Sopenharmony_ci	}
10218c2ecf20Sopenharmony_ci
10228c2ecf20Sopenharmony_ci out:
10238c2ecf20Sopenharmony_ci	mpol_cond_put(pol);
10248c2ecf20Sopenharmony_ci	if (vma)
10258c2ecf20Sopenharmony_ci		mmap_read_unlock(mm);
10268c2ecf20Sopenharmony_ci	if (pol_refcount)
10278c2ecf20Sopenharmony_ci		mpol_put(pol_refcount);
10288c2ecf20Sopenharmony_ci	return err;
10298c2ecf20Sopenharmony_ci}
10308c2ecf20Sopenharmony_ci
10318c2ecf20Sopenharmony_ci#ifdef CONFIG_MIGRATION
10328c2ecf20Sopenharmony_ci/*
10338c2ecf20Sopenharmony_ci * page migration, thp tail pages can be passed.
10348c2ecf20Sopenharmony_ci */
10358c2ecf20Sopenharmony_cistatic int migrate_page_add(struct page *page, struct list_head *pagelist,
10368c2ecf20Sopenharmony_ci				unsigned long flags)
10378c2ecf20Sopenharmony_ci{
10388c2ecf20Sopenharmony_ci	struct page *head = compound_head(page);
10398c2ecf20Sopenharmony_ci	/*
10408c2ecf20Sopenharmony_ci	 * Avoid migrating a page that is shared with others.
10418c2ecf20Sopenharmony_ci	 */
10428c2ecf20Sopenharmony_ci	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
10438c2ecf20Sopenharmony_ci		if (!isolate_lru_page(head)) {
10448c2ecf20Sopenharmony_ci			list_add_tail(&head->lru, pagelist);
10458c2ecf20Sopenharmony_ci			mod_node_page_state(page_pgdat(head),
10468c2ecf20Sopenharmony_ci				NR_ISOLATED_ANON + page_is_file_lru(head),
10478c2ecf20Sopenharmony_ci				thp_nr_pages(head));
10488c2ecf20Sopenharmony_ci		} else if (flags & MPOL_MF_STRICT) {
10498c2ecf20Sopenharmony_ci			/*
10508c2ecf20Sopenharmony_ci			 * Non-movable page may reach here.  And, there may be
10518c2ecf20Sopenharmony_ci			 * temporary off LRU pages or non-LRU movable pages.
10528c2ecf20Sopenharmony_ci			 * Treat them as unmovable pages since they can't be
10538c2ecf20Sopenharmony_ci			 * isolated, so they can't be moved at the moment.  It
10548c2ecf20Sopenharmony_ci			 * should return -EIO for this case too.
10558c2ecf20Sopenharmony_ci			 */
10568c2ecf20Sopenharmony_ci			return -EIO;
10578c2ecf20Sopenharmony_ci		}
10588c2ecf20Sopenharmony_ci	}
10598c2ecf20Sopenharmony_ci
10608c2ecf20Sopenharmony_ci	return 0;
10618c2ecf20Sopenharmony_ci}
10628c2ecf20Sopenharmony_ci
10638c2ecf20Sopenharmony_ci/*
10648c2ecf20Sopenharmony_ci * Migrate pages from one node to a target node.
10658c2ecf20Sopenharmony_ci * Returns error or the number of pages not migrated.
10668c2ecf20Sopenharmony_ci */
10678c2ecf20Sopenharmony_cistatic int migrate_to_node(struct mm_struct *mm, int source, int dest,
10688c2ecf20Sopenharmony_ci			   int flags)
10698c2ecf20Sopenharmony_ci{
10708c2ecf20Sopenharmony_ci	nodemask_t nmask;
10718c2ecf20Sopenharmony_ci	LIST_HEAD(pagelist);
10728c2ecf20Sopenharmony_ci	int err = 0;
10738c2ecf20Sopenharmony_ci	struct migration_target_control mtc = {
10748c2ecf20Sopenharmony_ci		.nid = dest,
10758c2ecf20Sopenharmony_ci		.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
10768c2ecf20Sopenharmony_ci	};
10778c2ecf20Sopenharmony_ci
10788c2ecf20Sopenharmony_ci	nodes_clear(nmask);
10798c2ecf20Sopenharmony_ci	node_set(source, nmask);
10808c2ecf20Sopenharmony_ci
10818c2ecf20Sopenharmony_ci	/*
10828c2ecf20Sopenharmony_ci	 * This does not "check" the range but isolates all pages that
10838c2ecf20Sopenharmony_ci	 * need migration.  Between passing in the full user address
10848c2ecf20Sopenharmony_ci	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
10858c2ecf20Sopenharmony_ci	 */
10868c2ecf20Sopenharmony_ci	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
10878c2ecf20Sopenharmony_ci	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
10888c2ecf20Sopenharmony_ci			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
10898c2ecf20Sopenharmony_ci
10908c2ecf20Sopenharmony_ci	if (!list_empty(&pagelist)) {
10918c2ecf20Sopenharmony_ci		err = migrate_pages(&pagelist, alloc_migration_target, NULL,
10928c2ecf20Sopenharmony_ci				(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
10938c2ecf20Sopenharmony_ci		if (err)
10948c2ecf20Sopenharmony_ci			putback_movable_pages(&pagelist);
10958c2ecf20Sopenharmony_ci	}
10968c2ecf20Sopenharmony_ci
10978c2ecf20Sopenharmony_ci	return err;
10988c2ecf20Sopenharmony_ci}
10998c2ecf20Sopenharmony_ci
11008c2ecf20Sopenharmony_ci/*
11018c2ecf20Sopenharmony_ci * Move pages between the two nodesets so as to preserve the physical
11028c2ecf20Sopenharmony_ci * layout as much as possible.
11038c2ecf20Sopenharmony_ci *
11048c2ecf20Sopenharmony_ci * Returns the number of page that could not be moved.
11058c2ecf20Sopenharmony_ci */
11068c2ecf20Sopenharmony_ciint do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
11078c2ecf20Sopenharmony_ci		     const nodemask_t *to, int flags)
11088c2ecf20Sopenharmony_ci{
11098c2ecf20Sopenharmony_ci	int busy = 0;
11108c2ecf20Sopenharmony_ci	int err;
11118c2ecf20Sopenharmony_ci	nodemask_t tmp;
11128c2ecf20Sopenharmony_ci
11138c2ecf20Sopenharmony_ci	err = migrate_prep();
11148c2ecf20Sopenharmony_ci	if (err)
11158c2ecf20Sopenharmony_ci		return err;
11168c2ecf20Sopenharmony_ci
11178c2ecf20Sopenharmony_ci	mmap_read_lock(mm);
11188c2ecf20Sopenharmony_ci
11198c2ecf20Sopenharmony_ci	/*
11208c2ecf20Sopenharmony_ci	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
11218c2ecf20Sopenharmony_ci	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
11228c2ecf20Sopenharmony_ci	 * bit in 'tmp', and return that <source, dest> pair for migration.
11238c2ecf20Sopenharmony_ci	 * The pair of nodemasks 'to' and 'from' define the map.
11248c2ecf20Sopenharmony_ci	 *
11258c2ecf20Sopenharmony_ci	 * If no pair of bits is found that way, fallback to picking some
11268c2ecf20Sopenharmony_ci	 * pair of 'source' and 'dest' bits that are not the same.  If the
11278c2ecf20Sopenharmony_ci	 * 'source' and 'dest' bits are the same, this represents a node
11288c2ecf20Sopenharmony_ci	 * that will be migrating to itself, so no pages need move.
11298c2ecf20Sopenharmony_ci	 *
11308c2ecf20Sopenharmony_ci	 * If no bits are left in 'tmp', or if all remaining bits left
11318c2ecf20Sopenharmony_ci	 * in 'tmp' correspond to the same bit in 'to', return false
11328c2ecf20Sopenharmony_ci	 * (nothing left to migrate).
11338c2ecf20Sopenharmony_ci	 *
11348c2ecf20Sopenharmony_ci	 * This lets us pick a pair of nodes to migrate between, such that
11358c2ecf20Sopenharmony_ci	 * if possible the dest node is not already occupied by some other
11368c2ecf20Sopenharmony_ci	 * source node, minimizing the risk of overloading the memory on a
11378c2ecf20Sopenharmony_ci	 * node that would happen if we migrated incoming memory to a node
11388c2ecf20Sopenharmony_ci	 * before migrating outgoing memory source that same node.
11398c2ecf20Sopenharmony_ci	 *
11408c2ecf20Sopenharmony_ci	 * A single scan of tmp is sufficient.  As we go, we remember the
11418c2ecf20Sopenharmony_ci	 * most recent <s, d> pair that moved (s != d).  If we find a pair
11428c2ecf20Sopenharmony_ci	 * that not only moved, but what's better, moved to an empty slot
11438c2ecf20Sopenharmony_ci	 * (d is not set in tmp), then we break out then, with that pair.
11448c2ecf20Sopenharmony_ci	 * Otherwise when we finish scanning from_tmp, we at least have the
11458c2ecf20Sopenharmony_ci	 * most recent <s, d> pair that moved.  If we get all the way through
11468c2ecf20Sopenharmony_ci	 * the scan of tmp without finding any node that moved, much less
11478c2ecf20Sopenharmony_ci	 * moved to an empty node, then there is nothing left worth migrating.
11488c2ecf20Sopenharmony_ci	 */
11498c2ecf20Sopenharmony_ci
11508c2ecf20Sopenharmony_ci	tmp = *from;
11518c2ecf20Sopenharmony_ci	while (!nodes_empty(tmp)) {
11528c2ecf20Sopenharmony_ci		int s,d;
11538c2ecf20Sopenharmony_ci		int source = NUMA_NO_NODE;
11548c2ecf20Sopenharmony_ci		int dest = 0;
11558c2ecf20Sopenharmony_ci
11568c2ecf20Sopenharmony_ci		for_each_node_mask(s, tmp) {
11578c2ecf20Sopenharmony_ci
11588c2ecf20Sopenharmony_ci			/*
11598c2ecf20Sopenharmony_ci			 * do_migrate_pages() tries to maintain the relative
11608c2ecf20Sopenharmony_ci			 * node relationship of the pages established between
11618c2ecf20Sopenharmony_ci			 * threads and memory areas.
11628c2ecf20Sopenharmony_ci                         *
11638c2ecf20Sopenharmony_ci			 * However if the number of source nodes is not equal to
11648c2ecf20Sopenharmony_ci			 * the number of destination nodes we can not preserve
11658c2ecf20Sopenharmony_ci			 * this node relative relationship.  In that case, skip
11668c2ecf20Sopenharmony_ci			 * copying memory from a node that is in the destination
11678c2ecf20Sopenharmony_ci			 * mask.
11688c2ecf20Sopenharmony_ci			 *
11698c2ecf20Sopenharmony_ci			 * Example: [2,3,4] -> [3,4,5] moves everything.
11708c2ecf20Sopenharmony_ci			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
11718c2ecf20Sopenharmony_ci			 */
11728c2ecf20Sopenharmony_ci
11738c2ecf20Sopenharmony_ci			if ((nodes_weight(*from) != nodes_weight(*to)) &&
11748c2ecf20Sopenharmony_ci						(node_isset(s, *to)))
11758c2ecf20Sopenharmony_ci				continue;
11768c2ecf20Sopenharmony_ci
11778c2ecf20Sopenharmony_ci			d = node_remap(s, *from, *to);
11788c2ecf20Sopenharmony_ci			if (s == d)
11798c2ecf20Sopenharmony_ci				continue;
11808c2ecf20Sopenharmony_ci
11818c2ecf20Sopenharmony_ci			source = s;	/* Node moved. Memorize */
11828c2ecf20Sopenharmony_ci			dest = d;
11838c2ecf20Sopenharmony_ci
11848c2ecf20Sopenharmony_ci			/* dest not in remaining from nodes? */
11858c2ecf20Sopenharmony_ci			if (!node_isset(dest, tmp))
11868c2ecf20Sopenharmony_ci				break;
11878c2ecf20Sopenharmony_ci		}
11888c2ecf20Sopenharmony_ci		if (source == NUMA_NO_NODE)
11898c2ecf20Sopenharmony_ci			break;
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ci		node_clear(source, tmp);
11928c2ecf20Sopenharmony_ci		err = migrate_to_node(mm, source, dest, flags);
11938c2ecf20Sopenharmony_ci		if (err > 0)
11948c2ecf20Sopenharmony_ci			busy += err;
11958c2ecf20Sopenharmony_ci		if (err < 0)
11968c2ecf20Sopenharmony_ci			break;
11978c2ecf20Sopenharmony_ci	}
11988c2ecf20Sopenharmony_ci	mmap_read_unlock(mm);
11998c2ecf20Sopenharmony_ci	if (err < 0)
12008c2ecf20Sopenharmony_ci		return err;
12018c2ecf20Sopenharmony_ci	return busy;
12028c2ecf20Sopenharmony_ci
12038c2ecf20Sopenharmony_ci}
12048c2ecf20Sopenharmony_ci
12058c2ecf20Sopenharmony_ci/*
12068c2ecf20Sopenharmony_ci * Allocate a new page for page migration based on vma policy.
12078c2ecf20Sopenharmony_ci * Start by assuming the page is mapped by the same vma as contains @start.
12088c2ecf20Sopenharmony_ci * Search forward from there, if not.  N.B., this assumes that the
12098c2ecf20Sopenharmony_ci * list of pages handed to migrate_pages()--which is how we get here--
12108c2ecf20Sopenharmony_ci * is in virtual address order.
12118c2ecf20Sopenharmony_ci */
12128c2ecf20Sopenharmony_cistatic struct page *new_page(struct page *page, unsigned long start)
12138c2ecf20Sopenharmony_ci{
12148c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
12158c2ecf20Sopenharmony_ci	unsigned long address;
12168c2ecf20Sopenharmony_ci
12178c2ecf20Sopenharmony_ci	vma = find_vma(current->mm, start);
12188c2ecf20Sopenharmony_ci	while (vma) {
12198c2ecf20Sopenharmony_ci		address = page_address_in_vma(page, vma);
12208c2ecf20Sopenharmony_ci		if (address != -EFAULT)
12218c2ecf20Sopenharmony_ci			break;
12228c2ecf20Sopenharmony_ci		vma = vma->vm_next;
12238c2ecf20Sopenharmony_ci	}
12248c2ecf20Sopenharmony_ci
12258c2ecf20Sopenharmony_ci	if (PageHuge(page)) {
12268c2ecf20Sopenharmony_ci		return alloc_huge_page_vma(page_hstate(compound_head(page)),
12278c2ecf20Sopenharmony_ci				vma, address);
12288c2ecf20Sopenharmony_ci	} else if (PageTransHuge(page)) {
12298c2ecf20Sopenharmony_ci		struct page *thp;
12308c2ecf20Sopenharmony_ci
12318c2ecf20Sopenharmony_ci		thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
12328c2ecf20Sopenharmony_ci					 HPAGE_PMD_ORDER);
12338c2ecf20Sopenharmony_ci		if (!thp)
12348c2ecf20Sopenharmony_ci			return NULL;
12358c2ecf20Sopenharmony_ci		prep_transhuge_page(thp);
12368c2ecf20Sopenharmony_ci		return thp;
12378c2ecf20Sopenharmony_ci	}
12388c2ecf20Sopenharmony_ci	/*
12398c2ecf20Sopenharmony_ci	 * if !vma, alloc_page_vma() will use task or system default policy
12408c2ecf20Sopenharmony_ci	 */
12418c2ecf20Sopenharmony_ci	return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
12428c2ecf20Sopenharmony_ci			vma, address);
12438c2ecf20Sopenharmony_ci}
12448c2ecf20Sopenharmony_ci#else
12458c2ecf20Sopenharmony_ci
12468c2ecf20Sopenharmony_cistatic int migrate_page_add(struct page *page, struct list_head *pagelist,
12478c2ecf20Sopenharmony_ci				unsigned long flags)
12488c2ecf20Sopenharmony_ci{
12498c2ecf20Sopenharmony_ci	return -EIO;
12508c2ecf20Sopenharmony_ci}
12518c2ecf20Sopenharmony_ci
12528c2ecf20Sopenharmony_ciint do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
12538c2ecf20Sopenharmony_ci		     const nodemask_t *to, int flags)
12548c2ecf20Sopenharmony_ci{
12558c2ecf20Sopenharmony_ci	return -ENOSYS;
12568c2ecf20Sopenharmony_ci}
12578c2ecf20Sopenharmony_ci
12588c2ecf20Sopenharmony_cistatic struct page *new_page(struct page *page, unsigned long start)
12598c2ecf20Sopenharmony_ci{
12608c2ecf20Sopenharmony_ci	return NULL;
12618c2ecf20Sopenharmony_ci}
12628c2ecf20Sopenharmony_ci#endif
12638c2ecf20Sopenharmony_ci
12648c2ecf20Sopenharmony_cistatic long do_mbind(unsigned long start, unsigned long len,
12658c2ecf20Sopenharmony_ci		     unsigned short mode, unsigned short mode_flags,
12668c2ecf20Sopenharmony_ci		     nodemask_t *nmask, unsigned long flags)
12678c2ecf20Sopenharmony_ci{
12688c2ecf20Sopenharmony_ci	struct mm_struct *mm = current->mm;
12698c2ecf20Sopenharmony_ci	struct mempolicy *new;
12708c2ecf20Sopenharmony_ci	unsigned long end;
12718c2ecf20Sopenharmony_ci	int err;
12728c2ecf20Sopenharmony_ci	int ret;
12738c2ecf20Sopenharmony_ci	LIST_HEAD(pagelist);
12748c2ecf20Sopenharmony_ci
12758c2ecf20Sopenharmony_ci	if (flags & ~(unsigned long)MPOL_MF_VALID)
12768c2ecf20Sopenharmony_ci		return -EINVAL;
12778c2ecf20Sopenharmony_ci	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
12788c2ecf20Sopenharmony_ci		return -EPERM;
12798c2ecf20Sopenharmony_ci
12808c2ecf20Sopenharmony_ci	if (start & ~PAGE_MASK)
12818c2ecf20Sopenharmony_ci		return -EINVAL;
12828c2ecf20Sopenharmony_ci
12838c2ecf20Sopenharmony_ci	if (mode == MPOL_DEFAULT)
12848c2ecf20Sopenharmony_ci		flags &= ~MPOL_MF_STRICT;
12858c2ecf20Sopenharmony_ci
12868c2ecf20Sopenharmony_ci	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
12878c2ecf20Sopenharmony_ci	end = start + len;
12888c2ecf20Sopenharmony_ci
12898c2ecf20Sopenharmony_ci	if (end < start)
12908c2ecf20Sopenharmony_ci		return -EINVAL;
12918c2ecf20Sopenharmony_ci	if (end == start)
12928c2ecf20Sopenharmony_ci		return 0;
12938c2ecf20Sopenharmony_ci
12948c2ecf20Sopenharmony_ci	new = mpol_new(mode, mode_flags, nmask);
12958c2ecf20Sopenharmony_ci	if (IS_ERR(new))
12968c2ecf20Sopenharmony_ci		return PTR_ERR(new);
12978c2ecf20Sopenharmony_ci
12988c2ecf20Sopenharmony_ci	if (flags & MPOL_MF_LAZY)
12998c2ecf20Sopenharmony_ci		new->flags |= MPOL_F_MOF;
13008c2ecf20Sopenharmony_ci
13018c2ecf20Sopenharmony_ci	/*
13028c2ecf20Sopenharmony_ci	 * If we are using the default policy then operation
13038c2ecf20Sopenharmony_ci	 * on discontinuous address spaces is okay after all
13048c2ecf20Sopenharmony_ci	 */
13058c2ecf20Sopenharmony_ci	if (!new)
13068c2ecf20Sopenharmony_ci		flags |= MPOL_MF_DISCONTIG_OK;
13078c2ecf20Sopenharmony_ci
13088c2ecf20Sopenharmony_ci	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
13098c2ecf20Sopenharmony_ci		 start, start + len, mode, mode_flags,
13108c2ecf20Sopenharmony_ci		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
13118c2ecf20Sopenharmony_ci
13128c2ecf20Sopenharmony_ci	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
13138c2ecf20Sopenharmony_ci
13148c2ecf20Sopenharmony_ci		err = migrate_prep();
13158c2ecf20Sopenharmony_ci		if (err)
13168c2ecf20Sopenharmony_ci			goto mpol_out;
13178c2ecf20Sopenharmony_ci	}
13188c2ecf20Sopenharmony_ci	{
13198c2ecf20Sopenharmony_ci		NODEMASK_SCRATCH(scratch);
13208c2ecf20Sopenharmony_ci		if (scratch) {
13218c2ecf20Sopenharmony_ci			mmap_write_lock(mm);
13228c2ecf20Sopenharmony_ci			err = mpol_set_nodemask(new, nmask, scratch);
13238c2ecf20Sopenharmony_ci			if (err)
13248c2ecf20Sopenharmony_ci				mmap_write_unlock(mm);
13258c2ecf20Sopenharmony_ci		} else
13268c2ecf20Sopenharmony_ci			err = -ENOMEM;
13278c2ecf20Sopenharmony_ci		NODEMASK_SCRATCH_FREE(scratch);
13288c2ecf20Sopenharmony_ci	}
13298c2ecf20Sopenharmony_ci	if (err)
13308c2ecf20Sopenharmony_ci		goto mpol_out;
13318c2ecf20Sopenharmony_ci
13328c2ecf20Sopenharmony_ci	ret = queue_pages_range(mm, start, end, nmask,
13338c2ecf20Sopenharmony_ci			  flags | MPOL_MF_INVERT, &pagelist);
13348c2ecf20Sopenharmony_ci
13358c2ecf20Sopenharmony_ci	if (ret < 0) {
13368c2ecf20Sopenharmony_ci		err = ret;
13378c2ecf20Sopenharmony_ci		goto up_out;
13388c2ecf20Sopenharmony_ci	}
13398c2ecf20Sopenharmony_ci
13408c2ecf20Sopenharmony_ci	err = mbind_range(mm, start, end, new);
13418c2ecf20Sopenharmony_ci
13428c2ecf20Sopenharmony_ci	if (!err) {
13438c2ecf20Sopenharmony_ci		int nr_failed = 0;
13448c2ecf20Sopenharmony_ci
13458c2ecf20Sopenharmony_ci		if (!list_empty(&pagelist)) {
13468c2ecf20Sopenharmony_ci			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
13478c2ecf20Sopenharmony_ci			nr_failed = migrate_pages(&pagelist, new_page, NULL,
13488c2ecf20Sopenharmony_ci				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
13498c2ecf20Sopenharmony_ci			if (nr_failed)
13508c2ecf20Sopenharmony_ci				putback_movable_pages(&pagelist);
13518c2ecf20Sopenharmony_ci		}
13528c2ecf20Sopenharmony_ci
13538c2ecf20Sopenharmony_ci		if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
13548c2ecf20Sopenharmony_ci			err = -EIO;
13558c2ecf20Sopenharmony_ci	} else {
13568c2ecf20Sopenharmony_ciup_out:
13578c2ecf20Sopenharmony_ci		if (!list_empty(&pagelist))
13588c2ecf20Sopenharmony_ci			putback_movable_pages(&pagelist);
13598c2ecf20Sopenharmony_ci	}
13608c2ecf20Sopenharmony_ci
13618c2ecf20Sopenharmony_ci	mmap_write_unlock(mm);
13628c2ecf20Sopenharmony_cimpol_out:
13638c2ecf20Sopenharmony_ci	mpol_put(new);
13648c2ecf20Sopenharmony_ci	return err;
13658c2ecf20Sopenharmony_ci}
13668c2ecf20Sopenharmony_ci
13678c2ecf20Sopenharmony_ci/*
13688c2ecf20Sopenharmony_ci * User space interface with variable sized bitmaps for nodelists.
13698c2ecf20Sopenharmony_ci */
13708c2ecf20Sopenharmony_ci
13718c2ecf20Sopenharmony_ci/* Copy a node mask from user space. */
13728c2ecf20Sopenharmony_cistatic int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
13738c2ecf20Sopenharmony_ci		     unsigned long maxnode)
13748c2ecf20Sopenharmony_ci{
13758c2ecf20Sopenharmony_ci	unsigned long k;
13768c2ecf20Sopenharmony_ci	unsigned long t;
13778c2ecf20Sopenharmony_ci	unsigned long nlongs;
13788c2ecf20Sopenharmony_ci	unsigned long endmask;
13798c2ecf20Sopenharmony_ci
13808c2ecf20Sopenharmony_ci	--maxnode;
13818c2ecf20Sopenharmony_ci	nodes_clear(*nodes);
13828c2ecf20Sopenharmony_ci	if (maxnode == 0 || !nmask)
13838c2ecf20Sopenharmony_ci		return 0;
13848c2ecf20Sopenharmony_ci	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
13858c2ecf20Sopenharmony_ci		return -EINVAL;
13868c2ecf20Sopenharmony_ci
13878c2ecf20Sopenharmony_ci	nlongs = BITS_TO_LONGS(maxnode);
13888c2ecf20Sopenharmony_ci	if ((maxnode % BITS_PER_LONG) == 0)
13898c2ecf20Sopenharmony_ci		endmask = ~0UL;
13908c2ecf20Sopenharmony_ci	else
13918c2ecf20Sopenharmony_ci		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
13928c2ecf20Sopenharmony_ci
13938c2ecf20Sopenharmony_ci	/*
13948c2ecf20Sopenharmony_ci	 * When the user specified more nodes than supported just check
13958c2ecf20Sopenharmony_ci	 * if the non supported part is all zero.
13968c2ecf20Sopenharmony_ci	 *
13978c2ecf20Sopenharmony_ci	 * If maxnode have more longs than MAX_NUMNODES, check
13988c2ecf20Sopenharmony_ci	 * the bits in that area first. And then go through to
13998c2ecf20Sopenharmony_ci	 * check the rest bits which equal or bigger than MAX_NUMNODES.
14008c2ecf20Sopenharmony_ci	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
14018c2ecf20Sopenharmony_ci	 */
14028c2ecf20Sopenharmony_ci	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
14038c2ecf20Sopenharmony_ci		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
14048c2ecf20Sopenharmony_ci			if (get_user(t, nmask + k))
14058c2ecf20Sopenharmony_ci				return -EFAULT;
14068c2ecf20Sopenharmony_ci			if (k == nlongs - 1) {
14078c2ecf20Sopenharmony_ci				if (t & endmask)
14088c2ecf20Sopenharmony_ci					return -EINVAL;
14098c2ecf20Sopenharmony_ci			} else if (t)
14108c2ecf20Sopenharmony_ci				return -EINVAL;
14118c2ecf20Sopenharmony_ci		}
14128c2ecf20Sopenharmony_ci		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
14138c2ecf20Sopenharmony_ci		endmask = ~0UL;
14148c2ecf20Sopenharmony_ci	}
14158c2ecf20Sopenharmony_ci
14168c2ecf20Sopenharmony_ci	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
14178c2ecf20Sopenharmony_ci		unsigned long valid_mask = endmask;
14188c2ecf20Sopenharmony_ci
14198c2ecf20Sopenharmony_ci		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
14208c2ecf20Sopenharmony_ci		if (get_user(t, nmask + nlongs - 1))
14218c2ecf20Sopenharmony_ci			return -EFAULT;
14228c2ecf20Sopenharmony_ci		if (t & valid_mask)
14238c2ecf20Sopenharmony_ci			return -EINVAL;
14248c2ecf20Sopenharmony_ci	}
14258c2ecf20Sopenharmony_ci
14268c2ecf20Sopenharmony_ci	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
14278c2ecf20Sopenharmony_ci		return -EFAULT;
14288c2ecf20Sopenharmony_ci	nodes_addr(*nodes)[nlongs-1] &= endmask;
14298c2ecf20Sopenharmony_ci	return 0;
14308c2ecf20Sopenharmony_ci}
14318c2ecf20Sopenharmony_ci
14328c2ecf20Sopenharmony_ci/* Copy a kernel node mask to user space */
14338c2ecf20Sopenharmony_cistatic int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
14348c2ecf20Sopenharmony_ci			      nodemask_t *nodes)
14358c2ecf20Sopenharmony_ci{
14368c2ecf20Sopenharmony_ci	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
14378c2ecf20Sopenharmony_ci	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
14388c2ecf20Sopenharmony_ci
14398c2ecf20Sopenharmony_ci	if (copy > nbytes) {
14408c2ecf20Sopenharmony_ci		if (copy > PAGE_SIZE)
14418c2ecf20Sopenharmony_ci			return -EINVAL;
14428c2ecf20Sopenharmony_ci		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
14438c2ecf20Sopenharmony_ci			return -EFAULT;
14448c2ecf20Sopenharmony_ci		copy = nbytes;
14458c2ecf20Sopenharmony_ci	}
14468c2ecf20Sopenharmony_ci	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
14478c2ecf20Sopenharmony_ci}
14488c2ecf20Sopenharmony_ci
14498c2ecf20Sopenharmony_cistatic long kernel_mbind(unsigned long start, unsigned long len,
14508c2ecf20Sopenharmony_ci			 unsigned long mode, const unsigned long __user *nmask,
14518c2ecf20Sopenharmony_ci			 unsigned long maxnode, unsigned int flags)
14528c2ecf20Sopenharmony_ci{
14538c2ecf20Sopenharmony_ci	nodemask_t nodes;
14548c2ecf20Sopenharmony_ci	int err;
14558c2ecf20Sopenharmony_ci	unsigned short mode_flags;
14568c2ecf20Sopenharmony_ci
14578c2ecf20Sopenharmony_ci	start = untagged_addr(start);
14588c2ecf20Sopenharmony_ci	mode_flags = mode & MPOL_MODE_FLAGS;
14598c2ecf20Sopenharmony_ci	mode &= ~MPOL_MODE_FLAGS;
14608c2ecf20Sopenharmony_ci	if (mode >= MPOL_MAX)
14618c2ecf20Sopenharmony_ci		return -EINVAL;
14628c2ecf20Sopenharmony_ci	if ((mode_flags & MPOL_F_STATIC_NODES) &&
14638c2ecf20Sopenharmony_ci	    (mode_flags & MPOL_F_RELATIVE_NODES))
14648c2ecf20Sopenharmony_ci		return -EINVAL;
14658c2ecf20Sopenharmony_ci	err = get_nodes(&nodes, nmask, maxnode);
14668c2ecf20Sopenharmony_ci	if (err)
14678c2ecf20Sopenharmony_ci		return err;
14688c2ecf20Sopenharmony_ci	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
14698c2ecf20Sopenharmony_ci}
14708c2ecf20Sopenharmony_ci
14718c2ecf20Sopenharmony_ciSYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
14728c2ecf20Sopenharmony_ci		unsigned long, mode, const unsigned long __user *, nmask,
14738c2ecf20Sopenharmony_ci		unsigned long, maxnode, unsigned int, flags)
14748c2ecf20Sopenharmony_ci{
14758c2ecf20Sopenharmony_ci	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
14768c2ecf20Sopenharmony_ci}
14778c2ecf20Sopenharmony_ci
14788c2ecf20Sopenharmony_ci/* Set the process memory policy */
14798c2ecf20Sopenharmony_cistatic long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
14808c2ecf20Sopenharmony_ci				 unsigned long maxnode)
14818c2ecf20Sopenharmony_ci{
14828c2ecf20Sopenharmony_ci	int err;
14838c2ecf20Sopenharmony_ci	nodemask_t nodes;
14848c2ecf20Sopenharmony_ci	unsigned short flags;
14858c2ecf20Sopenharmony_ci
14868c2ecf20Sopenharmony_ci	flags = mode & MPOL_MODE_FLAGS;
14878c2ecf20Sopenharmony_ci	mode &= ~MPOL_MODE_FLAGS;
14888c2ecf20Sopenharmony_ci	if ((unsigned int)mode >= MPOL_MAX)
14898c2ecf20Sopenharmony_ci		return -EINVAL;
14908c2ecf20Sopenharmony_ci	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
14918c2ecf20Sopenharmony_ci		return -EINVAL;
14928c2ecf20Sopenharmony_ci	err = get_nodes(&nodes, nmask, maxnode);
14938c2ecf20Sopenharmony_ci	if (err)
14948c2ecf20Sopenharmony_ci		return err;
14958c2ecf20Sopenharmony_ci	return do_set_mempolicy(mode, flags, &nodes);
14968c2ecf20Sopenharmony_ci}
14978c2ecf20Sopenharmony_ci
14988c2ecf20Sopenharmony_ciSYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
14998c2ecf20Sopenharmony_ci		unsigned long, maxnode)
15008c2ecf20Sopenharmony_ci{
15018c2ecf20Sopenharmony_ci	return kernel_set_mempolicy(mode, nmask, maxnode);
15028c2ecf20Sopenharmony_ci}
15038c2ecf20Sopenharmony_ci
15048c2ecf20Sopenharmony_cistatic int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
15058c2ecf20Sopenharmony_ci				const unsigned long __user *old_nodes,
15068c2ecf20Sopenharmony_ci				const unsigned long __user *new_nodes)
15078c2ecf20Sopenharmony_ci{
15088c2ecf20Sopenharmony_ci	struct mm_struct *mm = NULL;
15098c2ecf20Sopenharmony_ci	struct task_struct *task;
15108c2ecf20Sopenharmony_ci	nodemask_t task_nodes;
15118c2ecf20Sopenharmony_ci	int err;
15128c2ecf20Sopenharmony_ci	nodemask_t *old;
15138c2ecf20Sopenharmony_ci	nodemask_t *new;
15148c2ecf20Sopenharmony_ci	NODEMASK_SCRATCH(scratch);
15158c2ecf20Sopenharmony_ci
15168c2ecf20Sopenharmony_ci	if (!scratch)
15178c2ecf20Sopenharmony_ci		return -ENOMEM;
15188c2ecf20Sopenharmony_ci
15198c2ecf20Sopenharmony_ci	old = &scratch->mask1;
15208c2ecf20Sopenharmony_ci	new = &scratch->mask2;
15218c2ecf20Sopenharmony_ci
15228c2ecf20Sopenharmony_ci	err = get_nodes(old, old_nodes, maxnode);
15238c2ecf20Sopenharmony_ci	if (err)
15248c2ecf20Sopenharmony_ci		goto out;
15258c2ecf20Sopenharmony_ci
15268c2ecf20Sopenharmony_ci	err = get_nodes(new, new_nodes, maxnode);
15278c2ecf20Sopenharmony_ci	if (err)
15288c2ecf20Sopenharmony_ci		goto out;
15298c2ecf20Sopenharmony_ci
15308c2ecf20Sopenharmony_ci	/* Find the mm_struct */
15318c2ecf20Sopenharmony_ci	rcu_read_lock();
15328c2ecf20Sopenharmony_ci	task = pid ? find_task_by_vpid(pid) : current;
15338c2ecf20Sopenharmony_ci	if (!task) {
15348c2ecf20Sopenharmony_ci		rcu_read_unlock();
15358c2ecf20Sopenharmony_ci		err = -ESRCH;
15368c2ecf20Sopenharmony_ci		goto out;
15378c2ecf20Sopenharmony_ci	}
15388c2ecf20Sopenharmony_ci	get_task_struct(task);
15398c2ecf20Sopenharmony_ci
15408c2ecf20Sopenharmony_ci	err = -EINVAL;
15418c2ecf20Sopenharmony_ci
15428c2ecf20Sopenharmony_ci	/*
15438c2ecf20Sopenharmony_ci	 * Check if this process has the right to modify the specified process.
15448c2ecf20Sopenharmony_ci	 * Use the regular "ptrace_may_access()" checks.
15458c2ecf20Sopenharmony_ci	 */
15468c2ecf20Sopenharmony_ci	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
15478c2ecf20Sopenharmony_ci		rcu_read_unlock();
15488c2ecf20Sopenharmony_ci		err = -EPERM;
15498c2ecf20Sopenharmony_ci		goto out_put;
15508c2ecf20Sopenharmony_ci	}
15518c2ecf20Sopenharmony_ci	rcu_read_unlock();
15528c2ecf20Sopenharmony_ci
15538c2ecf20Sopenharmony_ci	task_nodes = cpuset_mems_allowed(task);
15548c2ecf20Sopenharmony_ci	/* Is the user allowed to access the target nodes? */
15558c2ecf20Sopenharmony_ci	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
15568c2ecf20Sopenharmony_ci		err = -EPERM;
15578c2ecf20Sopenharmony_ci		goto out_put;
15588c2ecf20Sopenharmony_ci	}
15598c2ecf20Sopenharmony_ci
15608c2ecf20Sopenharmony_ci	task_nodes = cpuset_mems_allowed(current);
15618c2ecf20Sopenharmony_ci	nodes_and(*new, *new, task_nodes);
15628c2ecf20Sopenharmony_ci	if (nodes_empty(*new))
15638c2ecf20Sopenharmony_ci		goto out_put;
15648c2ecf20Sopenharmony_ci
15658c2ecf20Sopenharmony_ci	err = security_task_movememory(task);
15668c2ecf20Sopenharmony_ci	if (err)
15678c2ecf20Sopenharmony_ci		goto out_put;
15688c2ecf20Sopenharmony_ci
15698c2ecf20Sopenharmony_ci	mm = get_task_mm(task);
15708c2ecf20Sopenharmony_ci	put_task_struct(task);
15718c2ecf20Sopenharmony_ci
15728c2ecf20Sopenharmony_ci	if (!mm) {
15738c2ecf20Sopenharmony_ci		err = -EINVAL;
15748c2ecf20Sopenharmony_ci		goto out;
15758c2ecf20Sopenharmony_ci	}
15768c2ecf20Sopenharmony_ci
15778c2ecf20Sopenharmony_ci	err = do_migrate_pages(mm, old, new,
15788c2ecf20Sopenharmony_ci		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
15798c2ecf20Sopenharmony_ci
15808c2ecf20Sopenharmony_ci	mmput(mm);
15818c2ecf20Sopenharmony_ciout:
15828c2ecf20Sopenharmony_ci	NODEMASK_SCRATCH_FREE(scratch);
15838c2ecf20Sopenharmony_ci
15848c2ecf20Sopenharmony_ci	return err;
15858c2ecf20Sopenharmony_ci
15868c2ecf20Sopenharmony_ciout_put:
15878c2ecf20Sopenharmony_ci	put_task_struct(task);
15888c2ecf20Sopenharmony_ci	goto out;
15898c2ecf20Sopenharmony_ci
15908c2ecf20Sopenharmony_ci}
15918c2ecf20Sopenharmony_ci
15928c2ecf20Sopenharmony_ciSYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
15938c2ecf20Sopenharmony_ci		const unsigned long __user *, old_nodes,
15948c2ecf20Sopenharmony_ci		const unsigned long __user *, new_nodes)
15958c2ecf20Sopenharmony_ci{
15968c2ecf20Sopenharmony_ci	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
15978c2ecf20Sopenharmony_ci}
15988c2ecf20Sopenharmony_ci
15998c2ecf20Sopenharmony_ci
16008c2ecf20Sopenharmony_ci/* Retrieve NUMA policy */
16018c2ecf20Sopenharmony_cistatic int kernel_get_mempolicy(int __user *policy,
16028c2ecf20Sopenharmony_ci				unsigned long __user *nmask,
16038c2ecf20Sopenharmony_ci				unsigned long maxnode,
16048c2ecf20Sopenharmony_ci				unsigned long addr,
16058c2ecf20Sopenharmony_ci				unsigned long flags)
16068c2ecf20Sopenharmony_ci{
16078c2ecf20Sopenharmony_ci	int err;
16088c2ecf20Sopenharmony_ci	int pval;
16098c2ecf20Sopenharmony_ci	nodemask_t nodes;
16108c2ecf20Sopenharmony_ci
16118c2ecf20Sopenharmony_ci	if (nmask != NULL && maxnode < nr_node_ids)
16128c2ecf20Sopenharmony_ci		return -EINVAL;
16138c2ecf20Sopenharmony_ci
16148c2ecf20Sopenharmony_ci	addr = untagged_addr(addr);
16158c2ecf20Sopenharmony_ci
16168c2ecf20Sopenharmony_ci	err = do_get_mempolicy(&pval, &nodes, addr, flags);
16178c2ecf20Sopenharmony_ci
16188c2ecf20Sopenharmony_ci	if (err)
16198c2ecf20Sopenharmony_ci		return err;
16208c2ecf20Sopenharmony_ci
16218c2ecf20Sopenharmony_ci	if (policy && put_user(pval, policy))
16228c2ecf20Sopenharmony_ci		return -EFAULT;
16238c2ecf20Sopenharmony_ci
16248c2ecf20Sopenharmony_ci	if (nmask)
16258c2ecf20Sopenharmony_ci		err = copy_nodes_to_user(nmask, maxnode, &nodes);
16268c2ecf20Sopenharmony_ci
16278c2ecf20Sopenharmony_ci	return err;
16288c2ecf20Sopenharmony_ci}
16298c2ecf20Sopenharmony_ci
16308c2ecf20Sopenharmony_ciSYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
16318c2ecf20Sopenharmony_ci		unsigned long __user *, nmask, unsigned long, maxnode,
16328c2ecf20Sopenharmony_ci		unsigned long, addr, unsigned long, flags)
16338c2ecf20Sopenharmony_ci{
16348c2ecf20Sopenharmony_ci	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
16358c2ecf20Sopenharmony_ci}
16368c2ecf20Sopenharmony_ci
16378c2ecf20Sopenharmony_ci#ifdef CONFIG_COMPAT
16388c2ecf20Sopenharmony_ci
16398c2ecf20Sopenharmony_ciCOMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
16408c2ecf20Sopenharmony_ci		       compat_ulong_t __user *, nmask,
16418c2ecf20Sopenharmony_ci		       compat_ulong_t, maxnode,
16428c2ecf20Sopenharmony_ci		       compat_ulong_t, addr, compat_ulong_t, flags)
16438c2ecf20Sopenharmony_ci{
16448c2ecf20Sopenharmony_ci	long err;
16458c2ecf20Sopenharmony_ci	unsigned long __user *nm = NULL;
16468c2ecf20Sopenharmony_ci	unsigned long nr_bits, alloc_size;
16478c2ecf20Sopenharmony_ci	DECLARE_BITMAP(bm, MAX_NUMNODES);
16488c2ecf20Sopenharmony_ci
16498c2ecf20Sopenharmony_ci	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
16508c2ecf20Sopenharmony_ci	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
16518c2ecf20Sopenharmony_ci
16528c2ecf20Sopenharmony_ci	if (nmask)
16538c2ecf20Sopenharmony_ci		nm = compat_alloc_user_space(alloc_size);
16548c2ecf20Sopenharmony_ci
16558c2ecf20Sopenharmony_ci	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
16568c2ecf20Sopenharmony_ci
16578c2ecf20Sopenharmony_ci	if (!err && nmask) {
16588c2ecf20Sopenharmony_ci		unsigned long copy_size;
16598c2ecf20Sopenharmony_ci		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
16608c2ecf20Sopenharmony_ci		err = copy_from_user(bm, nm, copy_size);
16618c2ecf20Sopenharmony_ci		/* ensure entire bitmap is zeroed */
16628c2ecf20Sopenharmony_ci		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
16638c2ecf20Sopenharmony_ci		err |= compat_put_bitmap(nmask, bm, nr_bits);
16648c2ecf20Sopenharmony_ci	}
16658c2ecf20Sopenharmony_ci
16668c2ecf20Sopenharmony_ci	return err;
16678c2ecf20Sopenharmony_ci}
16688c2ecf20Sopenharmony_ci
16698c2ecf20Sopenharmony_ciCOMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
16708c2ecf20Sopenharmony_ci		       compat_ulong_t, maxnode)
16718c2ecf20Sopenharmony_ci{
16728c2ecf20Sopenharmony_ci	unsigned long __user *nm = NULL;
16738c2ecf20Sopenharmony_ci	unsigned long nr_bits, alloc_size;
16748c2ecf20Sopenharmony_ci	DECLARE_BITMAP(bm, MAX_NUMNODES);
16758c2ecf20Sopenharmony_ci
16768c2ecf20Sopenharmony_ci	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
16778c2ecf20Sopenharmony_ci	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
16788c2ecf20Sopenharmony_ci
16798c2ecf20Sopenharmony_ci	if (nmask) {
16808c2ecf20Sopenharmony_ci		if (compat_get_bitmap(bm, nmask, nr_bits))
16818c2ecf20Sopenharmony_ci			return -EFAULT;
16828c2ecf20Sopenharmony_ci		nm = compat_alloc_user_space(alloc_size);
16838c2ecf20Sopenharmony_ci		if (copy_to_user(nm, bm, alloc_size))
16848c2ecf20Sopenharmony_ci			return -EFAULT;
16858c2ecf20Sopenharmony_ci	}
16868c2ecf20Sopenharmony_ci
16878c2ecf20Sopenharmony_ci	return kernel_set_mempolicy(mode, nm, nr_bits+1);
16888c2ecf20Sopenharmony_ci}
16898c2ecf20Sopenharmony_ci
16908c2ecf20Sopenharmony_ciCOMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
16918c2ecf20Sopenharmony_ci		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
16928c2ecf20Sopenharmony_ci		       compat_ulong_t, maxnode, compat_ulong_t, flags)
16938c2ecf20Sopenharmony_ci{
16948c2ecf20Sopenharmony_ci	unsigned long __user *nm = NULL;
16958c2ecf20Sopenharmony_ci	unsigned long nr_bits, alloc_size;
16968c2ecf20Sopenharmony_ci	nodemask_t bm;
16978c2ecf20Sopenharmony_ci
16988c2ecf20Sopenharmony_ci	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
16998c2ecf20Sopenharmony_ci	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
17008c2ecf20Sopenharmony_ci
17018c2ecf20Sopenharmony_ci	if (nmask) {
17028c2ecf20Sopenharmony_ci		if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
17038c2ecf20Sopenharmony_ci			return -EFAULT;
17048c2ecf20Sopenharmony_ci		nm = compat_alloc_user_space(alloc_size);
17058c2ecf20Sopenharmony_ci		if (copy_to_user(nm, nodes_addr(bm), alloc_size))
17068c2ecf20Sopenharmony_ci			return -EFAULT;
17078c2ecf20Sopenharmony_ci	}
17088c2ecf20Sopenharmony_ci
17098c2ecf20Sopenharmony_ci	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
17108c2ecf20Sopenharmony_ci}
17118c2ecf20Sopenharmony_ci
17128c2ecf20Sopenharmony_ciCOMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
17138c2ecf20Sopenharmony_ci		       compat_ulong_t, maxnode,
17148c2ecf20Sopenharmony_ci		       const compat_ulong_t __user *, old_nodes,
17158c2ecf20Sopenharmony_ci		       const compat_ulong_t __user *, new_nodes)
17168c2ecf20Sopenharmony_ci{
17178c2ecf20Sopenharmony_ci	unsigned long __user *old = NULL;
17188c2ecf20Sopenharmony_ci	unsigned long __user *new = NULL;
17198c2ecf20Sopenharmony_ci	nodemask_t tmp_mask;
17208c2ecf20Sopenharmony_ci	unsigned long nr_bits;
17218c2ecf20Sopenharmony_ci	unsigned long size;
17228c2ecf20Sopenharmony_ci
17238c2ecf20Sopenharmony_ci	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
17248c2ecf20Sopenharmony_ci	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
17258c2ecf20Sopenharmony_ci	if (old_nodes) {
17268c2ecf20Sopenharmony_ci		if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
17278c2ecf20Sopenharmony_ci			return -EFAULT;
17288c2ecf20Sopenharmony_ci		old = compat_alloc_user_space(new_nodes ? size * 2 : size);
17298c2ecf20Sopenharmony_ci		if (new_nodes)
17308c2ecf20Sopenharmony_ci			new = old + size / sizeof(unsigned long);
17318c2ecf20Sopenharmony_ci		if (copy_to_user(old, nodes_addr(tmp_mask), size))
17328c2ecf20Sopenharmony_ci			return -EFAULT;
17338c2ecf20Sopenharmony_ci	}
17348c2ecf20Sopenharmony_ci	if (new_nodes) {
17358c2ecf20Sopenharmony_ci		if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
17368c2ecf20Sopenharmony_ci			return -EFAULT;
17378c2ecf20Sopenharmony_ci		if (new == NULL)
17388c2ecf20Sopenharmony_ci			new = compat_alloc_user_space(size);
17398c2ecf20Sopenharmony_ci		if (copy_to_user(new, nodes_addr(tmp_mask), size))
17408c2ecf20Sopenharmony_ci			return -EFAULT;
17418c2ecf20Sopenharmony_ci	}
17428c2ecf20Sopenharmony_ci	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
17438c2ecf20Sopenharmony_ci}
17448c2ecf20Sopenharmony_ci
17458c2ecf20Sopenharmony_ci#endif /* CONFIG_COMPAT */
17468c2ecf20Sopenharmony_ci
17478c2ecf20Sopenharmony_cibool vma_migratable(struct vm_area_struct *vma)
17488c2ecf20Sopenharmony_ci{
17498c2ecf20Sopenharmony_ci	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
17508c2ecf20Sopenharmony_ci		return false;
17518c2ecf20Sopenharmony_ci
17528c2ecf20Sopenharmony_ci	/*
17538c2ecf20Sopenharmony_ci	 * DAX device mappings require predictable access latency, so avoid
17548c2ecf20Sopenharmony_ci	 * incurring periodic faults.
17558c2ecf20Sopenharmony_ci	 */
17568c2ecf20Sopenharmony_ci	if (vma_is_dax(vma))
17578c2ecf20Sopenharmony_ci		return false;
17588c2ecf20Sopenharmony_ci
17598c2ecf20Sopenharmony_ci	if (is_vm_hugetlb_page(vma) &&
17608c2ecf20Sopenharmony_ci		!hugepage_migration_supported(hstate_vma(vma)))
17618c2ecf20Sopenharmony_ci		return false;
17628c2ecf20Sopenharmony_ci
17638c2ecf20Sopenharmony_ci	/*
17648c2ecf20Sopenharmony_ci	 * Migration allocates pages in the highest zone. If we cannot
17658c2ecf20Sopenharmony_ci	 * do so then migration (at least from node to node) is not
17668c2ecf20Sopenharmony_ci	 * possible.
17678c2ecf20Sopenharmony_ci	 */
17688c2ecf20Sopenharmony_ci	if (vma->vm_file &&
17698c2ecf20Sopenharmony_ci		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
17708c2ecf20Sopenharmony_ci			< policy_zone)
17718c2ecf20Sopenharmony_ci		return false;
17728c2ecf20Sopenharmony_ci	return true;
17738c2ecf20Sopenharmony_ci}
17748c2ecf20Sopenharmony_ci
17758c2ecf20Sopenharmony_cistruct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
17768c2ecf20Sopenharmony_ci						unsigned long addr)
17778c2ecf20Sopenharmony_ci{
17788c2ecf20Sopenharmony_ci	struct mempolicy *pol = NULL;
17798c2ecf20Sopenharmony_ci
17808c2ecf20Sopenharmony_ci	if (vma) {
17818c2ecf20Sopenharmony_ci		if (vma->vm_ops && vma->vm_ops->get_policy) {
17828c2ecf20Sopenharmony_ci			pol = vma->vm_ops->get_policy(vma, addr);
17838c2ecf20Sopenharmony_ci		} else if (vma->vm_policy) {
17848c2ecf20Sopenharmony_ci			pol = vma->vm_policy;
17858c2ecf20Sopenharmony_ci
17868c2ecf20Sopenharmony_ci			/*
17878c2ecf20Sopenharmony_ci			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
17888c2ecf20Sopenharmony_ci			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
17898c2ecf20Sopenharmony_ci			 * count on these policies which will be dropped by
17908c2ecf20Sopenharmony_ci			 * mpol_cond_put() later
17918c2ecf20Sopenharmony_ci			 */
17928c2ecf20Sopenharmony_ci			if (mpol_needs_cond_ref(pol))
17938c2ecf20Sopenharmony_ci				mpol_get(pol);
17948c2ecf20Sopenharmony_ci		}
17958c2ecf20Sopenharmony_ci	}
17968c2ecf20Sopenharmony_ci
17978c2ecf20Sopenharmony_ci	return pol;
17988c2ecf20Sopenharmony_ci}
17998c2ecf20Sopenharmony_ci
18008c2ecf20Sopenharmony_ci/*
18018c2ecf20Sopenharmony_ci * get_vma_policy(@vma, @addr)
18028c2ecf20Sopenharmony_ci * @vma: virtual memory area whose policy is sought
18038c2ecf20Sopenharmony_ci * @addr: address in @vma for shared policy lookup
18048c2ecf20Sopenharmony_ci *
18058c2ecf20Sopenharmony_ci * Returns effective policy for a VMA at specified address.
18068c2ecf20Sopenharmony_ci * Falls back to current->mempolicy or system default policy, as necessary.
18078c2ecf20Sopenharmony_ci * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
18088c2ecf20Sopenharmony_ci * count--added by the get_policy() vm_op, as appropriate--to protect against
18098c2ecf20Sopenharmony_ci * freeing by another task.  It is the caller's responsibility to free the
18108c2ecf20Sopenharmony_ci * extra reference for shared policies.
18118c2ecf20Sopenharmony_ci */
18128c2ecf20Sopenharmony_cistatic struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
18138c2ecf20Sopenharmony_ci						unsigned long addr)
18148c2ecf20Sopenharmony_ci{
18158c2ecf20Sopenharmony_ci	struct mempolicy *pol = __get_vma_policy(vma, addr);
18168c2ecf20Sopenharmony_ci
18178c2ecf20Sopenharmony_ci	if (!pol)
18188c2ecf20Sopenharmony_ci		pol = get_task_policy(current);
18198c2ecf20Sopenharmony_ci
18208c2ecf20Sopenharmony_ci	return pol;
18218c2ecf20Sopenharmony_ci}
18228c2ecf20Sopenharmony_ci
18238c2ecf20Sopenharmony_cibool vma_policy_mof(struct vm_area_struct *vma)
18248c2ecf20Sopenharmony_ci{
18258c2ecf20Sopenharmony_ci	struct mempolicy *pol;
18268c2ecf20Sopenharmony_ci
18278c2ecf20Sopenharmony_ci	if (vma->vm_ops && vma->vm_ops->get_policy) {
18288c2ecf20Sopenharmony_ci		bool ret = false;
18298c2ecf20Sopenharmony_ci
18308c2ecf20Sopenharmony_ci		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
18318c2ecf20Sopenharmony_ci		if (pol && (pol->flags & MPOL_F_MOF))
18328c2ecf20Sopenharmony_ci			ret = true;
18338c2ecf20Sopenharmony_ci		mpol_cond_put(pol);
18348c2ecf20Sopenharmony_ci
18358c2ecf20Sopenharmony_ci		return ret;
18368c2ecf20Sopenharmony_ci	}
18378c2ecf20Sopenharmony_ci
18388c2ecf20Sopenharmony_ci	pol = vma->vm_policy;
18398c2ecf20Sopenharmony_ci	if (!pol)
18408c2ecf20Sopenharmony_ci		pol = get_task_policy(current);
18418c2ecf20Sopenharmony_ci
18428c2ecf20Sopenharmony_ci	return pol->flags & MPOL_F_MOF;
18438c2ecf20Sopenharmony_ci}
18448c2ecf20Sopenharmony_ci
18458c2ecf20Sopenharmony_cistatic int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
18468c2ecf20Sopenharmony_ci{
18478c2ecf20Sopenharmony_ci	enum zone_type dynamic_policy_zone = policy_zone;
18488c2ecf20Sopenharmony_ci
18498c2ecf20Sopenharmony_ci	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
18508c2ecf20Sopenharmony_ci
18518c2ecf20Sopenharmony_ci	/*
18528c2ecf20Sopenharmony_ci	 * if policy->v.nodes has movable memory only,
18538c2ecf20Sopenharmony_ci	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
18548c2ecf20Sopenharmony_ci	 *
18558c2ecf20Sopenharmony_ci	 * policy->v.nodes is intersect with node_states[N_MEMORY].
18568c2ecf20Sopenharmony_ci	 * so if the following test faile, it implies
18578c2ecf20Sopenharmony_ci	 * policy->v.nodes has movable memory only.
18588c2ecf20Sopenharmony_ci	 */
18598c2ecf20Sopenharmony_ci	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
18608c2ecf20Sopenharmony_ci		dynamic_policy_zone = ZONE_MOVABLE;
18618c2ecf20Sopenharmony_ci
18628c2ecf20Sopenharmony_ci	return zone >= dynamic_policy_zone;
18638c2ecf20Sopenharmony_ci}
18648c2ecf20Sopenharmony_ci
18658c2ecf20Sopenharmony_ci/*
18668c2ecf20Sopenharmony_ci * Return a nodemask representing a mempolicy for filtering nodes for
18678c2ecf20Sopenharmony_ci * page allocation
18688c2ecf20Sopenharmony_ci */
18698c2ecf20Sopenharmony_cinodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
18708c2ecf20Sopenharmony_ci{
18718c2ecf20Sopenharmony_ci	/* Lower zones don't get a nodemask applied for MPOL_BIND */
18728c2ecf20Sopenharmony_ci	if (unlikely(policy->mode == MPOL_BIND) &&
18738c2ecf20Sopenharmony_ci			apply_policy_zone(policy, gfp_zone(gfp)) &&
18748c2ecf20Sopenharmony_ci			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
18758c2ecf20Sopenharmony_ci		return &policy->v.nodes;
18768c2ecf20Sopenharmony_ci
18778c2ecf20Sopenharmony_ci	return NULL;
18788c2ecf20Sopenharmony_ci}
18798c2ecf20Sopenharmony_ci
18808c2ecf20Sopenharmony_ci/* Return the node id preferred by the given mempolicy, or the given id */
18818c2ecf20Sopenharmony_cistatic int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
18828c2ecf20Sopenharmony_ci{
18838c2ecf20Sopenharmony_ci	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
18848c2ecf20Sopenharmony_ci		nd = policy->v.preferred_node;
18858c2ecf20Sopenharmony_ci	else {
18868c2ecf20Sopenharmony_ci		/*
18878c2ecf20Sopenharmony_ci		 * __GFP_THISNODE shouldn't even be used with the bind policy
18888c2ecf20Sopenharmony_ci		 * because we might easily break the expectation to stay on the
18898c2ecf20Sopenharmony_ci		 * requested node and not break the policy.
18908c2ecf20Sopenharmony_ci		 */
18918c2ecf20Sopenharmony_ci		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
18928c2ecf20Sopenharmony_ci	}
18938c2ecf20Sopenharmony_ci
18948c2ecf20Sopenharmony_ci	return nd;
18958c2ecf20Sopenharmony_ci}
18968c2ecf20Sopenharmony_ci
18978c2ecf20Sopenharmony_ci/* Do dynamic interleaving for a process */
18988c2ecf20Sopenharmony_cistatic unsigned interleave_nodes(struct mempolicy *policy)
18998c2ecf20Sopenharmony_ci{
19008c2ecf20Sopenharmony_ci	unsigned next;
19018c2ecf20Sopenharmony_ci	struct task_struct *me = current;
19028c2ecf20Sopenharmony_ci
19038c2ecf20Sopenharmony_ci	next = next_node_in(me->il_prev, policy->v.nodes);
19048c2ecf20Sopenharmony_ci	if (next < MAX_NUMNODES)
19058c2ecf20Sopenharmony_ci		me->il_prev = next;
19068c2ecf20Sopenharmony_ci	return next;
19078c2ecf20Sopenharmony_ci}
19088c2ecf20Sopenharmony_ci
19098c2ecf20Sopenharmony_ci/*
19108c2ecf20Sopenharmony_ci * Depending on the memory policy provide a node from which to allocate the
19118c2ecf20Sopenharmony_ci * next slab entry.
19128c2ecf20Sopenharmony_ci */
19138c2ecf20Sopenharmony_ciunsigned int mempolicy_slab_node(void)
19148c2ecf20Sopenharmony_ci{
19158c2ecf20Sopenharmony_ci	struct mempolicy *policy;
19168c2ecf20Sopenharmony_ci	int node = numa_mem_id();
19178c2ecf20Sopenharmony_ci
19188c2ecf20Sopenharmony_ci	if (in_interrupt())
19198c2ecf20Sopenharmony_ci		return node;
19208c2ecf20Sopenharmony_ci
19218c2ecf20Sopenharmony_ci	policy = current->mempolicy;
19228c2ecf20Sopenharmony_ci	if (!policy || policy->flags & MPOL_F_LOCAL)
19238c2ecf20Sopenharmony_ci		return node;
19248c2ecf20Sopenharmony_ci
19258c2ecf20Sopenharmony_ci	switch (policy->mode) {
19268c2ecf20Sopenharmony_ci	case MPOL_PREFERRED:
19278c2ecf20Sopenharmony_ci		/*
19288c2ecf20Sopenharmony_ci		 * handled MPOL_F_LOCAL above
19298c2ecf20Sopenharmony_ci		 */
19308c2ecf20Sopenharmony_ci		return policy->v.preferred_node;
19318c2ecf20Sopenharmony_ci
19328c2ecf20Sopenharmony_ci	case MPOL_INTERLEAVE:
19338c2ecf20Sopenharmony_ci		return interleave_nodes(policy);
19348c2ecf20Sopenharmony_ci
19358c2ecf20Sopenharmony_ci	case MPOL_BIND: {
19368c2ecf20Sopenharmony_ci		struct zoneref *z;
19378c2ecf20Sopenharmony_ci
19388c2ecf20Sopenharmony_ci		/*
19398c2ecf20Sopenharmony_ci		 * Follow bind policy behavior and start allocation at the
19408c2ecf20Sopenharmony_ci		 * first node.
19418c2ecf20Sopenharmony_ci		 */
19428c2ecf20Sopenharmony_ci		struct zonelist *zonelist;
19438c2ecf20Sopenharmony_ci		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
19448c2ecf20Sopenharmony_ci		zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
19458c2ecf20Sopenharmony_ci		z = first_zones_zonelist(zonelist, highest_zoneidx,
19468c2ecf20Sopenharmony_ci							&policy->v.nodes);
19478c2ecf20Sopenharmony_ci		return z->zone ? zone_to_nid(z->zone) : node;
19488c2ecf20Sopenharmony_ci	}
19498c2ecf20Sopenharmony_ci
19508c2ecf20Sopenharmony_ci	default:
19518c2ecf20Sopenharmony_ci		BUG();
19528c2ecf20Sopenharmony_ci	}
19538c2ecf20Sopenharmony_ci}
19548c2ecf20Sopenharmony_ci
19558c2ecf20Sopenharmony_ci/*
19568c2ecf20Sopenharmony_ci * Do static interleaving for a VMA with known offset @n.  Returns the n'th
19578c2ecf20Sopenharmony_ci * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
19588c2ecf20Sopenharmony_ci * number of present nodes.
19598c2ecf20Sopenharmony_ci */
19608c2ecf20Sopenharmony_cistatic unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
19618c2ecf20Sopenharmony_ci{
19628c2ecf20Sopenharmony_ci	unsigned nnodes = nodes_weight(pol->v.nodes);
19638c2ecf20Sopenharmony_ci	unsigned target;
19648c2ecf20Sopenharmony_ci	int i;
19658c2ecf20Sopenharmony_ci	int nid;
19668c2ecf20Sopenharmony_ci
19678c2ecf20Sopenharmony_ci	if (!nnodes)
19688c2ecf20Sopenharmony_ci		return numa_node_id();
19698c2ecf20Sopenharmony_ci	target = (unsigned int)n % nnodes;
19708c2ecf20Sopenharmony_ci	nid = first_node(pol->v.nodes);
19718c2ecf20Sopenharmony_ci	for (i = 0; i < target; i++)
19728c2ecf20Sopenharmony_ci		nid = next_node(nid, pol->v.nodes);
19738c2ecf20Sopenharmony_ci	return nid;
19748c2ecf20Sopenharmony_ci}
19758c2ecf20Sopenharmony_ci
19768c2ecf20Sopenharmony_ci/* Determine a node number for interleave */
19778c2ecf20Sopenharmony_cistatic inline unsigned interleave_nid(struct mempolicy *pol,
19788c2ecf20Sopenharmony_ci		 struct vm_area_struct *vma, unsigned long addr, int shift)
19798c2ecf20Sopenharmony_ci{
19808c2ecf20Sopenharmony_ci	if (vma) {
19818c2ecf20Sopenharmony_ci		unsigned long off;
19828c2ecf20Sopenharmony_ci
19838c2ecf20Sopenharmony_ci		/*
19848c2ecf20Sopenharmony_ci		 * for small pages, there is no difference between
19858c2ecf20Sopenharmony_ci		 * shift and PAGE_SHIFT, so the bit-shift is safe.
19868c2ecf20Sopenharmony_ci		 * for huge pages, since vm_pgoff is in units of small
19878c2ecf20Sopenharmony_ci		 * pages, we need to shift off the always 0 bits to get
19888c2ecf20Sopenharmony_ci		 * a useful offset.
19898c2ecf20Sopenharmony_ci		 */
19908c2ecf20Sopenharmony_ci		BUG_ON(shift < PAGE_SHIFT);
19918c2ecf20Sopenharmony_ci		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
19928c2ecf20Sopenharmony_ci		off += (addr - vma->vm_start) >> shift;
19938c2ecf20Sopenharmony_ci		return offset_il_node(pol, off);
19948c2ecf20Sopenharmony_ci	} else
19958c2ecf20Sopenharmony_ci		return interleave_nodes(pol);
19968c2ecf20Sopenharmony_ci}
19978c2ecf20Sopenharmony_ci
19988c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLBFS
19998c2ecf20Sopenharmony_ci/*
20008c2ecf20Sopenharmony_ci * huge_node(@vma, @addr, @gfp_flags, @mpol)
20018c2ecf20Sopenharmony_ci * @vma: virtual memory area whose policy is sought
20028c2ecf20Sopenharmony_ci * @addr: address in @vma for shared policy lookup and interleave policy
20038c2ecf20Sopenharmony_ci * @gfp_flags: for requested zone
20048c2ecf20Sopenharmony_ci * @mpol: pointer to mempolicy pointer for reference counted mempolicy
20058c2ecf20Sopenharmony_ci * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
20068c2ecf20Sopenharmony_ci *
20078c2ecf20Sopenharmony_ci * Returns a nid suitable for a huge page allocation and a pointer
20088c2ecf20Sopenharmony_ci * to the struct mempolicy for conditional unref after allocation.
20098c2ecf20Sopenharmony_ci * If the effective policy is 'BIND, returns a pointer to the mempolicy's
20108c2ecf20Sopenharmony_ci * @nodemask for filtering the zonelist.
20118c2ecf20Sopenharmony_ci *
20128c2ecf20Sopenharmony_ci * Must be protected by read_mems_allowed_begin()
20138c2ecf20Sopenharmony_ci */
20148c2ecf20Sopenharmony_ciint huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
20158c2ecf20Sopenharmony_ci				struct mempolicy **mpol, nodemask_t **nodemask)
20168c2ecf20Sopenharmony_ci{
20178c2ecf20Sopenharmony_ci	int nid;
20188c2ecf20Sopenharmony_ci
20198c2ecf20Sopenharmony_ci	*mpol = get_vma_policy(vma, addr);
20208c2ecf20Sopenharmony_ci	*nodemask = NULL;	/* assume !MPOL_BIND */
20218c2ecf20Sopenharmony_ci
20228c2ecf20Sopenharmony_ci	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
20238c2ecf20Sopenharmony_ci		nid = interleave_nid(*mpol, vma, addr,
20248c2ecf20Sopenharmony_ci					huge_page_shift(hstate_vma(vma)));
20258c2ecf20Sopenharmony_ci	} else {
20268c2ecf20Sopenharmony_ci		nid = policy_node(gfp_flags, *mpol, numa_node_id());
20278c2ecf20Sopenharmony_ci		if ((*mpol)->mode == MPOL_BIND)
20288c2ecf20Sopenharmony_ci			*nodemask = &(*mpol)->v.nodes;
20298c2ecf20Sopenharmony_ci	}
20308c2ecf20Sopenharmony_ci	return nid;
20318c2ecf20Sopenharmony_ci}
20328c2ecf20Sopenharmony_ci
20338c2ecf20Sopenharmony_ci/*
20348c2ecf20Sopenharmony_ci * init_nodemask_of_mempolicy
20358c2ecf20Sopenharmony_ci *
20368c2ecf20Sopenharmony_ci * If the current task's mempolicy is "default" [NULL], return 'false'
20378c2ecf20Sopenharmony_ci * to indicate default policy.  Otherwise, extract the policy nodemask
20388c2ecf20Sopenharmony_ci * for 'bind' or 'interleave' policy into the argument nodemask, or
20398c2ecf20Sopenharmony_ci * initialize the argument nodemask to contain the single node for
20408c2ecf20Sopenharmony_ci * 'preferred' or 'local' policy and return 'true' to indicate presence
20418c2ecf20Sopenharmony_ci * of non-default mempolicy.
20428c2ecf20Sopenharmony_ci *
20438c2ecf20Sopenharmony_ci * We don't bother with reference counting the mempolicy [mpol_get/put]
20448c2ecf20Sopenharmony_ci * because the current task is examining it's own mempolicy and a task's
20458c2ecf20Sopenharmony_ci * mempolicy is only ever changed by the task itself.
20468c2ecf20Sopenharmony_ci *
20478c2ecf20Sopenharmony_ci * N.B., it is the caller's responsibility to free a returned nodemask.
20488c2ecf20Sopenharmony_ci */
20498c2ecf20Sopenharmony_cibool init_nodemask_of_mempolicy(nodemask_t *mask)
20508c2ecf20Sopenharmony_ci{
20518c2ecf20Sopenharmony_ci	struct mempolicy *mempolicy;
20528c2ecf20Sopenharmony_ci	int nid;
20538c2ecf20Sopenharmony_ci
20548c2ecf20Sopenharmony_ci	if (!(mask && current->mempolicy))
20558c2ecf20Sopenharmony_ci		return false;
20568c2ecf20Sopenharmony_ci
20578c2ecf20Sopenharmony_ci	task_lock(current);
20588c2ecf20Sopenharmony_ci	mempolicy = current->mempolicy;
20598c2ecf20Sopenharmony_ci	switch (mempolicy->mode) {
20608c2ecf20Sopenharmony_ci	case MPOL_PREFERRED:
20618c2ecf20Sopenharmony_ci		if (mempolicy->flags & MPOL_F_LOCAL)
20628c2ecf20Sopenharmony_ci			nid = numa_node_id();
20638c2ecf20Sopenharmony_ci		else
20648c2ecf20Sopenharmony_ci			nid = mempolicy->v.preferred_node;
20658c2ecf20Sopenharmony_ci		init_nodemask_of_node(mask, nid);
20668c2ecf20Sopenharmony_ci		break;
20678c2ecf20Sopenharmony_ci
20688c2ecf20Sopenharmony_ci	case MPOL_BIND:
20698c2ecf20Sopenharmony_ci	case MPOL_INTERLEAVE:
20708c2ecf20Sopenharmony_ci		*mask =  mempolicy->v.nodes;
20718c2ecf20Sopenharmony_ci		break;
20728c2ecf20Sopenharmony_ci
20738c2ecf20Sopenharmony_ci	default:
20748c2ecf20Sopenharmony_ci		BUG();
20758c2ecf20Sopenharmony_ci	}
20768c2ecf20Sopenharmony_ci	task_unlock(current);
20778c2ecf20Sopenharmony_ci
20788c2ecf20Sopenharmony_ci	return true;
20798c2ecf20Sopenharmony_ci}
20808c2ecf20Sopenharmony_ci#endif
20818c2ecf20Sopenharmony_ci
20828c2ecf20Sopenharmony_ci/*
20838c2ecf20Sopenharmony_ci * mempolicy_nodemask_intersects
20848c2ecf20Sopenharmony_ci *
20858c2ecf20Sopenharmony_ci * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
20868c2ecf20Sopenharmony_ci * policy.  Otherwise, check for intersection between mask and the policy
20878c2ecf20Sopenharmony_ci * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
20888c2ecf20Sopenharmony_ci * policy, always return true since it may allocate elsewhere on fallback.
20898c2ecf20Sopenharmony_ci *
20908c2ecf20Sopenharmony_ci * Takes task_lock(tsk) to prevent freeing of its mempolicy.
20918c2ecf20Sopenharmony_ci */
20928c2ecf20Sopenharmony_cibool mempolicy_nodemask_intersects(struct task_struct *tsk,
20938c2ecf20Sopenharmony_ci					const nodemask_t *mask)
20948c2ecf20Sopenharmony_ci{
20958c2ecf20Sopenharmony_ci	struct mempolicy *mempolicy;
20968c2ecf20Sopenharmony_ci	bool ret = true;
20978c2ecf20Sopenharmony_ci
20988c2ecf20Sopenharmony_ci	if (!mask)
20998c2ecf20Sopenharmony_ci		return ret;
21008c2ecf20Sopenharmony_ci	task_lock(tsk);
21018c2ecf20Sopenharmony_ci	mempolicy = tsk->mempolicy;
21028c2ecf20Sopenharmony_ci	if (!mempolicy)
21038c2ecf20Sopenharmony_ci		goto out;
21048c2ecf20Sopenharmony_ci
21058c2ecf20Sopenharmony_ci	switch (mempolicy->mode) {
21068c2ecf20Sopenharmony_ci	case MPOL_PREFERRED:
21078c2ecf20Sopenharmony_ci		/*
21088c2ecf20Sopenharmony_ci		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
21098c2ecf20Sopenharmony_ci		 * allocate from, they may fallback to other nodes when oom.
21108c2ecf20Sopenharmony_ci		 * Thus, it's possible for tsk to have allocated memory from
21118c2ecf20Sopenharmony_ci		 * nodes in mask.
21128c2ecf20Sopenharmony_ci		 */
21138c2ecf20Sopenharmony_ci		break;
21148c2ecf20Sopenharmony_ci	case MPOL_BIND:
21158c2ecf20Sopenharmony_ci	case MPOL_INTERLEAVE:
21168c2ecf20Sopenharmony_ci		ret = nodes_intersects(mempolicy->v.nodes, *mask);
21178c2ecf20Sopenharmony_ci		break;
21188c2ecf20Sopenharmony_ci	default:
21198c2ecf20Sopenharmony_ci		BUG();
21208c2ecf20Sopenharmony_ci	}
21218c2ecf20Sopenharmony_ciout:
21228c2ecf20Sopenharmony_ci	task_unlock(tsk);
21238c2ecf20Sopenharmony_ci	return ret;
21248c2ecf20Sopenharmony_ci}
21258c2ecf20Sopenharmony_ci
21268c2ecf20Sopenharmony_ci/* Allocate a page in interleaved policy.
21278c2ecf20Sopenharmony_ci   Own path because it needs to do special accounting. */
21288c2ecf20Sopenharmony_cistatic struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
21298c2ecf20Sopenharmony_ci					unsigned nid)
21308c2ecf20Sopenharmony_ci{
21318c2ecf20Sopenharmony_ci	struct page *page;
21328c2ecf20Sopenharmony_ci
21338c2ecf20Sopenharmony_ci	page = __alloc_pages(gfp, order, nid);
21348c2ecf20Sopenharmony_ci	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
21358c2ecf20Sopenharmony_ci	if (!static_branch_likely(&vm_numa_stat_key))
21368c2ecf20Sopenharmony_ci		return page;
21378c2ecf20Sopenharmony_ci	if (page && page_to_nid(page) == nid) {
21388c2ecf20Sopenharmony_ci		preempt_disable();
21398c2ecf20Sopenharmony_ci		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
21408c2ecf20Sopenharmony_ci		preempt_enable();
21418c2ecf20Sopenharmony_ci	}
21428c2ecf20Sopenharmony_ci	return page;
21438c2ecf20Sopenharmony_ci}
21448c2ecf20Sopenharmony_ci
21458c2ecf20Sopenharmony_ci/**
21468c2ecf20Sopenharmony_ci * 	alloc_pages_vma	- Allocate a page for a VMA.
21478c2ecf20Sopenharmony_ci *
21488c2ecf20Sopenharmony_ci * 	@gfp:
21498c2ecf20Sopenharmony_ci *      %GFP_USER    user allocation.
21508c2ecf20Sopenharmony_ci *      %GFP_KERNEL  kernel allocations,
21518c2ecf20Sopenharmony_ci *      %GFP_HIGHMEM highmem/user allocations,
21528c2ecf20Sopenharmony_ci *      %GFP_FS      allocation should not call back into a file system.
21538c2ecf20Sopenharmony_ci *      %GFP_ATOMIC  don't sleep.
21548c2ecf20Sopenharmony_ci *
21558c2ecf20Sopenharmony_ci *	@order:Order of the GFP allocation.
21568c2ecf20Sopenharmony_ci * 	@vma:  Pointer to VMA or NULL if not available.
21578c2ecf20Sopenharmony_ci *	@addr: Virtual Address of the allocation. Must be inside the VMA.
21588c2ecf20Sopenharmony_ci *	@node: Which node to prefer for allocation (modulo policy).
21598c2ecf20Sopenharmony_ci *	@hugepage: for hugepages try only the preferred node if possible
21608c2ecf20Sopenharmony_ci *
21618c2ecf20Sopenharmony_ci * 	This function allocates a page from the kernel page pool and applies
21628c2ecf20Sopenharmony_ci *	a NUMA policy associated with the VMA or the current process.
21638c2ecf20Sopenharmony_ci *	When VMA is not NULL caller must read-lock the mmap_lock of the
21648c2ecf20Sopenharmony_ci *	mm_struct of the VMA to prevent it from going away. Should be used for
21658c2ecf20Sopenharmony_ci *	all allocations for pages that will be mapped into user space. Returns
21668c2ecf20Sopenharmony_ci *	NULL when no page can be allocated.
21678c2ecf20Sopenharmony_ci */
21688c2ecf20Sopenharmony_cistruct page *
21698c2ecf20Sopenharmony_cialloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
21708c2ecf20Sopenharmony_ci		unsigned long addr, int node, bool hugepage)
21718c2ecf20Sopenharmony_ci{
21728c2ecf20Sopenharmony_ci	struct mempolicy *pol;
21738c2ecf20Sopenharmony_ci	struct page *page;
21748c2ecf20Sopenharmony_ci	int preferred_nid;
21758c2ecf20Sopenharmony_ci	nodemask_t *nmask;
21768c2ecf20Sopenharmony_ci
21778c2ecf20Sopenharmony_ci	pol = get_vma_policy(vma, addr);
21788c2ecf20Sopenharmony_ci
21798c2ecf20Sopenharmony_ci	if (pol->mode == MPOL_INTERLEAVE) {
21808c2ecf20Sopenharmony_ci		unsigned nid;
21818c2ecf20Sopenharmony_ci
21828c2ecf20Sopenharmony_ci		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
21838c2ecf20Sopenharmony_ci		mpol_cond_put(pol);
21848c2ecf20Sopenharmony_ci		page = alloc_page_interleave(gfp, order, nid);
21858c2ecf20Sopenharmony_ci		goto out;
21868c2ecf20Sopenharmony_ci	}
21878c2ecf20Sopenharmony_ci
21888c2ecf20Sopenharmony_ci	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
21898c2ecf20Sopenharmony_ci		int hpage_node = node;
21908c2ecf20Sopenharmony_ci
21918c2ecf20Sopenharmony_ci		/*
21928c2ecf20Sopenharmony_ci		 * For hugepage allocation and non-interleave policy which
21938c2ecf20Sopenharmony_ci		 * allows the current node (or other explicitly preferred
21948c2ecf20Sopenharmony_ci		 * node) we only try to allocate from the current/preferred
21958c2ecf20Sopenharmony_ci		 * node and don't fall back to other nodes, as the cost of
21968c2ecf20Sopenharmony_ci		 * remote accesses would likely offset THP benefits.
21978c2ecf20Sopenharmony_ci		 *
21988c2ecf20Sopenharmony_ci		 * If the policy is interleave, or does not allow the current
21998c2ecf20Sopenharmony_ci		 * node in its nodemask, we allocate the standard way.
22008c2ecf20Sopenharmony_ci		 */
22018c2ecf20Sopenharmony_ci		if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
22028c2ecf20Sopenharmony_ci			hpage_node = pol->v.preferred_node;
22038c2ecf20Sopenharmony_ci
22048c2ecf20Sopenharmony_ci		nmask = policy_nodemask(gfp, pol);
22058c2ecf20Sopenharmony_ci		if (!nmask || node_isset(hpage_node, *nmask)) {
22068c2ecf20Sopenharmony_ci			mpol_cond_put(pol);
22078c2ecf20Sopenharmony_ci			/*
22088c2ecf20Sopenharmony_ci			 * First, try to allocate THP only on local node, but
22098c2ecf20Sopenharmony_ci			 * don't reclaim unnecessarily, just compact.
22108c2ecf20Sopenharmony_ci			 */
22118c2ecf20Sopenharmony_ci			page = __alloc_pages_node(hpage_node,
22128c2ecf20Sopenharmony_ci				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
22138c2ecf20Sopenharmony_ci
22148c2ecf20Sopenharmony_ci			/*
22158c2ecf20Sopenharmony_ci			 * If hugepage allocations are configured to always
22168c2ecf20Sopenharmony_ci			 * synchronous compact or the vma has been madvised
22178c2ecf20Sopenharmony_ci			 * to prefer hugepage backing, retry allowing remote
22188c2ecf20Sopenharmony_ci			 * memory with both reclaim and compact as well.
22198c2ecf20Sopenharmony_ci			 */
22208c2ecf20Sopenharmony_ci			if (!page && (gfp & __GFP_DIRECT_RECLAIM))
22218c2ecf20Sopenharmony_ci				page = __alloc_pages_nodemask(gfp, order,
22228c2ecf20Sopenharmony_ci							hpage_node, nmask);
22238c2ecf20Sopenharmony_ci
22248c2ecf20Sopenharmony_ci			goto out;
22258c2ecf20Sopenharmony_ci		}
22268c2ecf20Sopenharmony_ci	}
22278c2ecf20Sopenharmony_ci
22288c2ecf20Sopenharmony_ci	nmask = policy_nodemask(gfp, pol);
22298c2ecf20Sopenharmony_ci	preferred_nid = policy_node(gfp, pol, node);
22308c2ecf20Sopenharmony_ci	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
22318c2ecf20Sopenharmony_ci	mpol_cond_put(pol);
22328c2ecf20Sopenharmony_ciout:
22338c2ecf20Sopenharmony_ci	return page;
22348c2ecf20Sopenharmony_ci}
22358c2ecf20Sopenharmony_ciEXPORT_SYMBOL(alloc_pages_vma);
22368c2ecf20Sopenharmony_ci
22378c2ecf20Sopenharmony_ci/**
22388c2ecf20Sopenharmony_ci * 	alloc_pages_current - Allocate pages.
22398c2ecf20Sopenharmony_ci *
22408c2ecf20Sopenharmony_ci *	@gfp:
22418c2ecf20Sopenharmony_ci *		%GFP_USER   user allocation,
22428c2ecf20Sopenharmony_ci *      	%GFP_KERNEL kernel allocation,
22438c2ecf20Sopenharmony_ci *      	%GFP_HIGHMEM highmem allocation,
22448c2ecf20Sopenharmony_ci *      	%GFP_FS     don't call back into a file system.
22458c2ecf20Sopenharmony_ci *      	%GFP_ATOMIC don't sleep.
22468c2ecf20Sopenharmony_ci *	@order: Power of two of allocation size in pages. 0 is a single page.
22478c2ecf20Sopenharmony_ci *
22488c2ecf20Sopenharmony_ci *	Allocate a page from the kernel page pool.  When not in
22498c2ecf20Sopenharmony_ci *	interrupt context and apply the current process NUMA policy.
22508c2ecf20Sopenharmony_ci *	Returns NULL when no page can be allocated.
22518c2ecf20Sopenharmony_ci */
22528c2ecf20Sopenharmony_cistruct page *alloc_pages_current(gfp_t gfp, unsigned order)
22538c2ecf20Sopenharmony_ci{
22548c2ecf20Sopenharmony_ci	struct mempolicy *pol = &default_policy;
22558c2ecf20Sopenharmony_ci	struct page *page;
22568c2ecf20Sopenharmony_ci
22578c2ecf20Sopenharmony_ci	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
22588c2ecf20Sopenharmony_ci		pol = get_task_policy(current);
22598c2ecf20Sopenharmony_ci
22608c2ecf20Sopenharmony_ci	/*
22618c2ecf20Sopenharmony_ci	 * No reference counting needed for current->mempolicy
22628c2ecf20Sopenharmony_ci	 * nor system default_policy
22638c2ecf20Sopenharmony_ci	 */
22648c2ecf20Sopenharmony_ci	if (pol->mode == MPOL_INTERLEAVE)
22658c2ecf20Sopenharmony_ci		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
22668c2ecf20Sopenharmony_ci	else
22678c2ecf20Sopenharmony_ci		page = __alloc_pages_nodemask(gfp, order,
22688c2ecf20Sopenharmony_ci				policy_node(gfp, pol, numa_node_id()),
22698c2ecf20Sopenharmony_ci				policy_nodemask(gfp, pol));
22708c2ecf20Sopenharmony_ci
22718c2ecf20Sopenharmony_ci	return page;
22728c2ecf20Sopenharmony_ci}
22738c2ecf20Sopenharmony_ciEXPORT_SYMBOL(alloc_pages_current);
22748c2ecf20Sopenharmony_ci
22758c2ecf20Sopenharmony_ciint vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
22768c2ecf20Sopenharmony_ci{
22778c2ecf20Sopenharmony_ci	struct mempolicy *pol = mpol_dup(vma_policy(src));
22788c2ecf20Sopenharmony_ci
22798c2ecf20Sopenharmony_ci	if (IS_ERR(pol))
22808c2ecf20Sopenharmony_ci		return PTR_ERR(pol);
22818c2ecf20Sopenharmony_ci	dst->vm_policy = pol;
22828c2ecf20Sopenharmony_ci	return 0;
22838c2ecf20Sopenharmony_ci}
22848c2ecf20Sopenharmony_ci
22858c2ecf20Sopenharmony_ci/*
22868c2ecf20Sopenharmony_ci * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
22878c2ecf20Sopenharmony_ci * rebinds the mempolicy its copying by calling mpol_rebind_policy()
22888c2ecf20Sopenharmony_ci * with the mems_allowed returned by cpuset_mems_allowed().  This
22898c2ecf20Sopenharmony_ci * keeps mempolicies cpuset relative after its cpuset moves.  See
22908c2ecf20Sopenharmony_ci * further kernel/cpuset.c update_nodemask().
22918c2ecf20Sopenharmony_ci *
22928c2ecf20Sopenharmony_ci * current's mempolicy may be rebinded by the other task(the task that changes
22938c2ecf20Sopenharmony_ci * cpuset's mems), so we needn't do rebind work for current task.
22948c2ecf20Sopenharmony_ci */
22958c2ecf20Sopenharmony_ci
22968c2ecf20Sopenharmony_ci/* Slow path of a mempolicy duplicate */
22978c2ecf20Sopenharmony_cistruct mempolicy *__mpol_dup(struct mempolicy *old)
22988c2ecf20Sopenharmony_ci{
22998c2ecf20Sopenharmony_ci	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
23008c2ecf20Sopenharmony_ci
23018c2ecf20Sopenharmony_ci	if (!new)
23028c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
23038c2ecf20Sopenharmony_ci
23048c2ecf20Sopenharmony_ci	/* task's mempolicy is protected by alloc_lock */
23058c2ecf20Sopenharmony_ci	if (old == current->mempolicy) {
23068c2ecf20Sopenharmony_ci		task_lock(current);
23078c2ecf20Sopenharmony_ci		*new = *old;
23088c2ecf20Sopenharmony_ci		task_unlock(current);
23098c2ecf20Sopenharmony_ci	} else
23108c2ecf20Sopenharmony_ci		*new = *old;
23118c2ecf20Sopenharmony_ci
23128c2ecf20Sopenharmony_ci	if (current_cpuset_is_being_rebound()) {
23138c2ecf20Sopenharmony_ci		nodemask_t mems = cpuset_mems_allowed(current);
23148c2ecf20Sopenharmony_ci		mpol_rebind_policy(new, &mems);
23158c2ecf20Sopenharmony_ci	}
23168c2ecf20Sopenharmony_ci	atomic_set(&new->refcnt, 1);
23178c2ecf20Sopenharmony_ci	return new;
23188c2ecf20Sopenharmony_ci}
23198c2ecf20Sopenharmony_ci
23208c2ecf20Sopenharmony_ci/* Slow path of a mempolicy comparison */
23218c2ecf20Sopenharmony_cibool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
23228c2ecf20Sopenharmony_ci{
23238c2ecf20Sopenharmony_ci	if (!a || !b)
23248c2ecf20Sopenharmony_ci		return false;
23258c2ecf20Sopenharmony_ci	if (a->mode != b->mode)
23268c2ecf20Sopenharmony_ci		return false;
23278c2ecf20Sopenharmony_ci	if (a->flags != b->flags)
23288c2ecf20Sopenharmony_ci		return false;
23298c2ecf20Sopenharmony_ci	if (mpol_store_user_nodemask(a))
23308c2ecf20Sopenharmony_ci		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
23318c2ecf20Sopenharmony_ci			return false;
23328c2ecf20Sopenharmony_ci
23338c2ecf20Sopenharmony_ci	switch (a->mode) {
23348c2ecf20Sopenharmony_ci	case MPOL_BIND:
23358c2ecf20Sopenharmony_ci	case MPOL_INTERLEAVE:
23368c2ecf20Sopenharmony_ci		return !!nodes_equal(a->v.nodes, b->v.nodes);
23378c2ecf20Sopenharmony_ci	case MPOL_PREFERRED:
23388c2ecf20Sopenharmony_ci		/* a's ->flags is the same as b's */
23398c2ecf20Sopenharmony_ci		if (a->flags & MPOL_F_LOCAL)
23408c2ecf20Sopenharmony_ci			return true;
23418c2ecf20Sopenharmony_ci		return a->v.preferred_node == b->v.preferred_node;
23428c2ecf20Sopenharmony_ci	default:
23438c2ecf20Sopenharmony_ci		BUG();
23448c2ecf20Sopenharmony_ci		return false;
23458c2ecf20Sopenharmony_ci	}
23468c2ecf20Sopenharmony_ci}
23478c2ecf20Sopenharmony_ci
23488c2ecf20Sopenharmony_ci/*
23498c2ecf20Sopenharmony_ci * Shared memory backing store policy support.
23508c2ecf20Sopenharmony_ci *
23518c2ecf20Sopenharmony_ci * Remember policies even when nobody has shared memory mapped.
23528c2ecf20Sopenharmony_ci * The policies are kept in Red-Black tree linked from the inode.
23538c2ecf20Sopenharmony_ci * They are protected by the sp->lock rwlock, which should be held
23548c2ecf20Sopenharmony_ci * for any accesses to the tree.
23558c2ecf20Sopenharmony_ci */
23568c2ecf20Sopenharmony_ci
23578c2ecf20Sopenharmony_ci/*
23588c2ecf20Sopenharmony_ci * lookup first element intersecting start-end.  Caller holds sp->lock for
23598c2ecf20Sopenharmony_ci * reading or for writing
23608c2ecf20Sopenharmony_ci */
23618c2ecf20Sopenharmony_cistatic struct sp_node *
23628c2ecf20Sopenharmony_cisp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
23638c2ecf20Sopenharmony_ci{
23648c2ecf20Sopenharmony_ci	struct rb_node *n = sp->root.rb_node;
23658c2ecf20Sopenharmony_ci
23668c2ecf20Sopenharmony_ci	while (n) {
23678c2ecf20Sopenharmony_ci		struct sp_node *p = rb_entry(n, struct sp_node, nd);
23688c2ecf20Sopenharmony_ci
23698c2ecf20Sopenharmony_ci		if (start >= p->end)
23708c2ecf20Sopenharmony_ci			n = n->rb_right;
23718c2ecf20Sopenharmony_ci		else if (end <= p->start)
23728c2ecf20Sopenharmony_ci			n = n->rb_left;
23738c2ecf20Sopenharmony_ci		else
23748c2ecf20Sopenharmony_ci			break;
23758c2ecf20Sopenharmony_ci	}
23768c2ecf20Sopenharmony_ci	if (!n)
23778c2ecf20Sopenharmony_ci		return NULL;
23788c2ecf20Sopenharmony_ci	for (;;) {
23798c2ecf20Sopenharmony_ci		struct sp_node *w = NULL;
23808c2ecf20Sopenharmony_ci		struct rb_node *prev = rb_prev(n);
23818c2ecf20Sopenharmony_ci		if (!prev)
23828c2ecf20Sopenharmony_ci			break;
23838c2ecf20Sopenharmony_ci		w = rb_entry(prev, struct sp_node, nd);
23848c2ecf20Sopenharmony_ci		if (w->end <= start)
23858c2ecf20Sopenharmony_ci			break;
23868c2ecf20Sopenharmony_ci		n = prev;
23878c2ecf20Sopenharmony_ci	}
23888c2ecf20Sopenharmony_ci	return rb_entry(n, struct sp_node, nd);
23898c2ecf20Sopenharmony_ci}
23908c2ecf20Sopenharmony_ci
23918c2ecf20Sopenharmony_ci/*
23928c2ecf20Sopenharmony_ci * Insert a new shared policy into the list.  Caller holds sp->lock for
23938c2ecf20Sopenharmony_ci * writing.
23948c2ecf20Sopenharmony_ci */
23958c2ecf20Sopenharmony_cistatic void sp_insert(struct shared_policy *sp, struct sp_node *new)
23968c2ecf20Sopenharmony_ci{
23978c2ecf20Sopenharmony_ci	struct rb_node **p = &sp->root.rb_node;
23988c2ecf20Sopenharmony_ci	struct rb_node *parent = NULL;
23998c2ecf20Sopenharmony_ci	struct sp_node *nd;
24008c2ecf20Sopenharmony_ci
24018c2ecf20Sopenharmony_ci	while (*p) {
24028c2ecf20Sopenharmony_ci		parent = *p;
24038c2ecf20Sopenharmony_ci		nd = rb_entry(parent, struct sp_node, nd);
24048c2ecf20Sopenharmony_ci		if (new->start < nd->start)
24058c2ecf20Sopenharmony_ci			p = &(*p)->rb_left;
24068c2ecf20Sopenharmony_ci		else if (new->end > nd->end)
24078c2ecf20Sopenharmony_ci			p = &(*p)->rb_right;
24088c2ecf20Sopenharmony_ci		else
24098c2ecf20Sopenharmony_ci			BUG();
24108c2ecf20Sopenharmony_ci	}
24118c2ecf20Sopenharmony_ci	rb_link_node(&new->nd, parent, p);
24128c2ecf20Sopenharmony_ci	rb_insert_color(&new->nd, &sp->root);
24138c2ecf20Sopenharmony_ci	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
24148c2ecf20Sopenharmony_ci		 new->policy ? new->policy->mode : 0);
24158c2ecf20Sopenharmony_ci}
24168c2ecf20Sopenharmony_ci
24178c2ecf20Sopenharmony_ci/* Find shared policy intersecting idx */
24188c2ecf20Sopenharmony_cistruct mempolicy *
24198c2ecf20Sopenharmony_cimpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
24208c2ecf20Sopenharmony_ci{
24218c2ecf20Sopenharmony_ci	struct mempolicy *pol = NULL;
24228c2ecf20Sopenharmony_ci	struct sp_node *sn;
24238c2ecf20Sopenharmony_ci
24248c2ecf20Sopenharmony_ci	if (!sp->root.rb_node)
24258c2ecf20Sopenharmony_ci		return NULL;
24268c2ecf20Sopenharmony_ci	read_lock(&sp->lock);
24278c2ecf20Sopenharmony_ci	sn = sp_lookup(sp, idx, idx+1);
24288c2ecf20Sopenharmony_ci	if (sn) {
24298c2ecf20Sopenharmony_ci		mpol_get(sn->policy);
24308c2ecf20Sopenharmony_ci		pol = sn->policy;
24318c2ecf20Sopenharmony_ci	}
24328c2ecf20Sopenharmony_ci	read_unlock(&sp->lock);
24338c2ecf20Sopenharmony_ci	return pol;
24348c2ecf20Sopenharmony_ci}
24358c2ecf20Sopenharmony_ci
24368c2ecf20Sopenharmony_cistatic void sp_free(struct sp_node *n)
24378c2ecf20Sopenharmony_ci{
24388c2ecf20Sopenharmony_ci	mpol_put(n->policy);
24398c2ecf20Sopenharmony_ci	kmem_cache_free(sn_cache, n);
24408c2ecf20Sopenharmony_ci}
24418c2ecf20Sopenharmony_ci
24428c2ecf20Sopenharmony_ci/**
24438c2ecf20Sopenharmony_ci * mpol_misplaced - check whether current page node is valid in policy
24448c2ecf20Sopenharmony_ci *
24458c2ecf20Sopenharmony_ci * @page: page to be checked
24468c2ecf20Sopenharmony_ci * @vma: vm area where page mapped
24478c2ecf20Sopenharmony_ci * @addr: virtual address where page mapped
24488c2ecf20Sopenharmony_ci *
24498c2ecf20Sopenharmony_ci * Lookup current policy node id for vma,addr and "compare to" page's
24508c2ecf20Sopenharmony_ci * node id.
24518c2ecf20Sopenharmony_ci *
24528c2ecf20Sopenharmony_ci * Returns:
24538c2ecf20Sopenharmony_ci *	-1	- not misplaced, page is in the right node
24548c2ecf20Sopenharmony_ci *	node	- node id where the page should be
24558c2ecf20Sopenharmony_ci *
24568c2ecf20Sopenharmony_ci * Policy determination "mimics" alloc_page_vma().
24578c2ecf20Sopenharmony_ci * Called from fault path where we know the vma and faulting address.
24588c2ecf20Sopenharmony_ci */
24598c2ecf20Sopenharmony_ciint mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
24608c2ecf20Sopenharmony_ci{
24618c2ecf20Sopenharmony_ci	struct mempolicy *pol;
24628c2ecf20Sopenharmony_ci	struct zoneref *z;
24638c2ecf20Sopenharmony_ci	int curnid = page_to_nid(page);
24648c2ecf20Sopenharmony_ci	unsigned long pgoff;
24658c2ecf20Sopenharmony_ci	int thiscpu = raw_smp_processor_id();
24668c2ecf20Sopenharmony_ci	int thisnid = cpu_to_node(thiscpu);
24678c2ecf20Sopenharmony_ci	int polnid = NUMA_NO_NODE;
24688c2ecf20Sopenharmony_ci	int ret = -1;
24698c2ecf20Sopenharmony_ci
24708c2ecf20Sopenharmony_ci	pol = get_vma_policy(vma, addr);
24718c2ecf20Sopenharmony_ci	if (!(pol->flags & MPOL_F_MOF))
24728c2ecf20Sopenharmony_ci		goto out;
24738c2ecf20Sopenharmony_ci
24748c2ecf20Sopenharmony_ci	switch (pol->mode) {
24758c2ecf20Sopenharmony_ci	case MPOL_INTERLEAVE:
24768c2ecf20Sopenharmony_ci		pgoff = vma->vm_pgoff;
24778c2ecf20Sopenharmony_ci		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
24788c2ecf20Sopenharmony_ci		polnid = offset_il_node(pol, pgoff);
24798c2ecf20Sopenharmony_ci		break;
24808c2ecf20Sopenharmony_ci
24818c2ecf20Sopenharmony_ci	case MPOL_PREFERRED:
24828c2ecf20Sopenharmony_ci		if (pol->flags & MPOL_F_LOCAL)
24838c2ecf20Sopenharmony_ci			polnid = numa_node_id();
24848c2ecf20Sopenharmony_ci		else
24858c2ecf20Sopenharmony_ci			polnid = pol->v.preferred_node;
24868c2ecf20Sopenharmony_ci		break;
24878c2ecf20Sopenharmony_ci
24888c2ecf20Sopenharmony_ci	case MPOL_BIND:
24898c2ecf20Sopenharmony_ci
24908c2ecf20Sopenharmony_ci		/*
24918c2ecf20Sopenharmony_ci		 * allows binding to multiple nodes.
24928c2ecf20Sopenharmony_ci		 * use current page if in policy nodemask,
24938c2ecf20Sopenharmony_ci		 * else select nearest allowed node, if any.
24948c2ecf20Sopenharmony_ci		 * If no allowed nodes, use current [!misplaced].
24958c2ecf20Sopenharmony_ci		 */
24968c2ecf20Sopenharmony_ci		if (node_isset(curnid, pol->v.nodes))
24978c2ecf20Sopenharmony_ci			goto out;
24988c2ecf20Sopenharmony_ci		z = first_zones_zonelist(
24998c2ecf20Sopenharmony_ci				node_zonelist(numa_node_id(), GFP_HIGHUSER),
25008c2ecf20Sopenharmony_ci				gfp_zone(GFP_HIGHUSER),
25018c2ecf20Sopenharmony_ci				&pol->v.nodes);
25028c2ecf20Sopenharmony_ci		polnid = zone_to_nid(z->zone);
25038c2ecf20Sopenharmony_ci		break;
25048c2ecf20Sopenharmony_ci
25058c2ecf20Sopenharmony_ci	default:
25068c2ecf20Sopenharmony_ci		BUG();
25078c2ecf20Sopenharmony_ci	}
25088c2ecf20Sopenharmony_ci
25098c2ecf20Sopenharmony_ci	/* Migrate the page towards the node whose CPU is referencing it */
25108c2ecf20Sopenharmony_ci	if (pol->flags & MPOL_F_MORON) {
25118c2ecf20Sopenharmony_ci		polnid = thisnid;
25128c2ecf20Sopenharmony_ci
25138c2ecf20Sopenharmony_ci		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
25148c2ecf20Sopenharmony_ci			goto out;
25158c2ecf20Sopenharmony_ci	}
25168c2ecf20Sopenharmony_ci
25178c2ecf20Sopenharmony_ci	if (curnid != polnid)
25188c2ecf20Sopenharmony_ci		ret = polnid;
25198c2ecf20Sopenharmony_ciout:
25208c2ecf20Sopenharmony_ci	mpol_cond_put(pol);
25218c2ecf20Sopenharmony_ci
25228c2ecf20Sopenharmony_ci	return ret;
25238c2ecf20Sopenharmony_ci}
25248c2ecf20Sopenharmony_ci
25258c2ecf20Sopenharmony_ci/*
25268c2ecf20Sopenharmony_ci * Drop the (possibly final) reference to task->mempolicy.  It needs to be
25278c2ecf20Sopenharmony_ci * dropped after task->mempolicy is set to NULL so that any allocation done as
25288c2ecf20Sopenharmony_ci * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
25298c2ecf20Sopenharmony_ci * policy.
25308c2ecf20Sopenharmony_ci */
25318c2ecf20Sopenharmony_civoid mpol_put_task_policy(struct task_struct *task)
25328c2ecf20Sopenharmony_ci{
25338c2ecf20Sopenharmony_ci	struct mempolicy *pol;
25348c2ecf20Sopenharmony_ci
25358c2ecf20Sopenharmony_ci	task_lock(task);
25368c2ecf20Sopenharmony_ci	pol = task->mempolicy;
25378c2ecf20Sopenharmony_ci	task->mempolicy = NULL;
25388c2ecf20Sopenharmony_ci	task_unlock(task);
25398c2ecf20Sopenharmony_ci	mpol_put(pol);
25408c2ecf20Sopenharmony_ci}
25418c2ecf20Sopenharmony_ci
25428c2ecf20Sopenharmony_cistatic void sp_delete(struct shared_policy *sp, struct sp_node *n)
25438c2ecf20Sopenharmony_ci{
25448c2ecf20Sopenharmony_ci	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
25458c2ecf20Sopenharmony_ci	rb_erase(&n->nd, &sp->root);
25468c2ecf20Sopenharmony_ci	sp_free(n);
25478c2ecf20Sopenharmony_ci}
25488c2ecf20Sopenharmony_ci
25498c2ecf20Sopenharmony_cistatic void sp_node_init(struct sp_node *node, unsigned long start,
25508c2ecf20Sopenharmony_ci			unsigned long end, struct mempolicy *pol)
25518c2ecf20Sopenharmony_ci{
25528c2ecf20Sopenharmony_ci	node->start = start;
25538c2ecf20Sopenharmony_ci	node->end = end;
25548c2ecf20Sopenharmony_ci	node->policy = pol;
25558c2ecf20Sopenharmony_ci}
25568c2ecf20Sopenharmony_ci
25578c2ecf20Sopenharmony_cistatic struct sp_node *sp_alloc(unsigned long start, unsigned long end,
25588c2ecf20Sopenharmony_ci				struct mempolicy *pol)
25598c2ecf20Sopenharmony_ci{
25608c2ecf20Sopenharmony_ci	struct sp_node *n;
25618c2ecf20Sopenharmony_ci	struct mempolicy *newpol;
25628c2ecf20Sopenharmony_ci
25638c2ecf20Sopenharmony_ci	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
25648c2ecf20Sopenharmony_ci	if (!n)
25658c2ecf20Sopenharmony_ci		return NULL;
25668c2ecf20Sopenharmony_ci
25678c2ecf20Sopenharmony_ci	newpol = mpol_dup(pol);
25688c2ecf20Sopenharmony_ci	if (IS_ERR(newpol)) {
25698c2ecf20Sopenharmony_ci		kmem_cache_free(sn_cache, n);
25708c2ecf20Sopenharmony_ci		return NULL;
25718c2ecf20Sopenharmony_ci	}
25728c2ecf20Sopenharmony_ci	newpol->flags |= MPOL_F_SHARED;
25738c2ecf20Sopenharmony_ci	sp_node_init(n, start, end, newpol);
25748c2ecf20Sopenharmony_ci
25758c2ecf20Sopenharmony_ci	return n;
25768c2ecf20Sopenharmony_ci}
25778c2ecf20Sopenharmony_ci
25788c2ecf20Sopenharmony_ci/* Replace a policy range. */
25798c2ecf20Sopenharmony_cistatic int shared_policy_replace(struct shared_policy *sp, unsigned long start,
25808c2ecf20Sopenharmony_ci				 unsigned long end, struct sp_node *new)
25818c2ecf20Sopenharmony_ci{
25828c2ecf20Sopenharmony_ci	struct sp_node *n;
25838c2ecf20Sopenharmony_ci	struct sp_node *n_new = NULL;
25848c2ecf20Sopenharmony_ci	struct mempolicy *mpol_new = NULL;
25858c2ecf20Sopenharmony_ci	int ret = 0;
25868c2ecf20Sopenharmony_ci
25878c2ecf20Sopenharmony_cirestart:
25888c2ecf20Sopenharmony_ci	write_lock(&sp->lock);
25898c2ecf20Sopenharmony_ci	n = sp_lookup(sp, start, end);
25908c2ecf20Sopenharmony_ci	/* Take care of old policies in the same range. */
25918c2ecf20Sopenharmony_ci	while (n && n->start < end) {
25928c2ecf20Sopenharmony_ci		struct rb_node *next = rb_next(&n->nd);
25938c2ecf20Sopenharmony_ci		if (n->start >= start) {
25948c2ecf20Sopenharmony_ci			if (n->end <= end)
25958c2ecf20Sopenharmony_ci				sp_delete(sp, n);
25968c2ecf20Sopenharmony_ci			else
25978c2ecf20Sopenharmony_ci				n->start = end;
25988c2ecf20Sopenharmony_ci		} else {
25998c2ecf20Sopenharmony_ci			/* Old policy spanning whole new range. */
26008c2ecf20Sopenharmony_ci			if (n->end > end) {
26018c2ecf20Sopenharmony_ci				if (!n_new)
26028c2ecf20Sopenharmony_ci					goto alloc_new;
26038c2ecf20Sopenharmony_ci
26048c2ecf20Sopenharmony_ci				*mpol_new = *n->policy;
26058c2ecf20Sopenharmony_ci				atomic_set(&mpol_new->refcnt, 1);
26068c2ecf20Sopenharmony_ci				sp_node_init(n_new, end, n->end, mpol_new);
26078c2ecf20Sopenharmony_ci				n->end = start;
26088c2ecf20Sopenharmony_ci				sp_insert(sp, n_new);
26098c2ecf20Sopenharmony_ci				n_new = NULL;
26108c2ecf20Sopenharmony_ci				mpol_new = NULL;
26118c2ecf20Sopenharmony_ci				break;
26128c2ecf20Sopenharmony_ci			} else
26138c2ecf20Sopenharmony_ci				n->end = start;
26148c2ecf20Sopenharmony_ci		}
26158c2ecf20Sopenharmony_ci		if (!next)
26168c2ecf20Sopenharmony_ci			break;
26178c2ecf20Sopenharmony_ci		n = rb_entry(next, struct sp_node, nd);
26188c2ecf20Sopenharmony_ci	}
26198c2ecf20Sopenharmony_ci	if (new)
26208c2ecf20Sopenharmony_ci		sp_insert(sp, new);
26218c2ecf20Sopenharmony_ci	write_unlock(&sp->lock);
26228c2ecf20Sopenharmony_ci	ret = 0;
26238c2ecf20Sopenharmony_ci
26248c2ecf20Sopenharmony_cierr_out:
26258c2ecf20Sopenharmony_ci	if (mpol_new)
26268c2ecf20Sopenharmony_ci		mpol_put(mpol_new);
26278c2ecf20Sopenharmony_ci	if (n_new)
26288c2ecf20Sopenharmony_ci		kmem_cache_free(sn_cache, n_new);
26298c2ecf20Sopenharmony_ci
26308c2ecf20Sopenharmony_ci	return ret;
26318c2ecf20Sopenharmony_ci
26328c2ecf20Sopenharmony_cialloc_new:
26338c2ecf20Sopenharmony_ci	write_unlock(&sp->lock);
26348c2ecf20Sopenharmony_ci	ret = -ENOMEM;
26358c2ecf20Sopenharmony_ci	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
26368c2ecf20Sopenharmony_ci	if (!n_new)
26378c2ecf20Sopenharmony_ci		goto err_out;
26388c2ecf20Sopenharmony_ci	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
26398c2ecf20Sopenharmony_ci	if (!mpol_new)
26408c2ecf20Sopenharmony_ci		goto err_out;
26418c2ecf20Sopenharmony_ci	atomic_set(&mpol_new->refcnt, 1);
26428c2ecf20Sopenharmony_ci	goto restart;
26438c2ecf20Sopenharmony_ci}
26448c2ecf20Sopenharmony_ci
26458c2ecf20Sopenharmony_ci/**
26468c2ecf20Sopenharmony_ci * mpol_shared_policy_init - initialize shared policy for inode
26478c2ecf20Sopenharmony_ci * @sp: pointer to inode shared policy
26488c2ecf20Sopenharmony_ci * @mpol:  struct mempolicy to install
26498c2ecf20Sopenharmony_ci *
26508c2ecf20Sopenharmony_ci * Install non-NULL @mpol in inode's shared policy rb-tree.
26518c2ecf20Sopenharmony_ci * On entry, the current task has a reference on a non-NULL @mpol.
26528c2ecf20Sopenharmony_ci * This must be released on exit.
26538c2ecf20Sopenharmony_ci * This is called at get_inode() calls and we can use GFP_KERNEL.
26548c2ecf20Sopenharmony_ci */
26558c2ecf20Sopenharmony_civoid mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
26568c2ecf20Sopenharmony_ci{
26578c2ecf20Sopenharmony_ci	int ret;
26588c2ecf20Sopenharmony_ci
26598c2ecf20Sopenharmony_ci	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
26608c2ecf20Sopenharmony_ci	rwlock_init(&sp->lock);
26618c2ecf20Sopenharmony_ci
26628c2ecf20Sopenharmony_ci	if (mpol) {
26638c2ecf20Sopenharmony_ci		struct vm_area_struct pvma;
26648c2ecf20Sopenharmony_ci		struct mempolicy *new;
26658c2ecf20Sopenharmony_ci		NODEMASK_SCRATCH(scratch);
26668c2ecf20Sopenharmony_ci
26678c2ecf20Sopenharmony_ci		if (!scratch)
26688c2ecf20Sopenharmony_ci			goto put_mpol;
26698c2ecf20Sopenharmony_ci		/* contextualize the tmpfs mount point mempolicy */
26708c2ecf20Sopenharmony_ci		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
26718c2ecf20Sopenharmony_ci		if (IS_ERR(new))
26728c2ecf20Sopenharmony_ci			goto free_scratch; /* no valid nodemask intersection */
26738c2ecf20Sopenharmony_ci
26748c2ecf20Sopenharmony_ci		task_lock(current);
26758c2ecf20Sopenharmony_ci		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
26768c2ecf20Sopenharmony_ci		task_unlock(current);
26778c2ecf20Sopenharmony_ci		if (ret)
26788c2ecf20Sopenharmony_ci			goto put_new;
26798c2ecf20Sopenharmony_ci
26808c2ecf20Sopenharmony_ci		/* Create pseudo-vma that contains just the policy */
26818c2ecf20Sopenharmony_ci		vma_init(&pvma, NULL);
26828c2ecf20Sopenharmony_ci		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
26838c2ecf20Sopenharmony_ci		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
26848c2ecf20Sopenharmony_ci
26858c2ecf20Sopenharmony_ciput_new:
26868c2ecf20Sopenharmony_ci		mpol_put(new);			/* drop initial ref */
26878c2ecf20Sopenharmony_cifree_scratch:
26888c2ecf20Sopenharmony_ci		NODEMASK_SCRATCH_FREE(scratch);
26898c2ecf20Sopenharmony_ciput_mpol:
26908c2ecf20Sopenharmony_ci		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
26918c2ecf20Sopenharmony_ci	}
26928c2ecf20Sopenharmony_ci}
26938c2ecf20Sopenharmony_ci
26948c2ecf20Sopenharmony_ciint mpol_set_shared_policy(struct shared_policy *info,
26958c2ecf20Sopenharmony_ci			struct vm_area_struct *vma, struct mempolicy *npol)
26968c2ecf20Sopenharmony_ci{
26978c2ecf20Sopenharmony_ci	int err;
26988c2ecf20Sopenharmony_ci	struct sp_node *new = NULL;
26998c2ecf20Sopenharmony_ci	unsigned long sz = vma_pages(vma);
27008c2ecf20Sopenharmony_ci
27018c2ecf20Sopenharmony_ci	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
27028c2ecf20Sopenharmony_ci		 vma->vm_pgoff,
27038c2ecf20Sopenharmony_ci		 sz, npol ? npol->mode : -1,
27048c2ecf20Sopenharmony_ci		 npol ? npol->flags : -1,
27058c2ecf20Sopenharmony_ci		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
27068c2ecf20Sopenharmony_ci
27078c2ecf20Sopenharmony_ci	if (npol) {
27088c2ecf20Sopenharmony_ci		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
27098c2ecf20Sopenharmony_ci		if (!new)
27108c2ecf20Sopenharmony_ci			return -ENOMEM;
27118c2ecf20Sopenharmony_ci	}
27128c2ecf20Sopenharmony_ci	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
27138c2ecf20Sopenharmony_ci	if (err && new)
27148c2ecf20Sopenharmony_ci		sp_free(new);
27158c2ecf20Sopenharmony_ci	return err;
27168c2ecf20Sopenharmony_ci}
27178c2ecf20Sopenharmony_ci
27188c2ecf20Sopenharmony_ci/* Free a backing policy store on inode delete. */
27198c2ecf20Sopenharmony_civoid mpol_free_shared_policy(struct shared_policy *p)
27208c2ecf20Sopenharmony_ci{
27218c2ecf20Sopenharmony_ci	struct sp_node *n;
27228c2ecf20Sopenharmony_ci	struct rb_node *next;
27238c2ecf20Sopenharmony_ci
27248c2ecf20Sopenharmony_ci	if (!p->root.rb_node)
27258c2ecf20Sopenharmony_ci		return;
27268c2ecf20Sopenharmony_ci	write_lock(&p->lock);
27278c2ecf20Sopenharmony_ci	next = rb_first(&p->root);
27288c2ecf20Sopenharmony_ci	while (next) {
27298c2ecf20Sopenharmony_ci		n = rb_entry(next, struct sp_node, nd);
27308c2ecf20Sopenharmony_ci		next = rb_next(&n->nd);
27318c2ecf20Sopenharmony_ci		sp_delete(p, n);
27328c2ecf20Sopenharmony_ci	}
27338c2ecf20Sopenharmony_ci	write_unlock(&p->lock);
27348c2ecf20Sopenharmony_ci}
27358c2ecf20Sopenharmony_ci
27368c2ecf20Sopenharmony_ci#ifdef CONFIG_NUMA_BALANCING
27378c2ecf20Sopenharmony_cistatic int __initdata numabalancing_override;
27388c2ecf20Sopenharmony_ci
27398c2ecf20Sopenharmony_cistatic void __init check_numabalancing_enable(void)
27408c2ecf20Sopenharmony_ci{
27418c2ecf20Sopenharmony_ci	bool numabalancing_default = false;
27428c2ecf20Sopenharmony_ci
27438c2ecf20Sopenharmony_ci	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
27448c2ecf20Sopenharmony_ci		numabalancing_default = true;
27458c2ecf20Sopenharmony_ci
27468c2ecf20Sopenharmony_ci	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
27478c2ecf20Sopenharmony_ci	if (numabalancing_override)
27488c2ecf20Sopenharmony_ci		set_numabalancing_state(numabalancing_override == 1);
27498c2ecf20Sopenharmony_ci
27508c2ecf20Sopenharmony_ci	if (num_online_nodes() > 1 && !numabalancing_override) {
27518c2ecf20Sopenharmony_ci		pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
27528c2ecf20Sopenharmony_ci			numabalancing_default ? "Enabling" : "Disabling");
27538c2ecf20Sopenharmony_ci		set_numabalancing_state(numabalancing_default);
27548c2ecf20Sopenharmony_ci	}
27558c2ecf20Sopenharmony_ci}
27568c2ecf20Sopenharmony_ci
27578c2ecf20Sopenharmony_cistatic int __init setup_numabalancing(char *str)
27588c2ecf20Sopenharmony_ci{
27598c2ecf20Sopenharmony_ci	int ret = 0;
27608c2ecf20Sopenharmony_ci	if (!str)
27618c2ecf20Sopenharmony_ci		goto out;
27628c2ecf20Sopenharmony_ci
27638c2ecf20Sopenharmony_ci	if (!strcmp(str, "enable")) {
27648c2ecf20Sopenharmony_ci		numabalancing_override = 1;
27658c2ecf20Sopenharmony_ci		ret = 1;
27668c2ecf20Sopenharmony_ci	} else if (!strcmp(str, "disable")) {
27678c2ecf20Sopenharmony_ci		numabalancing_override = -1;
27688c2ecf20Sopenharmony_ci		ret = 1;
27698c2ecf20Sopenharmony_ci	}
27708c2ecf20Sopenharmony_ciout:
27718c2ecf20Sopenharmony_ci	if (!ret)
27728c2ecf20Sopenharmony_ci		pr_warn("Unable to parse numa_balancing=\n");
27738c2ecf20Sopenharmony_ci
27748c2ecf20Sopenharmony_ci	return ret;
27758c2ecf20Sopenharmony_ci}
27768c2ecf20Sopenharmony_ci__setup("numa_balancing=", setup_numabalancing);
27778c2ecf20Sopenharmony_ci#else
27788c2ecf20Sopenharmony_cistatic inline void __init check_numabalancing_enable(void)
27798c2ecf20Sopenharmony_ci{
27808c2ecf20Sopenharmony_ci}
27818c2ecf20Sopenharmony_ci#endif /* CONFIG_NUMA_BALANCING */
27828c2ecf20Sopenharmony_ci
27838c2ecf20Sopenharmony_ci/* assumes fs == KERNEL_DS */
27848c2ecf20Sopenharmony_civoid __init numa_policy_init(void)
27858c2ecf20Sopenharmony_ci{
27868c2ecf20Sopenharmony_ci	nodemask_t interleave_nodes;
27878c2ecf20Sopenharmony_ci	unsigned long largest = 0;
27888c2ecf20Sopenharmony_ci	int nid, prefer = 0;
27898c2ecf20Sopenharmony_ci
27908c2ecf20Sopenharmony_ci	policy_cache = kmem_cache_create("numa_policy",
27918c2ecf20Sopenharmony_ci					 sizeof(struct mempolicy),
27928c2ecf20Sopenharmony_ci					 0, SLAB_PANIC, NULL);
27938c2ecf20Sopenharmony_ci
27948c2ecf20Sopenharmony_ci	sn_cache = kmem_cache_create("shared_policy_node",
27958c2ecf20Sopenharmony_ci				     sizeof(struct sp_node),
27968c2ecf20Sopenharmony_ci				     0, SLAB_PANIC, NULL);
27978c2ecf20Sopenharmony_ci
27988c2ecf20Sopenharmony_ci	for_each_node(nid) {
27998c2ecf20Sopenharmony_ci		preferred_node_policy[nid] = (struct mempolicy) {
28008c2ecf20Sopenharmony_ci			.refcnt = ATOMIC_INIT(1),
28018c2ecf20Sopenharmony_ci			.mode = MPOL_PREFERRED,
28028c2ecf20Sopenharmony_ci			.flags = MPOL_F_MOF | MPOL_F_MORON,
28038c2ecf20Sopenharmony_ci			.v = { .preferred_node = nid, },
28048c2ecf20Sopenharmony_ci		};
28058c2ecf20Sopenharmony_ci	}
28068c2ecf20Sopenharmony_ci
28078c2ecf20Sopenharmony_ci	/*
28088c2ecf20Sopenharmony_ci	 * Set interleaving policy for system init. Interleaving is only
28098c2ecf20Sopenharmony_ci	 * enabled across suitably sized nodes (default is >= 16MB), or
28108c2ecf20Sopenharmony_ci	 * fall back to the largest node if they're all smaller.
28118c2ecf20Sopenharmony_ci	 */
28128c2ecf20Sopenharmony_ci	nodes_clear(interleave_nodes);
28138c2ecf20Sopenharmony_ci	for_each_node_state(nid, N_MEMORY) {
28148c2ecf20Sopenharmony_ci		unsigned long total_pages = node_present_pages(nid);
28158c2ecf20Sopenharmony_ci
28168c2ecf20Sopenharmony_ci		/* Preserve the largest node */
28178c2ecf20Sopenharmony_ci		if (largest < total_pages) {
28188c2ecf20Sopenharmony_ci			largest = total_pages;
28198c2ecf20Sopenharmony_ci			prefer = nid;
28208c2ecf20Sopenharmony_ci		}
28218c2ecf20Sopenharmony_ci
28228c2ecf20Sopenharmony_ci		/* Interleave this node? */
28238c2ecf20Sopenharmony_ci		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
28248c2ecf20Sopenharmony_ci			node_set(nid, interleave_nodes);
28258c2ecf20Sopenharmony_ci	}
28268c2ecf20Sopenharmony_ci
28278c2ecf20Sopenharmony_ci	/* All too small, use the largest */
28288c2ecf20Sopenharmony_ci	if (unlikely(nodes_empty(interleave_nodes)))
28298c2ecf20Sopenharmony_ci		node_set(prefer, interleave_nodes);
28308c2ecf20Sopenharmony_ci
28318c2ecf20Sopenharmony_ci	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
28328c2ecf20Sopenharmony_ci		pr_err("%s: interleaving failed\n", __func__);
28338c2ecf20Sopenharmony_ci
28348c2ecf20Sopenharmony_ci	check_numabalancing_enable();
28358c2ecf20Sopenharmony_ci}
28368c2ecf20Sopenharmony_ci
28378c2ecf20Sopenharmony_ci/* Reset policy of current process to default */
28388c2ecf20Sopenharmony_civoid numa_default_policy(void)
28398c2ecf20Sopenharmony_ci{
28408c2ecf20Sopenharmony_ci	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
28418c2ecf20Sopenharmony_ci}
28428c2ecf20Sopenharmony_ci
28438c2ecf20Sopenharmony_ci/*
28448c2ecf20Sopenharmony_ci * Parse and format mempolicy from/to strings
28458c2ecf20Sopenharmony_ci */
28468c2ecf20Sopenharmony_ci
28478c2ecf20Sopenharmony_ci/*
28488c2ecf20Sopenharmony_ci * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
28498c2ecf20Sopenharmony_ci */
28508c2ecf20Sopenharmony_cistatic const char * const policy_modes[] =
28518c2ecf20Sopenharmony_ci{
28528c2ecf20Sopenharmony_ci	[MPOL_DEFAULT]    = "default",
28538c2ecf20Sopenharmony_ci	[MPOL_PREFERRED]  = "prefer",
28548c2ecf20Sopenharmony_ci	[MPOL_BIND]       = "bind",
28558c2ecf20Sopenharmony_ci	[MPOL_INTERLEAVE] = "interleave",
28568c2ecf20Sopenharmony_ci	[MPOL_LOCAL]      = "local",
28578c2ecf20Sopenharmony_ci};
28588c2ecf20Sopenharmony_ci
28598c2ecf20Sopenharmony_ci
28608c2ecf20Sopenharmony_ci#ifdef CONFIG_TMPFS
28618c2ecf20Sopenharmony_ci/**
28628c2ecf20Sopenharmony_ci * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
28638c2ecf20Sopenharmony_ci * @str:  string containing mempolicy to parse
28648c2ecf20Sopenharmony_ci * @mpol:  pointer to struct mempolicy pointer, returned on success.
28658c2ecf20Sopenharmony_ci *
28668c2ecf20Sopenharmony_ci * Format of input:
28678c2ecf20Sopenharmony_ci *	<mode>[=<flags>][:<nodelist>]
28688c2ecf20Sopenharmony_ci *
28698c2ecf20Sopenharmony_ci * On success, returns 0, else 1
28708c2ecf20Sopenharmony_ci */
28718c2ecf20Sopenharmony_ciint mpol_parse_str(char *str, struct mempolicy **mpol)
28728c2ecf20Sopenharmony_ci{
28738c2ecf20Sopenharmony_ci	struct mempolicy *new = NULL;
28748c2ecf20Sopenharmony_ci	unsigned short mode_flags;
28758c2ecf20Sopenharmony_ci	nodemask_t nodes;
28768c2ecf20Sopenharmony_ci	char *nodelist = strchr(str, ':');
28778c2ecf20Sopenharmony_ci	char *flags = strchr(str, '=');
28788c2ecf20Sopenharmony_ci	int err = 1, mode;
28798c2ecf20Sopenharmony_ci
28808c2ecf20Sopenharmony_ci	if (flags)
28818c2ecf20Sopenharmony_ci		*flags++ = '\0';	/* terminate mode string */
28828c2ecf20Sopenharmony_ci
28838c2ecf20Sopenharmony_ci	if (nodelist) {
28848c2ecf20Sopenharmony_ci		/* NUL-terminate mode or flags string */
28858c2ecf20Sopenharmony_ci		*nodelist++ = '\0';
28868c2ecf20Sopenharmony_ci		if (nodelist_parse(nodelist, nodes))
28878c2ecf20Sopenharmony_ci			goto out;
28888c2ecf20Sopenharmony_ci		if (!nodes_subset(nodes, node_states[N_MEMORY]))
28898c2ecf20Sopenharmony_ci			goto out;
28908c2ecf20Sopenharmony_ci	} else
28918c2ecf20Sopenharmony_ci		nodes_clear(nodes);
28928c2ecf20Sopenharmony_ci
28938c2ecf20Sopenharmony_ci	mode = match_string(policy_modes, MPOL_MAX, str);
28948c2ecf20Sopenharmony_ci	if (mode < 0)
28958c2ecf20Sopenharmony_ci		goto out;
28968c2ecf20Sopenharmony_ci
28978c2ecf20Sopenharmony_ci	switch (mode) {
28988c2ecf20Sopenharmony_ci	case MPOL_PREFERRED:
28998c2ecf20Sopenharmony_ci		/*
29008c2ecf20Sopenharmony_ci		 * Insist on a nodelist of one node only, although later
29018c2ecf20Sopenharmony_ci		 * we use first_node(nodes) to grab a single node, so here
29028c2ecf20Sopenharmony_ci		 * nodelist (or nodes) cannot be empty.
29038c2ecf20Sopenharmony_ci		 */
29048c2ecf20Sopenharmony_ci		if (nodelist) {
29058c2ecf20Sopenharmony_ci			char *rest = nodelist;
29068c2ecf20Sopenharmony_ci			while (isdigit(*rest))
29078c2ecf20Sopenharmony_ci				rest++;
29088c2ecf20Sopenharmony_ci			if (*rest)
29098c2ecf20Sopenharmony_ci				goto out;
29108c2ecf20Sopenharmony_ci			if (nodes_empty(nodes))
29118c2ecf20Sopenharmony_ci				goto out;
29128c2ecf20Sopenharmony_ci		}
29138c2ecf20Sopenharmony_ci		break;
29148c2ecf20Sopenharmony_ci	case MPOL_INTERLEAVE:
29158c2ecf20Sopenharmony_ci		/*
29168c2ecf20Sopenharmony_ci		 * Default to online nodes with memory if no nodelist
29178c2ecf20Sopenharmony_ci		 */
29188c2ecf20Sopenharmony_ci		if (!nodelist)
29198c2ecf20Sopenharmony_ci			nodes = node_states[N_MEMORY];
29208c2ecf20Sopenharmony_ci		break;
29218c2ecf20Sopenharmony_ci	case MPOL_LOCAL:
29228c2ecf20Sopenharmony_ci		/*
29238c2ecf20Sopenharmony_ci		 * Don't allow a nodelist;  mpol_new() checks flags
29248c2ecf20Sopenharmony_ci		 */
29258c2ecf20Sopenharmony_ci		if (nodelist)
29268c2ecf20Sopenharmony_ci			goto out;
29278c2ecf20Sopenharmony_ci		mode = MPOL_PREFERRED;
29288c2ecf20Sopenharmony_ci		break;
29298c2ecf20Sopenharmony_ci	case MPOL_DEFAULT:
29308c2ecf20Sopenharmony_ci		/*
29318c2ecf20Sopenharmony_ci		 * Insist on a empty nodelist
29328c2ecf20Sopenharmony_ci		 */
29338c2ecf20Sopenharmony_ci		if (!nodelist)
29348c2ecf20Sopenharmony_ci			err = 0;
29358c2ecf20Sopenharmony_ci		goto out;
29368c2ecf20Sopenharmony_ci	case MPOL_BIND:
29378c2ecf20Sopenharmony_ci		/*
29388c2ecf20Sopenharmony_ci		 * Insist on a nodelist
29398c2ecf20Sopenharmony_ci		 */
29408c2ecf20Sopenharmony_ci		if (!nodelist)
29418c2ecf20Sopenharmony_ci			goto out;
29428c2ecf20Sopenharmony_ci	}
29438c2ecf20Sopenharmony_ci
29448c2ecf20Sopenharmony_ci	mode_flags = 0;
29458c2ecf20Sopenharmony_ci	if (flags) {
29468c2ecf20Sopenharmony_ci		/*
29478c2ecf20Sopenharmony_ci		 * Currently, we only support two mutually exclusive
29488c2ecf20Sopenharmony_ci		 * mode flags.
29498c2ecf20Sopenharmony_ci		 */
29508c2ecf20Sopenharmony_ci		if (!strcmp(flags, "static"))
29518c2ecf20Sopenharmony_ci			mode_flags |= MPOL_F_STATIC_NODES;
29528c2ecf20Sopenharmony_ci		else if (!strcmp(flags, "relative"))
29538c2ecf20Sopenharmony_ci			mode_flags |= MPOL_F_RELATIVE_NODES;
29548c2ecf20Sopenharmony_ci		else
29558c2ecf20Sopenharmony_ci			goto out;
29568c2ecf20Sopenharmony_ci	}
29578c2ecf20Sopenharmony_ci
29588c2ecf20Sopenharmony_ci	new = mpol_new(mode, mode_flags, &nodes);
29598c2ecf20Sopenharmony_ci	if (IS_ERR(new))
29608c2ecf20Sopenharmony_ci		goto out;
29618c2ecf20Sopenharmony_ci
29628c2ecf20Sopenharmony_ci	/*
29638c2ecf20Sopenharmony_ci	 * Save nodes for mpol_to_str() to show the tmpfs mount options
29648c2ecf20Sopenharmony_ci	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
29658c2ecf20Sopenharmony_ci	 */
29668c2ecf20Sopenharmony_ci	if (mode != MPOL_PREFERRED)
29678c2ecf20Sopenharmony_ci		new->v.nodes = nodes;
29688c2ecf20Sopenharmony_ci	else if (nodelist)
29698c2ecf20Sopenharmony_ci		new->v.preferred_node = first_node(nodes);
29708c2ecf20Sopenharmony_ci	else
29718c2ecf20Sopenharmony_ci		new->flags |= MPOL_F_LOCAL;
29728c2ecf20Sopenharmony_ci
29738c2ecf20Sopenharmony_ci	/*
29748c2ecf20Sopenharmony_ci	 * Save nodes for contextualization: this will be used to "clone"
29758c2ecf20Sopenharmony_ci	 * the mempolicy in a specific context [cpuset] at a later time.
29768c2ecf20Sopenharmony_ci	 */
29778c2ecf20Sopenharmony_ci	new->w.user_nodemask = nodes;
29788c2ecf20Sopenharmony_ci
29798c2ecf20Sopenharmony_ci	err = 0;
29808c2ecf20Sopenharmony_ci
29818c2ecf20Sopenharmony_ciout:
29828c2ecf20Sopenharmony_ci	/* Restore string for error message */
29838c2ecf20Sopenharmony_ci	if (nodelist)
29848c2ecf20Sopenharmony_ci		*--nodelist = ':';
29858c2ecf20Sopenharmony_ci	if (flags)
29868c2ecf20Sopenharmony_ci		*--flags = '=';
29878c2ecf20Sopenharmony_ci	if (!err)
29888c2ecf20Sopenharmony_ci		*mpol = new;
29898c2ecf20Sopenharmony_ci	return err;
29908c2ecf20Sopenharmony_ci}
29918c2ecf20Sopenharmony_ci#endif /* CONFIG_TMPFS */
29928c2ecf20Sopenharmony_ci
29938c2ecf20Sopenharmony_ci/**
29948c2ecf20Sopenharmony_ci * mpol_to_str - format a mempolicy structure for printing
29958c2ecf20Sopenharmony_ci * @buffer:  to contain formatted mempolicy string
29968c2ecf20Sopenharmony_ci * @maxlen:  length of @buffer
29978c2ecf20Sopenharmony_ci * @pol:  pointer to mempolicy to be formatted
29988c2ecf20Sopenharmony_ci *
29998c2ecf20Sopenharmony_ci * Convert @pol into a string.  If @buffer is too short, truncate the string.
30008c2ecf20Sopenharmony_ci * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
30018c2ecf20Sopenharmony_ci * longest flag, "relative", and to display at least a few node ids.
30028c2ecf20Sopenharmony_ci */
30038c2ecf20Sopenharmony_civoid mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
30048c2ecf20Sopenharmony_ci{
30058c2ecf20Sopenharmony_ci	char *p = buffer;
30068c2ecf20Sopenharmony_ci	nodemask_t nodes = NODE_MASK_NONE;
30078c2ecf20Sopenharmony_ci	unsigned short mode = MPOL_DEFAULT;
30088c2ecf20Sopenharmony_ci	unsigned short flags = 0;
30098c2ecf20Sopenharmony_ci
30108c2ecf20Sopenharmony_ci	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
30118c2ecf20Sopenharmony_ci		mode = pol->mode;
30128c2ecf20Sopenharmony_ci		flags = pol->flags;
30138c2ecf20Sopenharmony_ci	}
30148c2ecf20Sopenharmony_ci
30158c2ecf20Sopenharmony_ci	switch (mode) {
30168c2ecf20Sopenharmony_ci	case MPOL_DEFAULT:
30178c2ecf20Sopenharmony_ci		break;
30188c2ecf20Sopenharmony_ci	case MPOL_PREFERRED:
30198c2ecf20Sopenharmony_ci		if (flags & MPOL_F_LOCAL)
30208c2ecf20Sopenharmony_ci			mode = MPOL_LOCAL;
30218c2ecf20Sopenharmony_ci		else
30228c2ecf20Sopenharmony_ci			node_set(pol->v.preferred_node, nodes);
30238c2ecf20Sopenharmony_ci		break;
30248c2ecf20Sopenharmony_ci	case MPOL_BIND:
30258c2ecf20Sopenharmony_ci	case MPOL_INTERLEAVE:
30268c2ecf20Sopenharmony_ci		nodes = pol->v.nodes;
30278c2ecf20Sopenharmony_ci		break;
30288c2ecf20Sopenharmony_ci	default:
30298c2ecf20Sopenharmony_ci		WARN_ON_ONCE(1);
30308c2ecf20Sopenharmony_ci		snprintf(p, maxlen, "unknown");
30318c2ecf20Sopenharmony_ci		return;
30328c2ecf20Sopenharmony_ci	}
30338c2ecf20Sopenharmony_ci
30348c2ecf20Sopenharmony_ci	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
30358c2ecf20Sopenharmony_ci
30368c2ecf20Sopenharmony_ci	if (flags & MPOL_MODE_FLAGS) {
30378c2ecf20Sopenharmony_ci		p += snprintf(p, buffer + maxlen - p, "=");
30388c2ecf20Sopenharmony_ci
30398c2ecf20Sopenharmony_ci		/*
30408c2ecf20Sopenharmony_ci		 * Currently, the only defined flags are mutually exclusive
30418c2ecf20Sopenharmony_ci		 */
30428c2ecf20Sopenharmony_ci		if (flags & MPOL_F_STATIC_NODES)
30438c2ecf20Sopenharmony_ci			p += snprintf(p, buffer + maxlen - p, "static");
30448c2ecf20Sopenharmony_ci		else if (flags & MPOL_F_RELATIVE_NODES)
30458c2ecf20Sopenharmony_ci			p += snprintf(p, buffer + maxlen - p, "relative");
30468c2ecf20Sopenharmony_ci	}
30478c2ecf20Sopenharmony_ci
30488c2ecf20Sopenharmony_ci	if (!nodes_empty(nodes))
30498c2ecf20Sopenharmony_ci		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
30508c2ecf20Sopenharmony_ci			       nodemask_pr_args(&nodes));
30518c2ecf20Sopenharmony_ci}
3052