18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Linux VM pressure
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright 2012 Linaro Ltd.
68c2ecf20Sopenharmony_ci *		  Anton Vorontsov <anton.vorontsov@linaro.org>
78c2ecf20Sopenharmony_ci *
88c2ecf20Sopenharmony_ci * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
98c2ecf20Sopenharmony_ci * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
108c2ecf20Sopenharmony_ci */
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci#include <linux/cgroup.h>
138c2ecf20Sopenharmony_ci#include <linux/fs.h>
148c2ecf20Sopenharmony_ci#include <linux/log2.h>
158c2ecf20Sopenharmony_ci#include <linux/sched.h>
168c2ecf20Sopenharmony_ci#include <linux/mm.h>
178c2ecf20Sopenharmony_ci#include <linux/vmstat.h>
188c2ecf20Sopenharmony_ci#include <linux/eventfd.h>
198c2ecf20Sopenharmony_ci#include <linux/slab.h>
208c2ecf20Sopenharmony_ci#include <linux/swap.h>
218c2ecf20Sopenharmony_ci#include <linux/printk.h>
228c2ecf20Sopenharmony_ci#include <linux/vmpressure.h>
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci/*
258c2ecf20Sopenharmony_ci * The window size (vmpressure_win) is the number of scanned pages before
268c2ecf20Sopenharmony_ci * we try to analyze scanned/reclaimed ratio. So the window is used as a
278c2ecf20Sopenharmony_ci * rate-limit tunable for the "low" level notification, and also for
288c2ecf20Sopenharmony_ci * averaging the ratio for medium/critical levels. Using small window
298c2ecf20Sopenharmony_ci * sizes can cause lot of false positives, but too big window size will
308c2ecf20Sopenharmony_ci * delay the notifications.
318c2ecf20Sopenharmony_ci *
328c2ecf20Sopenharmony_ci * As the vmscan reclaimer logic works with chunks which are multiple of
338c2ecf20Sopenharmony_ci * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
348c2ecf20Sopenharmony_ci *
358c2ecf20Sopenharmony_ci * TODO: Make the window size depend on machine size, as we do for vmstat
368c2ecf20Sopenharmony_ci * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
378c2ecf20Sopenharmony_ci */
388c2ecf20Sopenharmony_cistatic const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_ci/*
418c2ecf20Sopenharmony_ci * These thresholds are used when we account memory pressure through
428c2ecf20Sopenharmony_ci * scanned/reclaimed ratio. The current values were chosen empirically. In
438c2ecf20Sopenharmony_ci * essence, they are percents: the higher the value, the more number
448c2ecf20Sopenharmony_ci * unsuccessful reclaims there were.
458c2ecf20Sopenharmony_ci */
468c2ecf20Sopenharmony_cistatic const unsigned int vmpressure_level_med = 60;
478c2ecf20Sopenharmony_cistatic const unsigned int vmpressure_level_critical = 95;
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_ci/*
508c2ecf20Sopenharmony_ci * When there are too little pages left to scan, vmpressure() may miss the
518c2ecf20Sopenharmony_ci * critical pressure as number of pages will be less than "window size".
528c2ecf20Sopenharmony_ci * However, in that case the vmscan priority will raise fast as the
538c2ecf20Sopenharmony_ci * reclaimer will try to scan LRUs more deeply.
548c2ecf20Sopenharmony_ci *
558c2ecf20Sopenharmony_ci * The vmscan logic considers these special priorities:
568c2ecf20Sopenharmony_ci *
578c2ecf20Sopenharmony_ci * prio == DEF_PRIORITY (12): reclaimer starts with that value
588c2ecf20Sopenharmony_ci * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
598c2ecf20Sopenharmony_ci * prio == 0                : close to OOM, kernel scans every page in an lru
608c2ecf20Sopenharmony_ci *
618c2ecf20Sopenharmony_ci * Any value in this range is acceptable for this tunable (i.e. from 12 to
628c2ecf20Sopenharmony_ci * 0). Current value for the vmpressure_level_critical_prio is chosen
638c2ecf20Sopenharmony_ci * empirically, but the number, in essence, means that we consider
648c2ecf20Sopenharmony_ci * critical level when scanning depth is ~10% of the lru size (vmscan
658c2ecf20Sopenharmony_ci * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
668c2ecf20Sopenharmony_ci * eights).
678c2ecf20Sopenharmony_ci */
688c2ecf20Sopenharmony_cistatic const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_cistatic struct vmpressure *work_to_vmpressure(struct work_struct *work)
718c2ecf20Sopenharmony_ci{
728c2ecf20Sopenharmony_ci	return container_of(work, struct vmpressure, work);
738c2ecf20Sopenharmony_ci}
748c2ecf20Sopenharmony_ci
758c2ecf20Sopenharmony_cistatic struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
768c2ecf20Sopenharmony_ci{
778c2ecf20Sopenharmony_ci	struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
788c2ecf20Sopenharmony_ci	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci	memcg = parent_mem_cgroup(memcg);
818c2ecf20Sopenharmony_ci	if (!memcg)
828c2ecf20Sopenharmony_ci		return NULL;
838c2ecf20Sopenharmony_ci	return memcg_to_vmpressure(memcg);
848c2ecf20Sopenharmony_ci}
858c2ecf20Sopenharmony_ci
868c2ecf20Sopenharmony_cienum vmpressure_levels {
878c2ecf20Sopenharmony_ci	VMPRESSURE_LOW = 0,
888c2ecf20Sopenharmony_ci	VMPRESSURE_MEDIUM,
898c2ecf20Sopenharmony_ci	VMPRESSURE_CRITICAL,
908c2ecf20Sopenharmony_ci	VMPRESSURE_NUM_LEVELS,
918c2ecf20Sopenharmony_ci};
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_cienum vmpressure_modes {
948c2ecf20Sopenharmony_ci	VMPRESSURE_NO_PASSTHROUGH = 0,
958c2ecf20Sopenharmony_ci	VMPRESSURE_HIERARCHY,
968c2ecf20Sopenharmony_ci	VMPRESSURE_LOCAL,
978c2ecf20Sopenharmony_ci	VMPRESSURE_NUM_MODES,
988c2ecf20Sopenharmony_ci};
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_cistatic const char * const vmpressure_str_levels[] = {
1018c2ecf20Sopenharmony_ci	[VMPRESSURE_LOW] = "low",
1028c2ecf20Sopenharmony_ci	[VMPRESSURE_MEDIUM] = "medium",
1038c2ecf20Sopenharmony_ci	[VMPRESSURE_CRITICAL] = "critical",
1048c2ecf20Sopenharmony_ci};
1058c2ecf20Sopenharmony_ci
1068c2ecf20Sopenharmony_cistatic const char * const vmpressure_str_modes[] = {
1078c2ecf20Sopenharmony_ci	[VMPRESSURE_NO_PASSTHROUGH] = "default",
1088c2ecf20Sopenharmony_ci	[VMPRESSURE_HIERARCHY] = "hierarchy",
1098c2ecf20Sopenharmony_ci	[VMPRESSURE_LOCAL] = "local",
1108c2ecf20Sopenharmony_ci};
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_cistatic enum vmpressure_levels vmpressure_level(unsigned long pressure)
1138c2ecf20Sopenharmony_ci{
1148c2ecf20Sopenharmony_ci	if (pressure >= vmpressure_level_critical)
1158c2ecf20Sopenharmony_ci		return VMPRESSURE_CRITICAL;
1168c2ecf20Sopenharmony_ci	else if (pressure >= vmpressure_level_med)
1178c2ecf20Sopenharmony_ci		return VMPRESSURE_MEDIUM;
1188c2ecf20Sopenharmony_ci	return VMPRESSURE_LOW;
1198c2ecf20Sopenharmony_ci}
1208c2ecf20Sopenharmony_ci
1218c2ecf20Sopenharmony_cistatic enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
1228c2ecf20Sopenharmony_ci						    unsigned long reclaimed)
1238c2ecf20Sopenharmony_ci{
1248c2ecf20Sopenharmony_ci	unsigned long scale = scanned + reclaimed;
1258c2ecf20Sopenharmony_ci	unsigned long pressure = 0;
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci	/*
1288c2ecf20Sopenharmony_ci	 * reclaimed can be greater than scanned for things such as reclaimed
1298c2ecf20Sopenharmony_ci	 * slab pages. shrink_node() just adds reclaimed pages without a
1308c2ecf20Sopenharmony_ci	 * related increment to scanned pages.
1318c2ecf20Sopenharmony_ci	 */
1328c2ecf20Sopenharmony_ci	if (reclaimed >= scanned)
1338c2ecf20Sopenharmony_ci		goto out;
1348c2ecf20Sopenharmony_ci	/*
1358c2ecf20Sopenharmony_ci	 * We calculate the ratio (in percents) of how many pages were
1368c2ecf20Sopenharmony_ci	 * scanned vs. reclaimed in a given time frame (window). Note that
1378c2ecf20Sopenharmony_ci	 * time is in VM reclaimer's "ticks", i.e. number of pages
1388c2ecf20Sopenharmony_ci	 * scanned. This makes it possible to set desired reaction time
1398c2ecf20Sopenharmony_ci	 * and serves as a ratelimit.
1408c2ecf20Sopenharmony_ci	 */
1418c2ecf20Sopenharmony_ci	pressure = scale - (reclaimed * scale / scanned);
1428c2ecf20Sopenharmony_ci	pressure = pressure * 100 / scale;
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ciout:
1458c2ecf20Sopenharmony_ci	pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
1468c2ecf20Sopenharmony_ci		 scanned, reclaimed);
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_ci	return vmpressure_level(pressure);
1498c2ecf20Sopenharmony_ci}
1508c2ecf20Sopenharmony_ci
1518c2ecf20Sopenharmony_cistruct vmpressure_event {
1528c2ecf20Sopenharmony_ci	struct eventfd_ctx *efd;
1538c2ecf20Sopenharmony_ci	enum vmpressure_levels level;
1548c2ecf20Sopenharmony_ci	enum vmpressure_modes mode;
1558c2ecf20Sopenharmony_ci	struct list_head node;
1568c2ecf20Sopenharmony_ci};
1578c2ecf20Sopenharmony_ci
1588c2ecf20Sopenharmony_cistatic bool vmpressure_event(struct vmpressure *vmpr,
1598c2ecf20Sopenharmony_ci			     const enum vmpressure_levels level,
1608c2ecf20Sopenharmony_ci			     bool ancestor, bool signalled)
1618c2ecf20Sopenharmony_ci{
1628c2ecf20Sopenharmony_ci	struct vmpressure_event *ev;
1638c2ecf20Sopenharmony_ci	bool ret = false;
1648c2ecf20Sopenharmony_ci
1658c2ecf20Sopenharmony_ci	mutex_lock(&vmpr->events_lock);
1668c2ecf20Sopenharmony_ci	list_for_each_entry(ev, &vmpr->events, node) {
1678c2ecf20Sopenharmony_ci		if (ancestor && ev->mode == VMPRESSURE_LOCAL)
1688c2ecf20Sopenharmony_ci			continue;
1698c2ecf20Sopenharmony_ci		if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
1708c2ecf20Sopenharmony_ci			continue;
1718c2ecf20Sopenharmony_ci		if (level < ev->level)
1728c2ecf20Sopenharmony_ci			continue;
1738c2ecf20Sopenharmony_ci		eventfd_signal(ev->efd, 1);
1748c2ecf20Sopenharmony_ci		ret = true;
1758c2ecf20Sopenharmony_ci	}
1768c2ecf20Sopenharmony_ci	mutex_unlock(&vmpr->events_lock);
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	return ret;
1798c2ecf20Sopenharmony_ci}
1808c2ecf20Sopenharmony_ci
1818c2ecf20Sopenharmony_cistatic void vmpressure_work_fn(struct work_struct *work)
1828c2ecf20Sopenharmony_ci{
1838c2ecf20Sopenharmony_ci	struct vmpressure *vmpr = work_to_vmpressure(work);
1848c2ecf20Sopenharmony_ci	unsigned long scanned;
1858c2ecf20Sopenharmony_ci	unsigned long reclaimed;
1868c2ecf20Sopenharmony_ci	enum vmpressure_levels level;
1878c2ecf20Sopenharmony_ci	bool ancestor = false;
1888c2ecf20Sopenharmony_ci	bool signalled = false;
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	spin_lock(&vmpr->sr_lock);
1918c2ecf20Sopenharmony_ci	/*
1928c2ecf20Sopenharmony_ci	 * Several contexts might be calling vmpressure(), so it is
1938c2ecf20Sopenharmony_ci	 * possible that the work was rescheduled again before the old
1948c2ecf20Sopenharmony_ci	 * work context cleared the counters. In that case we will run
1958c2ecf20Sopenharmony_ci	 * just after the old work returns, but then scanned might be zero
1968c2ecf20Sopenharmony_ci	 * here. No need for any locks here since we don't care if
1978c2ecf20Sopenharmony_ci	 * vmpr->reclaimed is in sync.
1988c2ecf20Sopenharmony_ci	 */
1998c2ecf20Sopenharmony_ci	scanned = vmpr->tree_scanned;
2008c2ecf20Sopenharmony_ci	if (!scanned) {
2018c2ecf20Sopenharmony_ci		spin_unlock(&vmpr->sr_lock);
2028c2ecf20Sopenharmony_ci		return;
2038c2ecf20Sopenharmony_ci	}
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	reclaimed = vmpr->tree_reclaimed;
2068c2ecf20Sopenharmony_ci	vmpr->tree_scanned = 0;
2078c2ecf20Sopenharmony_ci	vmpr->tree_reclaimed = 0;
2088c2ecf20Sopenharmony_ci	spin_unlock(&vmpr->sr_lock);
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci	level = vmpressure_calc_level(scanned, reclaimed);
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	do {
2138c2ecf20Sopenharmony_ci		if (vmpressure_event(vmpr, level, ancestor, signalled))
2148c2ecf20Sopenharmony_ci			signalled = true;
2158c2ecf20Sopenharmony_ci		ancestor = true;
2168c2ecf20Sopenharmony_ci	} while ((vmpr = vmpressure_parent(vmpr)));
2178c2ecf20Sopenharmony_ci}
2188c2ecf20Sopenharmony_ci
2198c2ecf20Sopenharmony_ci/**
2208c2ecf20Sopenharmony_ci * vmpressure() - Account memory pressure through scanned/reclaimed ratio
2218c2ecf20Sopenharmony_ci * @gfp:	reclaimer's gfp mask
2228c2ecf20Sopenharmony_ci * @memcg:	cgroup memory controller handle
2238c2ecf20Sopenharmony_ci * @tree:	legacy subtree mode
2248c2ecf20Sopenharmony_ci * @scanned:	number of pages scanned
2258c2ecf20Sopenharmony_ci * @reclaimed:	number of pages reclaimed
2268c2ecf20Sopenharmony_ci *
2278c2ecf20Sopenharmony_ci * This function should be called from the vmscan reclaim path to account
2288c2ecf20Sopenharmony_ci * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
2298c2ecf20Sopenharmony_ci * pressure index is then further refined and averaged over time.
2308c2ecf20Sopenharmony_ci *
2318c2ecf20Sopenharmony_ci * If @tree is set, vmpressure is in traditional userspace reporting
2328c2ecf20Sopenharmony_ci * mode: @memcg is considered the pressure root and userspace is
2338c2ecf20Sopenharmony_ci * notified of the entire subtree's reclaim efficiency.
2348c2ecf20Sopenharmony_ci *
2358c2ecf20Sopenharmony_ci * If @tree is not set, reclaim efficiency is recorded for @memcg, and
2368c2ecf20Sopenharmony_ci * only in-kernel users are notified.
2378c2ecf20Sopenharmony_ci *
2388c2ecf20Sopenharmony_ci * This function does not return any value.
2398c2ecf20Sopenharmony_ci */
2408c2ecf20Sopenharmony_civoid vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
2418c2ecf20Sopenharmony_ci		unsigned long scanned, unsigned long reclaimed)
2428c2ecf20Sopenharmony_ci{
2438c2ecf20Sopenharmony_ci	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	/*
2468c2ecf20Sopenharmony_ci	 * Here we only want to account pressure that userland is able to
2478c2ecf20Sopenharmony_ci	 * help us with. For example, suppose that DMA zone is under
2488c2ecf20Sopenharmony_ci	 * pressure; if we notify userland about that kind of pressure,
2498c2ecf20Sopenharmony_ci	 * then it will be mostly a waste as it will trigger unnecessary
2508c2ecf20Sopenharmony_ci	 * freeing of memory by userland (since userland is more likely to
2518c2ecf20Sopenharmony_ci	 * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
2528c2ecf20Sopenharmony_ci	 * is why we include only movable, highmem and FS/IO pages.
2538c2ecf20Sopenharmony_ci	 * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
2548c2ecf20Sopenharmony_ci	 * we account it too.
2558c2ecf20Sopenharmony_ci	 */
2568c2ecf20Sopenharmony_ci	if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
2578c2ecf20Sopenharmony_ci		return;
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci	/*
2608c2ecf20Sopenharmony_ci	 * If we got here with no pages scanned, then that is an indicator
2618c2ecf20Sopenharmony_ci	 * that reclaimer was unable to find any shrinkable LRUs at the
2628c2ecf20Sopenharmony_ci	 * current scanning depth. But it does not mean that we should
2638c2ecf20Sopenharmony_ci	 * report the critical pressure, yet. If the scanning priority
2648c2ecf20Sopenharmony_ci	 * (scanning depth) goes too high (deep), we will be notified
2658c2ecf20Sopenharmony_ci	 * through vmpressure_prio(). But so far, keep calm.
2668c2ecf20Sopenharmony_ci	 */
2678c2ecf20Sopenharmony_ci	if (!scanned)
2688c2ecf20Sopenharmony_ci		return;
2698c2ecf20Sopenharmony_ci
2708c2ecf20Sopenharmony_ci	if (tree) {
2718c2ecf20Sopenharmony_ci		spin_lock(&vmpr->sr_lock);
2728c2ecf20Sopenharmony_ci		scanned = vmpr->tree_scanned += scanned;
2738c2ecf20Sopenharmony_ci		vmpr->tree_reclaimed += reclaimed;
2748c2ecf20Sopenharmony_ci		spin_unlock(&vmpr->sr_lock);
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci		if (scanned < vmpressure_win)
2778c2ecf20Sopenharmony_ci			return;
2788c2ecf20Sopenharmony_ci		schedule_work(&vmpr->work);
2798c2ecf20Sopenharmony_ci	} else {
2808c2ecf20Sopenharmony_ci		enum vmpressure_levels level;
2818c2ecf20Sopenharmony_ci
2828c2ecf20Sopenharmony_ci		/* For now, no users for root-level efficiency */
2838c2ecf20Sopenharmony_ci		if (!memcg || mem_cgroup_is_root(memcg))
2848c2ecf20Sopenharmony_ci			return;
2858c2ecf20Sopenharmony_ci
2868c2ecf20Sopenharmony_ci		spin_lock(&vmpr->sr_lock);
2878c2ecf20Sopenharmony_ci		scanned = vmpr->scanned += scanned;
2888c2ecf20Sopenharmony_ci		reclaimed = vmpr->reclaimed += reclaimed;
2898c2ecf20Sopenharmony_ci		if (scanned < vmpressure_win) {
2908c2ecf20Sopenharmony_ci			spin_unlock(&vmpr->sr_lock);
2918c2ecf20Sopenharmony_ci			return;
2928c2ecf20Sopenharmony_ci		}
2938c2ecf20Sopenharmony_ci		vmpr->scanned = vmpr->reclaimed = 0;
2948c2ecf20Sopenharmony_ci		spin_unlock(&vmpr->sr_lock);
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_ci		level = vmpressure_calc_level(scanned, reclaimed);
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_ci		if (level > VMPRESSURE_LOW) {
2998c2ecf20Sopenharmony_ci			/*
3008c2ecf20Sopenharmony_ci			 * Let the socket buffer allocator know that
3018c2ecf20Sopenharmony_ci			 * we are having trouble reclaiming LRU pages.
3028c2ecf20Sopenharmony_ci			 *
3038c2ecf20Sopenharmony_ci			 * For hysteresis keep the pressure state
3048c2ecf20Sopenharmony_ci			 * asserted for a second in which subsequent
3058c2ecf20Sopenharmony_ci			 * pressure events can occur.
3068c2ecf20Sopenharmony_ci			 */
3078c2ecf20Sopenharmony_ci			memcg->socket_pressure = jiffies + HZ;
3088c2ecf20Sopenharmony_ci		}
3098c2ecf20Sopenharmony_ci	}
3108c2ecf20Sopenharmony_ci}
3118c2ecf20Sopenharmony_ci
3128c2ecf20Sopenharmony_ci/**
3138c2ecf20Sopenharmony_ci * vmpressure_prio() - Account memory pressure through reclaimer priority level
3148c2ecf20Sopenharmony_ci * @gfp:	reclaimer's gfp mask
3158c2ecf20Sopenharmony_ci * @memcg:	cgroup memory controller handle
3168c2ecf20Sopenharmony_ci * @prio:	reclaimer's priority
3178c2ecf20Sopenharmony_ci *
3188c2ecf20Sopenharmony_ci * This function should be called from the reclaim path every time when
3198c2ecf20Sopenharmony_ci * the vmscan's reclaiming priority (scanning depth) changes.
3208c2ecf20Sopenharmony_ci *
3218c2ecf20Sopenharmony_ci * This function does not return any value.
3228c2ecf20Sopenharmony_ci */
3238c2ecf20Sopenharmony_civoid vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
3248c2ecf20Sopenharmony_ci{
3258c2ecf20Sopenharmony_ci	/*
3268c2ecf20Sopenharmony_ci	 * We only use prio for accounting critical level. For more info
3278c2ecf20Sopenharmony_ci	 * see comment for vmpressure_level_critical_prio variable above.
3288c2ecf20Sopenharmony_ci	 */
3298c2ecf20Sopenharmony_ci	if (prio > vmpressure_level_critical_prio)
3308c2ecf20Sopenharmony_ci		return;
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_ci	/*
3338c2ecf20Sopenharmony_ci	 * OK, the prio is below the threshold, updating vmpressure
3348c2ecf20Sopenharmony_ci	 * information before shrinker dives into long shrinking of long
3358c2ecf20Sopenharmony_ci	 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
3368c2ecf20Sopenharmony_ci	 * to the vmpressure() basically means that we signal 'critical'
3378c2ecf20Sopenharmony_ci	 * level.
3388c2ecf20Sopenharmony_ci	 */
3398c2ecf20Sopenharmony_ci	vmpressure(gfp, memcg, true, vmpressure_win, 0);
3408c2ecf20Sopenharmony_ci}
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_ci#define MAX_VMPRESSURE_ARGS_LEN	(strlen("critical") + strlen("hierarchy") + 2)
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_ci/**
3458c2ecf20Sopenharmony_ci * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
3468c2ecf20Sopenharmony_ci * @memcg:	memcg that is interested in vmpressure notifications
3478c2ecf20Sopenharmony_ci * @eventfd:	eventfd context to link notifications with
3488c2ecf20Sopenharmony_ci * @args:	event arguments (pressure level threshold, optional mode)
3498c2ecf20Sopenharmony_ci *
3508c2ecf20Sopenharmony_ci * This function associates eventfd context with the vmpressure
3518c2ecf20Sopenharmony_ci * infrastructure, so that the notifications will be delivered to the
3528c2ecf20Sopenharmony_ci * @eventfd. The @args parameter is a comma-delimited string that denotes a
3538c2ecf20Sopenharmony_ci * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
3548c2ecf20Sopenharmony_ci * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
3558c2ecf20Sopenharmony_ci * "hierarchy" or "local").
3568c2ecf20Sopenharmony_ci *
3578c2ecf20Sopenharmony_ci * To be used as memcg event method.
3588c2ecf20Sopenharmony_ci *
3598c2ecf20Sopenharmony_ci * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
3608c2ecf20Sopenharmony_ci * not be parsed.
3618c2ecf20Sopenharmony_ci */
3628c2ecf20Sopenharmony_ciint vmpressure_register_event(struct mem_cgroup *memcg,
3638c2ecf20Sopenharmony_ci			      struct eventfd_ctx *eventfd, const char *args)
3648c2ecf20Sopenharmony_ci{
3658c2ecf20Sopenharmony_ci	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
3668c2ecf20Sopenharmony_ci	struct vmpressure_event *ev;
3678c2ecf20Sopenharmony_ci	enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
3688c2ecf20Sopenharmony_ci	enum vmpressure_levels level;
3698c2ecf20Sopenharmony_ci	char *spec, *spec_orig;
3708c2ecf20Sopenharmony_ci	char *token;
3718c2ecf20Sopenharmony_ci	int ret = 0;
3728c2ecf20Sopenharmony_ci
3738c2ecf20Sopenharmony_ci	spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
3748c2ecf20Sopenharmony_ci	if (!spec)
3758c2ecf20Sopenharmony_ci		return -ENOMEM;
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	/* Find required level */
3788c2ecf20Sopenharmony_ci	token = strsep(&spec, ",");
3798c2ecf20Sopenharmony_ci	ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
3808c2ecf20Sopenharmony_ci	if (ret < 0)
3818c2ecf20Sopenharmony_ci		goto out;
3828c2ecf20Sopenharmony_ci	level = ret;
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci	/* Find optional mode */
3858c2ecf20Sopenharmony_ci	token = strsep(&spec, ",");
3868c2ecf20Sopenharmony_ci	if (token) {
3878c2ecf20Sopenharmony_ci		ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
3888c2ecf20Sopenharmony_ci		if (ret < 0)
3898c2ecf20Sopenharmony_ci			goto out;
3908c2ecf20Sopenharmony_ci		mode = ret;
3918c2ecf20Sopenharmony_ci	}
3928c2ecf20Sopenharmony_ci
3938c2ecf20Sopenharmony_ci	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
3948c2ecf20Sopenharmony_ci	if (!ev) {
3958c2ecf20Sopenharmony_ci		ret = -ENOMEM;
3968c2ecf20Sopenharmony_ci		goto out;
3978c2ecf20Sopenharmony_ci	}
3988c2ecf20Sopenharmony_ci
3998c2ecf20Sopenharmony_ci	ev->efd = eventfd;
4008c2ecf20Sopenharmony_ci	ev->level = level;
4018c2ecf20Sopenharmony_ci	ev->mode = mode;
4028c2ecf20Sopenharmony_ci
4038c2ecf20Sopenharmony_ci	mutex_lock(&vmpr->events_lock);
4048c2ecf20Sopenharmony_ci	list_add(&ev->node, &vmpr->events);
4058c2ecf20Sopenharmony_ci	mutex_unlock(&vmpr->events_lock);
4068c2ecf20Sopenharmony_ci	ret = 0;
4078c2ecf20Sopenharmony_ciout:
4088c2ecf20Sopenharmony_ci	kfree(spec_orig);
4098c2ecf20Sopenharmony_ci	return ret;
4108c2ecf20Sopenharmony_ci}
4118c2ecf20Sopenharmony_ci
4128c2ecf20Sopenharmony_ci/**
4138c2ecf20Sopenharmony_ci * vmpressure_unregister_event() - Unbind eventfd from vmpressure
4148c2ecf20Sopenharmony_ci * @memcg:	memcg handle
4158c2ecf20Sopenharmony_ci * @eventfd:	eventfd context that was used to link vmpressure with the @cg
4168c2ecf20Sopenharmony_ci *
4178c2ecf20Sopenharmony_ci * This function does internal manipulations to detach the @eventfd from
4188c2ecf20Sopenharmony_ci * the vmpressure notifications, and then frees internal resources
4198c2ecf20Sopenharmony_ci * associated with the @eventfd (but the @eventfd itself is not freed).
4208c2ecf20Sopenharmony_ci *
4218c2ecf20Sopenharmony_ci * To be used as memcg event method.
4228c2ecf20Sopenharmony_ci */
4238c2ecf20Sopenharmony_civoid vmpressure_unregister_event(struct mem_cgroup *memcg,
4248c2ecf20Sopenharmony_ci				 struct eventfd_ctx *eventfd)
4258c2ecf20Sopenharmony_ci{
4268c2ecf20Sopenharmony_ci	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
4278c2ecf20Sopenharmony_ci	struct vmpressure_event *ev;
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci	mutex_lock(&vmpr->events_lock);
4308c2ecf20Sopenharmony_ci	list_for_each_entry(ev, &vmpr->events, node) {
4318c2ecf20Sopenharmony_ci		if (ev->efd != eventfd)
4328c2ecf20Sopenharmony_ci			continue;
4338c2ecf20Sopenharmony_ci		list_del(&ev->node);
4348c2ecf20Sopenharmony_ci		kfree(ev);
4358c2ecf20Sopenharmony_ci		break;
4368c2ecf20Sopenharmony_ci	}
4378c2ecf20Sopenharmony_ci	mutex_unlock(&vmpr->events_lock);
4388c2ecf20Sopenharmony_ci}
4398c2ecf20Sopenharmony_ci
4408c2ecf20Sopenharmony_ci/**
4418c2ecf20Sopenharmony_ci * vmpressure_init() - Initialize vmpressure control structure
4428c2ecf20Sopenharmony_ci * @vmpr:	Structure to be initialized
4438c2ecf20Sopenharmony_ci *
4448c2ecf20Sopenharmony_ci * This function should be called on every allocated vmpressure structure
4458c2ecf20Sopenharmony_ci * before any usage.
4468c2ecf20Sopenharmony_ci */
4478c2ecf20Sopenharmony_civoid vmpressure_init(struct vmpressure *vmpr)
4488c2ecf20Sopenharmony_ci{
4498c2ecf20Sopenharmony_ci	spin_lock_init(&vmpr->sr_lock);
4508c2ecf20Sopenharmony_ci	mutex_init(&vmpr->events_lock);
4518c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&vmpr->events);
4528c2ecf20Sopenharmony_ci	INIT_WORK(&vmpr->work, vmpressure_work_fn);
4538c2ecf20Sopenharmony_ci}
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ci/**
4568c2ecf20Sopenharmony_ci * vmpressure_cleanup() - shuts down vmpressure control structure
4578c2ecf20Sopenharmony_ci * @vmpr:	Structure to be cleaned up
4588c2ecf20Sopenharmony_ci *
4598c2ecf20Sopenharmony_ci * This function should be called before the structure in which it is
4608c2ecf20Sopenharmony_ci * embedded is cleaned up.
4618c2ecf20Sopenharmony_ci */
4628c2ecf20Sopenharmony_civoid vmpressure_cleanup(struct vmpressure *vmpr)
4638c2ecf20Sopenharmony_ci{
4648c2ecf20Sopenharmony_ci	/*
4658c2ecf20Sopenharmony_ci	 * Make sure there is no pending work before eventfd infrastructure
4668c2ecf20Sopenharmony_ci	 * goes away.
4678c2ecf20Sopenharmony_ci	 */
4688c2ecf20Sopenharmony_ci	flush_work(&vmpr->work);
4698c2ecf20Sopenharmony_ci}
470