162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Linux VM pressure
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright 2012 Linaro Ltd.
662306a36Sopenharmony_ci *		  Anton Vorontsov <anton.vorontsov@linaro.org>
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
962306a36Sopenharmony_ci * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
1062306a36Sopenharmony_ci */
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include <linux/cgroup.h>
1362306a36Sopenharmony_ci#include <linux/fs.h>
1462306a36Sopenharmony_ci#include <linux/log2.h>
1562306a36Sopenharmony_ci#include <linux/sched.h>
1662306a36Sopenharmony_ci#include <linux/mm.h>
1762306a36Sopenharmony_ci#include <linux/vmstat.h>
1862306a36Sopenharmony_ci#include <linux/eventfd.h>
1962306a36Sopenharmony_ci#include <linux/slab.h>
2062306a36Sopenharmony_ci#include <linux/swap.h>
2162306a36Sopenharmony_ci#include <linux/printk.h>
2262306a36Sopenharmony_ci#include <linux/vmpressure.h>
2362306a36Sopenharmony_ci
2462306a36Sopenharmony_ci/*
2562306a36Sopenharmony_ci * The window size (vmpressure_win) is the number of scanned pages before
2662306a36Sopenharmony_ci * we try to analyze scanned/reclaimed ratio. So the window is used as a
2762306a36Sopenharmony_ci * rate-limit tunable for the "low" level notification, and also for
2862306a36Sopenharmony_ci * averaging the ratio for medium/critical levels. Using small window
2962306a36Sopenharmony_ci * sizes can cause lot of false positives, but too big window size will
3062306a36Sopenharmony_ci * delay the notifications.
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci * As the vmscan reclaimer logic works with chunks which are multiple of
3362306a36Sopenharmony_ci * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
3462306a36Sopenharmony_ci *
3562306a36Sopenharmony_ci * TODO: Make the window size depend on machine size, as we do for vmstat
3662306a36Sopenharmony_ci * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
3762306a36Sopenharmony_ci */
3862306a36Sopenharmony_cistatic const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci/*
4162306a36Sopenharmony_ci * These thresholds are used when we account memory pressure through
4262306a36Sopenharmony_ci * scanned/reclaimed ratio. The current values were chosen empirically. In
4362306a36Sopenharmony_ci * essence, they are percents: the higher the value, the more number
4462306a36Sopenharmony_ci * unsuccessful reclaims there were.
4562306a36Sopenharmony_ci */
4662306a36Sopenharmony_cistatic const unsigned int vmpressure_level_med = 60;
4762306a36Sopenharmony_cistatic const unsigned int vmpressure_level_critical = 95;
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci/*
5062306a36Sopenharmony_ci * When there are too little pages left to scan, vmpressure() may miss the
5162306a36Sopenharmony_ci * critical pressure as number of pages will be less than "window size".
5262306a36Sopenharmony_ci * However, in that case the vmscan priority will raise fast as the
5362306a36Sopenharmony_ci * reclaimer will try to scan LRUs more deeply.
5462306a36Sopenharmony_ci *
5562306a36Sopenharmony_ci * The vmscan logic considers these special priorities:
5662306a36Sopenharmony_ci *
5762306a36Sopenharmony_ci * prio == DEF_PRIORITY (12): reclaimer starts with that value
5862306a36Sopenharmony_ci * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
5962306a36Sopenharmony_ci * prio == 0                : close to OOM, kernel scans every page in an lru
6062306a36Sopenharmony_ci *
6162306a36Sopenharmony_ci * Any value in this range is acceptable for this tunable (i.e. from 12 to
6262306a36Sopenharmony_ci * 0). Current value for the vmpressure_level_critical_prio is chosen
6362306a36Sopenharmony_ci * empirically, but the number, in essence, means that we consider
6462306a36Sopenharmony_ci * critical level when scanning depth is ~10% of the lru size (vmscan
6562306a36Sopenharmony_ci * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
6662306a36Sopenharmony_ci * eights).
6762306a36Sopenharmony_ci */
6862306a36Sopenharmony_cistatic const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_cistatic struct vmpressure *work_to_vmpressure(struct work_struct *work)
7162306a36Sopenharmony_ci{
7262306a36Sopenharmony_ci	return container_of(work, struct vmpressure, work);
7362306a36Sopenharmony_ci}
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_cistatic struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
7662306a36Sopenharmony_ci{
7762306a36Sopenharmony_ci	struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci	memcg = parent_mem_cgroup(memcg);
8062306a36Sopenharmony_ci	if (!memcg)
8162306a36Sopenharmony_ci		return NULL;
8262306a36Sopenharmony_ci	return memcg_to_vmpressure(memcg);
8362306a36Sopenharmony_ci}
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_cienum vmpressure_levels {
8662306a36Sopenharmony_ci	VMPRESSURE_LOW = 0,
8762306a36Sopenharmony_ci	VMPRESSURE_MEDIUM,
8862306a36Sopenharmony_ci	VMPRESSURE_CRITICAL,
8962306a36Sopenharmony_ci	VMPRESSURE_NUM_LEVELS,
9062306a36Sopenharmony_ci};
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_cienum vmpressure_modes {
9362306a36Sopenharmony_ci	VMPRESSURE_NO_PASSTHROUGH = 0,
9462306a36Sopenharmony_ci	VMPRESSURE_HIERARCHY,
9562306a36Sopenharmony_ci	VMPRESSURE_LOCAL,
9662306a36Sopenharmony_ci	VMPRESSURE_NUM_MODES,
9762306a36Sopenharmony_ci};
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_cistatic const char * const vmpressure_str_levels[] = {
10062306a36Sopenharmony_ci	[VMPRESSURE_LOW] = "low",
10162306a36Sopenharmony_ci	[VMPRESSURE_MEDIUM] = "medium",
10262306a36Sopenharmony_ci	[VMPRESSURE_CRITICAL] = "critical",
10362306a36Sopenharmony_ci};
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_cistatic const char * const vmpressure_str_modes[] = {
10662306a36Sopenharmony_ci	[VMPRESSURE_NO_PASSTHROUGH] = "default",
10762306a36Sopenharmony_ci	[VMPRESSURE_HIERARCHY] = "hierarchy",
10862306a36Sopenharmony_ci	[VMPRESSURE_LOCAL] = "local",
10962306a36Sopenharmony_ci};
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_cistatic enum vmpressure_levels vmpressure_level(unsigned long pressure)
11262306a36Sopenharmony_ci{
11362306a36Sopenharmony_ci	if (pressure >= vmpressure_level_critical)
11462306a36Sopenharmony_ci		return VMPRESSURE_CRITICAL;
11562306a36Sopenharmony_ci	else if (pressure >= vmpressure_level_med)
11662306a36Sopenharmony_ci		return VMPRESSURE_MEDIUM;
11762306a36Sopenharmony_ci	return VMPRESSURE_LOW;
11862306a36Sopenharmony_ci}
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_cistatic enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
12162306a36Sopenharmony_ci						    unsigned long reclaimed)
12262306a36Sopenharmony_ci{
12362306a36Sopenharmony_ci	unsigned long scale = scanned + reclaimed;
12462306a36Sopenharmony_ci	unsigned long pressure = 0;
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	/*
12762306a36Sopenharmony_ci	 * reclaimed can be greater than scanned for things such as reclaimed
12862306a36Sopenharmony_ci	 * slab pages. shrink_node() just adds reclaimed pages without a
12962306a36Sopenharmony_ci	 * related increment to scanned pages.
13062306a36Sopenharmony_ci	 */
13162306a36Sopenharmony_ci	if (reclaimed >= scanned)
13262306a36Sopenharmony_ci		goto out;
13362306a36Sopenharmony_ci	/*
13462306a36Sopenharmony_ci	 * We calculate the ratio (in percents) of how many pages were
13562306a36Sopenharmony_ci	 * scanned vs. reclaimed in a given time frame (window). Note that
13662306a36Sopenharmony_ci	 * time is in VM reclaimer's "ticks", i.e. number of pages
13762306a36Sopenharmony_ci	 * scanned. This makes it possible to set desired reaction time
13862306a36Sopenharmony_ci	 * and serves as a ratelimit.
13962306a36Sopenharmony_ci	 */
14062306a36Sopenharmony_ci	pressure = scale - (reclaimed * scale / scanned);
14162306a36Sopenharmony_ci	pressure = pressure * 100 / scale;
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_ciout:
14462306a36Sopenharmony_ci	pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
14562306a36Sopenharmony_ci		 scanned, reclaimed);
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	return vmpressure_level(pressure);
14862306a36Sopenharmony_ci}
14962306a36Sopenharmony_ci
15062306a36Sopenharmony_cistruct vmpressure_event {
15162306a36Sopenharmony_ci	struct eventfd_ctx *efd;
15262306a36Sopenharmony_ci	enum vmpressure_levels level;
15362306a36Sopenharmony_ci	enum vmpressure_modes mode;
15462306a36Sopenharmony_ci	struct list_head node;
15562306a36Sopenharmony_ci};
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_cistatic bool vmpressure_event(struct vmpressure *vmpr,
15862306a36Sopenharmony_ci			     const enum vmpressure_levels level,
15962306a36Sopenharmony_ci			     bool ancestor, bool signalled)
16062306a36Sopenharmony_ci{
16162306a36Sopenharmony_ci	struct vmpressure_event *ev;
16262306a36Sopenharmony_ci	bool ret = false;
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	mutex_lock(&vmpr->events_lock);
16562306a36Sopenharmony_ci	list_for_each_entry(ev, &vmpr->events, node) {
16662306a36Sopenharmony_ci		if (ancestor && ev->mode == VMPRESSURE_LOCAL)
16762306a36Sopenharmony_ci			continue;
16862306a36Sopenharmony_ci		if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
16962306a36Sopenharmony_ci			continue;
17062306a36Sopenharmony_ci		if (level < ev->level)
17162306a36Sopenharmony_ci			continue;
17262306a36Sopenharmony_ci		eventfd_signal(ev->efd, 1);
17362306a36Sopenharmony_ci		ret = true;
17462306a36Sopenharmony_ci	}
17562306a36Sopenharmony_ci	mutex_unlock(&vmpr->events_lock);
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_ci	return ret;
17862306a36Sopenharmony_ci}
17962306a36Sopenharmony_ci
18062306a36Sopenharmony_cistatic void vmpressure_work_fn(struct work_struct *work)
18162306a36Sopenharmony_ci{
18262306a36Sopenharmony_ci	struct vmpressure *vmpr = work_to_vmpressure(work);
18362306a36Sopenharmony_ci	unsigned long scanned;
18462306a36Sopenharmony_ci	unsigned long reclaimed;
18562306a36Sopenharmony_ci	enum vmpressure_levels level;
18662306a36Sopenharmony_ci	bool ancestor = false;
18762306a36Sopenharmony_ci	bool signalled = false;
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	spin_lock(&vmpr->sr_lock);
19062306a36Sopenharmony_ci	/*
19162306a36Sopenharmony_ci	 * Several contexts might be calling vmpressure(), so it is
19262306a36Sopenharmony_ci	 * possible that the work was rescheduled again before the old
19362306a36Sopenharmony_ci	 * work context cleared the counters. In that case we will run
19462306a36Sopenharmony_ci	 * just after the old work returns, but then scanned might be zero
19562306a36Sopenharmony_ci	 * here. No need for any locks here since we don't care if
19662306a36Sopenharmony_ci	 * vmpr->reclaimed is in sync.
19762306a36Sopenharmony_ci	 */
19862306a36Sopenharmony_ci	scanned = vmpr->tree_scanned;
19962306a36Sopenharmony_ci	if (!scanned) {
20062306a36Sopenharmony_ci		spin_unlock(&vmpr->sr_lock);
20162306a36Sopenharmony_ci		return;
20262306a36Sopenharmony_ci	}
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci	reclaimed = vmpr->tree_reclaimed;
20562306a36Sopenharmony_ci	vmpr->tree_scanned = 0;
20662306a36Sopenharmony_ci	vmpr->tree_reclaimed = 0;
20762306a36Sopenharmony_ci	spin_unlock(&vmpr->sr_lock);
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci	level = vmpressure_calc_level(scanned, reclaimed);
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci	do {
21262306a36Sopenharmony_ci		if (vmpressure_event(vmpr, level, ancestor, signalled))
21362306a36Sopenharmony_ci			signalled = true;
21462306a36Sopenharmony_ci		ancestor = true;
21562306a36Sopenharmony_ci	} while ((vmpr = vmpressure_parent(vmpr)));
21662306a36Sopenharmony_ci}
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_ci/**
21962306a36Sopenharmony_ci * vmpressure() - Account memory pressure through scanned/reclaimed ratio
22062306a36Sopenharmony_ci * @gfp:	reclaimer's gfp mask
22162306a36Sopenharmony_ci * @memcg:	cgroup memory controller handle
22262306a36Sopenharmony_ci * @tree:	legacy subtree mode
22362306a36Sopenharmony_ci * @scanned:	number of pages scanned
22462306a36Sopenharmony_ci * @reclaimed:	number of pages reclaimed
22562306a36Sopenharmony_ci *
22662306a36Sopenharmony_ci * This function should be called from the vmscan reclaim path to account
22762306a36Sopenharmony_ci * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
22862306a36Sopenharmony_ci * pressure index is then further refined and averaged over time.
22962306a36Sopenharmony_ci *
23062306a36Sopenharmony_ci * If @tree is set, vmpressure is in traditional userspace reporting
23162306a36Sopenharmony_ci * mode: @memcg is considered the pressure root and userspace is
23262306a36Sopenharmony_ci * notified of the entire subtree's reclaim efficiency.
23362306a36Sopenharmony_ci *
23462306a36Sopenharmony_ci * If @tree is not set, reclaim efficiency is recorded for @memcg, and
23562306a36Sopenharmony_ci * only in-kernel users are notified.
23662306a36Sopenharmony_ci *
23762306a36Sopenharmony_ci * This function does not return any value.
23862306a36Sopenharmony_ci */
23962306a36Sopenharmony_civoid vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
24062306a36Sopenharmony_ci		unsigned long scanned, unsigned long reclaimed)
24162306a36Sopenharmony_ci{
24262306a36Sopenharmony_ci	struct vmpressure *vmpr;
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	if (mem_cgroup_disabled())
24562306a36Sopenharmony_ci		return;
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	/*
24862306a36Sopenharmony_ci	 * The in-kernel users only care about the reclaim efficiency
24962306a36Sopenharmony_ci	 * for this @memcg rather than the whole subtree, and there
25062306a36Sopenharmony_ci	 * isn't and won't be any in-kernel user in a legacy cgroup.
25162306a36Sopenharmony_ci	 */
25262306a36Sopenharmony_ci	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree)
25362306a36Sopenharmony_ci		return;
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	vmpr = memcg_to_vmpressure(memcg);
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	/*
25862306a36Sopenharmony_ci	 * Here we only want to account pressure that userland is able to
25962306a36Sopenharmony_ci	 * help us with. For example, suppose that DMA zone is under
26062306a36Sopenharmony_ci	 * pressure; if we notify userland about that kind of pressure,
26162306a36Sopenharmony_ci	 * then it will be mostly a waste as it will trigger unnecessary
26262306a36Sopenharmony_ci	 * freeing of memory by userland (since userland is more likely to
26362306a36Sopenharmony_ci	 * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
26462306a36Sopenharmony_ci	 * is why we include only movable, highmem and FS/IO pages.
26562306a36Sopenharmony_ci	 * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
26662306a36Sopenharmony_ci	 * we account it too.
26762306a36Sopenharmony_ci	 */
26862306a36Sopenharmony_ci	if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
26962306a36Sopenharmony_ci		return;
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci	/*
27262306a36Sopenharmony_ci	 * If we got here with no pages scanned, then that is an indicator
27362306a36Sopenharmony_ci	 * that reclaimer was unable to find any shrinkable LRUs at the
27462306a36Sopenharmony_ci	 * current scanning depth. But it does not mean that we should
27562306a36Sopenharmony_ci	 * report the critical pressure, yet. If the scanning priority
27662306a36Sopenharmony_ci	 * (scanning depth) goes too high (deep), we will be notified
27762306a36Sopenharmony_ci	 * through vmpressure_prio(). But so far, keep calm.
27862306a36Sopenharmony_ci	 */
27962306a36Sopenharmony_ci	if (!scanned)
28062306a36Sopenharmony_ci		return;
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci	if (tree) {
28362306a36Sopenharmony_ci		spin_lock(&vmpr->sr_lock);
28462306a36Sopenharmony_ci		scanned = vmpr->tree_scanned += scanned;
28562306a36Sopenharmony_ci		vmpr->tree_reclaimed += reclaimed;
28662306a36Sopenharmony_ci		spin_unlock(&vmpr->sr_lock);
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci		if (scanned < vmpressure_win)
28962306a36Sopenharmony_ci			return;
29062306a36Sopenharmony_ci		schedule_work(&vmpr->work);
29162306a36Sopenharmony_ci	} else {
29262306a36Sopenharmony_ci		enum vmpressure_levels level;
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci		/* For now, no users for root-level efficiency */
29562306a36Sopenharmony_ci		if (!memcg || mem_cgroup_is_root(memcg))
29662306a36Sopenharmony_ci			return;
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci		spin_lock(&vmpr->sr_lock);
29962306a36Sopenharmony_ci		scanned = vmpr->scanned += scanned;
30062306a36Sopenharmony_ci		reclaimed = vmpr->reclaimed += reclaimed;
30162306a36Sopenharmony_ci		if (scanned < vmpressure_win) {
30262306a36Sopenharmony_ci			spin_unlock(&vmpr->sr_lock);
30362306a36Sopenharmony_ci			return;
30462306a36Sopenharmony_ci		}
30562306a36Sopenharmony_ci		vmpr->scanned = vmpr->reclaimed = 0;
30662306a36Sopenharmony_ci		spin_unlock(&vmpr->sr_lock);
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci		level = vmpressure_calc_level(scanned, reclaimed);
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci		if (level > VMPRESSURE_LOW) {
31162306a36Sopenharmony_ci			/*
31262306a36Sopenharmony_ci			 * Let the socket buffer allocator know that
31362306a36Sopenharmony_ci			 * we are having trouble reclaiming LRU pages.
31462306a36Sopenharmony_ci			 *
31562306a36Sopenharmony_ci			 * For hysteresis keep the pressure state
31662306a36Sopenharmony_ci			 * asserted for a second in which subsequent
31762306a36Sopenharmony_ci			 * pressure events can occur.
31862306a36Sopenharmony_ci			 */
31962306a36Sopenharmony_ci			WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
32062306a36Sopenharmony_ci		}
32162306a36Sopenharmony_ci	}
32262306a36Sopenharmony_ci}
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_ci/**
32562306a36Sopenharmony_ci * vmpressure_prio() - Account memory pressure through reclaimer priority level
32662306a36Sopenharmony_ci * @gfp:	reclaimer's gfp mask
32762306a36Sopenharmony_ci * @memcg:	cgroup memory controller handle
32862306a36Sopenharmony_ci * @prio:	reclaimer's priority
32962306a36Sopenharmony_ci *
33062306a36Sopenharmony_ci * This function should be called from the reclaim path every time when
33162306a36Sopenharmony_ci * the vmscan's reclaiming priority (scanning depth) changes.
33262306a36Sopenharmony_ci *
33362306a36Sopenharmony_ci * This function does not return any value.
33462306a36Sopenharmony_ci */
33562306a36Sopenharmony_civoid vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
33662306a36Sopenharmony_ci{
33762306a36Sopenharmony_ci	/*
33862306a36Sopenharmony_ci	 * We only use prio for accounting critical level. For more info
33962306a36Sopenharmony_ci	 * see comment for vmpressure_level_critical_prio variable above.
34062306a36Sopenharmony_ci	 */
34162306a36Sopenharmony_ci	if (prio > vmpressure_level_critical_prio)
34262306a36Sopenharmony_ci		return;
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_ci	/*
34562306a36Sopenharmony_ci	 * OK, the prio is below the threshold, updating vmpressure
34662306a36Sopenharmony_ci	 * information before shrinker dives into long shrinking of long
34762306a36Sopenharmony_ci	 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
34862306a36Sopenharmony_ci	 * to the vmpressure() basically means that we signal 'critical'
34962306a36Sopenharmony_ci	 * level.
35062306a36Sopenharmony_ci	 */
35162306a36Sopenharmony_ci	vmpressure(gfp, memcg, true, vmpressure_win, 0);
35262306a36Sopenharmony_ci}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci#define MAX_VMPRESSURE_ARGS_LEN	(strlen("critical") + strlen("hierarchy") + 2)
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci/**
35762306a36Sopenharmony_ci * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
35862306a36Sopenharmony_ci * @memcg:	memcg that is interested in vmpressure notifications
35962306a36Sopenharmony_ci * @eventfd:	eventfd context to link notifications with
36062306a36Sopenharmony_ci * @args:	event arguments (pressure level threshold, optional mode)
36162306a36Sopenharmony_ci *
36262306a36Sopenharmony_ci * This function associates eventfd context with the vmpressure
36362306a36Sopenharmony_ci * infrastructure, so that the notifications will be delivered to the
36462306a36Sopenharmony_ci * @eventfd. The @args parameter is a comma-delimited string that denotes a
36562306a36Sopenharmony_ci * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
36662306a36Sopenharmony_ci * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
36762306a36Sopenharmony_ci * "hierarchy" or "local").
36862306a36Sopenharmony_ci *
36962306a36Sopenharmony_ci * To be used as memcg event method.
37062306a36Sopenharmony_ci *
37162306a36Sopenharmony_ci * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
37262306a36Sopenharmony_ci * not be parsed.
37362306a36Sopenharmony_ci */
37462306a36Sopenharmony_ciint vmpressure_register_event(struct mem_cgroup *memcg,
37562306a36Sopenharmony_ci			      struct eventfd_ctx *eventfd, const char *args)
37662306a36Sopenharmony_ci{
37762306a36Sopenharmony_ci	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
37862306a36Sopenharmony_ci	struct vmpressure_event *ev;
37962306a36Sopenharmony_ci	enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
38062306a36Sopenharmony_ci	enum vmpressure_levels level;
38162306a36Sopenharmony_ci	char *spec, *spec_orig;
38262306a36Sopenharmony_ci	char *token;
38362306a36Sopenharmony_ci	int ret = 0;
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci	spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
38662306a36Sopenharmony_ci	if (!spec)
38762306a36Sopenharmony_ci		return -ENOMEM;
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci	/* Find required level */
39062306a36Sopenharmony_ci	token = strsep(&spec, ",");
39162306a36Sopenharmony_ci	ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
39262306a36Sopenharmony_ci	if (ret < 0)
39362306a36Sopenharmony_ci		goto out;
39462306a36Sopenharmony_ci	level = ret;
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci	/* Find optional mode */
39762306a36Sopenharmony_ci	token = strsep(&spec, ",");
39862306a36Sopenharmony_ci	if (token) {
39962306a36Sopenharmony_ci		ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
40062306a36Sopenharmony_ci		if (ret < 0)
40162306a36Sopenharmony_ci			goto out;
40262306a36Sopenharmony_ci		mode = ret;
40362306a36Sopenharmony_ci	}
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_ci	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
40662306a36Sopenharmony_ci	if (!ev) {
40762306a36Sopenharmony_ci		ret = -ENOMEM;
40862306a36Sopenharmony_ci		goto out;
40962306a36Sopenharmony_ci	}
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci	ev->efd = eventfd;
41262306a36Sopenharmony_ci	ev->level = level;
41362306a36Sopenharmony_ci	ev->mode = mode;
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci	mutex_lock(&vmpr->events_lock);
41662306a36Sopenharmony_ci	list_add(&ev->node, &vmpr->events);
41762306a36Sopenharmony_ci	mutex_unlock(&vmpr->events_lock);
41862306a36Sopenharmony_ci	ret = 0;
41962306a36Sopenharmony_ciout:
42062306a36Sopenharmony_ci	kfree(spec_orig);
42162306a36Sopenharmony_ci	return ret;
42262306a36Sopenharmony_ci}
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_ci/**
42562306a36Sopenharmony_ci * vmpressure_unregister_event() - Unbind eventfd from vmpressure
42662306a36Sopenharmony_ci * @memcg:	memcg handle
42762306a36Sopenharmony_ci * @eventfd:	eventfd context that was used to link vmpressure with the @cg
42862306a36Sopenharmony_ci *
42962306a36Sopenharmony_ci * This function does internal manipulations to detach the @eventfd from
43062306a36Sopenharmony_ci * the vmpressure notifications, and then frees internal resources
43162306a36Sopenharmony_ci * associated with the @eventfd (but the @eventfd itself is not freed).
43262306a36Sopenharmony_ci *
43362306a36Sopenharmony_ci * To be used as memcg event method.
43462306a36Sopenharmony_ci */
43562306a36Sopenharmony_civoid vmpressure_unregister_event(struct mem_cgroup *memcg,
43662306a36Sopenharmony_ci				 struct eventfd_ctx *eventfd)
43762306a36Sopenharmony_ci{
43862306a36Sopenharmony_ci	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
43962306a36Sopenharmony_ci	struct vmpressure_event *ev;
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	mutex_lock(&vmpr->events_lock);
44262306a36Sopenharmony_ci	list_for_each_entry(ev, &vmpr->events, node) {
44362306a36Sopenharmony_ci		if (ev->efd != eventfd)
44462306a36Sopenharmony_ci			continue;
44562306a36Sopenharmony_ci		list_del(&ev->node);
44662306a36Sopenharmony_ci		kfree(ev);
44762306a36Sopenharmony_ci		break;
44862306a36Sopenharmony_ci	}
44962306a36Sopenharmony_ci	mutex_unlock(&vmpr->events_lock);
45062306a36Sopenharmony_ci}
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci/**
45362306a36Sopenharmony_ci * vmpressure_init() - Initialize vmpressure control structure
45462306a36Sopenharmony_ci * @vmpr:	Structure to be initialized
45562306a36Sopenharmony_ci *
45662306a36Sopenharmony_ci * This function should be called on every allocated vmpressure structure
45762306a36Sopenharmony_ci * before any usage.
45862306a36Sopenharmony_ci */
45962306a36Sopenharmony_civoid vmpressure_init(struct vmpressure *vmpr)
46062306a36Sopenharmony_ci{
46162306a36Sopenharmony_ci	spin_lock_init(&vmpr->sr_lock);
46262306a36Sopenharmony_ci	mutex_init(&vmpr->events_lock);
46362306a36Sopenharmony_ci	INIT_LIST_HEAD(&vmpr->events);
46462306a36Sopenharmony_ci	INIT_WORK(&vmpr->work, vmpressure_work_fn);
46562306a36Sopenharmony_ci}
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_ci/**
46862306a36Sopenharmony_ci * vmpressure_cleanup() - shuts down vmpressure control structure
46962306a36Sopenharmony_ci * @vmpr:	Structure to be cleaned up
47062306a36Sopenharmony_ci *
47162306a36Sopenharmony_ci * This function should be called before the structure in which it is
47262306a36Sopenharmony_ci * embedded is cleaned up.
47362306a36Sopenharmony_ci */
47462306a36Sopenharmony_civoid vmpressure_cleanup(struct vmpressure *vmpr)
47562306a36Sopenharmony_ci{
47662306a36Sopenharmony_ci	/*
47762306a36Sopenharmony_ci	 * Make sure there is no pending work before eventfd infrastructure
47862306a36Sopenharmony_ci	 * goes away.
47962306a36Sopenharmony_ci	 */
48062306a36Sopenharmony_ci	flush_work(&vmpr->work);
48162306a36Sopenharmony_ci}
482