162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Linux VM pressure 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright 2012 Linaro Ltd. 662306a36Sopenharmony_ci * Anton Vorontsov <anton.vorontsov@linaro.org> 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, 962306a36Sopenharmony_ci * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci#include <linux/cgroup.h> 1362306a36Sopenharmony_ci#include <linux/fs.h> 1462306a36Sopenharmony_ci#include <linux/log2.h> 1562306a36Sopenharmony_ci#include <linux/sched.h> 1662306a36Sopenharmony_ci#include <linux/mm.h> 1762306a36Sopenharmony_ci#include <linux/vmstat.h> 1862306a36Sopenharmony_ci#include <linux/eventfd.h> 1962306a36Sopenharmony_ci#include <linux/slab.h> 2062306a36Sopenharmony_ci#include <linux/swap.h> 2162306a36Sopenharmony_ci#include <linux/printk.h> 2262306a36Sopenharmony_ci#include <linux/vmpressure.h> 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci/* 2562306a36Sopenharmony_ci * The window size (vmpressure_win) is the number of scanned pages before 2662306a36Sopenharmony_ci * we try to analyze scanned/reclaimed ratio. So the window is used as a 2762306a36Sopenharmony_ci * rate-limit tunable for the "low" level notification, and also for 2862306a36Sopenharmony_ci * averaging the ratio for medium/critical levels. Using small window 2962306a36Sopenharmony_ci * sizes can cause lot of false positives, but too big window size will 3062306a36Sopenharmony_ci * delay the notifications. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci * As the vmscan reclaimer logic works with chunks which are multiple of 3362306a36Sopenharmony_ci * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. 3462306a36Sopenharmony_ci * 3562306a36Sopenharmony_ci * TODO: Make the window size depend on machine size, as we do for vmstat 3662306a36Sopenharmony_ci * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). 3762306a36Sopenharmony_ci */ 3862306a36Sopenharmony_cistatic const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci/* 4162306a36Sopenharmony_ci * These thresholds are used when we account memory pressure through 4262306a36Sopenharmony_ci * scanned/reclaimed ratio. The current values were chosen empirically. In 4362306a36Sopenharmony_ci * essence, they are percents: the higher the value, the more number 4462306a36Sopenharmony_ci * unsuccessful reclaims there were. 4562306a36Sopenharmony_ci */ 4662306a36Sopenharmony_cistatic const unsigned int vmpressure_level_med = 60; 4762306a36Sopenharmony_cistatic const unsigned int vmpressure_level_critical = 95; 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci/* 5062306a36Sopenharmony_ci * When there are too little pages left to scan, vmpressure() may miss the 5162306a36Sopenharmony_ci * critical pressure as number of pages will be less than "window size". 5262306a36Sopenharmony_ci * However, in that case the vmscan priority will raise fast as the 5362306a36Sopenharmony_ci * reclaimer will try to scan LRUs more deeply. 5462306a36Sopenharmony_ci * 5562306a36Sopenharmony_ci * The vmscan logic considers these special priorities: 5662306a36Sopenharmony_ci * 5762306a36Sopenharmony_ci * prio == DEF_PRIORITY (12): reclaimer starts with that value 5862306a36Sopenharmony_ci * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed 5962306a36Sopenharmony_ci * prio == 0 : close to OOM, kernel scans every page in an lru 6062306a36Sopenharmony_ci * 6162306a36Sopenharmony_ci * Any value in this range is acceptable for this tunable (i.e. from 12 to 6262306a36Sopenharmony_ci * 0). Current value for the vmpressure_level_critical_prio is chosen 6362306a36Sopenharmony_ci * empirically, but the number, in essence, means that we consider 6462306a36Sopenharmony_ci * critical level when scanning depth is ~10% of the lru size (vmscan 6562306a36Sopenharmony_ci * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one 6662306a36Sopenharmony_ci * eights). 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_cistatic const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_cistatic struct vmpressure *work_to_vmpressure(struct work_struct *work) 7162306a36Sopenharmony_ci{ 7262306a36Sopenharmony_ci return container_of(work, struct vmpressure, work); 7362306a36Sopenharmony_ci} 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_cistatic struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) 7662306a36Sopenharmony_ci{ 7762306a36Sopenharmony_ci struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci memcg = parent_mem_cgroup(memcg); 8062306a36Sopenharmony_ci if (!memcg) 8162306a36Sopenharmony_ci return NULL; 8262306a36Sopenharmony_ci return memcg_to_vmpressure(memcg); 8362306a36Sopenharmony_ci} 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_cienum vmpressure_levels { 8662306a36Sopenharmony_ci VMPRESSURE_LOW = 0, 8762306a36Sopenharmony_ci VMPRESSURE_MEDIUM, 8862306a36Sopenharmony_ci VMPRESSURE_CRITICAL, 8962306a36Sopenharmony_ci VMPRESSURE_NUM_LEVELS, 9062306a36Sopenharmony_ci}; 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_cienum vmpressure_modes { 9362306a36Sopenharmony_ci VMPRESSURE_NO_PASSTHROUGH = 0, 9462306a36Sopenharmony_ci VMPRESSURE_HIERARCHY, 9562306a36Sopenharmony_ci VMPRESSURE_LOCAL, 9662306a36Sopenharmony_ci VMPRESSURE_NUM_MODES, 9762306a36Sopenharmony_ci}; 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_cistatic const char * const vmpressure_str_levels[] = { 10062306a36Sopenharmony_ci [VMPRESSURE_LOW] = "low", 10162306a36Sopenharmony_ci [VMPRESSURE_MEDIUM] = "medium", 10262306a36Sopenharmony_ci [VMPRESSURE_CRITICAL] = "critical", 10362306a36Sopenharmony_ci}; 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_cistatic const char * const vmpressure_str_modes[] = { 10662306a36Sopenharmony_ci [VMPRESSURE_NO_PASSTHROUGH] = "default", 10762306a36Sopenharmony_ci [VMPRESSURE_HIERARCHY] = "hierarchy", 10862306a36Sopenharmony_ci [VMPRESSURE_LOCAL] = "local", 10962306a36Sopenharmony_ci}; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_cistatic enum vmpressure_levels vmpressure_level(unsigned long pressure) 11262306a36Sopenharmony_ci{ 11362306a36Sopenharmony_ci if (pressure >= vmpressure_level_critical) 11462306a36Sopenharmony_ci return VMPRESSURE_CRITICAL; 11562306a36Sopenharmony_ci else if (pressure >= vmpressure_level_med) 11662306a36Sopenharmony_ci return VMPRESSURE_MEDIUM; 11762306a36Sopenharmony_ci return VMPRESSURE_LOW; 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_cistatic enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, 12162306a36Sopenharmony_ci unsigned long reclaimed) 12262306a36Sopenharmony_ci{ 12362306a36Sopenharmony_ci unsigned long scale = scanned + reclaimed; 12462306a36Sopenharmony_ci unsigned long pressure = 0; 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci /* 12762306a36Sopenharmony_ci * reclaimed can be greater than scanned for things such as reclaimed 12862306a36Sopenharmony_ci * slab pages. shrink_node() just adds reclaimed pages without a 12962306a36Sopenharmony_ci * related increment to scanned pages. 13062306a36Sopenharmony_ci */ 13162306a36Sopenharmony_ci if (reclaimed >= scanned) 13262306a36Sopenharmony_ci goto out; 13362306a36Sopenharmony_ci /* 13462306a36Sopenharmony_ci * We calculate the ratio (in percents) of how many pages were 13562306a36Sopenharmony_ci * scanned vs. reclaimed in a given time frame (window). Note that 13662306a36Sopenharmony_ci * time is in VM reclaimer's "ticks", i.e. number of pages 13762306a36Sopenharmony_ci * scanned. This makes it possible to set desired reaction time 13862306a36Sopenharmony_ci * and serves as a ratelimit. 13962306a36Sopenharmony_ci */ 14062306a36Sopenharmony_ci pressure = scale - (reclaimed * scale / scanned); 14162306a36Sopenharmony_ci pressure = pressure * 100 / scale; 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ciout: 14462306a36Sopenharmony_ci pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, 14562306a36Sopenharmony_ci scanned, reclaimed); 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci return vmpressure_level(pressure); 14862306a36Sopenharmony_ci} 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_cistruct vmpressure_event { 15162306a36Sopenharmony_ci struct eventfd_ctx *efd; 15262306a36Sopenharmony_ci enum vmpressure_levels level; 15362306a36Sopenharmony_ci enum vmpressure_modes mode; 15462306a36Sopenharmony_ci struct list_head node; 15562306a36Sopenharmony_ci}; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_cistatic bool vmpressure_event(struct vmpressure *vmpr, 15862306a36Sopenharmony_ci const enum vmpressure_levels level, 15962306a36Sopenharmony_ci bool ancestor, bool signalled) 16062306a36Sopenharmony_ci{ 16162306a36Sopenharmony_ci struct vmpressure_event *ev; 16262306a36Sopenharmony_ci bool ret = false; 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci mutex_lock(&vmpr->events_lock); 16562306a36Sopenharmony_ci list_for_each_entry(ev, &vmpr->events, node) { 16662306a36Sopenharmony_ci if (ancestor && ev->mode == VMPRESSURE_LOCAL) 16762306a36Sopenharmony_ci continue; 16862306a36Sopenharmony_ci if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) 16962306a36Sopenharmony_ci continue; 17062306a36Sopenharmony_ci if (level < ev->level) 17162306a36Sopenharmony_ci continue; 17262306a36Sopenharmony_ci eventfd_signal(ev->efd, 1); 17362306a36Sopenharmony_ci ret = true; 17462306a36Sopenharmony_ci } 17562306a36Sopenharmony_ci mutex_unlock(&vmpr->events_lock); 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci return ret; 17862306a36Sopenharmony_ci} 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_cistatic void vmpressure_work_fn(struct work_struct *work) 18162306a36Sopenharmony_ci{ 18262306a36Sopenharmony_ci struct vmpressure *vmpr = work_to_vmpressure(work); 18362306a36Sopenharmony_ci unsigned long scanned; 18462306a36Sopenharmony_ci unsigned long reclaimed; 18562306a36Sopenharmony_ci enum vmpressure_levels level; 18662306a36Sopenharmony_ci bool ancestor = false; 18762306a36Sopenharmony_ci bool signalled = false; 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci spin_lock(&vmpr->sr_lock); 19062306a36Sopenharmony_ci /* 19162306a36Sopenharmony_ci * Several contexts might be calling vmpressure(), so it is 19262306a36Sopenharmony_ci * possible that the work was rescheduled again before the old 19362306a36Sopenharmony_ci * work context cleared the counters. In that case we will run 19462306a36Sopenharmony_ci * just after the old work returns, but then scanned might be zero 19562306a36Sopenharmony_ci * here. No need for any locks here since we don't care if 19662306a36Sopenharmony_ci * vmpr->reclaimed is in sync. 19762306a36Sopenharmony_ci */ 19862306a36Sopenharmony_ci scanned = vmpr->tree_scanned; 19962306a36Sopenharmony_ci if (!scanned) { 20062306a36Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 20162306a36Sopenharmony_ci return; 20262306a36Sopenharmony_ci } 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci reclaimed = vmpr->tree_reclaimed; 20562306a36Sopenharmony_ci vmpr->tree_scanned = 0; 20662306a36Sopenharmony_ci vmpr->tree_reclaimed = 0; 20762306a36Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci level = vmpressure_calc_level(scanned, reclaimed); 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci do { 21262306a36Sopenharmony_ci if (vmpressure_event(vmpr, level, ancestor, signalled)) 21362306a36Sopenharmony_ci signalled = true; 21462306a36Sopenharmony_ci ancestor = true; 21562306a36Sopenharmony_ci } while ((vmpr = vmpressure_parent(vmpr))); 21662306a36Sopenharmony_ci} 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci/** 21962306a36Sopenharmony_ci * vmpressure() - Account memory pressure through scanned/reclaimed ratio 22062306a36Sopenharmony_ci * @gfp: reclaimer's gfp mask 22162306a36Sopenharmony_ci * @memcg: cgroup memory controller handle 22262306a36Sopenharmony_ci * @tree: legacy subtree mode 22362306a36Sopenharmony_ci * @scanned: number of pages scanned 22462306a36Sopenharmony_ci * @reclaimed: number of pages reclaimed 22562306a36Sopenharmony_ci * 22662306a36Sopenharmony_ci * This function should be called from the vmscan reclaim path to account 22762306a36Sopenharmony_ci * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw 22862306a36Sopenharmony_ci * pressure index is then further refined and averaged over time. 22962306a36Sopenharmony_ci * 23062306a36Sopenharmony_ci * If @tree is set, vmpressure is in traditional userspace reporting 23162306a36Sopenharmony_ci * mode: @memcg is considered the pressure root and userspace is 23262306a36Sopenharmony_ci * notified of the entire subtree's reclaim efficiency. 23362306a36Sopenharmony_ci * 23462306a36Sopenharmony_ci * If @tree is not set, reclaim efficiency is recorded for @memcg, and 23562306a36Sopenharmony_ci * only in-kernel users are notified. 23662306a36Sopenharmony_ci * 23762306a36Sopenharmony_ci * This function does not return any value. 23862306a36Sopenharmony_ci */ 23962306a36Sopenharmony_civoid vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, 24062306a36Sopenharmony_ci unsigned long scanned, unsigned long reclaimed) 24162306a36Sopenharmony_ci{ 24262306a36Sopenharmony_ci struct vmpressure *vmpr; 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci if (mem_cgroup_disabled()) 24562306a36Sopenharmony_ci return; 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci /* 24862306a36Sopenharmony_ci * The in-kernel users only care about the reclaim efficiency 24962306a36Sopenharmony_ci * for this @memcg rather than the whole subtree, and there 25062306a36Sopenharmony_ci * isn't and won't be any in-kernel user in a legacy cgroup. 25162306a36Sopenharmony_ci */ 25262306a36Sopenharmony_ci if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree) 25362306a36Sopenharmony_ci return; 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci vmpr = memcg_to_vmpressure(memcg); 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci /* 25862306a36Sopenharmony_ci * Here we only want to account pressure that userland is able to 25962306a36Sopenharmony_ci * help us with. For example, suppose that DMA zone is under 26062306a36Sopenharmony_ci * pressure; if we notify userland about that kind of pressure, 26162306a36Sopenharmony_ci * then it will be mostly a waste as it will trigger unnecessary 26262306a36Sopenharmony_ci * freeing of memory by userland (since userland is more likely to 26362306a36Sopenharmony_ci * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That 26462306a36Sopenharmony_ci * is why we include only movable, highmem and FS/IO pages. 26562306a36Sopenharmony_ci * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so 26662306a36Sopenharmony_ci * we account it too. 26762306a36Sopenharmony_ci */ 26862306a36Sopenharmony_ci if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) 26962306a36Sopenharmony_ci return; 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci /* 27262306a36Sopenharmony_ci * If we got here with no pages scanned, then that is an indicator 27362306a36Sopenharmony_ci * that reclaimer was unable to find any shrinkable LRUs at the 27462306a36Sopenharmony_ci * current scanning depth. But it does not mean that we should 27562306a36Sopenharmony_ci * report the critical pressure, yet. If the scanning priority 27662306a36Sopenharmony_ci * (scanning depth) goes too high (deep), we will be notified 27762306a36Sopenharmony_ci * through vmpressure_prio(). But so far, keep calm. 27862306a36Sopenharmony_ci */ 27962306a36Sopenharmony_ci if (!scanned) 28062306a36Sopenharmony_ci return; 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci if (tree) { 28362306a36Sopenharmony_ci spin_lock(&vmpr->sr_lock); 28462306a36Sopenharmony_ci scanned = vmpr->tree_scanned += scanned; 28562306a36Sopenharmony_ci vmpr->tree_reclaimed += reclaimed; 28662306a36Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci if (scanned < vmpressure_win) 28962306a36Sopenharmony_ci return; 29062306a36Sopenharmony_ci schedule_work(&vmpr->work); 29162306a36Sopenharmony_ci } else { 29262306a36Sopenharmony_ci enum vmpressure_levels level; 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci /* For now, no users for root-level efficiency */ 29562306a36Sopenharmony_ci if (!memcg || mem_cgroup_is_root(memcg)) 29662306a36Sopenharmony_ci return; 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci spin_lock(&vmpr->sr_lock); 29962306a36Sopenharmony_ci scanned = vmpr->scanned += scanned; 30062306a36Sopenharmony_ci reclaimed = vmpr->reclaimed += reclaimed; 30162306a36Sopenharmony_ci if (scanned < vmpressure_win) { 30262306a36Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 30362306a36Sopenharmony_ci return; 30462306a36Sopenharmony_ci } 30562306a36Sopenharmony_ci vmpr->scanned = vmpr->reclaimed = 0; 30662306a36Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci level = vmpressure_calc_level(scanned, reclaimed); 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci if (level > VMPRESSURE_LOW) { 31162306a36Sopenharmony_ci /* 31262306a36Sopenharmony_ci * Let the socket buffer allocator know that 31362306a36Sopenharmony_ci * we are having trouble reclaiming LRU pages. 31462306a36Sopenharmony_ci * 31562306a36Sopenharmony_ci * For hysteresis keep the pressure state 31662306a36Sopenharmony_ci * asserted for a second in which subsequent 31762306a36Sopenharmony_ci * pressure events can occur. 31862306a36Sopenharmony_ci */ 31962306a36Sopenharmony_ci WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); 32062306a36Sopenharmony_ci } 32162306a36Sopenharmony_ci } 32262306a36Sopenharmony_ci} 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci/** 32562306a36Sopenharmony_ci * vmpressure_prio() - Account memory pressure through reclaimer priority level 32662306a36Sopenharmony_ci * @gfp: reclaimer's gfp mask 32762306a36Sopenharmony_ci * @memcg: cgroup memory controller handle 32862306a36Sopenharmony_ci * @prio: reclaimer's priority 32962306a36Sopenharmony_ci * 33062306a36Sopenharmony_ci * This function should be called from the reclaim path every time when 33162306a36Sopenharmony_ci * the vmscan's reclaiming priority (scanning depth) changes. 33262306a36Sopenharmony_ci * 33362306a36Sopenharmony_ci * This function does not return any value. 33462306a36Sopenharmony_ci */ 33562306a36Sopenharmony_civoid vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) 33662306a36Sopenharmony_ci{ 33762306a36Sopenharmony_ci /* 33862306a36Sopenharmony_ci * We only use prio for accounting critical level. For more info 33962306a36Sopenharmony_ci * see comment for vmpressure_level_critical_prio variable above. 34062306a36Sopenharmony_ci */ 34162306a36Sopenharmony_ci if (prio > vmpressure_level_critical_prio) 34262306a36Sopenharmony_ci return; 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_ci /* 34562306a36Sopenharmony_ci * OK, the prio is below the threshold, updating vmpressure 34662306a36Sopenharmony_ci * information before shrinker dives into long shrinking of long 34762306a36Sopenharmony_ci * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 34862306a36Sopenharmony_ci * to the vmpressure() basically means that we signal 'critical' 34962306a36Sopenharmony_ci * level. 35062306a36Sopenharmony_ci */ 35162306a36Sopenharmony_ci vmpressure(gfp, memcg, true, vmpressure_win, 0); 35262306a36Sopenharmony_ci} 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci/** 35762306a36Sopenharmony_ci * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 35862306a36Sopenharmony_ci * @memcg: memcg that is interested in vmpressure notifications 35962306a36Sopenharmony_ci * @eventfd: eventfd context to link notifications with 36062306a36Sopenharmony_ci * @args: event arguments (pressure level threshold, optional mode) 36162306a36Sopenharmony_ci * 36262306a36Sopenharmony_ci * This function associates eventfd context with the vmpressure 36362306a36Sopenharmony_ci * infrastructure, so that the notifications will be delivered to the 36462306a36Sopenharmony_ci * @eventfd. The @args parameter is a comma-delimited string that denotes a 36562306a36Sopenharmony_ci * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", 36662306a36Sopenharmony_ci * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. 36762306a36Sopenharmony_ci * "hierarchy" or "local"). 36862306a36Sopenharmony_ci * 36962306a36Sopenharmony_ci * To be used as memcg event method. 37062306a36Sopenharmony_ci * 37162306a36Sopenharmony_ci * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could 37262306a36Sopenharmony_ci * not be parsed. 37362306a36Sopenharmony_ci */ 37462306a36Sopenharmony_ciint vmpressure_register_event(struct mem_cgroup *memcg, 37562306a36Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args) 37662306a36Sopenharmony_ci{ 37762306a36Sopenharmony_ci struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 37862306a36Sopenharmony_ci struct vmpressure_event *ev; 37962306a36Sopenharmony_ci enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; 38062306a36Sopenharmony_ci enum vmpressure_levels level; 38162306a36Sopenharmony_ci char *spec, *spec_orig; 38262306a36Sopenharmony_ci char *token; 38362306a36Sopenharmony_ci int ret = 0; 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); 38662306a36Sopenharmony_ci if (!spec) 38762306a36Sopenharmony_ci return -ENOMEM; 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci /* Find required level */ 39062306a36Sopenharmony_ci token = strsep(&spec, ","); 39162306a36Sopenharmony_ci ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); 39262306a36Sopenharmony_ci if (ret < 0) 39362306a36Sopenharmony_ci goto out; 39462306a36Sopenharmony_ci level = ret; 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci /* Find optional mode */ 39762306a36Sopenharmony_ci token = strsep(&spec, ","); 39862306a36Sopenharmony_ci if (token) { 39962306a36Sopenharmony_ci ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); 40062306a36Sopenharmony_ci if (ret < 0) 40162306a36Sopenharmony_ci goto out; 40262306a36Sopenharmony_ci mode = ret; 40362306a36Sopenharmony_ci } 40462306a36Sopenharmony_ci 40562306a36Sopenharmony_ci ev = kzalloc(sizeof(*ev), GFP_KERNEL); 40662306a36Sopenharmony_ci if (!ev) { 40762306a36Sopenharmony_ci ret = -ENOMEM; 40862306a36Sopenharmony_ci goto out; 40962306a36Sopenharmony_ci } 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_ci ev->efd = eventfd; 41262306a36Sopenharmony_ci ev->level = level; 41362306a36Sopenharmony_ci ev->mode = mode; 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci mutex_lock(&vmpr->events_lock); 41662306a36Sopenharmony_ci list_add(&ev->node, &vmpr->events); 41762306a36Sopenharmony_ci mutex_unlock(&vmpr->events_lock); 41862306a36Sopenharmony_ci ret = 0; 41962306a36Sopenharmony_ciout: 42062306a36Sopenharmony_ci kfree(spec_orig); 42162306a36Sopenharmony_ci return ret; 42262306a36Sopenharmony_ci} 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci/** 42562306a36Sopenharmony_ci * vmpressure_unregister_event() - Unbind eventfd from vmpressure 42662306a36Sopenharmony_ci * @memcg: memcg handle 42762306a36Sopenharmony_ci * @eventfd: eventfd context that was used to link vmpressure with the @cg 42862306a36Sopenharmony_ci * 42962306a36Sopenharmony_ci * This function does internal manipulations to detach the @eventfd from 43062306a36Sopenharmony_ci * the vmpressure notifications, and then frees internal resources 43162306a36Sopenharmony_ci * associated with the @eventfd (but the @eventfd itself is not freed). 43262306a36Sopenharmony_ci * 43362306a36Sopenharmony_ci * To be used as memcg event method. 43462306a36Sopenharmony_ci */ 43562306a36Sopenharmony_civoid vmpressure_unregister_event(struct mem_cgroup *memcg, 43662306a36Sopenharmony_ci struct eventfd_ctx *eventfd) 43762306a36Sopenharmony_ci{ 43862306a36Sopenharmony_ci struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 43962306a36Sopenharmony_ci struct vmpressure_event *ev; 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci mutex_lock(&vmpr->events_lock); 44262306a36Sopenharmony_ci list_for_each_entry(ev, &vmpr->events, node) { 44362306a36Sopenharmony_ci if (ev->efd != eventfd) 44462306a36Sopenharmony_ci continue; 44562306a36Sopenharmony_ci list_del(&ev->node); 44662306a36Sopenharmony_ci kfree(ev); 44762306a36Sopenharmony_ci break; 44862306a36Sopenharmony_ci } 44962306a36Sopenharmony_ci mutex_unlock(&vmpr->events_lock); 45062306a36Sopenharmony_ci} 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci/** 45362306a36Sopenharmony_ci * vmpressure_init() - Initialize vmpressure control structure 45462306a36Sopenharmony_ci * @vmpr: Structure to be initialized 45562306a36Sopenharmony_ci * 45662306a36Sopenharmony_ci * This function should be called on every allocated vmpressure structure 45762306a36Sopenharmony_ci * before any usage. 45862306a36Sopenharmony_ci */ 45962306a36Sopenharmony_civoid vmpressure_init(struct vmpressure *vmpr) 46062306a36Sopenharmony_ci{ 46162306a36Sopenharmony_ci spin_lock_init(&vmpr->sr_lock); 46262306a36Sopenharmony_ci mutex_init(&vmpr->events_lock); 46362306a36Sopenharmony_ci INIT_LIST_HEAD(&vmpr->events); 46462306a36Sopenharmony_ci INIT_WORK(&vmpr->work, vmpressure_work_fn); 46562306a36Sopenharmony_ci} 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci/** 46862306a36Sopenharmony_ci * vmpressure_cleanup() - shuts down vmpressure control structure 46962306a36Sopenharmony_ci * @vmpr: Structure to be cleaned up 47062306a36Sopenharmony_ci * 47162306a36Sopenharmony_ci * This function should be called before the structure in which it is 47262306a36Sopenharmony_ci * embedded is cleaned up. 47362306a36Sopenharmony_ci */ 47462306a36Sopenharmony_civoid vmpressure_cleanup(struct vmpressure *vmpr) 47562306a36Sopenharmony_ci{ 47662306a36Sopenharmony_ci /* 47762306a36Sopenharmony_ci * Make sure there is no pending work before eventfd infrastructure 47862306a36Sopenharmony_ci * goes away. 47962306a36Sopenharmony_ci */ 48062306a36Sopenharmony_ci flush_work(&vmpr->work); 48162306a36Sopenharmony_ci} 482