18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Linux VM pressure 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright 2012 Linaro Ltd. 68c2ecf20Sopenharmony_ci * Anton Vorontsov <anton.vorontsov@linaro.org> 78c2ecf20Sopenharmony_ci * 88c2ecf20Sopenharmony_ci * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, 98c2ecf20Sopenharmony_ci * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. 108c2ecf20Sopenharmony_ci */ 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci#include <linux/cgroup.h> 138c2ecf20Sopenharmony_ci#include <linux/fs.h> 148c2ecf20Sopenharmony_ci#include <linux/log2.h> 158c2ecf20Sopenharmony_ci#include <linux/sched.h> 168c2ecf20Sopenharmony_ci#include <linux/mm.h> 178c2ecf20Sopenharmony_ci#include <linux/vmstat.h> 188c2ecf20Sopenharmony_ci#include <linux/eventfd.h> 198c2ecf20Sopenharmony_ci#include <linux/slab.h> 208c2ecf20Sopenharmony_ci#include <linux/swap.h> 218c2ecf20Sopenharmony_ci#include <linux/printk.h> 228c2ecf20Sopenharmony_ci#include <linux/vmpressure.h> 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_ci/* 258c2ecf20Sopenharmony_ci * The window size (vmpressure_win) is the number of scanned pages before 268c2ecf20Sopenharmony_ci * we try to analyze scanned/reclaimed ratio. So the window is used as a 278c2ecf20Sopenharmony_ci * rate-limit tunable for the "low" level notification, and also for 288c2ecf20Sopenharmony_ci * averaging the ratio for medium/critical levels. Using small window 298c2ecf20Sopenharmony_ci * sizes can cause lot of false positives, but too big window size will 308c2ecf20Sopenharmony_ci * delay the notifications. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci * As the vmscan reclaimer logic works with chunks which are multiple of 338c2ecf20Sopenharmony_ci * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. 348c2ecf20Sopenharmony_ci * 358c2ecf20Sopenharmony_ci * TODO: Make the window size depend on machine size, as we do for vmstat 368c2ecf20Sopenharmony_ci * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). 378c2ecf20Sopenharmony_ci */ 388c2ecf20Sopenharmony_cistatic const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci/* 418c2ecf20Sopenharmony_ci * These thresholds are used when we account memory pressure through 428c2ecf20Sopenharmony_ci * scanned/reclaimed ratio. The current values were chosen empirically. In 438c2ecf20Sopenharmony_ci * essence, they are percents: the higher the value, the more number 448c2ecf20Sopenharmony_ci * unsuccessful reclaims there were. 458c2ecf20Sopenharmony_ci */ 468c2ecf20Sopenharmony_cistatic const unsigned int vmpressure_level_med = 60; 478c2ecf20Sopenharmony_cistatic const unsigned int vmpressure_level_critical = 95; 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci/* 508c2ecf20Sopenharmony_ci * When there are too little pages left to scan, vmpressure() may miss the 518c2ecf20Sopenharmony_ci * critical pressure as number of pages will be less than "window size". 528c2ecf20Sopenharmony_ci * However, in that case the vmscan priority will raise fast as the 538c2ecf20Sopenharmony_ci * reclaimer will try to scan LRUs more deeply. 548c2ecf20Sopenharmony_ci * 558c2ecf20Sopenharmony_ci * The vmscan logic considers these special priorities: 568c2ecf20Sopenharmony_ci * 578c2ecf20Sopenharmony_ci * prio == DEF_PRIORITY (12): reclaimer starts with that value 588c2ecf20Sopenharmony_ci * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed 598c2ecf20Sopenharmony_ci * prio == 0 : close to OOM, kernel scans every page in an lru 608c2ecf20Sopenharmony_ci * 618c2ecf20Sopenharmony_ci * Any value in this range is acceptable for this tunable (i.e. from 12 to 628c2ecf20Sopenharmony_ci * 0). Current value for the vmpressure_level_critical_prio is chosen 638c2ecf20Sopenharmony_ci * empirically, but the number, in essence, means that we consider 648c2ecf20Sopenharmony_ci * critical level when scanning depth is ~10% of the lru size (vmscan 658c2ecf20Sopenharmony_ci * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one 668c2ecf20Sopenharmony_ci * eights). 678c2ecf20Sopenharmony_ci */ 688c2ecf20Sopenharmony_cistatic const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_cistatic struct vmpressure *work_to_vmpressure(struct work_struct *work) 718c2ecf20Sopenharmony_ci{ 728c2ecf20Sopenharmony_ci return container_of(work, struct vmpressure, work); 738c2ecf20Sopenharmony_ci} 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_cistatic struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) 768c2ecf20Sopenharmony_ci{ 778c2ecf20Sopenharmony_ci struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); 788c2ecf20Sopenharmony_ci struct mem_cgroup *memcg = mem_cgroup_from_css(css); 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci memcg = parent_mem_cgroup(memcg); 818c2ecf20Sopenharmony_ci if (!memcg) 828c2ecf20Sopenharmony_ci return NULL; 838c2ecf20Sopenharmony_ci return memcg_to_vmpressure(memcg); 848c2ecf20Sopenharmony_ci} 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_cienum vmpressure_levels { 878c2ecf20Sopenharmony_ci VMPRESSURE_LOW = 0, 888c2ecf20Sopenharmony_ci VMPRESSURE_MEDIUM, 898c2ecf20Sopenharmony_ci VMPRESSURE_CRITICAL, 908c2ecf20Sopenharmony_ci VMPRESSURE_NUM_LEVELS, 918c2ecf20Sopenharmony_ci}; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_cienum vmpressure_modes { 948c2ecf20Sopenharmony_ci VMPRESSURE_NO_PASSTHROUGH = 0, 958c2ecf20Sopenharmony_ci VMPRESSURE_HIERARCHY, 968c2ecf20Sopenharmony_ci VMPRESSURE_LOCAL, 978c2ecf20Sopenharmony_ci VMPRESSURE_NUM_MODES, 988c2ecf20Sopenharmony_ci}; 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_cistatic const char * const vmpressure_str_levels[] = { 1018c2ecf20Sopenharmony_ci [VMPRESSURE_LOW] = "low", 1028c2ecf20Sopenharmony_ci [VMPRESSURE_MEDIUM] = "medium", 1038c2ecf20Sopenharmony_ci [VMPRESSURE_CRITICAL] = "critical", 1048c2ecf20Sopenharmony_ci}; 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_cistatic const char * const vmpressure_str_modes[] = { 1078c2ecf20Sopenharmony_ci [VMPRESSURE_NO_PASSTHROUGH] = "default", 1088c2ecf20Sopenharmony_ci [VMPRESSURE_HIERARCHY] = "hierarchy", 1098c2ecf20Sopenharmony_ci [VMPRESSURE_LOCAL] = "local", 1108c2ecf20Sopenharmony_ci}; 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_cistatic enum vmpressure_levels vmpressure_level(unsigned long pressure) 1138c2ecf20Sopenharmony_ci{ 1148c2ecf20Sopenharmony_ci if (pressure >= vmpressure_level_critical) 1158c2ecf20Sopenharmony_ci return VMPRESSURE_CRITICAL; 1168c2ecf20Sopenharmony_ci else if (pressure >= vmpressure_level_med) 1178c2ecf20Sopenharmony_ci return VMPRESSURE_MEDIUM; 1188c2ecf20Sopenharmony_ci return VMPRESSURE_LOW; 1198c2ecf20Sopenharmony_ci} 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_cistatic enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, 1228c2ecf20Sopenharmony_ci unsigned long reclaimed) 1238c2ecf20Sopenharmony_ci{ 1248c2ecf20Sopenharmony_ci unsigned long scale = scanned + reclaimed; 1258c2ecf20Sopenharmony_ci unsigned long pressure = 0; 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci /* 1288c2ecf20Sopenharmony_ci * reclaimed can be greater than scanned for things such as reclaimed 1298c2ecf20Sopenharmony_ci * slab pages. shrink_node() just adds reclaimed pages without a 1308c2ecf20Sopenharmony_ci * related increment to scanned pages. 1318c2ecf20Sopenharmony_ci */ 1328c2ecf20Sopenharmony_ci if (reclaimed >= scanned) 1338c2ecf20Sopenharmony_ci goto out; 1348c2ecf20Sopenharmony_ci /* 1358c2ecf20Sopenharmony_ci * We calculate the ratio (in percents) of how many pages were 1368c2ecf20Sopenharmony_ci * scanned vs. reclaimed in a given time frame (window). Note that 1378c2ecf20Sopenharmony_ci * time is in VM reclaimer's "ticks", i.e. number of pages 1388c2ecf20Sopenharmony_ci * scanned. This makes it possible to set desired reaction time 1398c2ecf20Sopenharmony_ci * and serves as a ratelimit. 1408c2ecf20Sopenharmony_ci */ 1418c2ecf20Sopenharmony_ci pressure = scale - (reclaimed * scale / scanned); 1428c2ecf20Sopenharmony_ci pressure = pressure * 100 / scale; 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ciout: 1458c2ecf20Sopenharmony_ci pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, 1468c2ecf20Sopenharmony_ci scanned, reclaimed); 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci return vmpressure_level(pressure); 1498c2ecf20Sopenharmony_ci} 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_cistruct vmpressure_event { 1528c2ecf20Sopenharmony_ci struct eventfd_ctx *efd; 1538c2ecf20Sopenharmony_ci enum vmpressure_levels level; 1548c2ecf20Sopenharmony_ci enum vmpressure_modes mode; 1558c2ecf20Sopenharmony_ci struct list_head node; 1568c2ecf20Sopenharmony_ci}; 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_cistatic bool vmpressure_event(struct vmpressure *vmpr, 1598c2ecf20Sopenharmony_ci const enum vmpressure_levels level, 1608c2ecf20Sopenharmony_ci bool ancestor, bool signalled) 1618c2ecf20Sopenharmony_ci{ 1628c2ecf20Sopenharmony_ci struct vmpressure_event *ev; 1638c2ecf20Sopenharmony_ci bool ret = false; 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci mutex_lock(&vmpr->events_lock); 1668c2ecf20Sopenharmony_ci list_for_each_entry(ev, &vmpr->events, node) { 1678c2ecf20Sopenharmony_ci if (ancestor && ev->mode == VMPRESSURE_LOCAL) 1688c2ecf20Sopenharmony_ci continue; 1698c2ecf20Sopenharmony_ci if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) 1708c2ecf20Sopenharmony_ci continue; 1718c2ecf20Sopenharmony_ci if (level < ev->level) 1728c2ecf20Sopenharmony_ci continue; 1738c2ecf20Sopenharmony_ci eventfd_signal(ev->efd, 1); 1748c2ecf20Sopenharmony_ci ret = true; 1758c2ecf20Sopenharmony_ci } 1768c2ecf20Sopenharmony_ci mutex_unlock(&vmpr->events_lock); 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci return ret; 1798c2ecf20Sopenharmony_ci} 1808c2ecf20Sopenharmony_ci 1818c2ecf20Sopenharmony_cistatic void vmpressure_work_fn(struct work_struct *work) 1828c2ecf20Sopenharmony_ci{ 1838c2ecf20Sopenharmony_ci struct vmpressure *vmpr = work_to_vmpressure(work); 1848c2ecf20Sopenharmony_ci unsigned long scanned; 1858c2ecf20Sopenharmony_ci unsigned long reclaimed; 1868c2ecf20Sopenharmony_ci enum vmpressure_levels level; 1878c2ecf20Sopenharmony_ci bool ancestor = false; 1888c2ecf20Sopenharmony_ci bool signalled = false; 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci spin_lock(&vmpr->sr_lock); 1918c2ecf20Sopenharmony_ci /* 1928c2ecf20Sopenharmony_ci * Several contexts might be calling vmpressure(), so it is 1938c2ecf20Sopenharmony_ci * possible that the work was rescheduled again before the old 1948c2ecf20Sopenharmony_ci * work context cleared the counters. In that case we will run 1958c2ecf20Sopenharmony_ci * just after the old work returns, but then scanned might be zero 1968c2ecf20Sopenharmony_ci * here. No need for any locks here since we don't care if 1978c2ecf20Sopenharmony_ci * vmpr->reclaimed is in sync. 1988c2ecf20Sopenharmony_ci */ 1998c2ecf20Sopenharmony_ci scanned = vmpr->tree_scanned; 2008c2ecf20Sopenharmony_ci if (!scanned) { 2018c2ecf20Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 2028c2ecf20Sopenharmony_ci return; 2038c2ecf20Sopenharmony_ci } 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci reclaimed = vmpr->tree_reclaimed; 2068c2ecf20Sopenharmony_ci vmpr->tree_scanned = 0; 2078c2ecf20Sopenharmony_ci vmpr->tree_reclaimed = 0; 2088c2ecf20Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci level = vmpressure_calc_level(scanned, reclaimed); 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci do { 2138c2ecf20Sopenharmony_ci if (vmpressure_event(vmpr, level, ancestor, signalled)) 2148c2ecf20Sopenharmony_ci signalled = true; 2158c2ecf20Sopenharmony_ci ancestor = true; 2168c2ecf20Sopenharmony_ci } while ((vmpr = vmpressure_parent(vmpr))); 2178c2ecf20Sopenharmony_ci} 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci/** 2208c2ecf20Sopenharmony_ci * vmpressure() - Account memory pressure through scanned/reclaimed ratio 2218c2ecf20Sopenharmony_ci * @gfp: reclaimer's gfp mask 2228c2ecf20Sopenharmony_ci * @memcg: cgroup memory controller handle 2238c2ecf20Sopenharmony_ci * @tree: legacy subtree mode 2248c2ecf20Sopenharmony_ci * @scanned: number of pages scanned 2258c2ecf20Sopenharmony_ci * @reclaimed: number of pages reclaimed 2268c2ecf20Sopenharmony_ci * 2278c2ecf20Sopenharmony_ci * This function should be called from the vmscan reclaim path to account 2288c2ecf20Sopenharmony_ci * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw 2298c2ecf20Sopenharmony_ci * pressure index is then further refined and averaged over time. 2308c2ecf20Sopenharmony_ci * 2318c2ecf20Sopenharmony_ci * If @tree is set, vmpressure is in traditional userspace reporting 2328c2ecf20Sopenharmony_ci * mode: @memcg is considered the pressure root and userspace is 2338c2ecf20Sopenharmony_ci * notified of the entire subtree's reclaim efficiency. 2348c2ecf20Sopenharmony_ci * 2358c2ecf20Sopenharmony_ci * If @tree is not set, reclaim efficiency is recorded for @memcg, and 2368c2ecf20Sopenharmony_ci * only in-kernel users are notified. 2378c2ecf20Sopenharmony_ci * 2388c2ecf20Sopenharmony_ci * This function does not return any value. 2398c2ecf20Sopenharmony_ci */ 2408c2ecf20Sopenharmony_civoid vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, 2418c2ecf20Sopenharmony_ci unsigned long scanned, unsigned long reclaimed) 2428c2ecf20Sopenharmony_ci{ 2438c2ecf20Sopenharmony_ci struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci /* 2468c2ecf20Sopenharmony_ci * Here we only want to account pressure that userland is able to 2478c2ecf20Sopenharmony_ci * help us with. For example, suppose that DMA zone is under 2488c2ecf20Sopenharmony_ci * pressure; if we notify userland about that kind of pressure, 2498c2ecf20Sopenharmony_ci * then it will be mostly a waste as it will trigger unnecessary 2508c2ecf20Sopenharmony_ci * freeing of memory by userland (since userland is more likely to 2518c2ecf20Sopenharmony_ci * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That 2528c2ecf20Sopenharmony_ci * is why we include only movable, highmem and FS/IO pages. 2538c2ecf20Sopenharmony_ci * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so 2548c2ecf20Sopenharmony_ci * we account it too. 2558c2ecf20Sopenharmony_ci */ 2568c2ecf20Sopenharmony_ci if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) 2578c2ecf20Sopenharmony_ci return; 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci /* 2608c2ecf20Sopenharmony_ci * If we got here with no pages scanned, then that is an indicator 2618c2ecf20Sopenharmony_ci * that reclaimer was unable to find any shrinkable LRUs at the 2628c2ecf20Sopenharmony_ci * current scanning depth. But it does not mean that we should 2638c2ecf20Sopenharmony_ci * report the critical pressure, yet. If the scanning priority 2648c2ecf20Sopenharmony_ci * (scanning depth) goes too high (deep), we will be notified 2658c2ecf20Sopenharmony_ci * through vmpressure_prio(). But so far, keep calm. 2668c2ecf20Sopenharmony_ci */ 2678c2ecf20Sopenharmony_ci if (!scanned) 2688c2ecf20Sopenharmony_ci return; 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci if (tree) { 2718c2ecf20Sopenharmony_ci spin_lock(&vmpr->sr_lock); 2728c2ecf20Sopenharmony_ci scanned = vmpr->tree_scanned += scanned; 2738c2ecf20Sopenharmony_ci vmpr->tree_reclaimed += reclaimed; 2748c2ecf20Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci if (scanned < vmpressure_win) 2778c2ecf20Sopenharmony_ci return; 2788c2ecf20Sopenharmony_ci schedule_work(&vmpr->work); 2798c2ecf20Sopenharmony_ci } else { 2808c2ecf20Sopenharmony_ci enum vmpressure_levels level; 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_ci /* For now, no users for root-level efficiency */ 2838c2ecf20Sopenharmony_ci if (!memcg || mem_cgroup_is_root(memcg)) 2848c2ecf20Sopenharmony_ci return; 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci spin_lock(&vmpr->sr_lock); 2878c2ecf20Sopenharmony_ci scanned = vmpr->scanned += scanned; 2888c2ecf20Sopenharmony_ci reclaimed = vmpr->reclaimed += reclaimed; 2898c2ecf20Sopenharmony_ci if (scanned < vmpressure_win) { 2908c2ecf20Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 2918c2ecf20Sopenharmony_ci return; 2928c2ecf20Sopenharmony_ci } 2938c2ecf20Sopenharmony_ci vmpr->scanned = vmpr->reclaimed = 0; 2948c2ecf20Sopenharmony_ci spin_unlock(&vmpr->sr_lock); 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci level = vmpressure_calc_level(scanned, reclaimed); 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_ci if (level > VMPRESSURE_LOW) { 2998c2ecf20Sopenharmony_ci /* 3008c2ecf20Sopenharmony_ci * Let the socket buffer allocator know that 3018c2ecf20Sopenharmony_ci * we are having trouble reclaiming LRU pages. 3028c2ecf20Sopenharmony_ci * 3038c2ecf20Sopenharmony_ci * For hysteresis keep the pressure state 3048c2ecf20Sopenharmony_ci * asserted for a second in which subsequent 3058c2ecf20Sopenharmony_ci * pressure events can occur. 3068c2ecf20Sopenharmony_ci */ 3078c2ecf20Sopenharmony_ci memcg->socket_pressure = jiffies + HZ; 3088c2ecf20Sopenharmony_ci } 3098c2ecf20Sopenharmony_ci } 3108c2ecf20Sopenharmony_ci} 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci/** 3138c2ecf20Sopenharmony_ci * vmpressure_prio() - Account memory pressure through reclaimer priority level 3148c2ecf20Sopenharmony_ci * @gfp: reclaimer's gfp mask 3158c2ecf20Sopenharmony_ci * @memcg: cgroup memory controller handle 3168c2ecf20Sopenharmony_ci * @prio: reclaimer's priority 3178c2ecf20Sopenharmony_ci * 3188c2ecf20Sopenharmony_ci * This function should be called from the reclaim path every time when 3198c2ecf20Sopenharmony_ci * the vmscan's reclaiming priority (scanning depth) changes. 3208c2ecf20Sopenharmony_ci * 3218c2ecf20Sopenharmony_ci * This function does not return any value. 3228c2ecf20Sopenharmony_ci */ 3238c2ecf20Sopenharmony_civoid vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) 3248c2ecf20Sopenharmony_ci{ 3258c2ecf20Sopenharmony_ci /* 3268c2ecf20Sopenharmony_ci * We only use prio for accounting critical level. For more info 3278c2ecf20Sopenharmony_ci * see comment for vmpressure_level_critical_prio variable above. 3288c2ecf20Sopenharmony_ci */ 3298c2ecf20Sopenharmony_ci if (prio > vmpressure_level_critical_prio) 3308c2ecf20Sopenharmony_ci return; 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci /* 3338c2ecf20Sopenharmony_ci * OK, the prio is below the threshold, updating vmpressure 3348c2ecf20Sopenharmony_ci * information before shrinker dives into long shrinking of long 3358c2ecf20Sopenharmony_ci * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 3368c2ecf20Sopenharmony_ci * to the vmpressure() basically means that we signal 'critical' 3378c2ecf20Sopenharmony_ci * level. 3388c2ecf20Sopenharmony_ci */ 3398c2ecf20Sopenharmony_ci vmpressure(gfp, memcg, true, vmpressure_win, 0); 3408c2ecf20Sopenharmony_ci} 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ci/** 3458c2ecf20Sopenharmony_ci * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 3468c2ecf20Sopenharmony_ci * @memcg: memcg that is interested in vmpressure notifications 3478c2ecf20Sopenharmony_ci * @eventfd: eventfd context to link notifications with 3488c2ecf20Sopenharmony_ci * @args: event arguments (pressure level threshold, optional mode) 3498c2ecf20Sopenharmony_ci * 3508c2ecf20Sopenharmony_ci * This function associates eventfd context with the vmpressure 3518c2ecf20Sopenharmony_ci * infrastructure, so that the notifications will be delivered to the 3528c2ecf20Sopenharmony_ci * @eventfd. The @args parameter is a comma-delimited string that denotes a 3538c2ecf20Sopenharmony_ci * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", 3548c2ecf20Sopenharmony_ci * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. 3558c2ecf20Sopenharmony_ci * "hierarchy" or "local"). 3568c2ecf20Sopenharmony_ci * 3578c2ecf20Sopenharmony_ci * To be used as memcg event method. 3588c2ecf20Sopenharmony_ci * 3598c2ecf20Sopenharmony_ci * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could 3608c2ecf20Sopenharmony_ci * not be parsed. 3618c2ecf20Sopenharmony_ci */ 3628c2ecf20Sopenharmony_ciint vmpressure_register_event(struct mem_cgroup *memcg, 3638c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd, const char *args) 3648c2ecf20Sopenharmony_ci{ 3658c2ecf20Sopenharmony_ci struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 3668c2ecf20Sopenharmony_ci struct vmpressure_event *ev; 3678c2ecf20Sopenharmony_ci enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; 3688c2ecf20Sopenharmony_ci enum vmpressure_levels level; 3698c2ecf20Sopenharmony_ci char *spec, *spec_orig; 3708c2ecf20Sopenharmony_ci char *token; 3718c2ecf20Sopenharmony_ci int ret = 0; 3728c2ecf20Sopenharmony_ci 3738c2ecf20Sopenharmony_ci spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); 3748c2ecf20Sopenharmony_ci if (!spec) 3758c2ecf20Sopenharmony_ci return -ENOMEM; 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci /* Find required level */ 3788c2ecf20Sopenharmony_ci token = strsep(&spec, ","); 3798c2ecf20Sopenharmony_ci ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); 3808c2ecf20Sopenharmony_ci if (ret < 0) 3818c2ecf20Sopenharmony_ci goto out; 3828c2ecf20Sopenharmony_ci level = ret; 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci /* Find optional mode */ 3858c2ecf20Sopenharmony_ci token = strsep(&spec, ","); 3868c2ecf20Sopenharmony_ci if (token) { 3878c2ecf20Sopenharmony_ci ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); 3888c2ecf20Sopenharmony_ci if (ret < 0) 3898c2ecf20Sopenharmony_ci goto out; 3908c2ecf20Sopenharmony_ci mode = ret; 3918c2ecf20Sopenharmony_ci } 3928c2ecf20Sopenharmony_ci 3938c2ecf20Sopenharmony_ci ev = kzalloc(sizeof(*ev), GFP_KERNEL); 3948c2ecf20Sopenharmony_ci if (!ev) { 3958c2ecf20Sopenharmony_ci ret = -ENOMEM; 3968c2ecf20Sopenharmony_ci goto out; 3978c2ecf20Sopenharmony_ci } 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci ev->efd = eventfd; 4008c2ecf20Sopenharmony_ci ev->level = level; 4018c2ecf20Sopenharmony_ci ev->mode = mode; 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci mutex_lock(&vmpr->events_lock); 4048c2ecf20Sopenharmony_ci list_add(&ev->node, &vmpr->events); 4058c2ecf20Sopenharmony_ci mutex_unlock(&vmpr->events_lock); 4068c2ecf20Sopenharmony_ci ret = 0; 4078c2ecf20Sopenharmony_ciout: 4088c2ecf20Sopenharmony_ci kfree(spec_orig); 4098c2ecf20Sopenharmony_ci return ret; 4108c2ecf20Sopenharmony_ci} 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci/** 4138c2ecf20Sopenharmony_ci * vmpressure_unregister_event() - Unbind eventfd from vmpressure 4148c2ecf20Sopenharmony_ci * @memcg: memcg handle 4158c2ecf20Sopenharmony_ci * @eventfd: eventfd context that was used to link vmpressure with the @cg 4168c2ecf20Sopenharmony_ci * 4178c2ecf20Sopenharmony_ci * This function does internal manipulations to detach the @eventfd from 4188c2ecf20Sopenharmony_ci * the vmpressure notifications, and then frees internal resources 4198c2ecf20Sopenharmony_ci * associated with the @eventfd (but the @eventfd itself is not freed). 4208c2ecf20Sopenharmony_ci * 4218c2ecf20Sopenharmony_ci * To be used as memcg event method. 4228c2ecf20Sopenharmony_ci */ 4238c2ecf20Sopenharmony_civoid vmpressure_unregister_event(struct mem_cgroup *memcg, 4248c2ecf20Sopenharmony_ci struct eventfd_ctx *eventfd) 4258c2ecf20Sopenharmony_ci{ 4268c2ecf20Sopenharmony_ci struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 4278c2ecf20Sopenharmony_ci struct vmpressure_event *ev; 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_ci mutex_lock(&vmpr->events_lock); 4308c2ecf20Sopenharmony_ci list_for_each_entry(ev, &vmpr->events, node) { 4318c2ecf20Sopenharmony_ci if (ev->efd != eventfd) 4328c2ecf20Sopenharmony_ci continue; 4338c2ecf20Sopenharmony_ci list_del(&ev->node); 4348c2ecf20Sopenharmony_ci kfree(ev); 4358c2ecf20Sopenharmony_ci break; 4368c2ecf20Sopenharmony_ci } 4378c2ecf20Sopenharmony_ci mutex_unlock(&vmpr->events_lock); 4388c2ecf20Sopenharmony_ci} 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci/** 4418c2ecf20Sopenharmony_ci * vmpressure_init() - Initialize vmpressure control structure 4428c2ecf20Sopenharmony_ci * @vmpr: Structure to be initialized 4438c2ecf20Sopenharmony_ci * 4448c2ecf20Sopenharmony_ci * This function should be called on every allocated vmpressure structure 4458c2ecf20Sopenharmony_ci * before any usage. 4468c2ecf20Sopenharmony_ci */ 4478c2ecf20Sopenharmony_civoid vmpressure_init(struct vmpressure *vmpr) 4488c2ecf20Sopenharmony_ci{ 4498c2ecf20Sopenharmony_ci spin_lock_init(&vmpr->sr_lock); 4508c2ecf20Sopenharmony_ci mutex_init(&vmpr->events_lock); 4518c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&vmpr->events); 4528c2ecf20Sopenharmony_ci INIT_WORK(&vmpr->work, vmpressure_work_fn); 4538c2ecf20Sopenharmony_ci} 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_ci/** 4568c2ecf20Sopenharmony_ci * vmpressure_cleanup() - shuts down vmpressure control structure 4578c2ecf20Sopenharmony_ci * @vmpr: Structure to be cleaned up 4588c2ecf20Sopenharmony_ci * 4598c2ecf20Sopenharmony_ci * This function should be called before the structure in which it is 4608c2ecf20Sopenharmony_ci * embedded is cleaned up. 4618c2ecf20Sopenharmony_ci */ 4628c2ecf20Sopenharmony_civoid vmpressure_cleanup(struct vmpressure *vmpr) 4638c2ecf20Sopenharmony_ci{ 4648c2ecf20Sopenharmony_ci /* 4658c2ecf20Sopenharmony_ci * Make sure there is no pending work before eventfd infrastructure 4668c2ecf20Sopenharmony_ci * goes away. 4678c2ecf20Sopenharmony_ci */ 4688c2ecf20Sopenharmony_ci flush_work(&vmpr->work); 4698c2ecf20Sopenharmony_ci} 470