18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Workingset detection 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/memcontrol.h> 98c2ecf20Sopenharmony_ci#include <linux/mm_inline.h> 108c2ecf20Sopenharmony_ci#include <linux/writeback.h> 118c2ecf20Sopenharmony_ci#include <linux/shmem_fs.h> 128c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 138c2ecf20Sopenharmony_ci#include <linux/atomic.h> 148c2ecf20Sopenharmony_ci#include <linux/module.h> 158c2ecf20Sopenharmony_ci#include <linux/swap.h> 168c2ecf20Sopenharmony_ci#include <linux/dax.h> 178c2ecf20Sopenharmony_ci#include <linux/fs.h> 188c2ecf20Sopenharmony_ci#include <linux/mm.h> 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci/* 218c2ecf20Sopenharmony_ci * Double CLOCK lists 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * Per node, two clock lists are maintained for file pages: the 248c2ecf20Sopenharmony_ci * inactive and the active list. Freshly faulted pages start out at 258c2ecf20Sopenharmony_ci * the head of the inactive list and page reclaim scans pages from the 268c2ecf20Sopenharmony_ci * tail. Pages that are accessed multiple times on the inactive list 278c2ecf20Sopenharmony_ci * are promoted to the active list, to protect them from reclaim, 288c2ecf20Sopenharmony_ci * whereas active pages are demoted to the inactive list when the 298c2ecf20Sopenharmony_ci * active list grows too big. 308c2ecf20Sopenharmony_ci * 318c2ecf20Sopenharmony_ci * fault ------------------------+ 328c2ecf20Sopenharmony_ci * | 338c2ecf20Sopenharmony_ci * +--------------+ | +-------------+ 348c2ecf20Sopenharmony_ci * reclaim <- | inactive | <-+-- demotion | active | <--+ 358c2ecf20Sopenharmony_ci * +--------------+ +-------------+ | 368c2ecf20Sopenharmony_ci * | | 378c2ecf20Sopenharmony_ci * +-------------- promotion ------------------+ 388c2ecf20Sopenharmony_ci * 398c2ecf20Sopenharmony_ci * 408c2ecf20Sopenharmony_ci * Access frequency and refault distance 418c2ecf20Sopenharmony_ci * 428c2ecf20Sopenharmony_ci * A workload is thrashing when its pages are frequently used but they 438c2ecf20Sopenharmony_ci * are evicted from the inactive list every time before another access 448c2ecf20Sopenharmony_ci * would have promoted them to the active list. 458c2ecf20Sopenharmony_ci * 468c2ecf20Sopenharmony_ci * In cases where the average access distance between thrashing pages 478c2ecf20Sopenharmony_ci * is bigger than the size of memory there is nothing that can be 488c2ecf20Sopenharmony_ci * done - the thrashing set could never fit into memory under any 498c2ecf20Sopenharmony_ci * circumstance. 508c2ecf20Sopenharmony_ci * 518c2ecf20Sopenharmony_ci * However, the average access distance could be bigger than the 528c2ecf20Sopenharmony_ci * inactive list, yet smaller than the size of memory. In this case, 538c2ecf20Sopenharmony_ci * the set could fit into memory if it weren't for the currently 548c2ecf20Sopenharmony_ci * active pages - which may be used more, hopefully less frequently: 558c2ecf20Sopenharmony_ci * 568c2ecf20Sopenharmony_ci * +-memory available to cache-+ 578c2ecf20Sopenharmony_ci * | | 588c2ecf20Sopenharmony_ci * +-inactive------+-active----+ 598c2ecf20Sopenharmony_ci * a b | c d e f g h i | J K L M N | 608c2ecf20Sopenharmony_ci * +---------------+-----------+ 618c2ecf20Sopenharmony_ci * 628c2ecf20Sopenharmony_ci * It is prohibitively expensive to accurately track access frequency 638c2ecf20Sopenharmony_ci * of pages. But a reasonable approximation can be made to measure 648c2ecf20Sopenharmony_ci * thrashing on the inactive list, after which refaulting pages can be 658c2ecf20Sopenharmony_ci * activated optimistically to compete with the existing active pages. 668c2ecf20Sopenharmony_ci * 678c2ecf20Sopenharmony_ci * Approximating inactive page access frequency - Observations: 688c2ecf20Sopenharmony_ci * 698c2ecf20Sopenharmony_ci * 1. When a page is accessed for the first time, it is added to the 708c2ecf20Sopenharmony_ci * head of the inactive list, slides every existing inactive page 718c2ecf20Sopenharmony_ci * towards the tail by one slot, and pushes the current tail page 728c2ecf20Sopenharmony_ci * out of memory. 738c2ecf20Sopenharmony_ci * 748c2ecf20Sopenharmony_ci * 2. When a page is accessed for the second time, it is promoted to 758c2ecf20Sopenharmony_ci * the active list, shrinking the inactive list by one slot. This 768c2ecf20Sopenharmony_ci * also slides all inactive pages that were faulted into the cache 778c2ecf20Sopenharmony_ci * more recently than the activated page towards the tail of the 788c2ecf20Sopenharmony_ci * inactive list. 798c2ecf20Sopenharmony_ci * 808c2ecf20Sopenharmony_ci * Thus: 818c2ecf20Sopenharmony_ci * 828c2ecf20Sopenharmony_ci * 1. The sum of evictions and activations between any two points in 838c2ecf20Sopenharmony_ci * time indicate the minimum number of inactive pages accessed in 848c2ecf20Sopenharmony_ci * between. 858c2ecf20Sopenharmony_ci * 868c2ecf20Sopenharmony_ci * 2. Moving one inactive page N page slots towards the tail of the 878c2ecf20Sopenharmony_ci * list requires at least N inactive page accesses. 888c2ecf20Sopenharmony_ci * 898c2ecf20Sopenharmony_ci * Combining these: 908c2ecf20Sopenharmony_ci * 918c2ecf20Sopenharmony_ci * 1. When a page is finally evicted from memory, the number of 928c2ecf20Sopenharmony_ci * inactive pages accessed while the page was in cache is at least 938c2ecf20Sopenharmony_ci * the number of page slots on the inactive list. 948c2ecf20Sopenharmony_ci * 958c2ecf20Sopenharmony_ci * 2. In addition, measuring the sum of evictions and activations (E) 968c2ecf20Sopenharmony_ci * at the time of a page's eviction, and comparing it to another 978c2ecf20Sopenharmony_ci * reading (R) at the time the page faults back into memory tells 988c2ecf20Sopenharmony_ci * the minimum number of accesses while the page was not cached. 998c2ecf20Sopenharmony_ci * This is called the refault distance. 1008c2ecf20Sopenharmony_ci * 1018c2ecf20Sopenharmony_ci * Because the first access of the page was the fault and the second 1028c2ecf20Sopenharmony_ci * access the refault, we combine the in-cache distance with the 1038c2ecf20Sopenharmony_ci * out-of-cache distance to get the complete minimum access distance 1048c2ecf20Sopenharmony_ci * of this page: 1058c2ecf20Sopenharmony_ci * 1068c2ecf20Sopenharmony_ci * NR_inactive + (R - E) 1078c2ecf20Sopenharmony_ci * 1088c2ecf20Sopenharmony_ci * And knowing the minimum access distance of a page, we can easily 1098c2ecf20Sopenharmony_ci * tell if the page would be able to stay in cache assuming all page 1108c2ecf20Sopenharmony_ci * slots in the cache were available: 1118c2ecf20Sopenharmony_ci * 1128c2ecf20Sopenharmony_ci * NR_inactive + (R - E) <= NR_inactive + NR_active 1138c2ecf20Sopenharmony_ci * 1148c2ecf20Sopenharmony_ci * which can be further simplified to 1158c2ecf20Sopenharmony_ci * 1168c2ecf20Sopenharmony_ci * (R - E) <= NR_active 1178c2ecf20Sopenharmony_ci * 1188c2ecf20Sopenharmony_ci * Put into words, the refault distance (out-of-cache) can be seen as 1198c2ecf20Sopenharmony_ci * a deficit in inactive list space (in-cache). If the inactive list 1208c2ecf20Sopenharmony_ci * had (R - E) more page slots, the page would not have been evicted 1218c2ecf20Sopenharmony_ci * in between accesses, but activated instead. And on a full system, 1228c2ecf20Sopenharmony_ci * the only thing eating into inactive list space is active pages. 1238c2ecf20Sopenharmony_ci * 1248c2ecf20Sopenharmony_ci * 1258c2ecf20Sopenharmony_ci * Refaulting inactive pages 1268c2ecf20Sopenharmony_ci * 1278c2ecf20Sopenharmony_ci * All that is known about the active list is that the pages have been 1288c2ecf20Sopenharmony_ci * accessed more than once in the past. This means that at any given 1298c2ecf20Sopenharmony_ci * time there is actually a good chance that pages on the active list 1308c2ecf20Sopenharmony_ci * are no longer in active use. 1318c2ecf20Sopenharmony_ci * 1328c2ecf20Sopenharmony_ci * So when a refault distance of (R - E) is observed and there are at 1338c2ecf20Sopenharmony_ci * least (R - E) active pages, the refaulting page is activated 1348c2ecf20Sopenharmony_ci * optimistically in the hope that (R - E) active pages are actually 1358c2ecf20Sopenharmony_ci * used less frequently than the refaulting page - or even not used at 1368c2ecf20Sopenharmony_ci * all anymore. 1378c2ecf20Sopenharmony_ci * 1388c2ecf20Sopenharmony_ci * That means if inactive cache is refaulting with a suitable refault 1398c2ecf20Sopenharmony_ci * distance, we assume the cache workingset is transitioning and put 1408c2ecf20Sopenharmony_ci * pressure on the current active list. 1418c2ecf20Sopenharmony_ci * 1428c2ecf20Sopenharmony_ci * If this is wrong and demotion kicks in, the pages which are truly 1438c2ecf20Sopenharmony_ci * used more frequently will be reactivated while the less frequently 1448c2ecf20Sopenharmony_ci * used once will be evicted from memory. 1458c2ecf20Sopenharmony_ci * 1468c2ecf20Sopenharmony_ci * But if this is right, the stale pages will be pushed out of memory 1478c2ecf20Sopenharmony_ci * and the used pages get to stay in cache. 1488c2ecf20Sopenharmony_ci * 1498c2ecf20Sopenharmony_ci * Refaulting active pages 1508c2ecf20Sopenharmony_ci * 1518c2ecf20Sopenharmony_ci * If on the other hand the refaulting pages have recently been 1528c2ecf20Sopenharmony_ci * deactivated, it means that the active list is no longer protecting 1538c2ecf20Sopenharmony_ci * actively used cache from reclaim. The cache is NOT transitioning to 1548c2ecf20Sopenharmony_ci * a different workingset; the existing workingset is thrashing in the 1558c2ecf20Sopenharmony_ci * space allocated to the page cache. 1568c2ecf20Sopenharmony_ci * 1578c2ecf20Sopenharmony_ci * 1588c2ecf20Sopenharmony_ci * Implementation 1598c2ecf20Sopenharmony_ci * 1608c2ecf20Sopenharmony_ci * For each node's LRU lists, a counter for inactive evictions and 1618c2ecf20Sopenharmony_ci * activations is maintained (node->nonresident_age). 1628c2ecf20Sopenharmony_ci * 1638c2ecf20Sopenharmony_ci * On eviction, a snapshot of this counter (along with some bits to 1648c2ecf20Sopenharmony_ci * identify the node) is stored in the now empty page cache 1658c2ecf20Sopenharmony_ci * slot of the evicted page. This is called a shadow entry. 1668c2ecf20Sopenharmony_ci * 1678c2ecf20Sopenharmony_ci * On cache misses for which there are shadow entries, an eligible 1688c2ecf20Sopenharmony_ci * refault distance will immediately activate the refaulting page. 1698c2ecf20Sopenharmony_ci */ 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ 1728c2ecf20Sopenharmony_ci 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) 1738c2ecf20Sopenharmony_ci#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci/* 1768c2ecf20Sopenharmony_ci * Eviction timestamps need to be able to cover the full range of 1778c2ecf20Sopenharmony_ci * actionable refaults. However, bits are tight in the xarray 1788c2ecf20Sopenharmony_ci * entry, and after storing the identifier for the lruvec there might 1798c2ecf20Sopenharmony_ci * not be enough left to represent every single actionable refault. In 1808c2ecf20Sopenharmony_ci * that case, we have to sacrifice granularity for distance, and group 1818c2ecf20Sopenharmony_ci * evictions into coarser buckets by shaving off lower timestamp bits. 1828c2ecf20Sopenharmony_ci */ 1838c2ecf20Sopenharmony_cistatic unsigned int bucket_order __read_mostly; 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_cistatic void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, 1868c2ecf20Sopenharmony_ci bool workingset) 1878c2ecf20Sopenharmony_ci{ 1888c2ecf20Sopenharmony_ci eviction >>= bucket_order; 1898c2ecf20Sopenharmony_ci eviction &= EVICTION_MASK; 1908c2ecf20Sopenharmony_ci eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 1918c2ecf20Sopenharmony_ci eviction = (eviction << NODES_SHIFT) | pgdat->node_id; 1928c2ecf20Sopenharmony_ci eviction = (eviction << 1) | workingset; 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci return xa_mk_value(eviction); 1958c2ecf20Sopenharmony_ci} 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_cistatic void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, 1988c2ecf20Sopenharmony_ci unsigned long *evictionp, bool *workingsetp) 1998c2ecf20Sopenharmony_ci{ 2008c2ecf20Sopenharmony_ci unsigned long entry = xa_to_value(shadow); 2018c2ecf20Sopenharmony_ci int memcgid, nid; 2028c2ecf20Sopenharmony_ci bool workingset; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci workingset = entry & 1; 2058c2ecf20Sopenharmony_ci entry >>= 1; 2068c2ecf20Sopenharmony_ci nid = entry & ((1UL << NODES_SHIFT) - 1); 2078c2ecf20Sopenharmony_ci entry >>= NODES_SHIFT; 2088c2ecf20Sopenharmony_ci memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); 2098c2ecf20Sopenharmony_ci entry >>= MEM_CGROUP_ID_SHIFT; 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci *memcgidp = memcgid; 2128c2ecf20Sopenharmony_ci *pgdat = NODE_DATA(nid); 2138c2ecf20Sopenharmony_ci *evictionp = entry << bucket_order; 2148c2ecf20Sopenharmony_ci *workingsetp = workingset; 2158c2ecf20Sopenharmony_ci} 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_ci/** 2188c2ecf20Sopenharmony_ci * workingset_age_nonresident - age non-resident entries as LRU ages 2198c2ecf20Sopenharmony_ci * @lruvec: the lruvec that was aged 2208c2ecf20Sopenharmony_ci * @nr_pages: the number of pages to count 2218c2ecf20Sopenharmony_ci * 2228c2ecf20Sopenharmony_ci * As in-memory pages are aged, non-resident pages need to be aged as 2238c2ecf20Sopenharmony_ci * well, in order for the refault distances later on to be comparable 2248c2ecf20Sopenharmony_ci * to the in-memory dimensions. This function allows reclaim and LRU 2258c2ecf20Sopenharmony_ci * operations to drive the non-resident aging along in parallel. 2268c2ecf20Sopenharmony_ci */ 2278c2ecf20Sopenharmony_civoid workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) 2288c2ecf20Sopenharmony_ci{ 2298c2ecf20Sopenharmony_ci /* 2308c2ecf20Sopenharmony_ci * Reclaiming a cgroup means reclaiming all its children in a 2318c2ecf20Sopenharmony_ci * round-robin fashion. That means that each cgroup has an LRU 2328c2ecf20Sopenharmony_ci * order that is composed of the LRU orders of its child 2338c2ecf20Sopenharmony_ci * cgroups; and every page has an LRU position not just in the 2348c2ecf20Sopenharmony_ci * cgroup that owns it, but in all of that group's ancestors. 2358c2ecf20Sopenharmony_ci * 2368c2ecf20Sopenharmony_ci * So when the physical inactive list of a leaf cgroup ages, 2378c2ecf20Sopenharmony_ci * the virtual inactive lists of all its parents, including 2388c2ecf20Sopenharmony_ci * the root cgroup's, age as well. 2398c2ecf20Sopenharmony_ci */ 2408c2ecf20Sopenharmony_ci do { 2418c2ecf20Sopenharmony_ci atomic_long_add(nr_pages, &lruvec->nonresident_age); 2428c2ecf20Sopenharmony_ci } while ((lruvec = parent_lruvec(lruvec))); 2438c2ecf20Sopenharmony_ci} 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci/** 2468c2ecf20Sopenharmony_ci * workingset_eviction - note the eviction of a page from memory 2478c2ecf20Sopenharmony_ci * @target_memcg: the cgroup that is causing the reclaim 2488c2ecf20Sopenharmony_ci * @page: the page being evicted 2498c2ecf20Sopenharmony_ci * 2508c2ecf20Sopenharmony_ci * Returns a shadow entry to be stored in @page->mapping->i_pages in place 2518c2ecf20Sopenharmony_ci * of the evicted @page so that a later refault can be detected. 2528c2ecf20Sopenharmony_ci */ 2538c2ecf20Sopenharmony_civoid *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) 2548c2ecf20Sopenharmony_ci{ 2558c2ecf20Sopenharmony_ci struct pglist_data *pgdat = page_pgdat(page); 2568c2ecf20Sopenharmony_ci unsigned long eviction; 2578c2ecf20Sopenharmony_ci struct lruvec *lruvec; 2588c2ecf20Sopenharmony_ci int memcgid; 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci /* Page is fully exclusive and pins page->mem_cgroup */ 2618c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(PageLRU(page), page); 2628c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(page_count(page), page); 2638c2ecf20Sopenharmony_ci VM_BUG_ON_PAGE(!PageLocked(page), page); 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 2668c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 2678c2ecf20Sopenharmony_ci if (!is_prot_page(page) && page_is_file_lru(page)) { 2688c2ecf20Sopenharmony_ci lruvec = node_lruvec(pgdat); 2698c2ecf20Sopenharmony_ci workingset_age_nonresident(lruvec, thp_nr_pages(page)); 2708c2ecf20Sopenharmony_ci } else { 2718c2ecf20Sopenharmony_ci workingset_age_nonresident(lruvec, thp_nr_pages(page)); 2728c2ecf20Sopenharmony_ci } 2738c2ecf20Sopenharmony_ci#else 2748c2ecf20Sopenharmony_ci workingset_age_nonresident(lruvec, thp_nr_pages(page)); 2758c2ecf20Sopenharmony_ci#endif 2768c2ecf20Sopenharmony_ci /* XXX: target_memcg can be NULL, go through lruvec */ 2778c2ecf20Sopenharmony_ci memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); 2788c2ecf20Sopenharmony_ci eviction = atomic_long_read(&lruvec->nonresident_age); 2798c2ecf20Sopenharmony_ci return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); 2808c2ecf20Sopenharmony_ci} 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_ci/** 2838c2ecf20Sopenharmony_ci * workingset_refault - evaluate the refault of a previously evicted page 2848c2ecf20Sopenharmony_ci * @page: the freshly allocated replacement page 2858c2ecf20Sopenharmony_ci * @shadow: shadow entry of the evicted page 2868c2ecf20Sopenharmony_ci * 2878c2ecf20Sopenharmony_ci * Calculates and evaluates the refault distance of the previously 2888c2ecf20Sopenharmony_ci * evicted page in the context of the node and the memcg whose memory 2898c2ecf20Sopenharmony_ci * pressure caused the eviction. 2908c2ecf20Sopenharmony_ci */ 2918c2ecf20Sopenharmony_civoid workingset_refault(struct page *page, void *shadow) 2928c2ecf20Sopenharmony_ci{ 2938c2ecf20Sopenharmony_ci bool file = page_is_file_lru(page); 2948c2ecf20Sopenharmony_ci struct mem_cgroup *eviction_memcg; 2958c2ecf20Sopenharmony_ci struct lruvec *eviction_lruvec; 2968c2ecf20Sopenharmony_ci unsigned long refault_distance; 2978c2ecf20Sopenharmony_ci unsigned long workingset_size; 2988c2ecf20Sopenharmony_ci struct pglist_data *pgdat; 2998c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 3008c2ecf20Sopenharmony_ci unsigned long eviction; 3018c2ecf20Sopenharmony_ci struct lruvec *lruvec; 3028c2ecf20Sopenharmony_ci unsigned long refault; 3038c2ecf20Sopenharmony_ci bool workingset; 3048c2ecf20Sopenharmony_ci int memcgid; 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); 3078c2ecf20Sopenharmony_ci 3088c2ecf20Sopenharmony_ci rcu_read_lock(); 3098c2ecf20Sopenharmony_ci /* 3108c2ecf20Sopenharmony_ci * Look up the memcg associated with the stored ID. It might 3118c2ecf20Sopenharmony_ci * have been deleted since the page's eviction. 3128c2ecf20Sopenharmony_ci * 3138c2ecf20Sopenharmony_ci * Note that in rare events the ID could have been recycled 3148c2ecf20Sopenharmony_ci * for a new cgroup that refaults a shared page. This is 3158c2ecf20Sopenharmony_ci * impossible to tell from the available data. However, this 3168c2ecf20Sopenharmony_ci * should be a rare and limited disturbance, and activations 3178c2ecf20Sopenharmony_ci * are always speculative anyway. Ultimately, it's the aging 3188c2ecf20Sopenharmony_ci * algorithm's job to shake out the minimum access frequency 3198c2ecf20Sopenharmony_ci * for the active cache. 3208c2ecf20Sopenharmony_ci * 3218c2ecf20Sopenharmony_ci * XXX: On !CONFIG_MEMCG, this will always return NULL; it 3228c2ecf20Sopenharmony_ci * would be better if the root_mem_cgroup existed in all 3238c2ecf20Sopenharmony_ci * configurations instead. 3248c2ecf20Sopenharmony_ci */ 3258c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 3268c2ecf20Sopenharmony_ci if (memcgid != -1) { 3278c2ecf20Sopenharmony_ci eviction_memcg = mem_cgroup_from_id(memcgid); 3288c2ecf20Sopenharmony_ci if (!mem_cgroup_disabled() && !eviction_memcg) 3298c2ecf20Sopenharmony_ci goto out; 3308c2ecf20Sopenharmony_ci } 3318c2ecf20Sopenharmony_ci#else 3328c2ecf20Sopenharmony_ci eviction_memcg = mem_cgroup_from_id(memcgid); 3338c2ecf20Sopenharmony_ci if (!mem_cgroup_disabled() && !eviction_memcg) 3348c2ecf20Sopenharmony_ci goto out; 3358c2ecf20Sopenharmony_ci#endif 3368c2ecf20Sopenharmony_ci eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); 3378c2ecf20Sopenharmony_ci refault = atomic_long_read(&eviction_lruvec->nonresident_age); 3388c2ecf20Sopenharmony_ci 3398c2ecf20Sopenharmony_ci /* 3408c2ecf20Sopenharmony_ci * Calculate the refault distance 3418c2ecf20Sopenharmony_ci * 3428c2ecf20Sopenharmony_ci * The unsigned subtraction here gives an accurate distance 3438c2ecf20Sopenharmony_ci * across nonresident_age overflows in most cases. There is a 3448c2ecf20Sopenharmony_ci * special case: usually, shadow entries have a short lifetime 3458c2ecf20Sopenharmony_ci * and are either refaulted or reclaimed along with the inode 3468c2ecf20Sopenharmony_ci * before they get too old. But it is not impossible for the 3478c2ecf20Sopenharmony_ci * nonresident_age to lap a shadow entry in the field, which 3488c2ecf20Sopenharmony_ci * can then result in a false small refault distance, leading 3498c2ecf20Sopenharmony_ci * to a false activation should this old entry actually 3508c2ecf20Sopenharmony_ci * refault again. However, earlier kernels used to deactivate 3518c2ecf20Sopenharmony_ci * unconditionally with *every* reclaim invocation for the 3528c2ecf20Sopenharmony_ci * longest time, so the occasional inappropriate activation 3538c2ecf20Sopenharmony_ci * leading to pressure on the active list is not a problem. 3548c2ecf20Sopenharmony_ci */ 3558c2ecf20Sopenharmony_ci refault_distance = (refault - eviction) & EVICTION_MASK; 3568c2ecf20Sopenharmony_ci 3578c2ecf20Sopenharmony_ci /* 3588c2ecf20Sopenharmony_ci * The activation decision for this page is made at the level 3598c2ecf20Sopenharmony_ci * where the eviction occurred, as that is where the LRU order 3608c2ecf20Sopenharmony_ci * during page reclaim is being determined. 3618c2ecf20Sopenharmony_ci * 3628c2ecf20Sopenharmony_ci * However, the cgroup that will own the page is the one that 3638c2ecf20Sopenharmony_ci * is actually experiencing the refault event. 3648c2ecf20Sopenharmony_ci */ 3658c2ecf20Sopenharmony_ci memcg = page_memcg(page); 3668c2ecf20Sopenharmony_ci lruvec = mem_cgroup_lruvec(memcg, pgdat); 3678c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 3688c2ecf20Sopenharmony_ci if (!is_prot_page(page) && file) 3698c2ecf20Sopenharmony_ci inc_lruvec_state(node_lruvec(pgdat), 3708c2ecf20Sopenharmony_ci WORKINGSET_REFAULT_BASE + file); 3718c2ecf20Sopenharmony_ci else 3728c2ecf20Sopenharmony_ci inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); 3738c2ecf20Sopenharmony_ci#else 3748c2ecf20Sopenharmony_ci inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); 3758c2ecf20Sopenharmony_ci#endif 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci /* 3788c2ecf20Sopenharmony_ci * Compare the distance to the existing workingset size. We 3798c2ecf20Sopenharmony_ci * don't activate pages that couldn't stay resident even if 3808c2ecf20Sopenharmony_ci * all the memory was available to the workingset. Whether 3818c2ecf20Sopenharmony_ci * workingset competition needs to consider anon or not depends 3828c2ecf20Sopenharmony_ci * on having swap. 3838c2ecf20Sopenharmony_ci */ 3848c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 3858c2ecf20Sopenharmony_ci workingset_size = lruvec_page_state(node_lruvec(pgdat), NR_ACTIVE_FILE); 3868c2ecf20Sopenharmony_ci#else 3878c2ecf20Sopenharmony_ci workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); 3888c2ecf20Sopenharmony_ci#endif 3898c2ecf20Sopenharmony_ci 3908c2ecf20Sopenharmony_ci if (!file) { 3918c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 3928c2ecf20Sopenharmony_ci workingset_size += lruvec_page_state(node_lruvec(pgdat), 3938c2ecf20Sopenharmony_ci NR_INACTIVE_FILE); 3948c2ecf20Sopenharmony_ci#else 3958c2ecf20Sopenharmony_ci 3968c2ecf20Sopenharmony_ci workingset_size += lruvec_page_state(eviction_lruvec, 3978c2ecf20Sopenharmony_ci NR_INACTIVE_FILE); 3988c2ecf20Sopenharmony_ci#endif 3998c2ecf20Sopenharmony_ci } 4008c2ecf20Sopenharmony_ci if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { 4018c2ecf20Sopenharmony_ci workingset_size += lruvec_page_state(eviction_lruvec, 4028c2ecf20Sopenharmony_ci NR_ACTIVE_ANON); 4038c2ecf20Sopenharmony_ci if (file) { 4048c2ecf20Sopenharmony_ci workingset_size += lruvec_page_state(eviction_lruvec, 4058c2ecf20Sopenharmony_ci NR_INACTIVE_ANON); 4068c2ecf20Sopenharmony_ci } 4078c2ecf20Sopenharmony_ci } 4088c2ecf20Sopenharmony_ci if (refault_distance > workingset_size) 4098c2ecf20Sopenharmony_ci goto out; 4108c2ecf20Sopenharmony_ci 4118c2ecf20Sopenharmony_ci SetPageActive(page); 4128c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 4138c2ecf20Sopenharmony_ci if (!is_prot_page(page) && file) { 4148c2ecf20Sopenharmony_ci workingset_age_nonresident(node_lruvec(pgdat), 4158c2ecf20Sopenharmony_ci thp_nr_pages(page)); 4168c2ecf20Sopenharmony_ci inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); 4178c2ecf20Sopenharmony_ci } else { 4188c2ecf20Sopenharmony_ci workingset_age_nonresident(lruvec, thp_nr_pages(page)); 4198c2ecf20Sopenharmony_ci inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); 4208c2ecf20Sopenharmony_ci } 4218c2ecf20Sopenharmony_ci#else 4228c2ecf20Sopenharmony_ci workingset_age_nonresident(lruvec, thp_nr_pages(page)); 4238c2ecf20Sopenharmony_ci inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); 4248c2ecf20Sopenharmony_ci#endif 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci /* Page was active prior to eviction */ 4278c2ecf20Sopenharmony_ci if (workingset) { 4288c2ecf20Sopenharmony_ci SetPageWorkingset(page); 4298c2ecf20Sopenharmony_ci /* XXX: Move to lru_cache_add() when it supports new vs putback */ 4308c2ecf20Sopenharmony_ci spin_lock_irq(&page_pgdat(page)->lru_lock); 4318c2ecf20Sopenharmony_ci lru_note_cost_page(page); 4328c2ecf20Sopenharmony_ci spin_unlock_irq(&page_pgdat(page)->lru_lock); 4338c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 4348c2ecf20Sopenharmony_ci if (!is_prot_page(page) && file) 4358c2ecf20Sopenharmony_ci inc_lruvec_state(node_lruvec(pgdat), WORKINGSET_RESTORE_BASE + file); 4368c2ecf20Sopenharmony_ci else 4378c2ecf20Sopenharmony_ci inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); 4388c2ecf20Sopenharmony_ci#else 4398c2ecf20Sopenharmony_ci inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); 4408c2ecf20Sopenharmony_ci#endif 4418c2ecf20Sopenharmony_ci } 4428c2ecf20Sopenharmony_ciout: 4438c2ecf20Sopenharmony_ci rcu_read_unlock(); 4448c2ecf20Sopenharmony_ci} 4458c2ecf20Sopenharmony_ci 4468c2ecf20Sopenharmony_ci/** 4478c2ecf20Sopenharmony_ci * workingset_activation - note a page activation 4488c2ecf20Sopenharmony_ci * @page: page that is being activated 4498c2ecf20Sopenharmony_ci */ 4508c2ecf20Sopenharmony_civoid workingset_activation(struct page *page) 4518c2ecf20Sopenharmony_ci{ 4528c2ecf20Sopenharmony_ci struct mem_cgroup *memcg; 4538c2ecf20Sopenharmony_ci struct lruvec *lruvec; 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_ci rcu_read_lock(); 4568c2ecf20Sopenharmony_ci /* 4578c2ecf20Sopenharmony_ci * Filter non-memcg pages here, e.g. unmap can call 4588c2ecf20Sopenharmony_ci * mark_page_accessed() on VDSO pages. 4598c2ecf20Sopenharmony_ci * 4608c2ecf20Sopenharmony_ci * XXX: See workingset_refault() - this should return 4618c2ecf20Sopenharmony_ci * root_mem_cgroup even for !CONFIG_MEMCG. 4628c2ecf20Sopenharmony_ci */ 4638c2ecf20Sopenharmony_ci memcg = page_memcg_rcu(page); 4648c2ecf20Sopenharmony_ci if (!mem_cgroup_disabled() && !memcg) 4658c2ecf20Sopenharmony_ci goto out; 4668c2ecf20Sopenharmony_ci lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); 4678c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU 4688c2ecf20Sopenharmony_ci if (!is_prot_page(page) && page_is_file_lru(page)) { 4698c2ecf20Sopenharmony_ci lruvec = node_lruvec(page_pgdat(page)); 4708c2ecf20Sopenharmony_ci workingset_age_nonresident(lruvec, thp_nr_pages(page)); 4718c2ecf20Sopenharmony_ci } else { 4728c2ecf20Sopenharmony_ci workingset_age_nonresident(lruvec, thp_nr_pages(page)); 4738c2ecf20Sopenharmony_ci } 4748c2ecf20Sopenharmony_ci#else 4758c2ecf20Sopenharmony_ci workingset_age_nonresident(lruvec, thp_nr_pages(page)); 4768c2ecf20Sopenharmony_ci#endif 4778c2ecf20Sopenharmony_ciout: 4788c2ecf20Sopenharmony_ci rcu_read_unlock(); 4798c2ecf20Sopenharmony_ci} 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_ci/* 4828c2ecf20Sopenharmony_ci * Shadow entries reflect the share of the working set that does not 4838c2ecf20Sopenharmony_ci * fit into memory, so their number depends on the access pattern of 4848c2ecf20Sopenharmony_ci * the workload. In most cases, they will refault or get reclaimed 4858c2ecf20Sopenharmony_ci * along with the inode, but a (malicious) workload that streams 4868c2ecf20Sopenharmony_ci * through files with a total size several times that of available 4878c2ecf20Sopenharmony_ci * memory, while preventing the inodes from being reclaimed, can 4888c2ecf20Sopenharmony_ci * create excessive amounts of shadow nodes. To keep a lid on this, 4898c2ecf20Sopenharmony_ci * track shadow nodes and reclaim them when they grow way past the 4908c2ecf20Sopenharmony_ci * point where they would still be useful. 4918c2ecf20Sopenharmony_ci */ 4928c2ecf20Sopenharmony_ci 4938c2ecf20Sopenharmony_cistatic struct list_lru shadow_nodes; 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_civoid workingset_update_node(struct xa_node *node) 4968c2ecf20Sopenharmony_ci{ 4978c2ecf20Sopenharmony_ci /* 4988c2ecf20Sopenharmony_ci * Track non-empty nodes that contain only shadow entries; 4998c2ecf20Sopenharmony_ci * unlink those that contain pages or are being freed. 5008c2ecf20Sopenharmony_ci * 5018c2ecf20Sopenharmony_ci * Avoid acquiring the list_lru lock when the nodes are 5028c2ecf20Sopenharmony_ci * already where they should be. The list_empty() test is safe 5038c2ecf20Sopenharmony_ci * as node->private_list is protected by the i_pages lock. 5048c2ecf20Sopenharmony_ci */ 5058c2ecf20Sopenharmony_ci VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ 5068c2ecf20Sopenharmony_ci 5078c2ecf20Sopenharmony_ci if (node->count && node->count == node->nr_values) { 5088c2ecf20Sopenharmony_ci if (list_empty(&node->private_list)) { 5098c2ecf20Sopenharmony_ci list_lru_add(&shadow_nodes, &node->private_list); 5108c2ecf20Sopenharmony_ci __inc_lruvec_slab_state(node, WORKINGSET_NODES); 5118c2ecf20Sopenharmony_ci } 5128c2ecf20Sopenharmony_ci } else { 5138c2ecf20Sopenharmony_ci if (!list_empty(&node->private_list)) { 5148c2ecf20Sopenharmony_ci list_lru_del(&shadow_nodes, &node->private_list); 5158c2ecf20Sopenharmony_ci __dec_lruvec_slab_state(node, WORKINGSET_NODES); 5168c2ecf20Sopenharmony_ci } 5178c2ecf20Sopenharmony_ci } 5188c2ecf20Sopenharmony_ci} 5198c2ecf20Sopenharmony_ci 5208c2ecf20Sopenharmony_cistatic unsigned long count_shadow_nodes(struct shrinker *shrinker, 5218c2ecf20Sopenharmony_ci struct shrink_control *sc) 5228c2ecf20Sopenharmony_ci{ 5238c2ecf20Sopenharmony_ci unsigned long max_nodes; 5248c2ecf20Sopenharmony_ci unsigned long nodes; 5258c2ecf20Sopenharmony_ci unsigned long pages; 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_ci nodes = list_lru_shrink_count(&shadow_nodes, sc); 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci /* 5308c2ecf20Sopenharmony_ci * Approximate a reasonable limit for the nodes 5318c2ecf20Sopenharmony_ci * containing shadow entries. We don't need to keep more 5328c2ecf20Sopenharmony_ci * shadow entries than possible pages on the active list, 5338c2ecf20Sopenharmony_ci * since refault distances bigger than that are dismissed. 5348c2ecf20Sopenharmony_ci * 5358c2ecf20Sopenharmony_ci * The size of the active list converges toward 100% of 5368c2ecf20Sopenharmony_ci * overall page cache as memory grows, with only a tiny 5378c2ecf20Sopenharmony_ci * inactive list. Assume the total cache size for that. 5388c2ecf20Sopenharmony_ci * 5398c2ecf20Sopenharmony_ci * Nodes might be sparsely populated, with only one shadow 5408c2ecf20Sopenharmony_ci * entry in the extreme case. Obviously, we cannot keep one 5418c2ecf20Sopenharmony_ci * node for every eligible shadow entry, so compromise on a 5428c2ecf20Sopenharmony_ci * worst-case density of 1/8th. Below that, not all eligible 5438c2ecf20Sopenharmony_ci * refaults can be detected anymore. 5448c2ecf20Sopenharmony_ci * 5458c2ecf20Sopenharmony_ci * On 64-bit with 7 xa_nodes per page and 64 slots 5468c2ecf20Sopenharmony_ci * each, this will reclaim shadow entries when they consume 5478c2ecf20Sopenharmony_ci * ~1.8% of available memory: 5488c2ecf20Sopenharmony_ci * 5498c2ecf20Sopenharmony_ci * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE 5508c2ecf20Sopenharmony_ci */ 5518c2ecf20Sopenharmony_ci#ifdef CONFIG_MEMCG 5528c2ecf20Sopenharmony_ci#ifndef CONFIG_HYPERHOLD_FILE_LRU 5538c2ecf20Sopenharmony_ci if (sc->memcg) { 5548c2ecf20Sopenharmony_ci struct lruvec *lruvec; 5558c2ecf20Sopenharmony_ci int i; 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 5588c2ecf20Sopenharmony_ci for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) 5598c2ecf20Sopenharmony_ci pages += lruvec_page_state_local(lruvec, 5608c2ecf20Sopenharmony_ci NR_LRU_BASE + i); 5618c2ecf20Sopenharmony_ci pages += lruvec_page_state_local( 5628c2ecf20Sopenharmony_ci lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; 5638c2ecf20Sopenharmony_ci pages += lruvec_page_state_local( 5648c2ecf20Sopenharmony_ci lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; 5658c2ecf20Sopenharmony_ci } else 5668c2ecf20Sopenharmony_ci#endif 5678c2ecf20Sopenharmony_ci#endif 5688c2ecf20Sopenharmony_ci pages = node_present_pages(sc->nid); 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci max_nodes = pages >> (XA_CHUNK_SHIFT - 3); 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci if (!nodes) 5738c2ecf20Sopenharmony_ci return SHRINK_EMPTY; 5748c2ecf20Sopenharmony_ci 5758c2ecf20Sopenharmony_ci if (nodes <= max_nodes) 5768c2ecf20Sopenharmony_ci return 0; 5778c2ecf20Sopenharmony_ci return nodes - max_nodes; 5788c2ecf20Sopenharmony_ci} 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_cistatic enum lru_status shadow_lru_isolate(struct list_head *item, 5818c2ecf20Sopenharmony_ci struct list_lru_one *lru, 5828c2ecf20Sopenharmony_ci spinlock_t *lru_lock, 5838c2ecf20Sopenharmony_ci void *arg) __must_hold(lru_lock) 5848c2ecf20Sopenharmony_ci{ 5858c2ecf20Sopenharmony_ci struct xa_node *node = container_of(item, struct xa_node, private_list); 5868c2ecf20Sopenharmony_ci struct address_space *mapping; 5878c2ecf20Sopenharmony_ci int ret; 5888c2ecf20Sopenharmony_ci 5898c2ecf20Sopenharmony_ci /* 5908c2ecf20Sopenharmony_ci * Page cache insertions and deletions synchronously maintain 5918c2ecf20Sopenharmony_ci * the shadow node LRU under the i_pages lock and the 5928c2ecf20Sopenharmony_ci * lru_lock. Because the page cache tree is emptied before 5938c2ecf20Sopenharmony_ci * the inode can be destroyed, holding the lru_lock pins any 5948c2ecf20Sopenharmony_ci * address_space that has nodes on the LRU. 5958c2ecf20Sopenharmony_ci * 5968c2ecf20Sopenharmony_ci * We can then safely transition to the i_pages lock to 5978c2ecf20Sopenharmony_ci * pin only the address_space of the particular node we want 5988c2ecf20Sopenharmony_ci * to reclaim, take the node off-LRU, and drop the lru_lock. 5998c2ecf20Sopenharmony_ci */ 6008c2ecf20Sopenharmony_ci 6018c2ecf20Sopenharmony_ci mapping = container_of(node->array, struct address_space, i_pages); 6028c2ecf20Sopenharmony_ci 6038c2ecf20Sopenharmony_ci /* Coming from the list, invert the lock order */ 6048c2ecf20Sopenharmony_ci if (!xa_trylock(&mapping->i_pages)) { 6058c2ecf20Sopenharmony_ci spin_unlock_irq(lru_lock); 6068c2ecf20Sopenharmony_ci ret = LRU_RETRY; 6078c2ecf20Sopenharmony_ci goto out; 6088c2ecf20Sopenharmony_ci } 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci list_lru_isolate(lru, item); 6118c2ecf20Sopenharmony_ci __dec_lruvec_slab_state(node, WORKINGSET_NODES); 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_ci spin_unlock(lru_lock); 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_ci /* 6168c2ecf20Sopenharmony_ci * The nodes should only contain one or more shadow entries, 6178c2ecf20Sopenharmony_ci * no pages, so we expect to be able to remove them all and 6188c2ecf20Sopenharmony_ci * delete and free the empty node afterwards. 6198c2ecf20Sopenharmony_ci */ 6208c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(!node->nr_values)) 6218c2ecf20Sopenharmony_ci goto out_invalid; 6228c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(node->count != node->nr_values)) 6238c2ecf20Sopenharmony_ci goto out_invalid; 6248c2ecf20Sopenharmony_ci mapping->nrexceptional -= node->nr_values; 6258c2ecf20Sopenharmony_ci xa_delete_node(node, workingset_update_node); 6268c2ecf20Sopenharmony_ci __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ciout_invalid: 6298c2ecf20Sopenharmony_ci xa_unlock_irq(&mapping->i_pages); 6308c2ecf20Sopenharmony_ci ret = LRU_REMOVED_RETRY; 6318c2ecf20Sopenharmony_ciout: 6328c2ecf20Sopenharmony_ci cond_resched(); 6338c2ecf20Sopenharmony_ci spin_lock_irq(lru_lock); 6348c2ecf20Sopenharmony_ci return ret; 6358c2ecf20Sopenharmony_ci} 6368c2ecf20Sopenharmony_ci 6378c2ecf20Sopenharmony_cistatic unsigned long scan_shadow_nodes(struct shrinker *shrinker, 6388c2ecf20Sopenharmony_ci struct shrink_control *sc) 6398c2ecf20Sopenharmony_ci{ 6408c2ecf20Sopenharmony_ci /* list_lru lock nests inside the IRQ-safe i_pages lock */ 6418c2ecf20Sopenharmony_ci return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, 6428c2ecf20Sopenharmony_ci NULL); 6438c2ecf20Sopenharmony_ci} 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_cistatic struct shrinker workingset_shadow_shrinker = { 6468c2ecf20Sopenharmony_ci .count_objects = count_shadow_nodes, 6478c2ecf20Sopenharmony_ci .scan_objects = scan_shadow_nodes, 6488c2ecf20Sopenharmony_ci .seeks = 0, /* ->count reports only fully expendable nodes */ 6498c2ecf20Sopenharmony_ci .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, 6508c2ecf20Sopenharmony_ci}; 6518c2ecf20Sopenharmony_ci 6528c2ecf20Sopenharmony_ci/* 6538c2ecf20Sopenharmony_ci * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe 6548c2ecf20Sopenharmony_ci * i_pages lock. 6558c2ecf20Sopenharmony_ci */ 6568c2ecf20Sopenharmony_cistatic struct lock_class_key shadow_nodes_key; 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_cistatic int __init workingset_init(void) 6598c2ecf20Sopenharmony_ci{ 6608c2ecf20Sopenharmony_ci unsigned int timestamp_bits; 6618c2ecf20Sopenharmony_ci unsigned int max_order; 6628c2ecf20Sopenharmony_ci int ret; 6638c2ecf20Sopenharmony_ci 6648c2ecf20Sopenharmony_ci BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); 6658c2ecf20Sopenharmony_ci /* 6668c2ecf20Sopenharmony_ci * Calculate the eviction bucket size to cover the longest 6678c2ecf20Sopenharmony_ci * actionable refault distance, which is currently half of 6688c2ecf20Sopenharmony_ci * memory (totalram_pages/2). However, memory hotplug may add 6698c2ecf20Sopenharmony_ci * some more pages at runtime, so keep working with up to 6708c2ecf20Sopenharmony_ci * double the initial memory by using totalram_pages as-is. 6718c2ecf20Sopenharmony_ci */ 6728c2ecf20Sopenharmony_ci timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; 6738c2ecf20Sopenharmony_ci max_order = fls_long(totalram_pages() - 1); 6748c2ecf20Sopenharmony_ci if (max_order > timestamp_bits) 6758c2ecf20Sopenharmony_ci bucket_order = max_order - timestamp_bits; 6768c2ecf20Sopenharmony_ci pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", 6778c2ecf20Sopenharmony_ci timestamp_bits, max_order, bucket_order); 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci ret = prealloc_shrinker(&workingset_shadow_shrinker); 6808c2ecf20Sopenharmony_ci if (ret) 6818c2ecf20Sopenharmony_ci goto err; 6828c2ecf20Sopenharmony_ci ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, 6838c2ecf20Sopenharmony_ci &workingset_shadow_shrinker); 6848c2ecf20Sopenharmony_ci if (ret) 6858c2ecf20Sopenharmony_ci goto err_list_lru; 6868c2ecf20Sopenharmony_ci register_shrinker_prepared(&workingset_shadow_shrinker); 6878c2ecf20Sopenharmony_ci return 0; 6888c2ecf20Sopenharmony_cierr_list_lru: 6898c2ecf20Sopenharmony_ci free_prealloced_shrinker(&workingset_shadow_shrinker); 6908c2ecf20Sopenharmony_cierr: 6918c2ecf20Sopenharmony_ci return ret; 6928c2ecf20Sopenharmony_ci} 6938c2ecf20Sopenharmony_cimodule_init(workingset_init); 694