162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Workingset detection
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#include <linux/memcontrol.h>
962306a36Sopenharmony_ci#include <linux/mm_inline.h>
1062306a36Sopenharmony_ci#include <linux/writeback.h>
1162306a36Sopenharmony_ci#include <linux/shmem_fs.h>
1262306a36Sopenharmony_ci#include <linux/pagemap.h>
1362306a36Sopenharmony_ci#include <linux/atomic.h>
1462306a36Sopenharmony_ci#include <linux/module.h>
1562306a36Sopenharmony_ci#include <linux/swap.h>
1662306a36Sopenharmony_ci#include <linux/dax.h>
1762306a36Sopenharmony_ci#include <linux/fs.h>
1862306a36Sopenharmony_ci#include <linux/mm.h>
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci/*
2162306a36Sopenharmony_ci *		Double CLOCK lists
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci * Per node, two clock lists are maintained for file pages: the
2462306a36Sopenharmony_ci * inactive and the active list.  Freshly faulted pages start out at
2562306a36Sopenharmony_ci * the head of the inactive list and page reclaim scans pages from the
2662306a36Sopenharmony_ci * tail.  Pages that are accessed multiple times on the inactive list
2762306a36Sopenharmony_ci * are promoted to the active list, to protect them from reclaim,
2862306a36Sopenharmony_ci * whereas active pages are demoted to the inactive list when the
2962306a36Sopenharmony_ci * active list grows too big.
3062306a36Sopenharmony_ci *
3162306a36Sopenharmony_ci *   fault ------------------------+
3262306a36Sopenharmony_ci *                                 |
3362306a36Sopenharmony_ci *              +--------------+   |            +-------------+
3462306a36Sopenharmony_ci *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
3562306a36Sopenharmony_ci *              +--------------+                +-------------+    |
3662306a36Sopenharmony_ci *                     |                                           |
3762306a36Sopenharmony_ci *                     +-------------- promotion ------------------+
3862306a36Sopenharmony_ci *
3962306a36Sopenharmony_ci *
4062306a36Sopenharmony_ci *		Access frequency and refault distance
4162306a36Sopenharmony_ci *
4262306a36Sopenharmony_ci * A workload is thrashing when its pages are frequently used but they
4362306a36Sopenharmony_ci * are evicted from the inactive list every time before another access
4462306a36Sopenharmony_ci * would have promoted them to the active list.
4562306a36Sopenharmony_ci *
4662306a36Sopenharmony_ci * In cases where the average access distance between thrashing pages
4762306a36Sopenharmony_ci * is bigger than the size of memory there is nothing that can be
4862306a36Sopenharmony_ci * done - the thrashing set could never fit into memory under any
4962306a36Sopenharmony_ci * circumstance.
5062306a36Sopenharmony_ci *
5162306a36Sopenharmony_ci * However, the average access distance could be bigger than the
5262306a36Sopenharmony_ci * inactive list, yet smaller than the size of memory.  In this case,
5362306a36Sopenharmony_ci * the set could fit into memory if it weren't for the currently
5462306a36Sopenharmony_ci * active pages - which may be used more, hopefully less frequently:
5562306a36Sopenharmony_ci *
5662306a36Sopenharmony_ci *      +-memory available to cache-+
5762306a36Sopenharmony_ci *      |                           |
5862306a36Sopenharmony_ci *      +-inactive------+-active----+
5962306a36Sopenharmony_ci *  a b | c d e f g h i | J K L M N |
6062306a36Sopenharmony_ci *      +---------------+-----------+
6162306a36Sopenharmony_ci *
6262306a36Sopenharmony_ci * It is prohibitively expensive to accurately track access frequency
6362306a36Sopenharmony_ci * of pages.  But a reasonable approximation can be made to measure
6462306a36Sopenharmony_ci * thrashing on the inactive list, after which refaulting pages can be
6562306a36Sopenharmony_ci * activated optimistically to compete with the existing active pages.
6662306a36Sopenharmony_ci *
6762306a36Sopenharmony_ci * Approximating inactive page access frequency - Observations:
6862306a36Sopenharmony_ci *
6962306a36Sopenharmony_ci * 1. When a page is accessed for the first time, it is added to the
7062306a36Sopenharmony_ci *    head of the inactive list, slides every existing inactive page
7162306a36Sopenharmony_ci *    towards the tail by one slot, and pushes the current tail page
7262306a36Sopenharmony_ci *    out of memory.
7362306a36Sopenharmony_ci *
7462306a36Sopenharmony_ci * 2. When a page is accessed for the second time, it is promoted to
7562306a36Sopenharmony_ci *    the active list, shrinking the inactive list by one slot.  This
7662306a36Sopenharmony_ci *    also slides all inactive pages that were faulted into the cache
7762306a36Sopenharmony_ci *    more recently than the activated page towards the tail of the
7862306a36Sopenharmony_ci *    inactive list.
7962306a36Sopenharmony_ci *
8062306a36Sopenharmony_ci * Thus:
8162306a36Sopenharmony_ci *
8262306a36Sopenharmony_ci * 1. The sum of evictions and activations between any two points in
8362306a36Sopenharmony_ci *    time indicate the minimum number of inactive pages accessed in
8462306a36Sopenharmony_ci *    between.
8562306a36Sopenharmony_ci *
8662306a36Sopenharmony_ci * 2. Moving one inactive page N page slots towards the tail of the
8762306a36Sopenharmony_ci *    list requires at least N inactive page accesses.
8862306a36Sopenharmony_ci *
8962306a36Sopenharmony_ci * Combining these:
9062306a36Sopenharmony_ci *
9162306a36Sopenharmony_ci * 1. When a page is finally evicted from memory, the number of
9262306a36Sopenharmony_ci *    inactive pages accessed while the page was in cache is at least
9362306a36Sopenharmony_ci *    the number of page slots on the inactive list.
9462306a36Sopenharmony_ci *
9562306a36Sopenharmony_ci * 2. In addition, measuring the sum of evictions and activations (E)
9662306a36Sopenharmony_ci *    at the time of a page's eviction, and comparing it to another
9762306a36Sopenharmony_ci *    reading (R) at the time the page faults back into memory tells
9862306a36Sopenharmony_ci *    the minimum number of accesses while the page was not cached.
9962306a36Sopenharmony_ci *    This is called the refault distance.
10062306a36Sopenharmony_ci *
10162306a36Sopenharmony_ci * Because the first access of the page was the fault and the second
10262306a36Sopenharmony_ci * access the refault, we combine the in-cache distance with the
10362306a36Sopenharmony_ci * out-of-cache distance to get the complete minimum access distance
10462306a36Sopenharmony_ci * of this page:
10562306a36Sopenharmony_ci *
10662306a36Sopenharmony_ci *      NR_inactive + (R - E)
10762306a36Sopenharmony_ci *
10862306a36Sopenharmony_ci * And knowing the minimum access distance of a page, we can easily
10962306a36Sopenharmony_ci * tell if the page would be able to stay in cache assuming all page
11062306a36Sopenharmony_ci * slots in the cache were available:
11162306a36Sopenharmony_ci *
11262306a36Sopenharmony_ci *   NR_inactive + (R - E) <= NR_inactive + NR_active
11362306a36Sopenharmony_ci *
11462306a36Sopenharmony_ci * If we have swap we should consider about NR_inactive_anon and
11562306a36Sopenharmony_ci * NR_active_anon, so for page cache and anonymous respectively:
11662306a36Sopenharmony_ci *
11762306a36Sopenharmony_ci *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
11862306a36Sopenharmony_ci *   + NR_inactive_anon + NR_active_anon
11962306a36Sopenharmony_ci *
12062306a36Sopenharmony_ci *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
12162306a36Sopenharmony_ci *   + NR_inactive_file + NR_active_file
12262306a36Sopenharmony_ci *
12362306a36Sopenharmony_ci * Which can be further simplified to:
12462306a36Sopenharmony_ci *
12562306a36Sopenharmony_ci *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
12662306a36Sopenharmony_ci *
12762306a36Sopenharmony_ci *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
12862306a36Sopenharmony_ci *
12962306a36Sopenharmony_ci * Put into words, the refault distance (out-of-cache) can be seen as
13062306a36Sopenharmony_ci * a deficit in inactive list space (in-cache).  If the inactive list
13162306a36Sopenharmony_ci * had (R - E) more page slots, the page would not have been evicted
13262306a36Sopenharmony_ci * in between accesses, but activated instead.  And on a full system,
13362306a36Sopenharmony_ci * the only thing eating into inactive list space is active pages.
13462306a36Sopenharmony_ci *
13562306a36Sopenharmony_ci *
13662306a36Sopenharmony_ci *		Refaulting inactive pages
13762306a36Sopenharmony_ci *
13862306a36Sopenharmony_ci * All that is known about the active list is that the pages have been
13962306a36Sopenharmony_ci * accessed more than once in the past.  This means that at any given
14062306a36Sopenharmony_ci * time there is actually a good chance that pages on the active list
14162306a36Sopenharmony_ci * are no longer in active use.
14262306a36Sopenharmony_ci *
14362306a36Sopenharmony_ci * So when a refault distance of (R - E) is observed and there are at
14462306a36Sopenharmony_ci * least (R - E) pages in the userspace workingset, the refaulting page
14562306a36Sopenharmony_ci * is activated optimistically in the hope that (R - E) pages are actually
14662306a36Sopenharmony_ci * used less frequently than the refaulting page - or even not used at
14762306a36Sopenharmony_ci * all anymore.
14862306a36Sopenharmony_ci *
14962306a36Sopenharmony_ci * That means if inactive cache is refaulting with a suitable refault
15062306a36Sopenharmony_ci * distance, we assume the cache workingset is transitioning and put
15162306a36Sopenharmony_ci * pressure on the current workingset.
15262306a36Sopenharmony_ci *
15362306a36Sopenharmony_ci * If this is wrong and demotion kicks in, the pages which are truly
15462306a36Sopenharmony_ci * used more frequently will be reactivated while the less frequently
15562306a36Sopenharmony_ci * used once will be evicted from memory.
15662306a36Sopenharmony_ci *
15762306a36Sopenharmony_ci * But if this is right, the stale pages will be pushed out of memory
15862306a36Sopenharmony_ci * and the used pages get to stay in cache.
15962306a36Sopenharmony_ci *
16062306a36Sopenharmony_ci *		Refaulting active pages
16162306a36Sopenharmony_ci *
16262306a36Sopenharmony_ci * If on the other hand the refaulting pages have recently been
16362306a36Sopenharmony_ci * deactivated, it means that the active list is no longer protecting
16462306a36Sopenharmony_ci * actively used cache from reclaim. The cache is NOT transitioning to
16562306a36Sopenharmony_ci * a different workingset; the existing workingset is thrashing in the
16662306a36Sopenharmony_ci * space allocated to the page cache.
16762306a36Sopenharmony_ci *
16862306a36Sopenharmony_ci *
16962306a36Sopenharmony_ci *		Implementation
17062306a36Sopenharmony_ci *
17162306a36Sopenharmony_ci * For each node's LRU lists, a counter for inactive evictions and
17262306a36Sopenharmony_ci * activations is maintained (node->nonresident_age).
17362306a36Sopenharmony_ci *
17462306a36Sopenharmony_ci * On eviction, a snapshot of this counter (along with some bits to
17562306a36Sopenharmony_ci * identify the node) is stored in the now empty page cache
17662306a36Sopenharmony_ci * slot of the evicted page.  This is called a shadow entry.
17762306a36Sopenharmony_ci *
17862306a36Sopenharmony_ci * On cache misses for which there are shadow entries, an eligible
17962306a36Sopenharmony_ci * refault distance will immediately activate the refaulting page.
18062306a36Sopenharmony_ci */
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci#define WORKINGSET_SHIFT 1
18362306a36Sopenharmony_ci#define EVICTION_SHIFT	((BITS_PER_LONG - BITS_PER_XA_VALUE) +	\
18462306a36Sopenharmony_ci			 WORKINGSET_SHIFT + NODES_SHIFT + \
18562306a36Sopenharmony_ci			 MEM_CGROUP_ID_SHIFT)
18662306a36Sopenharmony_ci#define EVICTION_MASK	(~0UL >> EVICTION_SHIFT)
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci/*
18962306a36Sopenharmony_ci * Eviction timestamps need to be able to cover the full range of
19062306a36Sopenharmony_ci * actionable refaults. However, bits are tight in the xarray
19162306a36Sopenharmony_ci * entry, and after storing the identifier for the lruvec there might
19262306a36Sopenharmony_ci * not be enough left to represent every single actionable refault. In
19362306a36Sopenharmony_ci * that case, we have to sacrifice granularity for distance, and group
19462306a36Sopenharmony_ci * evictions into coarser buckets by shaving off lower timestamp bits.
19562306a36Sopenharmony_ci */
19662306a36Sopenharmony_cistatic unsigned int bucket_order __read_mostly;
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_cistatic void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
19962306a36Sopenharmony_ci			 bool workingset)
20062306a36Sopenharmony_ci{
20162306a36Sopenharmony_ci	eviction &= EVICTION_MASK;
20262306a36Sopenharmony_ci	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
20362306a36Sopenharmony_ci	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
20462306a36Sopenharmony_ci	eviction = (eviction << WORKINGSET_SHIFT) | workingset;
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci	return xa_mk_value(eviction);
20762306a36Sopenharmony_ci}
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_cistatic void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
21062306a36Sopenharmony_ci			  unsigned long *evictionp, bool *workingsetp)
21162306a36Sopenharmony_ci{
21262306a36Sopenharmony_ci	unsigned long entry = xa_to_value(shadow);
21362306a36Sopenharmony_ci	int memcgid, nid;
21462306a36Sopenharmony_ci	bool workingset;
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci	workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
21762306a36Sopenharmony_ci	entry >>= WORKINGSET_SHIFT;
21862306a36Sopenharmony_ci	nid = entry & ((1UL << NODES_SHIFT) - 1);
21962306a36Sopenharmony_ci	entry >>= NODES_SHIFT;
22062306a36Sopenharmony_ci	memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
22162306a36Sopenharmony_ci	entry >>= MEM_CGROUP_ID_SHIFT;
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci	*memcgidp = memcgid;
22462306a36Sopenharmony_ci	*pgdat = NODE_DATA(nid);
22562306a36Sopenharmony_ci	*evictionp = entry;
22662306a36Sopenharmony_ci	*workingsetp = workingset;
22762306a36Sopenharmony_ci}
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci#ifdef CONFIG_LRU_GEN
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_cistatic void *lru_gen_eviction(struct folio *folio)
23262306a36Sopenharmony_ci{
23362306a36Sopenharmony_ci	int hist;
23462306a36Sopenharmony_ci	unsigned long token;
23562306a36Sopenharmony_ci	unsigned long min_seq;
23662306a36Sopenharmony_ci	struct lruvec *lruvec;
23762306a36Sopenharmony_ci	struct lru_gen_folio *lrugen;
23862306a36Sopenharmony_ci	int type = folio_is_file_lru(folio);
23962306a36Sopenharmony_ci	int delta = folio_nr_pages(folio);
24062306a36Sopenharmony_ci	int refs = folio_lru_refs(folio);
24162306a36Sopenharmony_ci	int tier = lru_tier_from_refs(refs);
24262306a36Sopenharmony_ci	struct mem_cgroup *memcg = folio_memcg(folio);
24362306a36Sopenharmony_ci	struct pglist_data *pgdat = folio_pgdat(folio);
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	lruvec = mem_cgroup_lruvec(memcg, pgdat);
24862306a36Sopenharmony_ci	lrugen = &lruvec->lrugen;
24962306a36Sopenharmony_ci	min_seq = READ_ONCE(lrugen->min_seq[type]);
25062306a36Sopenharmony_ci	token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci	hist = lru_hist_from_seq(min_seq);
25362306a36Sopenharmony_ci	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
25662306a36Sopenharmony_ci}
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci/*
25962306a36Sopenharmony_ci * Tests if the shadow entry is for a folio that was recently evicted.
26062306a36Sopenharmony_ci * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
26162306a36Sopenharmony_ci */
26262306a36Sopenharmony_cistatic bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
26362306a36Sopenharmony_ci				unsigned long *token, bool *workingset)
26462306a36Sopenharmony_ci{
26562306a36Sopenharmony_ci	int memcg_id;
26662306a36Sopenharmony_ci	unsigned long min_seq;
26762306a36Sopenharmony_ci	struct mem_cgroup *memcg;
26862306a36Sopenharmony_ci	struct pglist_data *pgdat;
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	memcg = mem_cgroup_from_id(memcg_id);
27362306a36Sopenharmony_ci	*lruvec = mem_cgroup_lruvec(memcg, pgdat);
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
27662306a36Sopenharmony_ci	return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
27762306a36Sopenharmony_ci}
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_cistatic void lru_gen_refault(struct folio *folio, void *shadow)
28062306a36Sopenharmony_ci{
28162306a36Sopenharmony_ci	bool recent;
28262306a36Sopenharmony_ci	int hist, tier, refs;
28362306a36Sopenharmony_ci	bool workingset;
28462306a36Sopenharmony_ci	unsigned long token;
28562306a36Sopenharmony_ci	struct lruvec *lruvec;
28662306a36Sopenharmony_ci	struct lru_gen_folio *lrugen;
28762306a36Sopenharmony_ci	int type = folio_is_file_lru(folio);
28862306a36Sopenharmony_ci	int delta = folio_nr_pages(folio);
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci	rcu_read_lock();
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
29362306a36Sopenharmony_ci	if (lruvec != folio_lruvec(folio))
29462306a36Sopenharmony_ci		goto unlock;
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	if (!recent)
29962306a36Sopenharmony_ci		goto unlock;
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	lrugen = &lruvec->lrugen;
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
30462306a36Sopenharmony_ci	/* see the comment in folio_lru_refs() */
30562306a36Sopenharmony_ci	refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
30662306a36Sopenharmony_ci	tier = lru_tier_from_refs(refs);
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
30962306a36Sopenharmony_ci	mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	/*
31262306a36Sopenharmony_ci	 * Count the following two cases as stalls:
31362306a36Sopenharmony_ci	 * 1. For pages accessed through page tables, hotter pages pushed out
31462306a36Sopenharmony_ci	 *    hot pages which refaulted immediately.
31562306a36Sopenharmony_ci	 * 2. For pages accessed multiple times through file descriptors,
31662306a36Sopenharmony_ci	 *    they would have been protected by sort_folio().
31762306a36Sopenharmony_ci	 */
31862306a36Sopenharmony_ci	if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
31962306a36Sopenharmony_ci		set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
32062306a36Sopenharmony_ci		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
32162306a36Sopenharmony_ci	}
32262306a36Sopenharmony_ciunlock:
32362306a36Sopenharmony_ci	rcu_read_unlock();
32462306a36Sopenharmony_ci}
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci#else /* !CONFIG_LRU_GEN */
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_cistatic void *lru_gen_eviction(struct folio *folio)
32962306a36Sopenharmony_ci{
33062306a36Sopenharmony_ci	return NULL;
33162306a36Sopenharmony_ci}
33262306a36Sopenharmony_ci
33362306a36Sopenharmony_cistatic bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
33462306a36Sopenharmony_ci				unsigned long *token, bool *workingset)
33562306a36Sopenharmony_ci{
33662306a36Sopenharmony_ci	return false;
33762306a36Sopenharmony_ci}
33862306a36Sopenharmony_ci
33962306a36Sopenharmony_cistatic void lru_gen_refault(struct folio *folio, void *shadow)
34062306a36Sopenharmony_ci{
34162306a36Sopenharmony_ci}
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci#endif /* CONFIG_LRU_GEN */
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci/**
34662306a36Sopenharmony_ci * workingset_age_nonresident - age non-resident entries as LRU ages
34762306a36Sopenharmony_ci * @lruvec: the lruvec that was aged
34862306a36Sopenharmony_ci * @nr_pages: the number of pages to count
34962306a36Sopenharmony_ci *
35062306a36Sopenharmony_ci * As in-memory pages are aged, non-resident pages need to be aged as
35162306a36Sopenharmony_ci * well, in order for the refault distances later on to be comparable
35262306a36Sopenharmony_ci * to the in-memory dimensions. This function allows reclaim and LRU
35362306a36Sopenharmony_ci * operations to drive the non-resident aging along in parallel.
35462306a36Sopenharmony_ci */
35562306a36Sopenharmony_civoid workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
35662306a36Sopenharmony_ci{
35762306a36Sopenharmony_ci	/*
35862306a36Sopenharmony_ci	 * Reclaiming a cgroup means reclaiming all its children in a
35962306a36Sopenharmony_ci	 * round-robin fashion. That means that each cgroup has an LRU
36062306a36Sopenharmony_ci	 * order that is composed of the LRU orders of its child
36162306a36Sopenharmony_ci	 * cgroups; and every page has an LRU position not just in the
36262306a36Sopenharmony_ci	 * cgroup that owns it, but in all of that group's ancestors.
36362306a36Sopenharmony_ci	 *
36462306a36Sopenharmony_ci	 * So when the physical inactive list of a leaf cgroup ages,
36562306a36Sopenharmony_ci	 * the virtual inactive lists of all its parents, including
36662306a36Sopenharmony_ci	 * the root cgroup's, age as well.
36762306a36Sopenharmony_ci	 */
36862306a36Sopenharmony_ci	do {
36962306a36Sopenharmony_ci		atomic_long_add(nr_pages, &lruvec->nonresident_age);
37062306a36Sopenharmony_ci	} while ((lruvec = parent_lruvec(lruvec)));
37162306a36Sopenharmony_ci}
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci/**
37462306a36Sopenharmony_ci * workingset_eviction - note the eviction of a folio from memory
37562306a36Sopenharmony_ci * @target_memcg: the cgroup that is causing the reclaim
37662306a36Sopenharmony_ci * @folio: the folio being evicted
37762306a36Sopenharmony_ci *
37862306a36Sopenharmony_ci * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
37962306a36Sopenharmony_ci * of the evicted @folio so that a later refault can be detected.
38062306a36Sopenharmony_ci */
38162306a36Sopenharmony_civoid *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
38262306a36Sopenharmony_ci{
38362306a36Sopenharmony_ci	struct pglist_data *pgdat = folio_pgdat(folio);
38462306a36Sopenharmony_ci	unsigned long eviction;
38562306a36Sopenharmony_ci	struct lruvec *lruvec;
38662306a36Sopenharmony_ci	int memcgid;
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_ci	/* Folio is fully exclusive and pins folio's memory cgroup pointer */
38962306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
39062306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
39162306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci	if (lru_gen_enabled())
39462306a36Sopenharmony_ci		return lru_gen_eviction(folio);
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
39762306a36Sopenharmony_ci	/* XXX: target_memcg can be NULL, go through lruvec */
39862306a36Sopenharmony_ci	memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
39962306a36Sopenharmony_ci	eviction = atomic_long_read(&lruvec->nonresident_age);
40062306a36Sopenharmony_ci	eviction >>= bucket_order;
40162306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
40262306a36Sopenharmony_ci	if (!is_prot_page(folio_page(folio, 0)) && page_is_file_lru(folio_page(folio, 0))) {
40362306a36Sopenharmony_ci		lruvec = folio_lruvec(folio);
40462306a36Sopenharmony_ci		workingset_age_nonresident(lruvec, folio_nr_pages(folio));
40562306a36Sopenharmony_ci	} else {
40662306a36Sopenharmony_ci		workingset_age_nonresident(lruvec, folio_nr_pages(folio));
40762306a36Sopenharmony_ci	}
40862306a36Sopenharmony_ci#else
40962306a36Sopenharmony_ci	workingset_age_nonresident(lruvec, folio_nr_pages(folio));
41062306a36Sopenharmony_ci#endif
41162306a36Sopenharmony_ci	return pack_shadow(memcgid, pgdat, eviction,
41262306a36Sopenharmony_ci				folio_test_workingset(folio));
41362306a36Sopenharmony_ci}
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci/**
41662306a36Sopenharmony_ci * workingset_test_recent - tests if the shadow entry is for a folio that was
41762306a36Sopenharmony_ci * recently evicted. Also fills in @workingset with the value unpacked from
41862306a36Sopenharmony_ci * shadow.
41962306a36Sopenharmony_ci * @shadow: the shadow entry to be tested.
42062306a36Sopenharmony_ci * @file: whether the corresponding folio is from the file lru.
42162306a36Sopenharmony_ci * @workingset: where the workingset value unpacked from shadow should
42262306a36Sopenharmony_ci * be stored.
42362306a36Sopenharmony_ci *
42462306a36Sopenharmony_ci * Return: true if the shadow is for a recently evicted folio; false otherwise.
42562306a36Sopenharmony_ci */
42662306a36Sopenharmony_cibool workingset_test_recent(void *shadow, bool file, bool *workingset)
42762306a36Sopenharmony_ci{
42862306a36Sopenharmony_ci	struct mem_cgroup *eviction_memcg;
42962306a36Sopenharmony_ci	struct lruvec *eviction_lruvec;
43062306a36Sopenharmony_ci	unsigned long refault_distance;
43162306a36Sopenharmony_ci	unsigned long workingset_size;
43262306a36Sopenharmony_ci	unsigned long refault;
43362306a36Sopenharmony_ci	int memcgid;
43462306a36Sopenharmony_ci	struct pglist_data *pgdat;
43562306a36Sopenharmony_ci	unsigned long eviction;
43662306a36Sopenharmony_ci
43762306a36Sopenharmony_ci	if (lru_gen_enabled())
43862306a36Sopenharmony_ci		return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset);
43962306a36Sopenharmony_ci
44062306a36Sopenharmony_ci	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
44162306a36Sopenharmony_ci	eviction <<= bucket_order;
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_ci	/*
44462306a36Sopenharmony_ci	 * Look up the memcg associated with the stored ID. It might
44562306a36Sopenharmony_ci	 * have been deleted since the folio's eviction.
44662306a36Sopenharmony_ci	 *
44762306a36Sopenharmony_ci	 * Note that in rare events the ID could have been recycled
44862306a36Sopenharmony_ci	 * for a new cgroup that refaults a shared folio. This is
44962306a36Sopenharmony_ci	 * impossible to tell from the available data. However, this
45062306a36Sopenharmony_ci	 * should be a rare and limited disturbance, and activations
45162306a36Sopenharmony_ci	 * are always speculative anyway. Ultimately, it's the aging
45262306a36Sopenharmony_ci	 * algorithm's job to shake out the minimum access frequency
45362306a36Sopenharmony_ci	 * for the active cache.
45462306a36Sopenharmony_ci	 *
45562306a36Sopenharmony_ci	 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
45662306a36Sopenharmony_ci	 * would be better if the root_mem_cgroup existed in all
45762306a36Sopenharmony_ci	 * configurations instead.
45862306a36Sopenharmony_ci	 */
45962306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
46062306a36Sopenharmony_ci	if (memcgid != -1) {
46162306a36Sopenharmony_ci		eviction_memcg = mem_cgroup_from_id(memcgid);
46262306a36Sopenharmony_ci		if (!mem_cgroup_disabled() && !eviction_memcg)
46362306a36Sopenharmony_ci			return false;
46462306a36Sopenharmony_ci	}
46562306a36Sopenharmony_ci#else
46662306a36Sopenharmony_ci	eviction_memcg = mem_cgroup_from_id(memcgid);
46762306a36Sopenharmony_ci	if (!mem_cgroup_disabled() && !eviction_memcg)
46862306a36Sopenharmony_ci		return false;
46962306a36Sopenharmony_ci#endif
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
47262306a36Sopenharmony_ci	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci	/*
47562306a36Sopenharmony_ci	 * Calculate the refault distance
47662306a36Sopenharmony_ci	 *
47762306a36Sopenharmony_ci	 * The unsigned subtraction here gives an accurate distance
47862306a36Sopenharmony_ci	 * across nonresident_age overflows in most cases. There is a
47962306a36Sopenharmony_ci	 * special case: usually, shadow entries have a short lifetime
48062306a36Sopenharmony_ci	 * and are either refaulted or reclaimed along with the inode
48162306a36Sopenharmony_ci	 * before they get too old.  But it is not impossible for the
48262306a36Sopenharmony_ci	 * nonresident_age to lap a shadow entry in the field, which
48362306a36Sopenharmony_ci	 * can then result in a false small refault distance, leading
48462306a36Sopenharmony_ci	 * to a false activation should this old entry actually
48562306a36Sopenharmony_ci	 * refault again.  However, earlier kernels used to deactivate
48662306a36Sopenharmony_ci	 * unconditionally with *every* reclaim invocation for the
48762306a36Sopenharmony_ci	 * longest time, so the occasional inappropriate activation
48862306a36Sopenharmony_ci	 * leading to pressure on the active list is not a problem.
48962306a36Sopenharmony_ci	 */
49062306a36Sopenharmony_ci	refault_distance = (refault - eviction) & EVICTION_MASK;
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	/*
49362306a36Sopenharmony_ci	 * Compare the distance to the existing workingset size. We
49462306a36Sopenharmony_ci	 * don't activate pages that couldn't stay resident even if
49562306a36Sopenharmony_ci	 * all the memory was available to the workingset. Whether
49662306a36Sopenharmony_ci	 * workingset competition needs to consider anon or not depends
49762306a36Sopenharmony_ci	 * on having free swap space.
49862306a36Sopenharmony_ci	 */
49962306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
50062306a36Sopenharmony_ci	workingset_size = lruvec_page_state(node_lruvec(pgdat), NR_ACTIVE_FILE);
50162306a36Sopenharmony_ci#else
50262306a36Sopenharmony_ci	workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
50362306a36Sopenharmony_ci#endif
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	if (!file) {
50662306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
50762306a36Sopenharmony_ci		workingset_size += lruvec_page_state(node_lruvec(pgdat),
50862306a36Sopenharmony_ci						     NR_INACTIVE_FILE);
50962306a36Sopenharmony_ci#else
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_ci		workingset_size += lruvec_page_state(eviction_lruvec,
51262306a36Sopenharmony_ci						     NR_INACTIVE_FILE);
51362306a36Sopenharmony_ci#endif
51462306a36Sopenharmony_ci	}
51562306a36Sopenharmony_ci	if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
51662306a36Sopenharmony_ci		workingset_size += lruvec_page_state(eviction_lruvec,
51762306a36Sopenharmony_ci						     NR_ACTIVE_ANON);
51862306a36Sopenharmony_ci		if (file) {
51962306a36Sopenharmony_ci			workingset_size += lruvec_page_state(eviction_lruvec,
52062306a36Sopenharmony_ci						     NR_INACTIVE_ANON);
52162306a36Sopenharmony_ci		}
52262306a36Sopenharmony_ci	}
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci	return refault_distance <= workingset_size;
52562306a36Sopenharmony_ci}
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci/**
52862306a36Sopenharmony_ci * workingset_refault - Evaluate the refault of a previously evicted folio.
52962306a36Sopenharmony_ci * @folio: The freshly allocated replacement folio.
53062306a36Sopenharmony_ci * @shadow: Shadow entry of the evicted folio.
53162306a36Sopenharmony_ci *
53262306a36Sopenharmony_ci * Calculates and evaluates the refault distance of the previously
53362306a36Sopenharmony_ci * evicted folio in the context of the node and the memcg whose memory
53462306a36Sopenharmony_ci * pressure caused the eviction.
53562306a36Sopenharmony_ci */
53662306a36Sopenharmony_civoid workingset_refault(struct folio *folio, void *shadow)
53762306a36Sopenharmony_ci{
53862306a36Sopenharmony_ci	bool file = folio_is_file_lru(folio);
53962306a36Sopenharmony_ci	struct pglist_data *pgdat;
54062306a36Sopenharmony_ci	struct mem_cgroup *memcg;
54162306a36Sopenharmony_ci	struct lruvec *lruvec;
54262306a36Sopenharmony_ci	bool workingset;
54362306a36Sopenharmony_ci	long nr;
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci	if (lru_gen_enabled()) {
54662306a36Sopenharmony_ci		lru_gen_refault(folio, shadow);
54762306a36Sopenharmony_ci		return;
54862306a36Sopenharmony_ci	}
54962306a36Sopenharmony_ci
55062306a36Sopenharmony_ci	/* Flush stats (and potentially sleep) before holding RCU read lock */
55162306a36Sopenharmony_ci	mem_cgroup_flush_stats_ratelimited();
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	rcu_read_lock();
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci	/*
55662306a36Sopenharmony_ci	 * The activation decision for this folio is made at the level
55762306a36Sopenharmony_ci	 * where the eviction occurred, as that is where the LRU order
55862306a36Sopenharmony_ci	 * during folio reclaim is being determined.
55962306a36Sopenharmony_ci	 *
56062306a36Sopenharmony_ci	 * However, the cgroup that will own the folio is the one that
56162306a36Sopenharmony_ci	 * is actually experiencing the refault event.
56262306a36Sopenharmony_ci	 */
56362306a36Sopenharmony_ci	nr = folio_nr_pages(folio);
56462306a36Sopenharmony_ci	memcg = folio_memcg(folio);
56562306a36Sopenharmony_ci	pgdat = folio_pgdat(folio);
56662306a36Sopenharmony_ci	lruvec = mem_cgroup_lruvec(memcg, pgdat);
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
56962306a36Sopenharmony_ci	if (!is_prot_page(folio_page(folio, 0)) && file)
57062306a36Sopenharmony_ci		mod_lruvec_state(node_lruvec(pgdat),
57162306a36Sopenharmony_ci		    WORKINGSET_REFAULT_BASE + file, folio_nr_pages(folio));
57262306a36Sopenharmony_ci	else
57362306a36Sopenharmony_ci		mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
57462306a36Sopenharmony_ci#else
57562306a36Sopenharmony_ci	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
57662306a36Sopenharmony_ci#endif
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	if (!workingset_test_recent(shadow, file, &workingset))
57962306a36Sopenharmony_ci		goto out;
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	folio_set_active(folio);
58262306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
58362306a36Sopenharmony_ci	if (!is_prot_page(folio_page(folio, 0)) && file) {
58462306a36Sopenharmony_ci		workingset_age_nonresident(node_lruvec(pgdat),
58562306a36Sopenharmony_ci					   folio_nr_pages(folio));
58662306a36Sopenharmony_ci		mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, folio_nr_pages(folio));
58762306a36Sopenharmony_ci	} else {
58862306a36Sopenharmony_ci		workingset_age_nonresident(lruvec, nr);
58962306a36Sopenharmony_ci		mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
59062306a36Sopenharmony_ci	}
59162306a36Sopenharmony_ci#else
59262306a36Sopenharmony_ci	workingset_age_nonresident(lruvec, nr);
59362306a36Sopenharmony_ci	mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
59462306a36Sopenharmony_ci#endif
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_ci	/* Folio was active prior to eviction */
59762306a36Sopenharmony_ci	if (workingset) {
59862306a36Sopenharmony_ci		folio_set_workingset(folio);
59962306a36Sopenharmony_ci		/*
60062306a36Sopenharmony_ci		 * XXX: Move to folio_add_lru() when it supports new vs
60162306a36Sopenharmony_ci		 * putback
60262306a36Sopenharmony_ci		 */
60362306a36Sopenharmony_ci		lru_note_cost_refault(folio);
60462306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
60562306a36Sopenharmony_ci		if (!is_prot_page(folio_page(folio, 0)) && file)
60662306a36Sopenharmony_ci			mod_lruvec_state(node_lruvec(pgdat), WORKINGSET_RESTORE_BASE + file, folio_nr_pages(folio));
60762306a36Sopenharmony_ci		else
60862306a36Sopenharmony_ci			mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
60962306a36Sopenharmony_ci#else
61062306a36Sopenharmony_ci		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
61162306a36Sopenharmony_ci#endif
61262306a36Sopenharmony_ci	}
61362306a36Sopenharmony_ciout:
61462306a36Sopenharmony_ci	rcu_read_unlock();
61562306a36Sopenharmony_ci}
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ci/**
61862306a36Sopenharmony_ci * workingset_activation - note a page activation
61962306a36Sopenharmony_ci * @folio: Folio that is being activated.
62062306a36Sopenharmony_ci */
62162306a36Sopenharmony_civoid workingset_activation(struct folio *folio)
62262306a36Sopenharmony_ci{
62362306a36Sopenharmony_ci	struct mem_cgroup *memcg;
62462306a36Sopenharmony_ci	struct lruvec *lruvec;
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci	rcu_read_lock();
62762306a36Sopenharmony_ci	/*
62862306a36Sopenharmony_ci	 * Filter non-memcg pages here, e.g. unmap can call
62962306a36Sopenharmony_ci	 * mark_page_accessed() on VDSO pages.
63062306a36Sopenharmony_ci	 *
63162306a36Sopenharmony_ci	 * XXX: See workingset_refault() - this should return
63262306a36Sopenharmony_ci	 * root_mem_cgroup even for !CONFIG_MEMCG.
63362306a36Sopenharmony_ci	 */
63462306a36Sopenharmony_ci	memcg = folio_memcg_rcu(folio);
63562306a36Sopenharmony_ci	if (!mem_cgroup_disabled() && !memcg)
63662306a36Sopenharmony_ci		goto out;
63762306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
63862306a36Sopenharmony_ci	if (!is_prot_page(folio_page(folio, 0)) && page_is_file_lru(folio_page(folio, 0))) {
63962306a36Sopenharmony_ci		lruvec = folio_lruvec(folio);
64062306a36Sopenharmony_ci		workingset_age_nonresident(lruvec, folio_nr_pages(folio));
64162306a36Sopenharmony_ci	} else {
64262306a36Sopenharmony_ci		workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
64362306a36Sopenharmony_ci	}
64462306a36Sopenharmony_ci#else
64562306a36Sopenharmony_ci	workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
64662306a36Sopenharmony_ci#endif
64762306a36Sopenharmony_ciout:
64862306a36Sopenharmony_ci	rcu_read_unlock();
64962306a36Sopenharmony_ci}
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ci/*
65262306a36Sopenharmony_ci * Shadow entries reflect the share of the working set that does not
65362306a36Sopenharmony_ci * fit into memory, so their number depends on the access pattern of
65462306a36Sopenharmony_ci * the workload.  In most cases, they will refault or get reclaimed
65562306a36Sopenharmony_ci * along with the inode, but a (malicious) workload that streams
65662306a36Sopenharmony_ci * through files with a total size several times that of available
65762306a36Sopenharmony_ci * memory, while preventing the inodes from being reclaimed, can
65862306a36Sopenharmony_ci * create excessive amounts of shadow nodes.  To keep a lid on this,
65962306a36Sopenharmony_ci * track shadow nodes and reclaim them when they grow way past the
66062306a36Sopenharmony_ci * point where they would still be useful.
66162306a36Sopenharmony_ci */
66262306a36Sopenharmony_ci
66362306a36Sopenharmony_cistruct list_lru shadow_nodes;
66462306a36Sopenharmony_ci
66562306a36Sopenharmony_civoid workingset_update_node(struct xa_node *node)
66662306a36Sopenharmony_ci{
66762306a36Sopenharmony_ci	struct address_space *mapping;
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	/*
67062306a36Sopenharmony_ci	 * Track non-empty nodes that contain only shadow entries;
67162306a36Sopenharmony_ci	 * unlink those that contain pages or are being freed.
67262306a36Sopenharmony_ci	 *
67362306a36Sopenharmony_ci	 * Avoid acquiring the list_lru lock when the nodes are
67462306a36Sopenharmony_ci	 * already where they should be. The list_empty() test is safe
67562306a36Sopenharmony_ci	 * as node->private_list is protected by the i_pages lock.
67662306a36Sopenharmony_ci	 */
67762306a36Sopenharmony_ci	mapping = container_of(node->array, struct address_space, i_pages);
67862306a36Sopenharmony_ci	lockdep_assert_held(&mapping->i_pages.xa_lock);
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci	if (node->count && node->count == node->nr_values) {
68162306a36Sopenharmony_ci		if (list_empty(&node->private_list)) {
68262306a36Sopenharmony_ci			list_lru_add(&shadow_nodes, &node->private_list);
68362306a36Sopenharmony_ci			__inc_lruvec_kmem_state(node, WORKINGSET_NODES);
68462306a36Sopenharmony_ci		}
68562306a36Sopenharmony_ci	} else {
68662306a36Sopenharmony_ci		if (!list_empty(&node->private_list)) {
68762306a36Sopenharmony_ci			list_lru_del(&shadow_nodes, &node->private_list);
68862306a36Sopenharmony_ci			__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
68962306a36Sopenharmony_ci		}
69062306a36Sopenharmony_ci	}
69162306a36Sopenharmony_ci}
69262306a36Sopenharmony_ci
69362306a36Sopenharmony_cistatic unsigned long count_shadow_nodes(struct shrinker *shrinker,
69462306a36Sopenharmony_ci					struct shrink_control *sc)
69562306a36Sopenharmony_ci{
69662306a36Sopenharmony_ci	unsigned long max_nodes;
69762306a36Sopenharmony_ci	unsigned long nodes;
69862306a36Sopenharmony_ci	unsigned long pages;
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ci	nodes = list_lru_shrink_count(&shadow_nodes, sc);
70162306a36Sopenharmony_ci	if (!nodes)
70262306a36Sopenharmony_ci		return SHRINK_EMPTY;
70362306a36Sopenharmony_ci
70462306a36Sopenharmony_ci	/*
70562306a36Sopenharmony_ci	 * Approximate a reasonable limit for the nodes
70662306a36Sopenharmony_ci	 * containing shadow entries. We don't need to keep more
70762306a36Sopenharmony_ci	 * shadow entries than possible pages on the active list,
70862306a36Sopenharmony_ci	 * since refault distances bigger than that are dismissed.
70962306a36Sopenharmony_ci	 *
71062306a36Sopenharmony_ci	 * The size of the active list converges toward 100% of
71162306a36Sopenharmony_ci	 * overall page cache as memory grows, with only a tiny
71262306a36Sopenharmony_ci	 * inactive list. Assume the total cache size for that.
71362306a36Sopenharmony_ci	 *
71462306a36Sopenharmony_ci	 * Nodes might be sparsely populated, with only one shadow
71562306a36Sopenharmony_ci	 * entry in the extreme case. Obviously, we cannot keep one
71662306a36Sopenharmony_ci	 * node for every eligible shadow entry, so compromise on a
71762306a36Sopenharmony_ci	 * worst-case density of 1/8th. Below that, not all eligible
71862306a36Sopenharmony_ci	 * refaults can be detected anymore.
71962306a36Sopenharmony_ci	 *
72062306a36Sopenharmony_ci	 * On 64-bit with 7 xa_nodes per page and 64 slots
72162306a36Sopenharmony_ci	 * each, this will reclaim shadow entries when they consume
72262306a36Sopenharmony_ci	 * ~1.8% of available memory:
72362306a36Sopenharmony_ci	 *
72462306a36Sopenharmony_ci	 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
72562306a36Sopenharmony_ci	 */
72662306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
72762306a36Sopenharmony_ci#ifndef CONFIG_HYPERHOLD_FILE_LRU
72862306a36Sopenharmony_ci	if (sc->memcg) {
72962306a36Sopenharmony_ci		struct lruvec *lruvec;
73062306a36Sopenharmony_ci		int i;
73162306a36Sopenharmony_ci
73262306a36Sopenharmony_ci		mem_cgroup_flush_stats();
73362306a36Sopenharmony_ci		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
73462306a36Sopenharmony_ci		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
73562306a36Sopenharmony_ci			pages += lruvec_page_state_local(lruvec,
73662306a36Sopenharmony_ci							 NR_LRU_BASE + i);
73762306a36Sopenharmony_ci		pages += lruvec_page_state_local(
73862306a36Sopenharmony_ci			lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
73962306a36Sopenharmony_ci		pages += lruvec_page_state_local(
74062306a36Sopenharmony_ci			lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
74162306a36Sopenharmony_ci	} else
74262306a36Sopenharmony_ci#endif
74362306a36Sopenharmony_ci#endif
74462306a36Sopenharmony_ci		pages = node_present_pages(sc->nid);
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_ci	max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	if (nodes <= max_nodes)
74962306a36Sopenharmony_ci		return 0;
75062306a36Sopenharmony_ci	return nodes - max_nodes;
75162306a36Sopenharmony_ci}
75262306a36Sopenharmony_ci
75362306a36Sopenharmony_cistatic enum lru_status shadow_lru_isolate(struct list_head *item,
75462306a36Sopenharmony_ci					  struct list_lru_one *lru,
75562306a36Sopenharmony_ci					  spinlock_t *lru_lock,
75662306a36Sopenharmony_ci					  void *arg) __must_hold(lru_lock)
75762306a36Sopenharmony_ci{
75862306a36Sopenharmony_ci	struct xa_node *node = container_of(item, struct xa_node, private_list);
75962306a36Sopenharmony_ci	struct address_space *mapping;
76062306a36Sopenharmony_ci	int ret;
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_ci	/*
76362306a36Sopenharmony_ci	 * Page cache insertions and deletions synchronously maintain
76462306a36Sopenharmony_ci	 * the shadow node LRU under the i_pages lock and the
76562306a36Sopenharmony_ci	 * lru_lock.  Because the page cache tree is emptied before
76662306a36Sopenharmony_ci	 * the inode can be destroyed, holding the lru_lock pins any
76762306a36Sopenharmony_ci	 * address_space that has nodes on the LRU.
76862306a36Sopenharmony_ci	 *
76962306a36Sopenharmony_ci	 * We can then safely transition to the i_pages lock to
77062306a36Sopenharmony_ci	 * pin only the address_space of the particular node we want
77162306a36Sopenharmony_ci	 * to reclaim, take the node off-LRU, and drop the lru_lock.
77262306a36Sopenharmony_ci	 */
77362306a36Sopenharmony_ci
77462306a36Sopenharmony_ci	mapping = container_of(node->array, struct address_space, i_pages);
77562306a36Sopenharmony_ci
77662306a36Sopenharmony_ci	/* Coming from the list, invert the lock order */
77762306a36Sopenharmony_ci	if (!xa_trylock(&mapping->i_pages)) {
77862306a36Sopenharmony_ci		spin_unlock_irq(lru_lock);
77962306a36Sopenharmony_ci		ret = LRU_RETRY;
78062306a36Sopenharmony_ci		goto out;
78162306a36Sopenharmony_ci	}
78262306a36Sopenharmony_ci
78362306a36Sopenharmony_ci	/* For page cache we need to hold i_lock */
78462306a36Sopenharmony_ci	if (mapping->host != NULL) {
78562306a36Sopenharmony_ci		if (!spin_trylock(&mapping->host->i_lock)) {
78662306a36Sopenharmony_ci			xa_unlock(&mapping->i_pages);
78762306a36Sopenharmony_ci			spin_unlock_irq(lru_lock);
78862306a36Sopenharmony_ci			ret = LRU_RETRY;
78962306a36Sopenharmony_ci			goto out;
79062306a36Sopenharmony_ci		}
79162306a36Sopenharmony_ci	}
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_ci	list_lru_isolate(lru, item);
79462306a36Sopenharmony_ci	__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
79562306a36Sopenharmony_ci
79662306a36Sopenharmony_ci	spin_unlock(lru_lock);
79762306a36Sopenharmony_ci
79862306a36Sopenharmony_ci	/*
79962306a36Sopenharmony_ci	 * The nodes should only contain one or more shadow entries,
80062306a36Sopenharmony_ci	 * no pages, so we expect to be able to remove them all and
80162306a36Sopenharmony_ci	 * delete and free the empty node afterwards.
80262306a36Sopenharmony_ci	 */
80362306a36Sopenharmony_ci	if (WARN_ON_ONCE(!node->nr_values))
80462306a36Sopenharmony_ci		goto out_invalid;
80562306a36Sopenharmony_ci	if (WARN_ON_ONCE(node->count != node->nr_values))
80662306a36Sopenharmony_ci		goto out_invalid;
80762306a36Sopenharmony_ci	xa_delete_node(node, workingset_update_node);
80862306a36Sopenharmony_ci	__inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
80962306a36Sopenharmony_ci
81062306a36Sopenharmony_ciout_invalid:
81162306a36Sopenharmony_ci	xa_unlock_irq(&mapping->i_pages);
81262306a36Sopenharmony_ci	if (mapping->host != NULL) {
81362306a36Sopenharmony_ci		if (mapping_shrinkable(mapping))
81462306a36Sopenharmony_ci			inode_add_lru(mapping->host);
81562306a36Sopenharmony_ci		spin_unlock(&mapping->host->i_lock);
81662306a36Sopenharmony_ci	}
81762306a36Sopenharmony_ci	ret = LRU_REMOVED_RETRY;
81862306a36Sopenharmony_ciout:
81962306a36Sopenharmony_ci	cond_resched();
82062306a36Sopenharmony_ci	spin_lock_irq(lru_lock);
82162306a36Sopenharmony_ci	return ret;
82262306a36Sopenharmony_ci}
82362306a36Sopenharmony_ci
82462306a36Sopenharmony_cistatic unsigned long scan_shadow_nodes(struct shrinker *shrinker,
82562306a36Sopenharmony_ci				       struct shrink_control *sc)
82662306a36Sopenharmony_ci{
82762306a36Sopenharmony_ci	/* list_lru lock nests inside the IRQ-safe i_pages lock */
82862306a36Sopenharmony_ci	return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
82962306a36Sopenharmony_ci					NULL);
83062306a36Sopenharmony_ci}
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_cistatic struct shrinker workingset_shadow_shrinker = {
83362306a36Sopenharmony_ci	.count_objects = count_shadow_nodes,
83462306a36Sopenharmony_ci	.scan_objects = scan_shadow_nodes,
83562306a36Sopenharmony_ci	.seeks = 0, /* ->count reports only fully expendable nodes */
83662306a36Sopenharmony_ci	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
83762306a36Sopenharmony_ci};
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_ci/*
84062306a36Sopenharmony_ci * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
84162306a36Sopenharmony_ci * i_pages lock.
84262306a36Sopenharmony_ci */
84362306a36Sopenharmony_cistatic struct lock_class_key shadow_nodes_key;
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_cistatic int __init workingset_init(void)
84662306a36Sopenharmony_ci{
84762306a36Sopenharmony_ci	unsigned int timestamp_bits;
84862306a36Sopenharmony_ci	unsigned int max_order;
84962306a36Sopenharmony_ci	int ret;
85062306a36Sopenharmony_ci
85162306a36Sopenharmony_ci	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
85262306a36Sopenharmony_ci	/*
85362306a36Sopenharmony_ci	 * Calculate the eviction bucket size to cover the longest
85462306a36Sopenharmony_ci	 * actionable refault distance, which is currently half of
85562306a36Sopenharmony_ci	 * memory (totalram_pages/2). However, memory hotplug may add
85662306a36Sopenharmony_ci	 * some more pages at runtime, so keep working with up to
85762306a36Sopenharmony_ci	 * double the initial memory by using totalram_pages as-is.
85862306a36Sopenharmony_ci	 */
85962306a36Sopenharmony_ci	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
86062306a36Sopenharmony_ci	max_order = fls_long(totalram_pages() - 1);
86162306a36Sopenharmony_ci	if (max_order > timestamp_bits)
86262306a36Sopenharmony_ci		bucket_order = max_order - timestamp_bits;
86362306a36Sopenharmony_ci	pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
86462306a36Sopenharmony_ci	       timestamp_bits, max_order, bucket_order);
86562306a36Sopenharmony_ci
86662306a36Sopenharmony_ci	ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
86762306a36Sopenharmony_ci	if (ret)
86862306a36Sopenharmony_ci		goto err;
86962306a36Sopenharmony_ci	ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
87062306a36Sopenharmony_ci			      &workingset_shadow_shrinker);
87162306a36Sopenharmony_ci	if (ret)
87262306a36Sopenharmony_ci		goto err_list_lru;
87362306a36Sopenharmony_ci	register_shrinker_prepared(&workingset_shadow_shrinker);
87462306a36Sopenharmony_ci	return 0;
87562306a36Sopenharmony_cierr_list_lru:
87662306a36Sopenharmony_ci	free_prealloced_shrinker(&workingset_shadow_shrinker);
87762306a36Sopenharmony_cierr:
87862306a36Sopenharmony_ci	return ret;
87962306a36Sopenharmony_ci}
88062306a36Sopenharmony_cimodule_init(workingset_init);
881