162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci *  Swap reorganised 29.12.95, Stephen Tweedie.
662306a36Sopenharmony_ci *  kswapd added: 7.1.96  sct
762306a36Sopenharmony_ci *  Removed kswapd_ctl limits, and swap out as many pages as needed
862306a36Sopenharmony_ci *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
962306a36Sopenharmony_ci *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
1062306a36Sopenharmony_ci *  Multiqueue VM started 5.8.00, Rik van Riel.
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci#include <linux/mm.h>
1662306a36Sopenharmony_ci#include <linux/sched/mm.h>
1762306a36Sopenharmony_ci#include <linux/module.h>
1862306a36Sopenharmony_ci#include <linux/gfp.h>
1962306a36Sopenharmony_ci#include <linux/kernel_stat.h>
2062306a36Sopenharmony_ci#include <linux/swap.h>
2162306a36Sopenharmony_ci#include <linux/pagemap.h>
2262306a36Sopenharmony_ci#include <linux/init.h>
2362306a36Sopenharmony_ci#include <linux/highmem.h>
2462306a36Sopenharmony_ci#include <linux/vmpressure.h>
2562306a36Sopenharmony_ci#include <linux/vmstat.h>
2662306a36Sopenharmony_ci#include <linux/file.h>
2762306a36Sopenharmony_ci#include <linux/writeback.h>
2862306a36Sopenharmony_ci#include <linux/blkdev.h>
2962306a36Sopenharmony_ci#include <linux/buffer_head.h>	/* for buffer_heads_over_limit */
3062306a36Sopenharmony_ci#include <linux/mm_inline.h>
3162306a36Sopenharmony_ci#include <linux/backing-dev.h>
3262306a36Sopenharmony_ci#include <linux/rmap.h>
3362306a36Sopenharmony_ci#include <linux/topology.h>
3462306a36Sopenharmony_ci#include <linux/cpu.h>
3562306a36Sopenharmony_ci#include <linux/cpuset.h>
3662306a36Sopenharmony_ci#include <linux/compaction.h>
3762306a36Sopenharmony_ci#include <linux/notifier.h>
3862306a36Sopenharmony_ci#include <linux/rwsem.h>
3962306a36Sopenharmony_ci#include <linux/delay.h>
4062306a36Sopenharmony_ci#include <linux/kthread.h>
4162306a36Sopenharmony_ci#include <linux/freezer.h>
4262306a36Sopenharmony_ci#include <linux/memcontrol.h>
4362306a36Sopenharmony_ci#include <linux/migrate.h>
4462306a36Sopenharmony_ci#include <linux/delayacct.h>
4562306a36Sopenharmony_ci#include <linux/sysctl.h>
4662306a36Sopenharmony_ci#include <linux/memory-tiers.h>
4762306a36Sopenharmony_ci#include <linux/oom.h>
4862306a36Sopenharmony_ci#include <linux/pagevec.h>
4962306a36Sopenharmony_ci#include <linux/prefetch.h>
5062306a36Sopenharmony_ci#include <linux/printk.h>
5162306a36Sopenharmony_ci#include <linux/dax.h>
5262306a36Sopenharmony_ci#include <linux/psi.h>
5362306a36Sopenharmony_ci#include <linux/pagewalk.h>
5462306a36Sopenharmony_ci#include <linux/shmem_fs.h>
5562306a36Sopenharmony_ci#include <linux/ctype.h>
5662306a36Sopenharmony_ci#include <linux/debugfs.h>
5762306a36Sopenharmony_ci#include <linux/khugepaged.h>
5862306a36Sopenharmony_ci#include <linux/rculist_nulls.h>
5962306a36Sopenharmony_ci#include <linux/random.h>
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ci#include <asm/tlbflush.h>
6262306a36Sopenharmony_ci#include <asm/div64.h>
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci#include <linux/swapops.h>
6562306a36Sopenharmony_ci#include <linux/balloon_compaction.h>
6662306a36Sopenharmony_ci#include <linux/sched/sysctl.h>
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci#include "internal.h"
6962306a36Sopenharmony_ci#include "swap.h"
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
7262306a36Sopenharmony_ci#include <trace/events/vmscan.h>
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
7562306a36Sopenharmony_ci#include <linux/memcg_policy.h>
7662306a36Sopenharmony_ci#endif
7762306a36Sopenharmony_ci#ifdef CONFIG_RECLAIM_ACCT
7862306a36Sopenharmony_ci#include <linux/reclaim_acct.h>
7962306a36Sopenharmony_ci#endif
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci#ifdef ARCH_HAS_PREFETCHW
8262306a36Sopenharmony_ci#define prefetchw_prev_lru_folio(_folio, _base, _field)			\
8362306a36Sopenharmony_ci	do {								\
8462306a36Sopenharmony_ci		if ((_folio)->lru.prev != _base) {			\
8562306a36Sopenharmony_ci			struct folio *prev;				\
8662306a36Sopenharmony_ci									\
8762306a36Sopenharmony_ci			prev = lru_to_folio(&(_folio->lru));		\
8862306a36Sopenharmony_ci			prefetchw(&prev->_field);			\
8962306a36Sopenharmony_ci		}							\
9062306a36Sopenharmony_ci	} while (0)
9162306a36Sopenharmony_ci#else
9262306a36Sopenharmony_ci#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
9362306a36Sopenharmony_ci#endif
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
9662306a36Sopenharmony_ciunsigned int enough_inactive_file = 1;
9762306a36Sopenharmony_ci#endif
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci/*
10062306a36Sopenharmony_ci * From 0 .. 200.  Higher means more swappy.
10162306a36Sopenharmony_ci */
10262306a36Sopenharmony_ciint vm_swappiness = 60;
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ciLIST_HEAD(shrinker_list);
10562306a36Sopenharmony_ciDECLARE_RWSEM(shrinker_rwsem);
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
10862306a36Sopenharmony_cistatic int shrinker_nr_max;
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
11162306a36Sopenharmony_cistatic inline int shrinker_map_size(int nr_items)
11262306a36Sopenharmony_ci{
11362306a36Sopenharmony_ci	return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
11462306a36Sopenharmony_ci}
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_cistatic inline int shrinker_defer_size(int nr_items)
11762306a36Sopenharmony_ci{
11862306a36Sopenharmony_ci	return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
11962306a36Sopenharmony_ci}
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_cistatic struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
12262306a36Sopenharmony_ci						     int nid)
12362306a36Sopenharmony_ci{
12462306a36Sopenharmony_ci	return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
12562306a36Sopenharmony_ci					 lockdep_is_held(&shrinker_rwsem));
12662306a36Sopenharmony_ci}
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_cistatic int expand_one_shrinker_info(struct mem_cgroup *memcg,
12962306a36Sopenharmony_ci				    int map_size, int defer_size,
13062306a36Sopenharmony_ci				    int old_map_size, int old_defer_size,
13162306a36Sopenharmony_ci				    int new_nr_max)
13262306a36Sopenharmony_ci{
13362306a36Sopenharmony_ci	struct shrinker_info *new, *old;
13462306a36Sopenharmony_ci	struct mem_cgroup_per_node *pn;
13562306a36Sopenharmony_ci	int nid;
13662306a36Sopenharmony_ci	int size = map_size + defer_size;
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci	for_each_node(nid) {
13962306a36Sopenharmony_ci		pn = memcg->nodeinfo[nid];
14062306a36Sopenharmony_ci		old = shrinker_info_protected(memcg, nid);
14162306a36Sopenharmony_ci		/* Not yet online memcg */
14262306a36Sopenharmony_ci		if (!old)
14362306a36Sopenharmony_ci			return 0;
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci		/* Already expanded this shrinker_info */
14662306a36Sopenharmony_ci		if (new_nr_max <= old->map_nr_max)
14762306a36Sopenharmony_ci			continue;
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci		new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
15062306a36Sopenharmony_ci		if (!new)
15162306a36Sopenharmony_ci			return -ENOMEM;
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci		new->nr_deferred = (atomic_long_t *)(new + 1);
15462306a36Sopenharmony_ci		new->map = (void *)new->nr_deferred + defer_size;
15562306a36Sopenharmony_ci		new->map_nr_max = new_nr_max;
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci		/* map: set all old bits, clear all new bits */
15862306a36Sopenharmony_ci		memset(new->map, (int)0xff, old_map_size);
15962306a36Sopenharmony_ci		memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
16062306a36Sopenharmony_ci		/* nr_deferred: copy old values, clear all new values */
16162306a36Sopenharmony_ci		memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
16262306a36Sopenharmony_ci		memset((void *)new->nr_deferred + old_defer_size, 0,
16362306a36Sopenharmony_ci		       defer_size - old_defer_size);
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci		rcu_assign_pointer(pn->shrinker_info, new);
16662306a36Sopenharmony_ci		kvfree_rcu(old, rcu);
16762306a36Sopenharmony_ci	}
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	return 0;
17062306a36Sopenharmony_ci}
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_civoid free_shrinker_info(struct mem_cgroup *memcg)
17362306a36Sopenharmony_ci{
17462306a36Sopenharmony_ci	struct mem_cgroup_per_node *pn;
17562306a36Sopenharmony_ci	struct shrinker_info *info;
17662306a36Sopenharmony_ci	int nid;
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	for_each_node(nid) {
17962306a36Sopenharmony_ci		pn = memcg->nodeinfo[nid];
18062306a36Sopenharmony_ci		info = rcu_dereference_protected(pn->shrinker_info, true);
18162306a36Sopenharmony_ci		kvfree(info);
18262306a36Sopenharmony_ci		rcu_assign_pointer(pn->shrinker_info, NULL);
18362306a36Sopenharmony_ci	}
18462306a36Sopenharmony_ci}
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ciint alloc_shrinker_info(struct mem_cgroup *memcg)
18762306a36Sopenharmony_ci{
18862306a36Sopenharmony_ci	struct shrinker_info *info;
18962306a36Sopenharmony_ci	int nid, size, ret = 0;
19062306a36Sopenharmony_ci	int map_size, defer_size = 0;
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci	down_write(&shrinker_rwsem);
19362306a36Sopenharmony_ci	map_size = shrinker_map_size(shrinker_nr_max);
19462306a36Sopenharmony_ci	defer_size = shrinker_defer_size(shrinker_nr_max);
19562306a36Sopenharmony_ci	size = map_size + defer_size;
19662306a36Sopenharmony_ci	for_each_node(nid) {
19762306a36Sopenharmony_ci		info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
19862306a36Sopenharmony_ci		if (!info) {
19962306a36Sopenharmony_ci			free_shrinker_info(memcg);
20062306a36Sopenharmony_ci			ret = -ENOMEM;
20162306a36Sopenharmony_ci			break;
20262306a36Sopenharmony_ci		}
20362306a36Sopenharmony_ci		info->nr_deferred = (atomic_long_t *)(info + 1);
20462306a36Sopenharmony_ci		info->map = (void *)info->nr_deferred + defer_size;
20562306a36Sopenharmony_ci		info->map_nr_max = shrinker_nr_max;
20662306a36Sopenharmony_ci		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
20762306a36Sopenharmony_ci	}
20862306a36Sopenharmony_ci	up_write(&shrinker_rwsem);
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	return ret;
21162306a36Sopenharmony_ci}
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_cistatic int expand_shrinker_info(int new_id)
21462306a36Sopenharmony_ci{
21562306a36Sopenharmony_ci	int ret = 0;
21662306a36Sopenharmony_ci	int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
21762306a36Sopenharmony_ci	int map_size, defer_size = 0;
21862306a36Sopenharmony_ci	int old_map_size, old_defer_size = 0;
21962306a36Sopenharmony_ci	struct mem_cgroup *memcg;
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci	if (!root_mem_cgroup)
22262306a36Sopenharmony_ci		goto out;
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ci	lockdep_assert_held(&shrinker_rwsem);
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci	map_size = shrinker_map_size(new_nr_max);
22762306a36Sopenharmony_ci	defer_size = shrinker_defer_size(new_nr_max);
22862306a36Sopenharmony_ci	old_map_size = shrinker_map_size(shrinker_nr_max);
22962306a36Sopenharmony_ci	old_defer_size = shrinker_defer_size(shrinker_nr_max);
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci	memcg = mem_cgroup_iter(NULL, NULL, NULL);
23262306a36Sopenharmony_ci	do {
23362306a36Sopenharmony_ci		ret = expand_one_shrinker_info(memcg, map_size, defer_size,
23462306a36Sopenharmony_ci					       old_map_size, old_defer_size,
23562306a36Sopenharmony_ci					       new_nr_max);
23662306a36Sopenharmony_ci		if (ret) {
23762306a36Sopenharmony_ci			mem_cgroup_iter_break(NULL, memcg);
23862306a36Sopenharmony_ci			goto out;
23962306a36Sopenharmony_ci		}
24062306a36Sopenharmony_ci	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
24162306a36Sopenharmony_ciout:
24262306a36Sopenharmony_ci	if (!ret)
24362306a36Sopenharmony_ci		shrinker_nr_max = new_nr_max;
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	return ret;
24662306a36Sopenharmony_ci}
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_civoid set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
24962306a36Sopenharmony_ci{
25062306a36Sopenharmony_ci	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
25162306a36Sopenharmony_ci		struct shrinker_info *info;
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci		rcu_read_lock();
25462306a36Sopenharmony_ci		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
25562306a36Sopenharmony_ci		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
25662306a36Sopenharmony_ci			/* Pairs with smp mb in shrink_slab() */
25762306a36Sopenharmony_ci			smp_mb__before_atomic();
25862306a36Sopenharmony_ci			set_bit(shrinker_id, info->map);
25962306a36Sopenharmony_ci		}
26062306a36Sopenharmony_ci		rcu_read_unlock();
26162306a36Sopenharmony_ci	}
26262306a36Sopenharmony_ci}
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_cistatic DEFINE_IDR(shrinker_idr);
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_cistatic int prealloc_memcg_shrinker(struct shrinker *shrinker)
26762306a36Sopenharmony_ci{
26862306a36Sopenharmony_ci	int id, ret = -ENOMEM;
26962306a36Sopenharmony_ci
27062306a36Sopenharmony_ci	if (mem_cgroup_disabled())
27162306a36Sopenharmony_ci		return -ENOSYS;
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	down_write(&shrinker_rwsem);
27462306a36Sopenharmony_ci	/* This may call shrinker, so it must use down_read_trylock() */
27562306a36Sopenharmony_ci	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
27662306a36Sopenharmony_ci	if (id < 0)
27762306a36Sopenharmony_ci		goto unlock;
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	if (id >= shrinker_nr_max) {
28062306a36Sopenharmony_ci		if (expand_shrinker_info(id)) {
28162306a36Sopenharmony_ci			idr_remove(&shrinker_idr, id);
28262306a36Sopenharmony_ci			goto unlock;
28362306a36Sopenharmony_ci		}
28462306a36Sopenharmony_ci	}
28562306a36Sopenharmony_ci	shrinker->id = id;
28662306a36Sopenharmony_ci	ret = 0;
28762306a36Sopenharmony_ciunlock:
28862306a36Sopenharmony_ci	up_write(&shrinker_rwsem);
28962306a36Sopenharmony_ci	return ret;
29062306a36Sopenharmony_ci}
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_cistatic void unregister_memcg_shrinker(struct shrinker *shrinker)
29362306a36Sopenharmony_ci{
29462306a36Sopenharmony_ci	int id = shrinker->id;
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	BUG_ON(id < 0);
29762306a36Sopenharmony_ci
29862306a36Sopenharmony_ci	lockdep_assert_held(&shrinker_rwsem);
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_ci	idr_remove(&shrinker_idr, id);
30162306a36Sopenharmony_ci}
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_cistatic long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
30462306a36Sopenharmony_ci				   struct mem_cgroup *memcg)
30562306a36Sopenharmony_ci{
30662306a36Sopenharmony_ci	struct shrinker_info *info;
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci	info = shrinker_info_protected(memcg, nid);
30962306a36Sopenharmony_ci	return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
31062306a36Sopenharmony_ci}
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_cistatic long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
31362306a36Sopenharmony_ci				  struct mem_cgroup *memcg)
31462306a36Sopenharmony_ci{
31562306a36Sopenharmony_ci	struct shrinker_info *info;
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci	info = shrinker_info_protected(memcg, nid);
31862306a36Sopenharmony_ci	return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
31962306a36Sopenharmony_ci}
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_civoid reparent_shrinker_deferred(struct mem_cgroup *memcg)
32262306a36Sopenharmony_ci{
32362306a36Sopenharmony_ci	int i, nid;
32462306a36Sopenharmony_ci	long nr;
32562306a36Sopenharmony_ci	struct mem_cgroup *parent;
32662306a36Sopenharmony_ci	struct shrinker_info *child_info, *parent_info;
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_ci	parent = parent_mem_cgroup(memcg);
32962306a36Sopenharmony_ci	if (!parent)
33062306a36Sopenharmony_ci		parent = root_mem_cgroup;
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	/* Prevent from concurrent shrinker_info expand */
33362306a36Sopenharmony_ci	down_read(&shrinker_rwsem);
33462306a36Sopenharmony_ci	for_each_node(nid) {
33562306a36Sopenharmony_ci		child_info = shrinker_info_protected(memcg, nid);
33662306a36Sopenharmony_ci		parent_info = shrinker_info_protected(parent, nid);
33762306a36Sopenharmony_ci		for (i = 0; i < child_info->map_nr_max; i++) {
33862306a36Sopenharmony_ci			nr = atomic_long_read(&child_info->nr_deferred[i]);
33962306a36Sopenharmony_ci			atomic_long_add(nr, &parent_info->nr_deferred[i]);
34062306a36Sopenharmony_ci		}
34162306a36Sopenharmony_ci	}
34262306a36Sopenharmony_ci	up_read(&shrinker_rwsem);
34362306a36Sopenharmony_ci}
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
34662306a36Sopenharmony_cibool cgroup_reclaim(struct scan_control *sc)
34762306a36Sopenharmony_ci{
34862306a36Sopenharmony_ci	return sc->target_mem_cgroup;
34962306a36Sopenharmony_ci}
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ci/*
35262306a36Sopenharmony_ci * Returns true for reclaim on the root cgroup. This is true for direct
35362306a36Sopenharmony_ci * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
35462306a36Sopenharmony_ci */
35562306a36Sopenharmony_cistatic bool root_reclaim(struct scan_control *sc)
35662306a36Sopenharmony_ci{
35762306a36Sopenharmony_ci	return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
35862306a36Sopenharmony_ci}
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci/**
36162306a36Sopenharmony_ci * writeback_throttling_sane - is the usual dirty throttling mechanism available?
36262306a36Sopenharmony_ci * @sc: scan_control in question
36362306a36Sopenharmony_ci *
36462306a36Sopenharmony_ci * The normal page dirty throttling mechanism in balance_dirty_pages() is
36562306a36Sopenharmony_ci * completely broken with the legacy memcg and direct stalling in
36662306a36Sopenharmony_ci * shrink_folio_list() is used for throttling instead, which lacks all the
36762306a36Sopenharmony_ci * niceties such as fairness, adaptive pausing, bandwidth proportional
36862306a36Sopenharmony_ci * allocation and configurability.
36962306a36Sopenharmony_ci *
37062306a36Sopenharmony_ci * This function tests whether the vmscan currently in progress can assume
37162306a36Sopenharmony_ci * that the normal dirty throttling mechanism is operational.
37262306a36Sopenharmony_ci */
37362306a36Sopenharmony_cibool writeback_throttling_sane(struct scan_control *sc)
37462306a36Sopenharmony_ci{
37562306a36Sopenharmony_ci	if (!cgroup_reclaim(sc))
37662306a36Sopenharmony_ci		return true;
37762306a36Sopenharmony_ci#ifdef CONFIG_CGROUP_WRITEBACK
37862306a36Sopenharmony_ci	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
37962306a36Sopenharmony_ci		return true;
38062306a36Sopenharmony_ci#endif
38162306a36Sopenharmony_ci	return false;
38262306a36Sopenharmony_ci}
38362306a36Sopenharmony_ci#else
38462306a36Sopenharmony_cistatic int prealloc_memcg_shrinker(struct shrinker *shrinker)
38562306a36Sopenharmony_ci{
38662306a36Sopenharmony_ci	return -ENOSYS;
38762306a36Sopenharmony_ci}
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_cistatic void unregister_memcg_shrinker(struct shrinker *shrinker)
39062306a36Sopenharmony_ci{
39162306a36Sopenharmony_ci}
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_cistatic long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
39462306a36Sopenharmony_ci				   struct mem_cgroup *memcg)
39562306a36Sopenharmony_ci{
39662306a36Sopenharmony_ci	return 0;
39762306a36Sopenharmony_ci}
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_cistatic long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
40062306a36Sopenharmony_ci				  struct mem_cgroup *memcg)
40162306a36Sopenharmony_ci{
40262306a36Sopenharmony_ci	return 0;
40362306a36Sopenharmony_ci}
40462306a36Sopenharmony_ci
40562306a36Sopenharmony_cibool cgroup_reclaim(struct scan_control *sc)
40662306a36Sopenharmony_ci{
40762306a36Sopenharmony_ci	return false;
40862306a36Sopenharmony_ci}
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_cistatic bool root_reclaim(struct scan_control *sc)
41162306a36Sopenharmony_ci{
41262306a36Sopenharmony_ci	return true;
41362306a36Sopenharmony_ci}
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_cibool writeback_throttling_sane(struct scan_control *sc)
41662306a36Sopenharmony_ci{
41762306a36Sopenharmony_ci	return true;
41862306a36Sopenharmony_ci}
41962306a36Sopenharmony_ci#endif
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_cistatic void set_task_reclaim_state(struct task_struct *task,
42262306a36Sopenharmony_ci				   struct reclaim_state *rs)
42362306a36Sopenharmony_ci{
42462306a36Sopenharmony_ci	/* Check for an overwrite */
42562306a36Sopenharmony_ci	WARN_ON_ONCE(rs && task->reclaim_state);
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	/* Check for the nulling of an already-nulled member */
42862306a36Sopenharmony_ci	WARN_ON_ONCE(!rs && !task->reclaim_state);
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_ci	task->reclaim_state = rs;
43162306a36Sopenharmony_ci}
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci/*
43462306a36Sopenharmony_ci * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
43562306a36Sopenharmony_ci * scan_control->nr_reclaimed.
43662306a36Sopenharmony_ci */
43762306a36Sopenharmony_cistatic void flush_reclaim_state(struct scan_control *sc)
43862306a36Sopenharmony_ci{
43962306a36Sopenharmony_ci	/*
44062306a36Sopenharmony_ci	 * Currently, reclaim_state->reclaimed includes three types of pages
44162306a36Sopenharmony_ci	 * freed outside of vmscan:
44262306a36Sopenharmony_ci	 * (1) Slab pages.
44362306a36Sopenharmony_ci	 * (2) Clean file pages from pruned inodes (on highmem systems).
44462306a36Sopenharmony_ci	 * (3) XFS freed buffer pages.
44562306a36Sopenharmony_ci	 *
44662306a36Sopenharmony_ci	 * For all of these cases, we cannot universally link the pages to a
44762306a36Sopenharmony_ci	 * single memcg. For example, a memcg-aware shrinker can free one object
44862306a36Sopenharmony_ci	 * charged to the target memcg, causing an entire page to be freed.
44962306a36Sopenharmony_ci	 * If we count the entire page as reclaimed from the memcg, we end up
45062306a36Sopenharmony_ci	 * overestimating the reclaimed amount (potentially under-reclaiming).
45162306a36Sopenharmony_ci	 *
45262306a36Sopenharmony_ci	 * Only count such pages for global reclaim to prevent under-reclaiming
45362306a36Sopenharmony_ci	 * from the target memcg; preventing unnecessary retries during memcg
45462306a36Sopenharmony_ci	 * charging and false positives from proactive reclaim.
45562306a36Sopenharmony_ci	 *
45662306a36Sopenharmony_ci	 * For uncommon cases where the freed pages were actually mostly
45762306a36Sopenharmony_ci	 * charged to the target memcg, we end up underestimating the reclaimed
45862306a36Sopenharmony_ci	 * amount. This should be fine. The freed pages will be uncharged
45962306a36Sopenharmony_ci	 * anyway, even if they are not counted here properly, and we will be
46062306a36Sopenharmony_ci	 * able to make forward progress in charging (which is usually in a
46162306a36Sopenharmony_ci	 * retry loop).
46262306a36Sopenharmony_ci	 *
46362306a36Sopenharmony_ci	 * We can go one step further, and report the uncharged objcg pages in
46462306a36Sopenharmony_ci	 * memcg reclaim, to make reporting more accurate and reduce
46562306a36Sopenharmony_ci	 * underestimation, but it's probably not worth the complexity for now.
46662306a36Sopenharmony_ci	 */
46762306a36Sopenharmony_ci	if (current->reclaim_state && root_reclaim(sc)) {
46862306a36Sopenharmony_ci		sc->nr_reclaimed += current->reclaim_state->reclaimed;
46962306a36Sopenharmony_ci		current->reclaim_state->reclaimed = 0;
47062306a36Sopenharmony_ci	}
47162306a36Sopenharmony_ci}
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_cistatic long xchg_nr_deferred(struct shrinker *shrinker,
47462306a36Sopenharmony_ci			     struct shrink_control *sc)
47562306a36Sopenharmony_ci{
47662306a36Sopenharmony_ci	int nid = sc->nid;
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
47962306a36Sopenharmony_ci		nid = 0;
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	if (sc->memcg &&
48262306a36Sopenharmony_ci	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
48362306a36Sopenharmony_ci		return xchg_nr_deferred_memcg(nid, shrinker,
48462306a36Sopenharmony_ci					      sc->memcg);
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci	return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
48762306a36Sopenharmony_ci}
48862306a36Sopenharmony_ci
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_cistatic long add_nr_deferred(long nr, struct shrinker *shrinker,
49162306a36Sopenharmony_ci			    struct shrink_control *sc)
49262306a36Sopenharmony_ci{
49362306a36Sopenharmony_ci	int nid = sc->nid;
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
49662306a36Sopenharmony_ci		nid = 0;
49762306a36Sopenharmony_ci
49862306a36Sopenharmony_ci	if (sc->memcg &&
49962306a36Sopenharmony_ci	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
50062306a36Sopenharmony_ci		return add_nr_deferred_memcg(nr, nid, shrinker,
50162306a36Sopenharmony_ci					     sc->memcg);
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci	return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
50462306a36Sopenharmony_ci}
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_cistatic bool can_demote(int nid, struct scan_control *sc)
50762306a36Sopenharmony_ci{
50862306a36Sopenharmony_ci	if (!numa_demotion_enabled)
50962306a36Sopenharmony_ci		return false;
51062306a36Sopenharmony_ci	if (sc && sc->no_demotion)
51162306a36Sopenharmony_ci		return false;
51262306a36Sopenharmony_ci	if (next_demotion_node(nid) == NUMA_NO_NODE)
51362306a36Sopenharmony_ci		return false;
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	return true;
51662306a36Sopenharmony_ci}
51762306a36Sopenharmony_ci
51862306a36Sopenharmony_cistatic inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
51962306a36Sopenharmony_ci					  int nid,
52062306a36Sopenharmony_ci					  struct scan_control *sc)
52162306a36Sopenharmony_ci{
52262306a36Sopenharmony_ci	if (memcg == NULL) {
52362306a36Sopenharmony_ci		/*
52462306a36Sopenharmony_ci		 * For non-memcg reclaim, is there
52562306a36Sopenharmony_ci		 * space in any swap device?
52662306a36Sopenharmony_ci		 */
52762306a36Sopenharmony_ci		if (get_nr_swap_pages() > 0)
52862306a36Sopenharmony_ci			return true;
52962306a36Sopenharmony_ci	} else {
53062306a36Sopenharmony_ci		/* Is the memcg below its swap limit? */
53162306a36Sopenharmony_ci		if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
53262306a36Sopenharmony_ci			return true;
53362306a36Sopenharmony_ci	}
53462306a36Sopenharmony_ci
53562306a36Sopenharmony_ci	/*
53662306a36Sopenharmony_ci	 * The page can not be swapped.
53762306a36Sopenharmony_ci	 *
53862306a36Sopenharmony_ci	 * Can it be reclaimed from this node via demotion?
53962306a36Sopenharmony_ci	 */
54062306a36Sopenharmony_ci	return can_demote(nid, sc);
54162306a36Sopenharmony_ci}
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_ci/*
54462306a36Sopenharmony_ci * This misses isolated folios which are not accounted for to save counters.
54562306a36Sopenharmony_ci * As the data only determines if reclaim or compaction continues, it is
54662306a36Sopenharmony_ci * not expected that isolated folios will be a dominating factor.
54762306a36Sopenharmony_ci */
54862306a36Sopenharmony_ciunsigned long zone_reclaimable_pages(struct zone *zone)
54962306a36Sopenharmony_ci{
55062306a36Sopenharmony_ci	unsigned long nr;
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci	nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
55362306a36Sopenharmony_ci		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
55462306a36Sopenharmony_ci	if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
55562306a36Sopenharmony_ci		nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
55662306a36Sopenharmony_ci			zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	return nr;
55962306a36Sopenharmony_ci}
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci/**
56262306a36Sopenharmony_ci * lruvec_lru_size -  Returns the number of pages on the given LRU list.
56362306a36Sopenharmony_ci * @lruvec: lru vector
56462306a36Sopenharmony_ci * @lru: lru to use
56562306a36Sopenharmony_ci * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
56662306a36Sopenharmony_ci */
56762306a36Sopenharmony_ciunsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
56862306a36Sopenharmony_ci				     int zone_idx)
56962306a36Sopenharmony_ci{
57062306a36Sopenharmony_ci	unsigned long size = 0;
57162306a36Sopenharmony_ci	int zid;
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
57462306a36Sopenharmony_ci	if (!mem_cgroup_disabled() && is_node_lruvec(lruvec)) {
57562306a36Sopenharmony_ci		for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
57662306a36Sopenharmony_ci			struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci			if (!managed_zone(zone))
57962306a36Sopenharmony_ci				continue;
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci			size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
58262306a36Sopenharmony_ci		}
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci		return size;
58562306a36Sopenharmony_ci	}
58662306a36Sopenharmony_ci#endif
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ci	for (zid = 0; zid <= zone_idx; zid++) {
58962306a36Sopenharmony_ci		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci		if (!managed_zone(zone))
59262306a36Sopenharmony_ci			continue;
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci		if (!mem_cgroup_disabled())
59562306a36Sopenharmony_ci			size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
59662306a36Sopenharmony_ci		else
59762306a36Sopenharmony_ci			size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
59862306a36Sopenharmony_ci	}
59962306a36Sopenharmony_ci	return size;
60062306a36Sopenharmony_ci}
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_ci/*
60362306a36Sopenharmony_ci * Add a shrinker callback to be called from the vm.
60462306a36Sopenharmony_ci */
60562306a36Sopenharmony_cistatic int __prealloc_shrinker(struct shrinker *shrinker)
60662306a36Sopenharmony_ci{
60762306a36Sopenharmony_ci	unsigned int size;
60862306a36Sopenharmony_ci	int err;
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_ci	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
61162306a36Sopenharmony_ci		err = prealloc_memcg_shrinker(shrinker);
61262306a36Sopenharmony_ci		if (err != -ENOSYS)
61362306a36Sopenharmony_ci			return err;
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci		shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
61662306a36Sopenharmony_ci	}
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_ci	size = sizeof(*shrinker->nr_deferred);
61962306a36Sopenharmony_ci	if (shrinker->flags & SHRINKER_NUMA_AWARE)
62062306a36Sopenharmony_ci		size *= nr_node_ids;
62162306a36Sopenharmony_ci
62262306a36Sopenharmony_ci	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
62362306a36Sopenharmony_ci	if (!shrinker->nr_deferred)
62462306a36Sopenharmony_ci		return -ENOMEM;
62562306a36Sopenharmony_ci
62662306a36Sopenharmony_ci	return 0;
62762306a36Sopenharmony_ci}
62862306a36Sopenharmony_ci
62962306a36Sopenharmony_ci#ifdef CONFIG_SHRINKER_DEBUG
63062306a36Sopenharmony_ciint prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
63162306a36Sopenharmony_ci{
63262306a36Sopenharmony_ci	va_list ap;
63362306a36Sopenharmony_ci	int err;
63462306a36Sopenharmony_ci
63562306a36Sopenharmony_ci	va_start(ap, fmt);
63662306a36Sopenharmony_ci	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
63762306a36Sopenharmony_ci	va_end(ap);
63862306a36Sopenharmony_ci	if (!shrinker->name)
63962306a36Sopenharmony_ci		return -ENOMEM;
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci	err = __prealloc_shrinker(shrinker);
64262306a36Sopenharmony_ci	if (err) {
64362306a36Sopenharmony_ci		kfree_const(shrinker->name);
64462306a36Sopenharmony_ci		shrinker->name = NULL;
64562306a36Sopenharmony_ci	}
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci	return err;
64862306a36Sopenharmony_ci}
64962306a36Sopenharmony_ci#else
65062306a36Sopenharmony_ciint prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
65162306a36Sopenharmony_ci{
65262306a36Sopenharmony_ci	return __prealloc_shrinker(shrinker);
65362306a36Sopenharmony_ci}
65462306a36Sopenharmony_ci#endif
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_civoid free_prealloced_shrinker(struct shrinker *shrinker)
65762306a36Sopenharmony_ci{
65862306a36Sopenharmony_ci#ifdef CONFIG_SHRINKER_DEBUG
65962306a36Sopenharmony_ci	kfree_const(shrinker->name);
66062306a36Sopenharmony_ci	shrinker->name = NULL;
66162306a36Sopenharmony_ci#endif
66262306a36Sopenharmony_ci	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
66362306a36Sopenharmony_ci		down_write(&shrinker_rwsem);
66462306a36Sopenharmony_ci		unregister_memcg_shrinker(shrinker);
66562306a36Sopenharmony_ci		up_write(&shrinker_rwsem);
66662306a36Sopenharmony_ci		return;
66762306a36Sopenharmony_ci	}
66862306a36Sopenharmony_ci
66962306a36Sopenharmony_ci	kfree(shrinker->nr_deferred);
67062306a36Sopenharmony_ci	shrinker->nr_deferred = NULL;
67162306a36Sopenharmony_ci}
67262306a36Sopenharmony_ci
67362306a36Sopenharmony_civoid register_shrinker_prepared(struct shrinker *shrinker)
67462306a36Sopenharmony_ci{
67562306a36Sopenharmony_ci	down_write(&shrinker_rwsem);
67662306a36Sopenharmony_ci	list_add_tail(&shrinker->list, &shrinker_list);
67762306a36Sopenharmony_ci	shrinker->flags |= SHRINKER_REGISTERED;
67862306a36Sopenharmony_ci	shrinker_debugfs_add(shrinker);
67962306a36Sopenharmony_ci	up_write(&shrinker_rwsem);
68062306a36Sopenharmony_ci}
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_cistatic int __register_shrinker(struct shrinker *shrinker)
68362306a36Sopenharmony_ci{
68462306a36Sopenharmony_ci	int err = __prealloc_shrinker(shrinker);
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_ci	if (err)
68762306a36Sopenharmony_ci		return err;
68862306a36Sopenharmony_ci	register_shrinker_prepared(shrinker);
68962306a36Sopenharmony_ci	return 0;
69062306a36Sopenharmony_ci}
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci#ifdef CONFIG_SHRINKER_DEBUG
69362306a36Sopenharmony_ciint register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
69462306a36Sopenharmony_ci{
69562306a36Sopenharmony_ci	va_list ap;
69662306a36Sopenharmony_ci	int err;
69762306a36Sopenharmony_ci
69862306a36Sopenharmony_ci	va_start(ap, fmt);
69962306a36Sopenharmony_ci	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
70062306a36Sopenharmony_ci	va_end(ap);
70162306a36Sopenharmony_ci	if (!shrinker->name)
70262306a36Sopenharmony_ci		return -ENOMEM;
70362306a36Sopenharmony_ci
70462306a36Sopenharmony_ci	err = __register_shrinker(shrinker);
70562306a36Sopenharmony_ci	if (err) {
70662306a36Sopenharmony_ci		kfree_const(shrinker->name);
70762306a36Sopenharmony_ci		shrinker->name = NULL;
70862306a36Sopenharmony_ci	}
70962306a36Sopenharmony_ci	return err;
71062306a36Sopenharmony_ci}
71162306a36Sopenharmony_ci#else
71262306a36Sopenharmony_ciint register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
71362306a36Sopenharmony_ci{
71462306a36Sopenharmony_ci	return __register_shrinker(shrinker);
71562306a36Sopenharmony_ci}
71662306a36Sopenharmony_ci#endif
71762306a36Sopenharmony_ciEXPORT_SYMBOL(register_shrinker);
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_ci/*
72062306a36Sopenharmony_ci * Remove one
72162306a36Sopenharmony_ci */
72262306a36Sopenharmony_civoid unregister_shrinker(struct shrinker *shrinker)
72362306a36Sopenharmony_ci{
72462306a36Sopenharmony_ci	struct dentry *debugfs_entry;
72562306a36Sopenharmony_ci	int debugfs_id;
72662306a36Sopenharmony_ci
72762306a36Sopenharmony_ci	if (!(shrinker->flags & SHRINKER_REGISTERED))
72862306a36Sopenharmony_ci		return;
72962306a36Sopenharmony_ci
73062306a36Sopenharmony_ci	down_write(&shrinker_rwsem);
73162306a36Sopenharmony_ci	list_del(&shrinker->list);
73262306a36Sopenharmony_ci	shrinker->flags &= ~SHRINKER_REGISTERED;
73362306a36Sopenharmony_ci	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
73462306a36Sopenharmony_ci		unregister_memcg_shrinker(shrinker);
73562306a36Sopenharmony_ci	debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
73662306a36Sopenharmony_ci	up_write(&shrinker_rwsem);
73762306a36Sopenharmony_ci
73862306a36Sopenharmony_ci	shrinker_debugfs_remove(debugfs_entry, debugfs_id);
73962306a36Sopenharmony_ci
74062306a36Sopenharmony_ci	kfree(shrinker->nr_deferred);
74162306a36Sopenharmony_ci	shrinker->nr_deferred = NULL;
74262306a36Sopenharmony_ci}
74362306a36Sopenharmony_ciEXPORT_SYMBOL(unregister_shrinker);
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci/**
74662306a36Sopenharmony_ci * synchronize_shrinkers - Wait for all running shrinkers to complete.
74762306a36Sopenharmony_ci *
74862306a36Sopenharmony_ci * This is equivalent to calling unregister_shrink() and register_shrinker(),
74962306a36Sopenharmony_ci * but atomically and with less overhead. This is useful to guarantee that all
75062306a36Sopenharmony_ci * shrinker invocations have seen an update, before freeing memory, similar to
75162306a36Sopenharmony_ci * rcu.
75262306a36Sopenharmony_ci */
75362306a36Sopenharmony_civoid synchronize_shrinkers(void)
75462306a36Sopenharmony_ci{
75562306a36Sopenharmony_ci	down_write(&shrinker_rwsem);
75662306a36Sopenharmony_ci	up_write(&shrinker_rwsem);
75762306a36Sopenharmony_ci}
75862306a36Sopenharmony_ciEXPORT_SYMBOL(synchronize_shrinkers);
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci#define SHRINK_BATCH 128
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_cistatic unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
76362306a36Sopenharmony_ci				    struct shrinker *shrinker, int priority)
76462306a36Sopenharmony_ci{
76562306a36Sopenharmony_ci	unsigned long freed = 0;
76662306a36Sopenharmony_ci	unsigned long long delta;
76762306a36Sopenharmony_ci	long total_scan;
76862306a36Sopenharmony_ci	long freeable;
76962306a36Sopenharmony_ci	long nr;
77062306a36Sopenharmony_ci	long new_nr;
77162306a36Sopenharmony_ci	long batch_size = shrinker->batch ? shrinker->batch
77262306a36Sopenharmony_ci					  : SHRINK_BATCH;
77362306a36Sopenharmony_ci	long scanned = 0, next_deferred;
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci	freeable = shrinker->count_objects(shrinker, shrinkctl);
77662306a36Sopenharmony_ci	if (freeable == 0 || freeable == SHRINK_EMPTY)
77762306a36Sopenharmony_ci		return freeable;
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci	/*
78062306a36Sopenharmony_ci	 * copy the current shrinker scan count into a local variable
78162306a36Sopenharmony_ci	 * and zero it so that other concurrent shrinker invocations
78262306a36Sopenharmony_ci	 * don't also do this scanning work.
78362306a36Sopenharmony_ci	 */
78462306a36Sopenharmony_ci	nr = xchg_nr_deferred(shrinker, shrinkctl);
78562306a36Sopenharmony_ci
78662306a36Sopenharmony_ci	if (shrinker->seeks) {
78762306a36Sopenharmony_ci		delta = freeable >> priority;
78862306a36Sopenharmony_ci		delta *= 4;
78962306a36Sopenharmony_ci		do_div(delta, shrinker->seeks);
79062306a36Sopenharmony_ci	} else {
79162306a36Sopenharmony_ci		/*
79262306a36Sopenharmony_ci		 * These objects don't require any IO to create. Trim
79362306a36Sopenharmony_ci		 * them aggressively under memory pressure to keep
79462306a36Sopenharmony_ci		 * them from causing refetches in the IO caches.
79562306a36Sopenharmony_ci		 */
79662306a36Sopenharmony_ci		delta = freeable / 2;
79762306a36Sopenharmony_ci	}
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci	total_scan = nr >> priority;
80062306a36Sopenharmony_ci	total_scan += delta;
80162306a36Sopenharmony_ci	total_scan = min(total_scan, (2 * freeable));
80262306a36Sopenharmony_ci
80362306a36Sopenharmony_ci	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
80462306a36Sopenharmony_ci				   freeable, delta, total_scan, priority);
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	/*
80762306a36Sopenharmony_ci	 * Normally, we should not scan less than batch_size objects in one
80862306a36Sopenharmony_ci	 * pass to avoid too frequent shrinker calls, but if the slab has less
80962306a36Sopenharmony_ci	 * than batch_size objects in total and we are really tight on memory,
81062306a36Sopenharmony_ci	 * we will try to reclaim all available objects, otherwise we can end
81162306a36Sopenharmony_ci	 * up failing allocations although there are plenty of reclaimable
81262306a36Sopenharmony_ci	 * objects spread over several slabs with usage less than the
81362306a36Sopenharmony_ci	 * batch_size.
81462306a36Sopenharmony_ci	 *
81562306a36Sopenharmony_ci	 * We detect the "tight on memory" situations by looking at the total
81662306a36Sopenharmony_ci	 * number of objects we want to scan (total_scan). If it is greater
81762306a36Sopenharmony_ci	 * than the total number of objects on slab (freeable), we must be
81862306a36Sopenharmony_ci	 * scanning at high prio and therefore should try to reclaim as much as
81962306a36Sopenharmony_ci	 * possible.
82062306a36Sopenharmony_ci	 */
82162306a36Sopenharmony_ci	while (total_scan >= batch_size ||
82262306a36Sopenharmony_ci	       total_scan >= freeable) {
82362306a36Sopenharmony_ci		unsigned long ret;
82462306a36Sopenharmony_ci		unsigned long nr_to_scan = min(batch_size, total_scan);
82562306a36Sopenharmony_ci
82662306a36Sopenharmony_ci		shrinkctl->nr_to_scan = nr_to_scan;
82762306a36Sopenharmony_ci		shrinkctl->nr_scanned = nr_to_scan;
82862306a36Sopenharmony_ci		ret = shrinker->scan_objects(shrinker, shrinkctl);
82962306a36Sopenharmony_ci		if (ret == SHRINK_STOP)
83062306a36Sopenharmony_ci			break;
83162306a36Sopenharmony_ci		freed += ret;
83262306a36Sopenharmony_ci
83362306a36Sopenharmony_ci		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
83462306a36Sopenharmony_ci		total_scan -= shrinkctl->nr_scanned;
83562306a36Sopenharmony_ci		scanned += shrinkctl->nr_scanned;
83662306a36Sopenharmony_ci
83762306a36Sopenharmony_ci		cond_resched();
83862306a36Sopenharmony_ci	}
83962306a36Sopenharmony_ci
84062306a36Sopenharmony_ci	/*
84162306a36Sopenharmony_ci	 * The deferred work is increased by any new work (delta) that wasn't
84262306a36Sopenharmony_ci	 * done, decreased by old deferred work that was done now.
84362306a36Sopenharmony_ci	 *
84462306a36Sopenharmony_ci	 * And it is capped to two times of the freeable items.
84562306a36Sopenharmony_ci	 */
84662306a36Sopenharmony_ci	next_deferred = max_t(long, (nr + delta - scanned), 0);
84762306a36Sopenharmony_ci	next_deferred = min(next_deferred, (2 * freeable));
84862306a36Sopenharmony_ci
84962306a36Sopenharmony_ci	/*
85062306a36Sopenharmony_ci	 * move the unused scan count back into the shrinker in a
85162306a36Sopenharmony_ci	 * manner that handles concurrent updates.
85262306a36Sopenharmony_ci	 */
85362306a36Sopenharmony_ci	new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
85462306a36Sopenharmony_ci
85562306a36Sopenharmony_ci	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
85662306a36Sopenharmony_ci	return freed;
85762306a36Sopenharmony_ci}
85862306a36Sopenharmony_ci
85962306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
86062306a36Sopenharmony_cistatic unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
86162306a36Sopenharmony_ci			struct mem_cgroup *memcg, int priority)
86262306a36Sopenharmony_ci{
86362306a36Sopenharmony_ci	struct shrinker_info *info;
86462306a36Sopenharmony_ci	unsigned long ret, freed = 0;
86562306a36Sopenharmony_ci	int i;
86662306a36Sopenharmony_ci
86762306a36Sopenharmony_ci	if (!mem_cgroup_online(memcg))
86862306a36Sopenharmony_ci		return 0;
86962306a36Sopenharmony_ci
87062306a36Sopenharmony_ci	if (!down_read_trylock(&shrinker_rwsem))
87162306a36Sopenharmony_ci		return 0;
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci	info = shrinker_info_protected(memcg, nid);
87462306a36Sopenharmony_ci	if (unlikely(!info))
87562306a36Sopenharmony_ci		goto unlock;
87662306a36Sopenharmony_ci
87762306a36Sopenharmony_ci	for_each_set_bit(i, info->map, info->map_nr_max) {
87862306a36Sopenharmony_ci		struct shrink_control sc = {
87962306a36Sopenharmony_ci			.gfp_mask = gfp_mask,
88062306a36Sopenharmony_ci			.nid = nid,
88162306a36Sopenharmony_ci			.memcg = memcg,
88262306a36Sopenharmony_ci		};
88362306a36Sopenharmony_ci		struct shrinker *shrinker;
88462306a36Sopenharmony_ci
88562306a36Sopenharmony_ci		shrinker = idr_find(&shrinker_idr, i);
88662306a36Sopenharmony_ci		if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
88762306a36Sopenharmony_ci			if (!shrinker)
88862306a36Sopenharmony_ci				clear_bit(i, info->map);
88962306a36Sopenharmony_ci			continue;
89062306a36Sopenharmony_ci		}
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci		/* Call non-slab shrinkers even though kmem is disabled */
89362306a36Sopenharmony_ci		if (!memcg_kmem_online() &&
89462306a36Sopenharmony_ci		    !(shrinker->flags & SHRINKER_NONSLAB))
89562306a36Sopenharmony_ci			continue;
89662306a36Sopenharmony_ci
89762306a36Sopenharmony_ci		ret = do_shrink_slab(&sc, shrinker, priority);
89862306a36Sopenharmony_ci		if (ret == SHRINK_EMPTY) {
89962306a36Sopenharmony_ci			clear_bit(i, info->map);
90062306a36Sopenharmony_ci			/*
90162306a36Sopenharmony_ci			 * After the shrinker reported that it had no objects to
90262306a36Sopenharmony_ci			 * free, but before we cleared the corresponding bit in
90362306a36Sopenharmony_ci			 * the memcg shrinker map, a new object might have been
90462306a36Sopenharmony_ci			 * added. To make sure, we have the bit set in this
90562306a36Sopenharmony_ci			 * case, we invoke the shrinker one more time and reset
90662306a36Sopenharmony_ci			 * the bit if it reports that it is not empty anymore.
90762306a36Sopenharmony_ci			 * The memory barrier here pairs with the barrier in
90862306a36Sopenharmony_ci			 * set_shrinker_bit():
90962306a36Sopenharmony_ci			 *
91062306a36Sopenharmony_ci			 * list_lru_add()     shrink_slab_memcg()
91162306a36Sopenharmony_ci			 *   list_add_tail()    clear_bit()
91262306a36Sopenharmony_ci			 *   <MB>               <MB>
91362306a36Sopenharmony_ci			 *   set_bit()          do_shrink_slab()
91462306a36Sopenharmony_ci			 */
91562306a36Sopenharmony_ci			smp_mb__after_atomic();
91662306a36Sopenharmony_ci			ret = do_shrink_slab(&sc, shrinker, priority);
91762306a36Sopenharmony_ci			if (ret == SHRINK_EMPTY)
91862306a36Sopenharmony_ci				ret = 0;
91962306a36Sopenharmony_ci			else
92062306a36Sopenharmony_ci				set_shrinker_bit(memcg, nid, i);
92162306a36Sopenharmony_ci		}
92262306a36Sopenharmony_ci		freed += ret;
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci		if (rwsem_is_contended(&shrinker_rwsem)) {
92562306a36Sopenharmony_ci			freed = freed ? : 1;
92662306a36Sopenharmony_ci			break;
92762306a36Sopenharmony_ci		}
92862306a36Sopenharmony_ci	}
92962306a36Sopenharmony_ciunlock:
93062306a36Sopenharmony_ci	up_read(&shrinker_rwsem);
93162306a36Sopenharmony_ci	return freed;
93262306a36Sopenharmony_ci}
93362306a36Sopenharmony_ci#else /* CONFIG_MEMCG */
93462306a36Sopenharmony_cistatic unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
93562306a36Sopenharmony_ci			struct mem_cgroup *memcg, int priority)
93662306a36Sopenharmony_ci{
93762306a36Sopenharmony_ci	return 0;
93862306a36Sopenharmony_ci}
93962306a36Sopenharmony_ci#endif /* CONFIG_MEMCG */
94062306a36Sopenharmony_ci
94162306a36Sopenharmony_ci/**
94262306a36Sopenharmony_ci * shrink_slab - shrink slab caches
94362306a36Sopenharmony_ci * @gfp_mask: allocation context
94462306a36Sopenharmony_ci * @nid: node whose slab caches to target
94562306a36Sopenharmony_ci * @memcg: memory cgroup whose slab caches to target
94662306a36Sopenharmony_ci * @priority: the reclaim priority
94762306a36Sopenharmony_ci *
94862306a36Sopenharmony_ci * Call the shrink functions to age shrinkable caches.
94962306a36Sopenharmony_ci *
95062306a36Sopenharmony_ci * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
95162306a36Sopenharmony_ci * unaware shrinkers will receive a node id of 0 instead.
95262306a36Sopenharmony_ci *
95362306a36Sopenharmony_ci * @memcg specifies the memory cgroup to target. Unaware shrinkers
95462306a36Sopenharmony_ci * are called only if it is the root cgroup.
95562306a36Sopenharmony_ci *
95662306a36Sopenharmony_ci * @priority is sc->priority, we take the number of objects and >> by priority
95762306a36Sopenharmony_ci * in order to get the scan target.
95862306a36Sopenharmony_ci *
95962306a36Sopenharmony_ci * Returns the number of reclaimed slab objects.
96062306a36Sopenharmony_ci */
96162306a36Sopenharmony_ciunsigned long shrink_slab(gfp_t gfp_mask, int nid,
96262306a36Sopenharmony_ci				 struct mem_cgroup *memcg,
96362306a36Sopenharmony_ci				 int priority)
96462306a36Sopenharmony_ci{
96562306a36Sopenharmony_ci	unsigned long ret, freed = 0;
96662306a36Sopenharmony_ci	struct shrinker *shrinker;
96762306a36Sopenharmony_ci
96862306a36Sopenharmony_ci	/*
96962306a36Sopenharmony_ci	 * The root memcg might be allocated even though memcg is disabled
97062306a36Sopenharmony_ci	 * via "cgroup_disable=memory" boot parameter.  This could make
97162306a36Sopenharmony_ci	 * mem_cgroup_is_root() return false, then just run memcg slab
97262306a36Sopenharmony_ci	 * shrink, but skip global shrink.  This may result in premature
97362306a36Sopenharmony_ci	 * oom.
97462306a36Sopenharmony_ci	 */
97562306a36Sopenharmony_ci	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
97662306a36Sopenharmony_ci		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
97762306a36Sopenharmony_ci
97862306a36Sopenharmony_ci	if (!down_read_trylock(&shrinker_rwsem))
97962306a36Sopenharmony_ci		goto out;
98062306a36Sopenharmony_ci
98162306a36Sopenharmony_ci	list_for_each_entry(shrinker, &shrinker_list, list) {
98262306a36Sopenharmony_ci		struct shrink_control sc = {
98362306a36Sopenharmony_ci			.gfp_mask = gfp_mask,
98462306a36Sopenharmony_ci			.nid = nid,
98562306a36Sopenharmony_ci			.memcg = memcg,
98662306a36Sopenharmony_ci		};
98762306a36Sopenharmony_ci
98862306a36Sopenharmony_ci		ret = do_shrink_slab(&sc, shrinker, priority);
98962306a36Sopenharmony_ci		if (ret == SHRINK_EMPTY)
99062306a36Sopenharmony_ci			ret = 0;
99162306a36Sopenharmony_ci		freed += ret;
99262306a36Sopenharmony_ci		/*
99362306a36Sopenharmony_ci		 * Bail out if someone want to register a new shrinker to
99462306a36Sopenharmony_ci		 * prevent the registration from being stalled for long periods
99562306a36Sopenharmony_ci		 * by parallel ongoing shrinking.
99662306a36Sopenharmony_ci		 */
99762306a36Sopenharmony_ci		if (rwsem_is_contended(&shrinker_rwsem)) {
99862306a36Sopenharmony_ci			freed = freed ? : 1;
99962306a36Sopenharmony_ci			break;
100062306a36Sopenharmony_ci		}
100162306a36Sopenharmony_ci	}
100262306a36Sopenharmony_ci
100362306a36Sopenharmony_ci	up_read(&shrinker_rwsem);
100462306a36Sopenharmony_ciout:
100562306a36Sopenharmony_ci	cond_resched();
100662306a36Sopenharmony_ci	return freed;
100762306a36Sopenharmony_ci}
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_cistatic unsigned long drop_slab_node(int nid)
101062306a36Sopenharmony_ci{
101162306a36Sopenharmony_ci	unsigned long freed = 0;
101262306a36Sopenharmony_ci	struct mem_cgroup *memcg = NULL;
101362306a36Sopenharmony_ci
101462306a36Sopenharmony_ci	memcg = mem_cgroup_iter(NULL, NULL, NULL);
101562306a36Sopenharmony_ci	do {
101662306a36Sopenharmony_ci		freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
101762306a36Sopenharmony_ci	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
101862306a36Sopenharmony_ci
101962306a36Sopenharmony_ci	return freed;
102062306a36Sopenharmony_ci}
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_civoid drop_slab(void)
102362306a36Sopenharmony_ci{
102462306a36Sopenharmony_ci	int nid;
102562306a36Sopenharmony_ci	int shift = 0;
102662306a36Sopenharmony_ci	unsigned long freed;
102762306a36Sopenharmony_ci
102862306a36Sopenharmony_ci	do {
102962306a36Sopenharmony_ci		freed = 0;
103062306a36Sopenharmony_ci		for_each_online_node(nid) {
103162306a36Sopenharmony_ci			if (fatal_signal_pending(current))
103262306a36Sopenharmony_ci				return;
103362306a36Sopenharmony_ci
103462306a36Sopenharmony_ci			freed += drop_slab_node(nid);
103562306a36Sopenharmony_ci		}
103662306a36Sopenharmony_ci	} while ((freed >> shift++) > 1);
103762306a36Sopenharmony_ci}
103862306a36Sopenharmony_ci
103962306a36Sopenharmony_cistatic int reclaimer_offset(void)
104062306a36Sopenharmony_ci{
104162306a36Sopenharmony_ci	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
104262306a36Sopenharmony_ci			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
104362306a36Sopenharmony_ci	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
104462306a36Sopenharmony_ci			PGSCAN_DIRECT - PGSCAN_KSWAPD);
104562306a36Sopenharmony_ci	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
104662306a36Sopenharmony_ci			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
104762306a36Sopenharmony_ci	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
104862306a36Sopenharmony_ci			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
104962306a36Sopenharmony_ci
105062306a36Sopenharmony_ci	if (current_is_kswapd())
105162306a36Sopenharmony_ci		return 0;
105262306a36Sopenharmony_ci	if (current_is_khugepaged())
105362306a36Sopenharmony_ci		return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
105462306a36Sopenharmony_ci	return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
105562306a36Sopenharmony_ci}
105662306a36Sopenharmony_ci
105762306a36Sopenharmony_cistatic inline int is_page_cache_freeable(struct folio *folio)
105862306a36Sopenharmony_ci{
105962306a36Sopenharmony_ci	/*
106062306a36Sopenharmony_ci	 * A freeable page cache folio is referenced only by the caller
106162306a36Sopenharmony_ci	 * that isolated the folio, the page cache and optional filesystem
106262306a36Sopenharmony_ci	 * private data at folio->private.
106362306a36Sopenharmony_ci	 */
106462306a36Sopenharmony_ci	return folio_ref_count(folio) - folio_test_private(folio) ==
106562306a36Sopenharmony_ci		1 + folio_nr_pages(folio);
106662306a36Sopenharmony_ci}
106762306a36Sopenharmony_ci
106862306a36Sopenharmony_ci/*
106962306a36Sopenharmony_ci * We detected a synchronous write error writing a folio out.  Probably
107062306a36Sopenharmony_ci * -ENOSPC.  We need to propagate that into the address_space for a subsequent
107162306a36Sopenharmony_ci * fsync(), msync() or close().
107262306a36Sopenharmony_ci *
107362306a36Sopenharmony_ci * The tricky part is that after writepage we cannot touch the mapping: nothing
107462306a36Sopenharmony_ci * prevents it from being freed up.  But we have a ref on the folio and once
107562306a36Sopenharmony_ci * that folio is locked, the mapping is pinned.
107662306a36Sopenharmony_ci *
107762306a36Sopenharmony_ci * We're allowed to run sleeping folio_lock() here because we know the caller has
107862306a36Sopenharmony_ci * __GFP_FS.
107962306a36Sopenharmony_ci */
108062306a36Sopenharmony_cistatic void handle_write_error(struct address_space *mapping,
108162306a36Sopenharmony_ci				struct folio *folio, int error)
108262306a36Sopenharmony_ci{
108362306a36Sopenharmony_ci	folio_lock(folio);
108462306a36Sopenharmony_ci	if (folio_mapping(folio) == mapping)
108562306a36Sopenharmony_ci		mapping_set_error(mapping, error);
108662306a36Sopenharmony_ci	folio_unlock(folio);
108762306a36Sopenharmony_ci}
108862306a36Sopenharmony_ci
108962306a36Sopenharmony_cistatic bool skip_throttle_noprogress(pg_data_t *pgdat)
109062306a36Sopenharmony_ci{
109162306a36Sopenharmony_ci	int reclaimable = 0, write_pending = 0;
109262306a36Sopenharmony_ci	int i;
109362306a36Sopenharmony_ci
109462306a36Sopenharmony_ci	/*
109562306a36Sopenharmony_ci	 * If kswapd is disabled, reschedule if necessary but do not
109662306a36Sopenharmony_ci	 * throttle as the system is likely near OOM.
109762306a36Sopenharmony_ci	 */
109862306a36Sopenharmony_ci	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
109962306a36Sopenharmony_ci		return true;
110062306a36Sopenharmony_ci
110162306a36Sopenharmony_ci	/*
110262306a36Sopenharmony_ci	 * If there are a lot of dirty/writeback folios then do not
110362306a36Sopenharmony_ci	 * throttle as throttling will occur when the folios cycle
110462306a36Sopenharmony_ci	 * towards the end of the LRU if still under writeback.
110562306a36Sopenharmony_ci	 */
110662306a36Sopenharmony_ci	for (i = 0; i < MAX_NR_ZONES; i++) {
110762306a36Sopenharmony_ci		struct zone *zone = pgdat->node_zones + i;
110862306a36Sopenharmony_ci
110962306a36Sopenharmony_ci		if (!managed_zone(zone))
111062306a36Sopenharmony_ci			continue;
111162306a36Sopenharmony_ci
111262306a36Sopenharmony_ci		reclaimable += zone_reclaimable_pages(zone);
111362306a36Sopenharmony_ci		write_pending += zone_page_state_snapshot(zone,
111462306a36Sopenharmony_ci						  NR_ZONE_WRITE_PENDING);
111562306a36Sopenharmony_ci	}
111662306a36Sopenharmony_ci	if (2 * write_pending <= reclaimable)
111762306a36Sopenharmony_ci		return true;
111862306a36Sopenharmony_ci
111962306a36Sopenharmony_ci	return false;
112062306a36Sopenharmony_ci}
112162306a36Sopenharmony_ci
112262306a36Sopenharmony_civoid reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
112362306a36Sopenharmony_ci{
112462306a36Sopenharmony_ci	wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
112562306a36Sopenharmony_ci	long timeout, ret;
112662306a36Sopenharmony_ci	DEFINE_WAIT(wait);
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_ci	/*
112962306a36Sopenharmony_ci	 * Do not throttle user workers, kthreads other than kswapd or
113062306a36Sopenharmony_ci	 * workqueues. They may be required for reclaim to make
113162306a36Sopenharmony_ci	 * forward progress (e.g. journalling workqueues or kthreads).
113262306a36Sopenharmony_ci	 */
113362306a36Sopenharmony_ci	if (!current_is_kswapd() &&
113462306a36Sopenharmony_ci	    current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
113562306a36Sopenharmony_ci		cond_resched();
113662306a36Sopenharmony_ci		return;
113762306a36Sopenharmony_ci	}
113862306a36Sopenharmony_ci
113962306a36Sopenharmony_ci	/*
114062306a36Sopenharmony_ci	 * These figures are pulled out of thin air.
114162306a36Sopenharmony_ci	 * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
114262306a36Sopenharmony_ci	 * parallel reclaimers which is a short-lived event so the timeout is
114362306a36Sopenharmony_ci	 * short. Failing to make progress or waiting on writeback are
114462306a36Sopenharmony_ci	 * potentially long-lived events so use a longer timeout. This is shaky
114562306a36Sopenharmony_ci	 * logic as a failure to make progress could be due to anything from
114662306a36Sopenharmony_ci	 * writeback to a slow device to excessive referenced folios at the tail
114762306a36Sopenharmony_ci	 * of the inactive LRU.
114862306a36Sopenharmony_ci	 */
114962306a36Sopenharmony_ci	switch(reason) {
115062306a36Sopenharmony_ci	case VMSCAN_THROTTLE_WRITEBACK:
115162306a36Sopenharmony_ci		timeout = HZ/10;
115262306a36Sopenharmony_ci
115362306a36Sopenharmony_ci		if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
115462306a36Sopenharmony_ci			WRITE_ONCE(pgdat->nr_reclaim_start,
115562306a36Sopenharmony_ci				node_page_state(pgdat, NR_THROTTLED_WRITTEN));
115662306a36Sopenharmony_ci		}
115762306a36Sopenharmony_ci
115862306a36Sopenharmony_ci		break;
115962306a36Sopenharmony_ci	case VMSCAN_THROTTLE_CONGESTED:
116062306a36Sopenharmony_ci		fallthrough;
116162306a36Sopenharmony_ci	case VMSCAN_THROTTLE_NOPROGRESS:
116262306a36Sopenharmony_ci		if (skip_throttle_noprogress(pgdat)) {
116362306a36Sopenharmony_ci			cond_resched();
116462306a36Sopenharmony_ci			return;
116562306a36Sopenharmony_ci		}
116662306a36Sopenharmony_ci
116762306a36Sopenharmony_ci		timeout = 1;
116862306a36Sopenharmony_ci
116962306a36Sopenharmony_ci		break;
117062306a36Sopenharmony_ci	case VMSCAN_THROTTLE_ISOLATED:
117162306a36Sopenharmony_ci		timeout = HZ/50;
117262306a36Sopenharmony_ci		break;
117362306a36Sopenharmony_ci	default:
117462306a36Sopenharmony_ci		WARN_ON_ONCE(1);
117562306a36Sopenharmony_ci		timeout = HZ;
117662306a36Sopenharmony_ci		break;
117762306a36Sopenharmony_ci	}
117862306a36Sopenharmony_ci
117962306a36Sopenharmony_ci	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
118062306a36Sopenharmony_ci	ret = schedule_timeout(timeout);
118162306a36Sopenharmony_ci	finish_wait(wqh, &wait);
118262306a36Sopenharmony_ci
118362306a36Sopenharmony_ci	if (reason == VMSCAN_THROTTLE_WRITEBACK)
118462306a36Sopenharmony_ci		atomic_dec(&pgdat->nr_writeback_throttled);
118562306a36Sopenharmony_ci
118662306a36Sopenharmony_ci	trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
118762306a36Sopenharmony_ci				jiffies_to_usecs(timeout - ret),
118862306a36Sopenharmony_ci				reason);
118962306a36Sopenharmony_ci}
119062306a36Sopenharmony_ci
119162306a36Sopenharmony_ci/*
119262306a36Sopenharmony_ci * Account for folios written if tasks are throttled waiting on dirty
119362306a36Sopenharmony_ci * folios to clean. If enough folios have been cleaned since throttling
119462306a36Sopenharmony_ci * started then wakeup the throttled tasks.
119562306a36Sopenharmony_ci */
119662306a36Sopenharmony_civoid __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
119762306a36Sopenharmony_ci							int nr_throttled)
119862306a36Sopenharmony_ci{
119962306a36Sopenharmony_ci	unsigned long nr_written;
120062306a36Sopenharmony_ci
120162306a36Sopenharmony_ci	node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
120262306a36Sopenharmony_ci
120362306a36Sopenharmony_ci	/*
120462306a36Sopenharmony_ci	 * This is an inaccurate read as the per-cpu deltas may not
120562306a36Sopenharmony_ci	 * be synchronised. However, given that the system is
120662306a36Sopenharmony_ci	 * writeback throttled, it is not worth taking the penalty
120762306a36Sopenharmony_ci	 * of getting an accurate count. At worst, the throttle
120862306a36Sopenharmony_ci	 * timeout guarantees forward progress.
120962306a36Sopenharmony_ci	 */
121062306a36Sopenharmony_ci	nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
121162306a36Sopenharmony_ci		READ_ONCE(pgdat->nr_reclaim_start);
121262306a36Sopenharmony_ci
121362306a36Sopenharmony_ci	if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
121462306a36Sopenharmony_ci		wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
121562306a36Sopenharmony_ci}
121662306a36Sopenharmony_ci
121762306a36Sopenharmony_ci/* possible outcome of pageout() */
121862306a36Sopenharmony_citypedef enum {
121962306a36Sopenharmony_ci	/* failed to write folio out, folio is locked */
122062306a36Sopenharmony_ci	PAGE_KEEP,
122162306a36Sopenharmony_ci	/* move folio to the active list, folio is locked */
122262306a36Sopenharmony_ci	PAGE_ACTIVATE,
122362306a36Sopenharmony_ci	/* folio has been sent to the disk successfully, folio is unlocked */
122462306a36Sopenharmony_ci	PAGE_SUCCESS,
122562306a36Sopenharmony_ci	/* folio is clean and locked */
122662306a36Sopenharmony_ci	PAGE_CLEAN,
122762306a36Sopenharmony_ci} pageout_t;
122862306a36Sopenharmony_ci
122962306a36Sopenharmony_ci/*
123062306a36Sopenharmony_ci * pageout is called by shrink_folio_list() for each dirty folio.
123162306a36Sopenharmony_ci * Calls ->writepage().
123262306a36Sopenharmony_ci */
123362306a36Sopenharmony_cistatic pageout_t pageout(struct folio *folio, struct address_space *mapping,
123462306a36Sopenharmony_ci			 struct swap_iocb **plug)
123562306a36Sopenharmony_ci{
123662306a36Sopenharmony_ci	/*
123762306a36Sopenharmony_ci	 * If the folio is dirty, only perform writeback if that write
123862306a36Sopenharmony_ci	 * will be non-blocking.  To prevent this allocation from being
123962306a36Sopenharmony_ci	 * stalled by pagecache activity.  But note that there may be
124062306a36Sopenharmony_ci	 * stalls if we need to run get_block().  We could test
124162306a36Sopenharmony_ci	 * PagePrivate for that.
124262306a36Sopenharmony_ci	 *
124362306a36Sopenharmony_ci	 * If this process is currently in __generic_file_write_iter() against
124462306a36Sopenharmony_ci	 * this folio's queue, we can perform writeback even if that
124562306a36Sopenharmony_ci	 * will block.
124662306a36Sopenharmony_ci	 *
124762306a36Sopenharmony_ci	 * If the folio is swapcache, write it back even if that would
124862306a36Sopenharmony_ci	 * block, for some throttling. This happens by accident, because
124962306a36Sopenharmony_ci	 * swap_backing_dev_info is bust: it doesn't reflect the
125062306a36Sopenharmony_ci	 * congestion state of the swapdevs.  Easy to fix, if needed.
125162306a36Sopenharmony_ci	 */
125262306a36Sopenharmony_ci	if (!is_page_cache_freeable(folio))
125362306a36Sopenharmony_ci		return PAGE_KEEP;
125462306a36Sopenharmony_ci	if (!mapping) {
125562306a36Sopenharmony_ci		/*
125662306a36Sopenharmony_ci		 * Some data journaling orphaned folios can have
125762306a36Sopenharmony_ci		 * folio->mapping == NULL while being dirty with clean buffers.
125862306a36Sopenharmony_ci		 */
125962306a36Sopenharmony_ci		if (folio_test_private(folio)) {
126062306a36Sopenharmony_ci			if (try_to_free_buffers(folio)) {
126162306a36Sopenharmony_ci				folio_clear_dirty(folio);
126262306a36Sopenharmony_ci				pr_info("%s: orphaned folio\n", __func__);
126362306a36Sopenharmony_ci				return PAGE_CLEAN;
126462306a36Sopenharmony_ci			}
126562306a36Sopenharmony_ci		}
126662306a36Sopenharmony_ci		return PAGE_KEEP;
126762306a36Sopenharmony_ci	}
126862306a36Sopenharmony_ci	if (mapping->a_ops->writepage == NULL)
126962306a36Sopenharmony_ci		return PAGE_ACTIVATE;
127062306a36Sopenharmony_ci
127162306a36Sopenharmony_ci	if (folio_clear_dirty_for_io(folio)) {
127262306a36Sopenharmony_ci		int res;
127362306a36Sopenharmony_ci		struct writeback_control wbc = {
127462306a36Sopenharmony_ci			.sync_mode = WB_SYNC_NONE,
127562306a36Sopenharmony_ci			.nr_to_write = SWAP_CLUSTER_MAX,
127662306a36Sopenharmony_ci			.range_start = 0,
127762306a36Sopenharmony_ci			.range_end = LLONG_MAX,
127862306a36Sopenharmony_ci			.for_reclaim = 1,
127962306a36Sopenharmony_ci			.swap_plug = plug,
128062306a36Sopenharmony_ci		};
128162306a36Sopenharmony_ci
128262306a36Sopenharmony_ci		folio_set_reclaim(folio);
128362306a36Sopenharmony_ci		res = mapping->a_ops->writepage(&folio->page, &wbc);
128462306a36Sopenharmony_ci		if (res < 0)
128562306a36Sopenharmony_ci			handle_write_error(mapping, folio, res);
128662306a36Sopenharmony_ci		if (res == AOP_WRITEPAGE_ACTIVATE) {
128762306a36Sopenharmony_ci			folio_clear_reclaim(folio);
128862306a36Sopenharmony_ci			return PAGE_ACTIVATE;
128962306a36Sopenharmony_ci		}
129062306a36Sopenharmony_ci
129162306a36Sopenharmony_ci		if (!folio_test_writeback(folio)) {
129262306a36Sopenharmony_ci			/* synchronous write or broken a_ops? */
129362306a36Sopenharmony_ci			folio_clear_reclaim(folio);
129462306a36Sopenharmony_ci		}
129562306a36Sopenharmony_ci		trace_mm_vmscan_write_folio(folio);
129662306a36Sopenharmony_ci		node_stat_add_folio(folio, NR_VMSCAN_WRITE);
129762306a36Sopenharmony_ci		return PAGE_SUCCESS;
129862306a36Sopenharmony_ci	}
129962306a36Sopenharmony_ci
130062306a36Sopenharmony_ci	return PAGE_CLEAN;
130162306a36Sopenharmony_ci}
130262306a36Sopenharmony_ci
130362306a36Sopenharmony_ci/*
130462306a36Sopenharmony_ci * Same as remove_mapping, but if the folio is removed from the mapping, it
130562306a36Sopenharmony_ci * gets returned with a refcount of 0.
130662306a36Sopenharmony_ci */
130762306a36Sopenharmony_cistatic int __remove_mapping(struct address_space *mapping, struct folio *folio,
130862306a36Sopenharmony_ci			    bool reclaimed, struct mem_cgroup *target_memcg)
130962306a36Sopenharmony_ci{
131062306a36Sopenharmony_ci	int refcount;
131162306a36Sopenharmony_ci	void *shadow = NULL;
131262306a36Sopenharmony_ci
131362306a36Sopenharmony_ci	BUG_ON(!folio_test_locked(folio));
131462306a36Sopenharmony_ci	BUG_ON(mapping != folio_mapping(folio));
131562306a36Sopenharmony_ci
131662306a36Sopenharmony_ci	if (!folio_test_swapcache(folio))
131762306a36Sopenharmony_ci		spin_lock(&mapping->host->i_lock);
131862306a36Sopenharmony_ci	xa_lock_irq(&mapping->i_pages);
131962306a36Sopenharmony_ci	/*
132062306a36Sopenharmony_ci	 * The non racy check for a busy folio.
132162306a36Sopenharmony_ci	 *
132262306a36Sopenharmony_ci	 * Must be careful with the order of the tests. When someone has
132362306a36Sopenharmony_ci	 * a ref to the folio, it may be possible that they dirty it then
132462306a36Sopenharmony_ci	 * drop the reference. So if the dirty flag is tested before the
132562306a36Sopenharmony_ci	 * refcount here, then the following race may occur:
132662306a36Sopenharmony_ci	 *
132762306a36Sopenharmony_ci	 * get_user_pages(&page);
132862306a36Sopenharmony_ci	 * [user mapping goes away]
132962306a36Sopenharmony_ci	 * write_to(page);
133062306a36Sopenharmony_ci	 *				!folio_test_dirty(folio)    [good]
133162306a36Sopenharmony_ci	 * folio_set_dirty(folio);
133262306a36Sopenharmony_ci	 * folio_put(folio);
133362306a36Sopenharmony_ci	 *				!refcount(folio)   [good, discard it]
133462306a36Sopenharmony_ci	 *
133562306a36Sopenharmony_ci	 * [oops, our write_to data is lost]
133662306a36Sopenharmony_ci	 *
133762306a36Sopenharmony_ci	 * Reversing the order of the tests ensures such a situation cannot
133862306a36Sopenharmony_ci	 * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
133962306a36Sopenharmony_ci	 * load is not satisfied before that of folio->_refcount.
134062306a36Sopenharmony_ci	 *
134162306a36Sopenharmony_ci	 * Note that if the dirty flag is always set via folio_mark_dirty,
134262306a36Sopenharmony_ci	 * and thus under the i_pages lock, then this ordering is not required.
134362306a36Sopenharmony_ci	 */
134462306a36Sopenharmony_ci	refcount = 1 + folio_nr_pages(folio);
134562306a36Sopenharmony_ci	if (!folio_ref_freeze(folio, refcount))
134662306a36Sopenharmony_ci		goto cannot_free;
134762306a36Sopenharmony_ci	/* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
134862306a36Sopenharmony_ci	if (unlikely(folio_test_dirty(folio))) {
134962306a36Sopenharmony_ci		folio_ref_unfreeze(folio, refcount);
135062306a36Sopenharmony_ci		goto cannot_free;
135162306a36Sopenharmony_ci	}
135262306a36Sopenharmony_ci
135362306a36Sopenharmony_ci	if (folio_test_swapcache(folio)) {
135462306a36Sopenharmony_ci		swp_entry_t swap = folio->swap;
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_ci		if (reclaimed && !mapping_exiting(mapping))
135762306a36Sopenharmony_ci			shadow = workingset_eviction(folio, target_memcg);
135862306a36Sopenharmony_ci		__delete_from_swap_cache(folio, swap, shadow);
135962306a36Sopenharmony_ci		mem_cgroup_swapout(folio, swap);
136062306a36Sopenharmony_ci		xa_unlock_irq(&mapping->i_pages);
136162306a36Sopenharmony_ci		put_swap_folio(folio, swap);
136262306a36Sopenharmony_ci	} else {
136362306a36Sopenharmony_ci		void (*free_folio)(struct folio *);
136462306a36Sopenharmony_ci
136562306a36Sopenharmony_ci		free_folio = mapping->a_ops->free_folio;
136662306a36Sopenharmony_ci		/*
136762306a36Sopenharmony_ci		 * Remember a shadow entry for reclaimed file cache in
136862306a36Sopenharmony_ci		 * order to detect refaults, thus thrashing, later on.
136962306a36Sopenharmony_ci		 *
137062306a36Sopenharmony_ci		 * But don't store shadows in an address space that is
137162306a36Sopenharmony_ci		 * already exiting.  This is not just an optimization,
137262306a36Sopenharmony_ci		 * inode reclaim needs to empty out the radix tree or
137362306a36Sopenharmony_ci		 * the nodes are lost.  Don't plant shadows behind its
137462306a36Sopenharmony_ci		 * back.
137562306a36Sopenharmony_ci		 *
137662306a36Sopenharmony_ci		 * We also don't store shadows for DAX mappings because the
137762306a36Sopenharmony_ci		 * only page cache folios found in these are zero pages
137862306a36Sopenharmony_ci		 * covering holes, and because we don't want to mix DAX
137962306a36Sopenharmony_ci		 * exceptional entries and shadow exceptional entries in the
138062306a36Sopenharmony_ci		 * same address_space.
138162306a36Sopenharmony_ci		 */
138262306a36Sopenharmony_ci		if (reclaimed && folio_is_file_lru(folio) &&
138362306a36Sopenharmony_ci		    !mapping_exiting(mapping) && !dax_mapping(mapping))
138462306a36Sopenharmony_ci			shadow = workingset_eviction(folio, target_memcg);
138562306a36Sopenharmony_ci		__filemap_remove_folio(folio, shadow);
138662306a36Sopenharmony_ci		xa_unlock_irq(&mapping->i_pages);
138762306a36Sopenharmony_ci		if (mapping_shrinkable(mapping))
138862306a36Sopenharmony_ci			inode_add_lru(mapping->host);
138962306a36Sopenharmony_ci		spin_unlock(&mapping->host->i_lock);
139062306a36Sopenharmony_ci
139162306a36Sopenharmony_ci		if (free_folio)
139262306a36Sopenharmony_ci			free_folio(folio);
139362306a36Sopenharmony_ci	}
139462306a36Sopenharmony_ci
139562306a36Sopenharmony_ci	return 1;
139662306a36Sopenharmony_ci
139762306a36Sopenharmony_cicannot_free:
139862306a36Sopenharmony_ci	xa_unlock_irq(&mapping->i_pages);
139962306a36Sopenharmony_ci	if (!folio_test_swapcache(folio))
140062306a36Sopenharmony_ci		spin_unlock(&mapping->host->i_lock);
140162306a36Sopenharmony_ci	return 0;
140262306a36Sopenharmony_ci}
140362306a36Sopenharmony_ci
140462306a36Sopenharmony_ci/**
140562306a36Sopenharmony_ci * remove_mapping() - Attempt to remove a folio from its mapping.
140662306a36Sopenharmony_ci * @mapping: The address space.
140762306a36Sopenharmony_ci * @folio: The folio to remove.
140862306a36Sopenharmony_ci *
140962306a36Sopenharmony_ci * If the folio is dirty, under writeback or if someone else has a ref
141062306a36Sopenharmony_ci * on it, removal will fail.
141162306a36Sopenharmony_ci * Return: The number of pages removed from the mapping.  0 if the folio
141262306a36Sopenharmony_ci * could not be removed.
141362306a36Sopenharmony_ci * Context: The caller should have a single refcount on the folio and
141462306a36Sopenharmony_ci * hold its lock.
141562306a36Sopenharmony_ci */
141662306a36Sopenharmony_cilong remove_mapping(struct address_space *mapping, struct folio *folio)
141762306a36Sopenharmony_ci{
141862306a36Sopenharmony_ci	if (__remove_mapping(mapping, folio, false, NULL)) {
141962306a36Sopenharmony_ci		/*
142062306a36Sopenharmony_ci		 * Unfreezing the refcount with 1 effectively
142162306a36Sopenharmony_ci		 * drops the pagecache ref for us without requiring another
142262306a36Sopenharmony_ci		 * atomic operation.
142362306a36Sopenharmony_ci		 */
142462306a36Sopenharmony_ci		folio_ref_unfreeze(folio, 1);
142562306a36Sopenharmony_ci		return folio_nr_pages(folio);
142662306a36Sopenharmony_ci	}
142762306a36Sopenharmony_ci	return 0;
142862306a36Sopenharmony_ci}
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci/**
143162306a36Sopenharmony_ci * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
143262306a36Sopenharmony_ci * @folio: Folio to be returned to an LRU list.
143362306a36Sopenharmony_ci *
143462306a36Sopenharmony_ci * Add previously isolated @folio to appropriate LRU list.
143562306a36Sopenharmony_ci * The folio may still be unevictable for other reasons.
143662306a36Sopenharmony_ci *
143762306a36Sopenharmony_ci * Context: lru_lock must not be held, interrupts must be enabled.
143862306a36Sopenharmony_ci */
143962306a36Sopenharmony_civoid folio_putback_lru(struct folio *folio)
144062306a36Sopenharmony_ci{
144162306a36Sopenharmony_ci	folio_add_lru(folio);
144262306a36Sopenharmony_ci	folio_put(folio);		/* drop ref from isolate */
144362306a36Sopenharmony_ci}
144462306a36Sopenharmony_ci
144562306a36Sopenharmony_cienum folio_references {
144662306a36Sopenharmony_ci	FOLIOREF_RECLAIM,
144762306a36Sopenharmony_ci	FOLIOREF_RECLAIM_CLEAN,
144862306a36Sopenharmony_ci	FOLIOREF_RECLAIM_PURGEABLE,
144962306a36Sopenharmony_ci	FOLIOREF_KEEP,
145062306a36Sopenharmony_ci	FOLIOREF_ACTIVATE,
145162306a36Sopenharmony_ci};
145262306a36Sopenharmony_ci
145362306a36Sopenharmony_cistatic enum folio_references folio_check_references(struct folio *folio,
145462306a36Sopenharmony_ci						  struct scan_control *sc)
145562306a36Sopenharmony_ci{
145662306a36Sopenharmony_ci	int referenced_ptes, referenced_folio;
145762306a36Sopenharmony_ci	unsigned long vm_flags;
145862306a36Sopenharmony_ci
145962306a36Sopenharmony_ci	referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
146062306a36Sopenharmony_ci					   &vm_flags);
146162306a36Sopenharmony_ci	referenced_folio = folio_test_clear_referenced(folio);
146262306a36Sopenharmony_ci
146362306a36Sopenharmony_ci	/*
146462306a36Sopenharmony_ci	 * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
146562306a36Sopenharmony_ci	 * Let the folio, now marked Mlocked, be moved to the unevictable list.
146662306a36Sopenharmony_ci	 */
146762306a36Sopenharmony_ci	if (vm_flags & VM_LOCKED)
146862306a36Sopenharmony_ci		return FOLIOREF_ACTIVATE;
146962306a36Sopenharmony_ci
147062306a36Sopenharmony_ci
147162306a36Sopenharmony_ci	/* rmap lock contention: rotate */
147262306a36Sopenharmony_ci	if (referenced_ptes == -1)
147362306a36Sopenharmony_ci		return FOLIOREF_KEEP;
147462306a36Sopenharmony_ci
147562306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE
147662306a36Sopenharmony_ci	if (vm_flags & VM_PURGEABLE)
147762306a36Sopenharmony_ci		return FOLIOREF_RECLAIM_PURGEABLE;
147862306a36Sopenharmony_ci#endif
147962306a36Sopenharmony_ci
148062306a36Sopenharmony_ci	if (referenced_ptes) {
148162306a36Sopenharmony_ci		/*
148262306a36Sopenharmony_ci		 * All mapped folios start out with page table
148362306a36Sopenharmony_ci		 * references from the instantiating fault, so we need
148462306a36Sopenharmony_ci		 * to look twice if a mapped file/anon folio is used more
148562306a36Sopenharmony_ci		 * than once.
148662306a36Sopenharmony_ci		 *
148762306a36Sopenharmony_ci		 * Mark it and spare it for another trip around the
148862306a36Sopenharmony_ci		 * inactive list.  Another page table reference will
148962306a36Sopenharmony_ci		 * lead to its activation.
149062306a36Sopenharmony_ci		 *
149162306a36Sopenharmony_ci		 * Note: the mark is set for activated folios as well
149262306a36Sopenharmony_ci		 * so that recently deactivated but used folios are
149362306a36Sopenharmony_ci		 * quickly recovered.
149462306a36Sopenharmony_ci		 */
149562306a36Sopenharmony_ci		folio_set_referenced(folio);
149662306a36Sopenharmony_ci
149762306a36Sopenharmony_ci		if (referenced_folio || referenced_ptes > 1)
149862306a36Sopenharmony_ci			return FOLIOREF_ACTIVATE;
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_ci		/*
150162306a36Sopenharmony_ci		 * Activate file-backed executable folios after first usage.
150262306a36Sopenharmony_ci		 */
150362306a36Sopenharmony_ci		if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
150462306a36Sopenharmony_ci			return FOLIOREF_ACTIVATE;
150562306a36Sopenharmony_ci
150662306a36Sopenharmony_ci		return FOLIOREF_KEEP;
150762306a36Sopenharmony_ci	}
150862306a36Sopenharmony_ci
150962306a36Sopenharmony_ci	/* Reclaim if clean, defer dirty folios to writeback */
151062306a36Sopenharmony_ci	if (referenced_folio && folio_is_file_lru(folio))
151162306a36Sopenharmony_ci		return FOLIOREF_RECLAIM_CLEAN;
151262306a36Sopenharmony_ci
151362306a36Sopenharmony_ci	return FOLIOREF_RECLAIM;
151462306a36Sopenharmony_ci}
151562306a36Sopenharmony_ci
151662306a36Sopenharmony_ci/* Check if a folio is dirty or under writeback */
151762306a36Sopenharmony_cistatic void folio_check_dirty_writeback(struct folio *folio,
151862306a36Sopenharmony_ci				       bool *dirty, bool *writeback)
151962306a36Sopenharmony_ci{
152062306a36Sopenharmony_ci	struct address_space *mapping;
152162306a36Sopenharmony_ci
152262306a36Sopenharmony_ci	/*
152362306a36Sopenharmony_ci	 * Anonymous folios are not handled by flushers and must be written
152462306a36Sopenharmony_ci	 * from reclaim context. Do not stall reclaim based on them.
152562306a36Sopenharmony_ci	 * MADV_FREE anonymous folios are put into inactive file list too.
152662306a36Sopenharmony_ci	 * They could be mistakenly treated as file lru. So further anon
152762306a36Sopenharmony_ci	 * test is needed.
152862306a36Sopenharmony_ci	 */
152962306a36Sopenharmony_ci	if (!folio_is_file_lru(folio) ||
153062306a36Sopenharmony_ci	    (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
153162306a36Sopenharmony_ci		*dirty = false;
153262306a36Sopenharmony_ci		*writeback = false;
153362306a36Sopenharmony_ci		return;
153462306a36Sopenharmony_ci	}
153562306a36Sopenharmony_ci
153662306a36Sopenharmony_ci	/* By default assume that the folio flags are accurate */
153762306a36Sopenharmony_ci	*dirty = folio_test_dirty(folio);
153862306a36Sopenharmony_ci	*writeback = folio_test_writeback(folio);
153962306a36Sopenharmony_ci
154062306a36Sopenharmony_ci	/* Verify dirty/writeback state if the filesystem supports it */
154162306a36Sopenharmony_ci	if (!folio_test_private(folio))
154262306a36Sopenharmony_ci		return;
154362306a36Sopenharmony_ci
154462306a36Sopenharmony_ci	mapping = folio_mapping(folio);
154562306a36Sopenharmony_ci	if (mapping && mapping->a_ops->is_dirty_writeback)
154662306a36Sopenharmony_ci		mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
154762306a36Sopenharmony_ci}
154862306a36Sopenharmony_ci
154962306a36Sopenharmony_cistatic struct folio *alloc_demote_folio(struct folio *src,
155062306a36Sopenharmony_ci		unsigned long private)
155162306a36Sopenharmony_ci{
155262306a36Sopenharmony_ci	struct folio *dst;
155362306a36Sopenharmony_ci	nodemask_t *allowed_mask;
155462306a36Sopenharmony_ci	struct migration_target_control *mtc;
155562306a36Sopenharmony_ci
155662306a36Sopenharmony_ci	mtc = (struct migration_target_control *)private;
155762306a36Sopenharmony_ci
155862306a36Sopenharmony_ci	allowed_mask = mtc->nmask;
155962306a36Sopenharmony_ci	/*
156062306a36Sopenharmony_ci	 * make sure we allocate from the target node first also trying to
156162306a36Sopenharmony_ci	 * demote or reclaim pages from the target node via kswapd if we are
156262306a36Sopenharmony_ci	 * low on free memory on target node. If we don't do this and if
156362306a36Sopenharmony_ci	 * we have free memory on the slower(lower) memtier, we would start
156462306a36Sopenharmony_ci	 * allocating pages from slower(lower) memory tiers without even forcing
156562306a36Sopenharmony_ci	 * a demotion of cold pages from the target memtier. This can result
156662306a36Sopenharmony_ci	 * in the kernel placing hot pages in slower(lower) memory tiers.
156762306a36Sopenharmony_ci	 */
156862306a36Sopenharmony_ci	mtc->nmask = NULL;
156962306a36Sopenharmony_ci	mtc->gfp_mask |= __GFP_THISNODE;
157062306a36Sopenharmony_ci	dst = alloc_migration_target(src, (unsigned long)mtc);
157162306a36Sopenharmony_ci	if (dst)
157262306a36Sopenharmony_ci		return dst;
157362306a36Sopenharmony_ci
157462306a36Sopenharmony_ci	mtc->gfp_mask &= ~__GFP_THISNODE;
157562306a36Sopenharmony_ci	mtc->nmask = allowed_mask;
157662306a36Sopenharmony_ci
157762306a36Sopenharmony_ci	return alloc_migration_target(src, (unsigned long)mtc);
157862306a36Sopenharmony_ci}
157962306a36Sopenharmony_ci
158062306a36Sopenharmony_ci/*
158162306a36Sopenharmony_ci * Take folios on @demote_folios and attempt to demote them to another node.
158262306a36Sopenharmony_ci * Folios which are not demoted are left on @demote_folios.
158362306a36Sopenharmony_ci */
158462306a36Sopenharmony_cistatic unsigned int demote_folio_list(struct list_head *demote_folios,
158562306a36Sopenharmony_ci				     struct pglist_data *pgdat)
158662306a36Sopenharmony_ci{
158762306a36Sopenharmony_ci	int target_nid = next_demotion_node(pgdat->node_id);
158862306a36Sopenharmony_ci	unsigned int nr_succeeded;
158962306a36Sopenharmony_ci	nodemask_t allowed_mask;
159062306a36Sopenharmony_ci
159162306a36Sopenharmony_ci	struct migration_target_control mtc = {
159262306a36Sopenharmony_ci		/*
159362306a36Sopenharmony_ci		 * Allocate from 'node', or fail quickly and quietly.
159462306a36Sopenharmony_ci		 * When this happens, 'page' will likely just be discarded
159562306a36Sopenharmony_ci		 * instead of migrated.
159662306a36Sopenharmony_ci		 */
159762306a36Sopenharmony_ci		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
159862306a36Sopenharmony_ci			__GFP_NOMEMALLOC | GFP_NOWAIT,
159962306a36Sopenharmony_ci		.nid = target_nid,
160062306a36Sopenharmony_ci		.nmask = &allowed_mask
160162306a36Sopenharmony_ci	};
160262306a36Sopenharmony_ci
160362306a36Sopenharmony_ci	if (list_empty(demote_folios))
160462306a36Sopenharmony_ci		return 0;
160562306a36Sopenharmony_ci
160662306a36Sopenharmony_ci	if (target_nid == NUMA_NO_NODE)
160762306a36Sopenharmony_ci		return 0;
160862306a36Sopenharmony_ci
160962306a36Sopenharmony_ci	node_get_allowed_targets(pgdat, &allowed_mask);
161062306a36Sopenharmony_ci
161162306a36Sopenharmony_ci	/* Demotion ignores all cpuset and mempolicy settings */
161262306a36Sopenharmony_ci	migrate_pages(demote_folios, alloc_demote_folio, NULL,
161362306a36Sopenharmony_ci		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
161462306a36Sopenharmony_ci		      &nr_succeeded);
161562306a36Sopenharmony_ci
161662306a36Sopenharmony_ci	__count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
161762306a36Sopenharmony_ci
161862306a36Sopenharmony_ci	return nr_succeeded;
161962306a36Sopenharmony_ci}
162062306a36Sopenharmony_ci
162162306a36Sopenharmony_cistatic bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
162262306a36Sopenharmony_ci{
162362306a36Sopenharmony_ci	if (gfp_mask & __GFP_FS)
162462306a36Sopenharmony_ci		return true;
162562306a36Sopenharmony_ci	if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
162662306a36Sopenharmony_ci		return false;
162762306a36Sopenharmony_ci	/*
162862306a36Sopenharmony_ci	 * We can "enter_fs" for swap-cache with only __GFP_IO
162962306a36Sopenharmony_ci	 * providing this isn't SWP_FS_OPS.
163062306a36Sopenharmony_ci	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
163162306a36Sopenharmony_ci	 * but that will never affect SWP_FS_OPS, so the data_race
163262306a36Sopenharmony_ci	 * is safe.
163362306a36Sopenharmony_ci	 */
163462306a36Sopenharmony_ci	return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
163562306a36Sopenharmony_ci}
163662306a36Sopenharmony_ci
163762306a36Sopenharmony_ci/*
163862306a36Sopenharmony_ci * shrink_folio_list() returns the number of reclaimed pages
163962306a36Sopenharmony_ci */
164062306a36Sopenharmony_ciunsigned int shrink_folio_list(struct list_head *folio_list,
164162306a36Sopenharmony_ci		struct pglist_data *pgdat, struct scan_control *sc,
164262306a36Sopenharmony_ci		struct reclaim_stat *stat, bool ignore_references)
164362306a36Sopenharmony_ci{
164462306a36Sopenharmony_ci	LIST_HEAD(ret_folios);
164562306a36Sopenharmony_ci	LIST_HEAD(free_folios);
164662306a36Sopenharmony_ci	LIST_HEAD(demote_folios);
164762306a36Sopenharmony_ci	unsigned int nr_reclaimed = 0;
164862306a36Sopenharmony_ci	unsigned int pgactivate = 0;
164962306a36Sopenharmony_ci	bool do_demote_pass;
165062306a36Sopenharmony_ci	struct swap_iocb *plug = NULL;
165162306a36Sopenharmony_ci
165262306a36Sopenharmony_ci	memset(stat, 0, sizeof(*stat));
165362306a36Sopenharmony_ci	cond_resched();
165462306a36Sopenharmony_ci	do_demote_pass = can_demote(pgdat->node_id, sc);
165562306a36Sopenharmony_ci
165662306a36Sopenharmony_ciretry:
165762306a36Sopenharmony_ci	while (!list_empty(folio_list)) {
165862306a36Sopenharmony_ci		struct address_space *mapping;
165962306a36Sopenharmony_ci		struct folio *folio;
166062306a36Sopenharmony_ci		enum folio_references references = FOLIOREF_RECLAIM;
166162306a36Sopenharmony_ci		bool dirty, writeback;
166262306a36Sopenharmony_ci		unsigned int nr_pages;
166362306a36Sopenharmony_ci
166462306a36Sopenharmony_ci		cond_resched();
166562306a36Sopenharmony_ci
166662306a36Sopenharmony_ci		folio = lru_to_folio(folio_list);
166762306a36Sopenharmony_ci		list_del(&folio->lru);
166862306a36Sopenharmony_ci
166962306a36Sopenharmony_ci		if (!folio_trylock(folio))
167062306a36Sopenharmony_ci			goto keep;
167162306a36Sopenharmony_ci
167262306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
167362306a36Sopenharmony_ci
167462306a36Sopenharmony_ci		nr_pages = folio_nr_pages(folio);
167562306a36Sopenharmony_ci
167662306a36Sopenharmony_ci		/* Account the number of base pages */
167762306a36Sopenharmony_ci		sc->nr_scanned += nr_pages;
167862306a36Sopenharmony_ci
167962306a36Sopenharmony_ci		if (unlikely(!folio_evictable(folio)))
168062306a36Sopenharmony_ci			goto activate_locked;
168162306a36Sopenharmony_ci
168262306a36Sopenharmony_ci		if (!sc->may_unmap && folio_mapped(folio))
168362306a36Sopenharmony_ci			goto keep_locked;
168462306a36Sopenharmony_ci
168562306a36Sopenharmony_ci		/* folio_update_gen() tried to promote this page? */
168662306a36Sopenharmony_ci		if (lru_gen_enabled() && !ignore_references &&
168762306a36Sopenharmony_ci		    folio_mapped(folio) && folio_test_referenced(folio))
168862306a36Sopenharmony_ci			goto keep_locked;
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_ci		/*
169162306a36Sopenharmony_ci		 * The number of dirty pages determines if a node is marked
169262306a36Sopenharmony_ci		 * reclaim_congested. kswapd will stall and start writing
169362306a36Sopenharmony_ci		 * folios if the tail of the LRU is all dirty unqueued folios.
169462306a36Sopenharmony_ci		 */
169562306a36Sopenharmony_ci		folio_check_dirty_writeback(folio, &dirty, &writeback);
169662306a36Sopenharmony_ci		if (dirty || writeback)
169762306a36Sopenharmony_ci			stat->nr_dirty += nr_pages;
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_ci		if (dirty && !writeback)
170062306a36Sopenharmony_ci			stat->nr_unqueued_dirty += nr_pages;
170162306a36Sopenharmony_ci
170262306a36Sopenharmony_ci		/*
170362306a36Sopenharmony_ci		 * Treat this folio as congested if folios are cycling
170462306a36Sopenharmony_ci		 * through the LRU so quickly that the folios marked
170562306a36Sopenharmony_ci		 * for immediate reclaim are making it to the end of
170662306a36Sopenharmony_ci		 * the LRU a second time.
170762306a36Sopenharmony_ci		 */
170862306a36Sopenharmony_ci		if (writeback && folio_test_reclaim(folio))
170962306a36Sopenharmony_ci			stat->nr_congested += nr_pages;
171062306a36Sopenharmony_ci
171162306a36Sopenharmony_ci		/*
171262306a36Sopenharmony_ci		 * If a folio at the tail of the LRU is under writeback, there
171362306a36Sopenharmony_ci		 * are three cases to consider.
171462306a36Sopenharmony_ci		 *
171562306a36Sopenharmony_ci		 * 1) If reclaim is encountering an excessive number
171662306a36Sopenharmony_ci		 *    of folios under writeback and this folio has both
171762306a36Sopenharmony_ci		 *    the writeback and reclaim flags set, then it
171862306a36Sopenharmony_ci		 *    indicates that folios are being queued for I/O but
171962306a36Sopenharmony_ci		 *    are being recycled through the LRU before the I/O
172062306a36Sopenharmony_ci		 *    can complete. Waiting on the folio itself risks an
172162306a36Sopenharmony_ci		 *    indefinite stall if it is impossible to writeback
172262306a36Sopenharmony_ci		 *    the folio due to I/O error or disconnected storage
172362306a36Sopenharmony_ci		 *    so instead note that the LRU is being scanned too
172462306a36Sopenharmony_ci		 *    quickly and the caller can stall after the folio
172562306a36Sopenharmony_ci		 *    list has been processed.
172662306a36Sopenharmony_ci		 *
172762306a36Sopenharmony_ci		 * 2) Global or new memcg reclaim encounters a folio that is
172862306a36Sopenharmony_ci		 *    not marked for immediate reclaim, or the caller does not
172962306a36Sopenharmony_ci		 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
173062306a36Sopenharmony_ci		 *    not to fs). In this case mark the folio for immediate
173162306a36Sopenharmony_ci		 *    reclaim and continue scanning.
173262306a36Sopenharmony_ci		 *
173362306a36Sopenharmony_ci		 *    Require may_enter_fs() because we would wait on fs, which
173462306a36Sopenharmony_ci		 *    may not have submitted I/O yet. And the loop driver might
173562306a36Sopenharmony_ci		 *    enter reclaim, and deadlock if it waits on a folio for
173662306a36Sopenharmony_ci		 *    which it is needed to do the write (loop masks off
173762306a36Sopenharmony_ci		 *    __GFP_IO|__GFP_FS for this reason); but more thought
173862306a36Sopenharmony_ci		 *    would probably show more reasons.
173962306a36Sopenharmony_ci		 *
174062306a36Sopenharmony_ci		 * 3) Legacy memcg encounters a folio that already has the
174162306a36Sopenharmony_ci		 *    reclaim flag set. memcg does not have any dirty folio
174262306a36Sopenharmony_ci		 *    throttling so we could easily OOM just because too many
174362306a36Sopenharmony_ci		 *    folios are in writeback and there is nothing else to
174462306a36Sopenharmony_ci		 *    reclaim. Wait for the writeback to complete.
174562306a36Sopenharmony_ci		 *
174662306a36Sopenharmony_ci		 * In cases 1) and 2) we activate the folios to get them out of
174762306a36Sopenharmony_ci		 * the way while we continue scanning for clean folios on the
174862306a36Sopenharmony_ci		 * inactive list and refilling from the active list. The
174962306a36Sopenharmony_ci		 * observation here is that waiting for disk writes is more
175062306a36Sopenharmony_ci		 * expensive than potentially causing reloads down the line.
175162306a36Sopenharmony_ci		 * Since they're marked for immediate reclaim, they won't put
175262306a36Sopenharmony_ci		 * memory pressure on the cache working set any longer than it
175362306a36Sopenharmony_ci		 * takes to write them to disk.
175462306a36Sopenharmony_ci		 */
175562306a36Sopenharmony_ci		if (folio_test_writeback(folio)) {
175662306a36Sopenharmony_ci			/* Case 1 above */
175762306a36Sopenharmony_ci			if (current_is_kswapd() &&
175862306a36Sopenharmony_ci			    folio_test_reclaim(folio) &&
175962306a36Sopenharmony_ci			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
176062306a36Sopenharmony_ci				stat->nr_immediate += nr_pages;
176162306a36Sopenharmony_ci				goto activate_locked;
176262306a36Sopenharmony_ci
176362306a36Sopenharmony_ci			/* Case 2 above */
176462306a36Sopenharmony_ci			} else if (writeback_throttling_sane(sc) ||
176562306a36Sopenharmony_ci			    !folio_test_reclaim(folio) ||
176662306a36Sopenharmony_ci			    !may_enter_fs(folio, sc->gfp_mask)) {
176762306a36Sopenharmony_ci				/*
176862306a36Sopenharmony_ci				 * This is slightly racy -
176962306a36Sopenharmony_ci				 * folio_end_writeback() might have
177062306a36Sopenharmony_ci				 * just cleared the reclaim flag, then
177162306a36Sopenharmony_ci				 * setting the reclaim flag here ends up
177262306a36Sopenharmony_ci				 * interpreted as the readahead flag - but
177362306a36Sopenharmony_ci				 * that does not matter enough to care.
177462306a36Sopenharmony_ci				 * What we do want is for this folio to
177562306a36Sopenharmony_ci				 * have the reclaim flag set next time
177662306a36Sopenharmony_ci				 * memcg reclaim reaches the tests above,
177762306a36Sopenharmony_ci				 * so it will then wait for writeback to
177862306a36Sopenharmony_ci				 * avoid OOM; and it's also appropriate
177962306a36Sopenharmony_ci				 * in global reclaim.
178062306a36Sopenharmony_ci				 */
178162306a36Sopenharmony_ci				folio_set_reclaim(folio);
178262306a36Sopenharmony_ci				stat->nr_writeback += nr_pages;
178362306a36Sopenharmony_ci				goto activate_locked;
178462306a36Sopenharmony_ci
178562306a36Sopenharmony_ci			/* Case 3 above */
178662306a36Sopenharmony_ci			} else {
178762306a36Sopenharmony_ci				folio_unlock(folio);
178862306a36Sopenharmony_ci				folio_wait_writeback(folio);
178962306a36Sopenharmony_ci				/* then go back and try same folio again */
179062306a36Sopenharmony_ci				list_add_tail(&folio->lru, folio_list);
179162306a36Sopenharmony_ci				continue;
179262306a36Sopenharmony_ci			}
179362306a36Sopenharmony_ci		}
179462306a36Sopenharmony_ci
179562306a36Sopenharmony_ci		if (!ignore_references)
179662306a36Sopenharmony_ci			references = folio_check_references(folio, sc);
179762306a36Sopenharmony_ci
179862306a36Sopenharmony_ci		switch (references) {
179962306a36Sopenharmony_ci		case FOLIOREF_ACTIVATE:
180062306a36Sopenharmony_ci			goto activate_locked;
180162306a36Sopenharmony_ci		case FOLIOREF_KEEP:
180262306a36Sopenharmony_ci			stat->nr_ref_keep += nr_pages;
180362306a36Sopenharmony_ci			goto keep_locked;
180462306a36Sopenharmony_ci		case FOLIOREF_RECLAIM:
180562306a36Sopenharmony_ci		case FOLIOREF_RECLAIM_CLEAN:
180662306a36Sopenharmony_ci		case FOLIOREF_RECLAIM_PURGEABLE:
180762306a36Sopenharmony_ci			; /* try to reclaim the folio below */
180862306a36Sopenharmony_ci		}
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ci		/*
181162306a36Sopenharmony_ci		 * Before reclaiming the folio, try to relocate
181262306a36Sopenharmony_ci		 * its contents to another node.
181362306a36Sopenharmony_ci		 */
181462306a36Sopenharmony_ci		if (do_demote_pass &&
181562306a36Sopenharmony_ci		    (thp_migration_supported() || !folio_test_large(folio))) {
181662306a36Sopenharmony_ci			list_add(&folio->lru, &demote_folios);
181762306a36Sopenharmony_ci			folio_unlock(folio);
181862306a36Sopenharmony_ci			continue;
181962306a36Sopenharmony_ci		}
182062306a36Sopenharmony_ci
182162306a36Sopenharmony_ci		/*
182262306a36Sopenharmony_ci		 * Anonymous process memory has backing store?
182362306a36Sopenharmony_ci		 * Try to allocate it some swap space here.
182462306a36Sopenharmony_ci		 * Lazyfree folio could be freed directly
182562306a36Sopenharmony_ci		 */
182662306a36Sopenharmony_ci		if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
182762306a36Sopenharmony_ci			if (!folio_test_swapcache(folio) && references != FOLIOREF_RECLAIM_PURGEABLE) {
182862306a36Sopenharmony_ci				if (!(sc->gfp_mask & __GFP_IO))
182962306a36Sopenharmony_ci					goto keep_locked;
183062306a36Sopenharmony_ci				if (folio_maybe_dma_pinned(folio))
183162306a36Sopenharmony_ci					goto keep_locked;
183262306a36Sopenharmony_ci				if (folio_test_large(folio)) {
183362306a36Sopenharmony_ci					/* cannot split folio, skip it */
183462306a36Sopenharmony_ci					if (!can_split_folio(folio, NULL))
183562306a36Sopenharmony_ci						goto activate_locked;
183662306a36Sopenharmony_ci					/*
183762306a36Sopenharmony_ci					 * Split folios without a PMD map right
183862306a36Sopenharmony_ci					 * away. Chances are some or all of the
183962306a36Sopenharmony_ci					 * tail pages can be freed without IO.
184062306a36Sopenharmony_ci					 */
184162306a36Sopenharmony_ci					if (!folio_entire_mapcount(folio) &&
184262306a36Sopenharmony_ci					    split_folio_to_list(folio,
184362306a36Sopenharmony_ci								folio_list))
184462306a36Sopenharmony_ci						goto activate_locked;
184562306a36Sopenharmony_ci				}
184662306a36Sopenharmony_ci				if (!add_to_swap(folio)) {
184762306a36Sopenharmony_ci					if (!folio_test_large(folio))
184862306a36Sopenharmony_ci						goto activate_locked_split;
184962306a36Sopenharmony_ci					/* Fallback to swap normal pages */
185062306a36Sopenharmony_ci					if (split_folio_to_list(folio,
185162306a36Sopenharmony_ci								folio_list))
185262306a36Sopenharmony_ci						goto activate_locked;
185362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
185462306a36Sopenharmony_ci					count_vm_event(THP_SWPOUT_FALLBACK);
185562306a36Sopenharmony_ci#endif
185662306a36Sopenharmony_ci					if (!add_to_swap(folio))
185762306a36Sopenharmony_ci						goto activate_locked_split;
185862306a36Sopenharmony_ci				}
185962306a36Sopenharmony_ci			}
186062306a36Sopenharmony_ci		} else if (folio_test_swapbacked(folio) &&
186162306a36Sopenharmony_ci			   folio_test_large(folio)) {
186262306a36Sopenharmony_ci			/* Split shmem folio */
186362306a36Sopenharmony_ci			if (split_folio_to_list(folio, folio_list))
186462306a36Sopenharmony_ci				goto keep_locked;
186562306a36Sopenharmony_ci		}
186662306a36Sopenharmony_ci
186762306a36Sopenharmony_ci		/*
186862306a36Sopenharmony_ci		 * If the folio was split above, the tail pages will make
186962306a36Sopenharmony_ci		 * their own pass through this function and be accounted
187062306a36Sopenharmony_ci		 * then.
187162306a36Sopenharmony_ci		 */
187262306a36Sopenharmony_ci		if ((nr_pages > 1) && !folio_test_large(folio)) {
187362306a36Sopenharmony_ci			sc->nr_scanned -= (nr_pages - 1);
187462306a36Sopenharmony_ci			nr_pages = 1;
187562306a36Sopenharmony_ci		}
187662306a36Sopenharmony_ci
187762306a36Sopenharmony_ci		/*
187862306a36Sopenharmony_ci		 * The folio is mapped into the page tables of one or more
187962306a36Sopenharmony_ci		 * processes. Try to unmap it here.
188062306a36Sopenharmony_ci		 */
188162306a36Sopenharmony_ci		if (folio_mapped(folio)) {
188262306a36Sopenharmony_ci			enum ttu_flags flags = TTU_BATCH_FLUSH;
188362306a36Sopenharmony_ci			bool was_swapbacked = folio_test_swapbacked(folio);
188462306a36Sopenharmony_ci
188562306a36Sopenharmony_ci			if (folio_test_pmd_mappable(folio))
188662306a36Sopenharmony_ci				flags |= TTU_SPLIT_HUGE_PMD;
188762306a36Sopenharmony_ci
188862306a36Sopenharmony_ci			try_to_unmap(folio, flags);
188962306a36Sopenharmony_ci			if (folio_mapped(folio)) {
189062306a36Sopenharmony_ci				stat->nr_unmap_fail += nr_pages;
189162306a36Sopenharmony_ci				if (!was_swapbacked &&
189262306a36Sopenharmony_ci				    folio_test_swapbacked(folio))
189362306a36Sopenharmony_ci					stat->nr_lazyfree_fail += nr_pages;
189462306a36Sopenharmony_ci				goto activate_locked;
189562306a36Sopenharmony_ci			}
189662306a36Sopenharmony_ci		}
189762306a36Sopenharmony_ci
189862306a36Sopenharmony_ci		/*
189962306a36Sopenharmony_ci		 * Folio is unmapped now so it cannot be newly pinned anymore.
190062306a36Sopenharmony_ci		 * No point in trying to reclaim folio if it is pinned.
190162306a36Sopenharmony_ci		 * Furthermore we don't want to reclaim underlying fs metadata
190262306a36Sopenharmony_ci		 * if the folio is pinned and thus potentially modified by the
190362306a36Sopenharmony_ci		 * pinning process as that may upset the filesystem.
190462306a36Sopenharmony_ci		 */
190562306a36Sopenharmony_ci		if (folio_maybe_dma_pinned(folio))
190662306a36Sopenharmony_ci			goto activate_locked;
190762306a36Sopenharmony_ci
190862306a36Sopenharmony_ci		mapping = folio_mapping(folio);
190962306a36Sopenharmony_ci		if (folio_test_dirty(folio) && references != FOLIOREF_RECLAIM_PURGEABLE) {
191062306a36Sopenharmony_ci			/*
191162306a36Sopenharmony_ci			 * Only kswapd can writeback filesystem folios
191262306a36Sopenharmony_ci			 * to avoid risk of stack overflow. But avoid
191362306a36Sopenharmony_ci			 * injecting inefficient single-folio I/O into
191462306a36Sopenharmony_ci			 * flusher writeback as much as possible: only
191562306a36Sopenharmony_ci			 * write folios when we've encountered many
191662306a36Sopenharmony_ci			 * dirty folios, and when we've already scanned
191762306a36Sopenharmony_ci			 * the rest of the LRU for clean folios and see
191862306a36Sopenharmony_ci			 * the same dirty folios again (with the reclaim
191962306a36Sopenharmony_ci			 * flag set).
192062306a36Sopenharmony_ci			 */
192162306a36Sopenharmony_ci			if (folio_is_file_lru(folio) &&
192262306a36Sopenharmony_ci			    (!current_is_kswapd() ||
192362306a36Sopenharmony_ci			     !folio_test_reclaim(folio) ||
192462306a36Sopenharmony_ci			     !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
192562306a36Sopenharmony_ci				/*
192662306a36Sopenharmony_ci				 * Immediately reclaim when written back.
192762306a36Sopenharmony_ci				 * Similar in principle to folio_deactivate()
192862306a36Sopenharmony_ci				 * except we already have the folio isolated
192962306a36Sopenharmony_ci				 * and know it's dirty
193062306a36Sopenharmony_ci				 */
193162306a36Sopenharmony_ci				node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
193262306a36Sopenharmony_ci						nr_pages);
193362306a36Sopenharmony_ci				folio_set_reclaim(folio);
193462306a36Sopenharmony_ci
193562306a36Sopenharmony_ci				goto activate_locked;
193662306a36Sopenharmony_ci			}
193762306a36Sopenharmony_ci
193862306a36Sopenharmony_ci			if (references == FOLIOREF_RECLAIM_CLEAN)
193962306a36Sopenharmony_ci				goto keep_locked;
194062306a36Sopenharmony_ci			if (!may_enter_fs(folio, sc->gfp_mask))
194162306a36Sopenharmony_ci				goto keep_locked;
194262306a36Sopenharmony_ci			if (!sc->may_writepage)
194362306a36Sopenharmony_ci				goto keep_locked;
194462306a36Sopenharmony_ci
194562306a36Sopenharmony_ci			/*
194662306a36Sopenharmony_ci			 * Folio is dirty. Flush the TLB if a writable entry
194762306a36Sopenharmony_ci			 * potentially exists to avoid CPU writes after I/O
194862306a36Sopenharmony_ci			 * starts and then write it out here.
194962306a36Sopenharmony_ci			 */
195062306a36Sopenharmony_ci			try_to_unmap_flush_dirty();
195162306a36Sopenharmony_ci			switch (pageout(folio, mapping, &plug)) {
195262306a36Sopenharmony_ci			case PAGE_KEEP:
195362306a36Sopenharmony_ci				goto keep_locked;
195462306a36Sopenharmony_ci			case PAGE_ACTIVATE:
195562306a36Sopenharmony_ci				goto activate_locked;
195662306a36Sopenharmony_ci			case PAGE_SUCCESS:
195762306a36Sopenharmony_ci				stat->nr_pageout += nr_pages;
195862306a36Sopenharmony_ci
195962306a36Sopenharmony_ci				if (folio_test_writeback(folio))
196062306a36Sopenharmony_ci					goto keep;
196162306a36Sopenharmony_ci				if (folio_test_dirty(folio))
196262306a36Sopenharmony_ci					goto keep;
196362306a36Sopenharmony_ci
196462306a36Sopenharmony_ci				/*
196562306a36Sopenharmony_ci				 * A synchronous write - probably a ramdisk.  Go
196662306a36Sopenharmony_ci				 * ahead and try to reclaim the folio.
196762306a36Sopenharmony_ci				 */
196862306a36Sopenharmony_ci				if (!folio_trylock(folio))
196962306a36Sopenharmony_ci					goto keep;
197062306a36Sopenharmony_ci				if (folio_test_dirty(folio) ||
197162306a36Sopenharmony_ci				    folio_test_writeback(folio))
197262306a36Sopenharmony_ci					goto keep_locked;
197362306a36Sopenharmony_ci				mapping = folio_mapping(folio);
197462306a36Sopenharmony_ci				fallthrough;
197562306a36Sopenharmony_ci			case PAGE_CLEAN:
197662306a36Sopenharmony_ci				; /* try to free the folio below */
197762306a36Sopenharmony_ci			}
197862306a36Sopenharmony_ci		}
197962306a36Sopenharmony_ci
198062306a36Sopenharmony_ci		/*
198162306a36Sopenharmony_ci		 * If the folio has buffers, try to free the buffer
198262306a36Sopenharmony_ci		 * mappings associated with this folio. If we succeed
198362306a36Sopenharmony_ci		 * we try to free the folio as well.
198462306a36Sopenharmony_ci		 *
198562306a36Sopenharmony_ci		 * We do this even if the folio is dirty.
198662306a36Sopenharmony_ci		 * filemap_release_folio() does not perform I/O, but it
198762306a36Sopenharmony_ci		 * is possible for a folio to have the dirty flag set,
198862306a36Sopenharmony_ci		 * but it is actually clean (all its buffers are clean).
198962306a36Sopenharmony_ci		 * This happens if the buffers were written out directly,
199062306a36Sopenharmony_ci		 * with submit_bh(). ext3 will do this, as well as
199162306a36Sopenharmony_ci		 * the blockdev mapping.  filemap_release_folio() will
199262306a36Sopenharmony_ci		 * discover that cleanness and will drop the buffers
199362306a36Sopenharmony_ci		 * and mark the folio clean - it can be freed.
199462306a36Sopenharmony_ci		 *
199562306a36Sopenharmony_ci		 * Rarely, folios can have buffers and no ->mapping.
199662306a36Sopenharmony_ci		 * These are the folios which were not successfully
199762306a36Sopenharmony_ci		 * invalidated in truncate_cleanup_folio().  We try to
199862306a36Sopenharmony_ci		 * drop those buffers here and if that worked, and the
199962306a36Sopenharmony_ci		 * folio is no longer mapped into process address space
200062306a36Sopenharmony_ci		 * (refcount == 1) it can be freed.  Otherwise, leave
200162306a36Sopenharmony_ci		 * the folio on the LRU so it is swappable.
200262306a36Sopenharmony_ci		 */
200362306a36Sopenharmony_ci		if (folio_needs_release(folio)) {
200462306a36Sopenharmony_ci			if (!filemap_release_folio(folio, sc->gfp_mask))
200562306a36Sopenharmony_ci				goto activate_locked;
200662306a36Sopenharmony_ci			if (!mapping && folio_ref_count(folio) == 1) {
200762306a36Sopenharmony_ci				folio_unlock(folio);
200862306a36Sopenharmony_ci				if (folio_put_testzero(folio))
200962306a36Sopenharmony_ci					goto free_it;
201062306a36Sopenharmony_ci				else {
201162306a36Sopenharmony_ci					/*
201262306a36Sopenharmony_ci					 * rare race with speculative reference.
201362306a36Sopenharmony_ci					 * the speculative reference will free
201462306a36Sopenharmony_ci					 * this folio shortly, so we may
201562306a36Sopenharmony_ci					 * increment nr_reclaimed here (and
201662306a36Sopenharmony_ci					 * leave it off the LRU).
201762306a36Sopenharmony_ci					 */
201862306a36Sopenharmony_ci					nr_reclaimed += nr_pages;
201962306a36Sopenharmony_ci					continue;
202062306a36Sopenharmony_ci				}
202162306a36Sopenharmony_ci			}
202262306a36Sopenharmony_ci		}
202362306a36Sopenharmony_ci
202462306a36Sopenharmony_ci		if (folio_test_anon(folio) && (!folio_test_swapbacked(folio) || references == FOLIOREF_RECLAIM_PURGEABLE)) {
202562306a36Sopenharmony_ci			/* follow __remove_mapping for reference */
202662306a36Sopenharmony_ci			if (!folio_ref_freeze(folio, 1))
202762306a36Sopenharmony_ci				goto keep_locked;
202862306a36Sopenharmony_ci
202962306a36Sopenharmony_ci			/*
203062306a36Sopenharmony_ci			 * The folio has only one reference left, which is
203162306a36Sopenharmony_ci			 * from the isolation. After the caller puts the
203262306a36Sopenharmony_ci			 * folio back on the lru and drops the reference, the
203362306a36Sopenharmony_ci			 * folio will be freed anyway. It doesn't matter
203462306a36Sopenharmony_ci			 * which lru it goes on. So we don't bother checking
203562306a36Sopenharmony_ci			 * the dirty flag here.
203662306a36Sopenharmony_ci			 */
203762306a36Sopenharmony_ci			count_vm_events(PGLAZYFREED, nr_pages);
203862306a36Sopenharmony_ci			count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
203962306a36Sopenharmony_ci		} else if (!mapping || !__remove_mapping(mapping, folio, true,
204062306a36Sopenharmony_ci							 sc->target_mem_cgroup))
204162306a36Sopenharmony_ci			goto keep_locked;
204262306a36Sopenharmony_ci
204362306a36Sopenharmony_ci		folio_unlock(folio);
204462306a36Sopenharmony_cifree_it:
204562306a36Sopenharmony_ci		/*
204662306a36Sopenharmony_ci		 * Folio may get swapped out as a whole, need to account
204762306a36Sopenharmony_ci		 * all pages in it.
204862306a36Sopenharmony_ci		 */
204962306a36Sopenharmony_ci		nr_reclaimed += nr_pages;
205062306a36Sopenharmony_ci
205162306a36Sopenharmony_ci		/*
205262306a36Sopenharmony_ci		 * Is there need to periodically free_folio_list? It would
205362306a36Sopenharmony_ci		 * appear not as the counts should be low
205462306a36Sopenharmony_ci		 */
205562306a36Sopenharmony_ci		if (unlikely(folio_test_large(folio)))
205662306a36Sopenharmony_ci			destroy_large_folio(folio);
205762306a36Sopenharmony_ci		else
205862306a36Sopenharmony_ci			list_add(&folio->lru, &free_folios);
205962306a36Sopenharmony_ci		continue;
206062306a36Sopenharmony_ci
206162306a36Sopenharmony_ciactivate_locked_split:
206262306a36Sopenharmony_ci		/*
206362306a36Sopenharmony_ci		 * The tail pages that are failed to add into swap cache
206462306a36Sopenharmony_ci		 * reach here.  Fixup nr_scanned and nr_pages.
206562306a36Sopenharmony_ci		 */
206662306a36Sopenharmony_ci		if (nr_pages > 1) {
206762306a36Sopenharmony_ci			sc->nr_scanned -= (nr_pages - 1);
206862306a36Sopenharmony_ci			nr_pages = 1;
206962306a36Sopenharmony_ci		}
207062306a36Sopenharmony_ciactivate_locked:
207162306a36Sopenharmony_ci		/* Not a candidate for swapping, so reclaim swap space. */
207262306a36Sopenharmony_ci		if (folio_test_swapcache(folio) &&
207362306a36Sopenharmony_ci		    (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
207462306a36Sopenharmony_ci			folio_free_swap(folio);
207562306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
207662306a36Sopenharmony_ci		if (!folio_test_mlocked(folio)) {
207762306a36Sopenharmony_ci			int type = folio_is_file_lru(folio);
207862306a36Sopenharmony_ci			folio_set_active(folio);
207962306a36Sopenharmony_ci			stat->nr_activate[type] += nr_pages;
208062306a36Sopenharmony_ci			count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
208162306a36Sopenharmony_ci		}
208262306a36Sopenharmony_cikeep_locked:
208362306a36Sopenharmony_ci		folio_unlock(folio);
208462306a36Sopenharmony_cikeep:
208562306a36Sopenharmony_ci		list_add(&folio->lru, &ret_folios);
208662306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
208762306a36Sopenharmony_ci				folio_test_unevictable(folio), folio);
208862306a36Sopenharmony_ci	}
208962306a36Sopenharmony_ci	/* 'folio_list' is always empty here */
209062306a36Sopenharmony_ci
209162306a36Sopenharmony_ci	/* Migrate folios selected for demotion */
209262306a36Sopenharmony_ci	nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
209362306a36Sopenharmony_ci	/* Folios that could not be demoted are still in @demote_folios */
209462306a36Sopenharmony_ci	if (!list_empty(&demote_folios)) {
209562306a36Sopenharmony_ci		/* Folios which weren't demoted go back on @folio_list */
209662306a36Sopenharmony_ci		list_splice_init(&demote_folios, folio_list);
209762306a36Sopenharmony_ci
209862306a36Sopenharmony_ci		/*
209962306a36Sopenharmony_ci		 * goto retry to reclaim the undemoted folios in folio_list if
210062306a36Sopenharmony_ci		 * desired.
210162306a36Sopenharmony_ci		 *
210262306a36Sopenharmony_ci		 * Reclaiming directly from top tier nodes is not often desired
210362306a36Sopenharmony_ci		 * due to it breaking the LRU ordering: in general memory
210462306a36Sopenharmony_ci		 * should be reclaimed from lower tier nodes and demoted from
210562306a36Sopenharmony_ci		 * top tier nodes.
210662306a36Sopenharmony_ci		 *
210762306a36Sopenharmony_ci		 * However, disabling reclaim from top tier nodes entirely
210862306a36Sopenharmony_ci		 * would cause ooms in edge scenarios where lower tier memory
210962306a36Sopenharmony_ci		 * is unreclaimable for whatever reason, eg memory being
211062306a36Sopenharmony_ci		 * mlocked or too hot to reclaim. We can disable reclaim
211162306a36Sopenharmony_ci		 * from top tier nodes in proactive reclaim though as that is
211262306a36Sopenharmony_ci		 * not real memory pressure.
211362306a36Sopenharmony_ci		 */
211462306a36Sopenharmony_ci		if (!sc->proactive) {
211562306a36Sopenharmony_ci			do_demote_pass = false;
211662306a36Sopenharmony_ci			goto retry;
211762306a36Sopenharmony_ci		}
211862306a36Sopenharmony_ci	}
211962306a36Sopenharmony_ci
212062306a36Sopenharmony_ci	pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
212162306a36Sopenharmony_ci
212262306a36Sopenharmony_ci	mem_cgroup_uncharge_list(&free_folios);
212362306a36Sopenharmony_ci	try_to_unmap_flush();
212462306a36Sopenharmony_ci	free_unref_page_list(&free_folios);
212562306a36Sopenharmony_ci
212662306a36Sopenharmony_ci	list_splice(&ret_folios, folio_list);
212762306a36Sopenharmony_ci	count_vm_events(PGACTIVATE, pgactivate);
212862306a36Sopenharmony_ci
212962306a36Sopenharmony_ci	if (plug)
213062306a36Sopenharmony_ci		swap_write_unplug(plug);
213162306a36Sopenharmony_ci	return nr_reclaimed;
213262306a36Sopenharmony_ci}
213362306a36Sopenharmony_ci
213462306a36Sopenharmony_ciunsigned int reclaim_clean_pages_from_list(struct zone *zone,
213562306a36Sopenharmony_ci					   struct list_head *folio_list)
213662306a36Sopenharmony_ci{
213762306a36Sopenharmony_ci	struct scan_control sc = {
213862306a36Sopenharmony_ci		.gfp_mask = GFP_KERNEL,
213962306a36Sopenharmony_ci		.may_unmap = 1,
214062306a36Sopenharmony_ci	};
214162306a36Sopenharmony_ci	struct reclaim_stat stat;
214262306a36Sopenharmony_ci	unsigned int nr_reclaimed;
214362306a36Sopenharmony_ci	struct folio *folio, *next;
214462306a36Sopenharmony_ci	LIST_HEAD(clean_folios);
214562306a36Sopenharmony_ci	unsigned int noreclaim_flag;
214662306a36Sopenharmony_ci
214762306a36Sopenharmony_ci	list_for_each_entry_safe(folio, next, folio_list, lru) {
214862306a36Sopenharmony_ci		if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
214962306a36Sopenharmony_ci		    !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
215062306a36Sopenharmony_ci		    !folio_test_unevictable(folio)) {
215162306a36Sopenharmony_ci			folio_clear_active(folio);
215262306a36Sopenharmony_ci			list_move(&folio->lru, &clean_folios);
215362306a36Sopenharmony_ci		}
215462306a36Sopenharmony_ci	}
215562306a36Sopenharmony_ci
215662306a36Sopenharmony_ci	/*
215762306a36Sopenharmony_ci	 * We should be safe here since we are only dealing with file pages and
215862306a36Sopenharmony_ci	 * we are not kswapd and therefore cannot write dirty file pages. But
215962306a36Sopenharmony_ci	 * call memalloc_noreclaim_save() anyway, just in case these conditions
216062306a36Sopenharmony_ci	 * change in the future.
216162306a36Sopenharmony_ci	 */
216262306a36Sopenharmony_ci	noreclaim_flag = memalloc_noreclaim_save();
216362306a36Sopenharmony_ci	nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
216462306a36Sopenharmony_ci					&stat, true);
216562306a36Sopenharmony_ci	memalloc_noreclaim_restore(noreclaim_flag);
216662306a36Sopenharmony_ci
216762306a36Sopenharmony_ci	list_splice(&clean_folios, folio_list);
216862306a36Sopenharmony_ci	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
216962306a36Sopenharmony_ci			    -(long)nr_reclaimed);
217062306a36Sopenharmony_ci	/*
217162306a36Sopenharmony_ci	 * Since lazyfree pages are isolated from file LRU from the beginning,
217262306a36Sopenharmony_ci	 * they will rotate back to anonymous LRU in the end if it failed to
217362306a36Sopenharmony_ci	 * discard so isolated count will be mismatched.
217462306a36Sopenharmony_ci	 * Compensate the isolated count for both LRU lists.
217562306a36Sopenharmony_ci	 */
217662306a36Sopenharmony_ci	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
217762306a36Sopenharmony_ci			    stat.nr_lazyfree_fail);
217862306a36Sopenharmony_ci	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
217962306a36Sopenharmony_ci			    -(long)stat.nr_lazyfree_fail);
218062306a36Sopenharmony_ci	return nr_reclaimed;
218162306a36Sopenharmony_ci}
218262306a36Sopenharmony_ci
218362306a36Sopenharmony_ci/*
218462306a36Sopenharmony_ci * Update LRU sizes after isolating pages. The LRU size updates must
218562306a36Sopenharmony_ci * be complete before mem_cgroup_update_lru_size due to a sanity check.
218662306a36Sopenharmony_ci */
218762306a36Sopenharmony_cistatic __always_inline void update_lru_sizes(struct lruvec *lruvec,
218862306a36Sopenharmony_ci			enum lru_list lru, unsigned long *nr_zone_taken)
218962306a36Sopenharmony_ci{
219062306a36Sopenharmony_ci	int zid;
219162306a36Sopenharmony_ci
219262306a36Sopenharmony_ci	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
219362306a36Sopenharmony_ci		if (!nr_zone_taken[zid])
219462306a36Sopenharmony_ci			continue;
219562306a36Sopenharmony_ci
219662306a36Sopenharmony_ci		update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
219762306a36Sopenharmony_ci	}
219862306a36Sopenharmony_ci
219962306a36Sopenharmony_ci}
220062306a36Sopenharmony_ci
220162306a36Sopenharmony_ci#ifdef CONFIG_CMA
220262306a36Sopenharmony_ci/*
220362306a36Sopenharmony_ci * It is waste of effort to scan and reclaim CMA pages if it is not available
220462306a36Sopenharmony_ci * for current allocation context. Kswapd can not be enrolled as it can not
220562306a36Sopenharmony_ci * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
220662306a36Sopenharmony_ci */
220762306a36Sopenharmony_cistatic bool skip_cma(struct folio *folio, struct scan_control *sc)
220862306a36Sopenharmony_ci{
220962306a36Sopenharmony_ci	return !current_is_kswapd() &&
221062306a36Sopenharmony_ci			gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
221162306a36Sopenharmony_ci			get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
221262306a36Sopenharmony_ci}
221362306a36Sopenharmony_ci#else
221462306a36Sopenharmony_cistatic bool skip_cma(struct folio *folio, struct scan_control *sc)
221562306a36Sopenharmony_ci{
221662306a36Sopenharmony_ci	return false;
221762306a36Sopenharmony_ci}
221862306a36Sopenharmony_ci#endif
221962306a36Sopenharmony_ci
222062306a36Sopenharmony_ci/*
222162306a36Sopenharmony_ci * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
222262306a36Sopenharmony_ci *
222362306a36Sopenharmony_ci * lruvec->lru_lock is heavily contended.  Some of the functions that
222462306a36Sopenharmony_ci * shrink the lists perform better by taking out a batch of pages
222562306a36Sopenharmony_ci * and working on them outside the LRU lock.
222662306a36Sopenharmony_ci *
222762306a36Sopenharmony_ci * For pagecache intensive workloads, this function is the hottest
222862306a36Sopenharmony_ci * spot in the kernel (apart from copy_*_user functions).
222962306a36Sopenharmony_ci *
223062306a36Sopenharmony_ci * Lru_lock must be held before calling this function.
223162306a36Sopenharmony_ci *
223262306a36Sopenharmony_ci * @nr_to_scan:	The number of eligible pages to look through on the list.
223362306a36Sopenharmony_ci * @lruvec:	The LRU vector to pull pages from.
223462306a36Sopenharmony_ci * @dst:	The temp list to put pages on to.
223562306a36Sopenharmony_ci * @nr_scanned:	The number of pages that were scanned.
223662306a36Sopenharmony_ci * @sc:		The scan_control struct for this reclaim session
223762306a36Sopenharmony_ci * @lru:	LRU list id for isolating
223862306a36Sopenharmony_ci *
223962306a36Sopenharmony_ci * returns how many pages were moved onto *@dst.
224062306a36Sopenharmony_ci */
224162306a36Sopenharmony_ciunsigned long isolate_lru_folios(unsigned long nr_to_scan,
224262306a36Sopenharmony_ci		struct lruvec *lruvec, struct list_head *dst,
224362306a36Sopenharmony_ci		unsigned long *nr_scanned, struct scan_control *sc,
224462306a36Sopenharmony_ci		enum lru_list lru)
224562306a36Sopenharmony_ci{
224662306a36Sopenharmony_ci	struct list_head *src = &lruvec->lists[lru];
224762306a36Sopenharmony_ci	unsigned long nr_taken = 0;
224862306a36Sopenharmony_ci	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
224962306a36Sopenharmony_ci	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
225062306a36Sopenharmony_ci	unsigned long skipped = 0;
225162306a36Sopenharmony_ci	unsigned long scan, total_scan, nr_pages;
225262306a36Sopenharmony_ci	LIST_HEAD(folios_skipped);
225362306a36Sopenharmony_ci
225462306a36Sopenharmony_ci	total_scan = 0;
225562306a36Sopenharmony_ci	scan = 0;
225662306a36Sopenharmony_ci	while (scan < nr_to_scan && !list_empty(src)) {
225762306a36Sopenharmony_ci		struct list_head *move_to = src;
225862306a36Sopenharmony_ci		struct folio *folio;
225962306a36Sopenharmony_ci
226062306a36Sopenharmony_ci		folio = lru_to_folio(src);
226162306a36Sopenharmony_ci		prefetchw_prev_lru_folio(folio, src, flags);
226262306a36Sopenharmony_ci
226362306a36Sopenharmony_ci		nr_pages = folio_nr_pages(folio);
226462306a36Sopenharmony_ci		total_scan += nr_pages;
226562306a36Sopenharmony_ci
226662306a36Sopenharmony_ci		if (folio_zonenum(folio) > sc->reclaim_idx ||
226762306a36Sopenharmony_ci				skip_cma(folio, sc)) {
226862306a36Sopenharmony_ci			nr_skipped[folio_zonenum(folio)] += nr_pages;
226962306a36Sopenharmony_ci			move_to = &folios_skipped;
227062306a36Sopenharmony_ci			goto move;
227162306a36Sopenharmony_ci		}
227262306a36Sopenharmony_ci
227362306a36Sopenharmony_ci		/*
227462306a36Sopenharmony_ci		 * Do not count skipped folios because that makes the function
227562306a36Sopenharmony_ci		 * return with no isolated folios if the LRU mostly contains
227662306a36Sopenharmony_ci		 * ineligible folios.  This causes the VM to not reclaim any
227762306a36Sopenharmony_ci		 * folios, triggering a premature OOM.
227862306a36Sopenharmony_ci		 * Account all pages in a folio.
227962306a36Sopenharmony_ci		 */
228062306a36Sopenharmony_ci		scan += nr_pages;
228162306a36Sopenharmony_ci
228262306a36Sopenharmony_ci		if (!folio_test_lru(folio))
228362306a36Sopenharmony_ci			goto move;
228462306a36Sopenharmony_ci		if (!sc->may_unmap && folio_mapped(folio))
228562306a36Sopenharmony_ci			goto move;
228662306a36Sopenharmony_ci
228762306a36Sopenharmony_ci		/*
228862306a36Sopenharmony_ci		 * Be careful not to clear the lru flag until after we're
228962306a36Sopenharmony_ci		 * sure the folio is not being freed elsewhere -- the
229062306a36Sopenharmony_ci		 * folio release code relies on it.
229162306a36Sopenharmony_ci		 */
229262306a36Sopenharmony_ci		if (unlikely(!folio_try_get(folio)))
229362306a36Sopenharmony_ci			goto move;
229462306a36Sopenharmony_ci
229562306a36Sopenharmony_ci		if (!folio_test_clear_lru(folio)) {
229662306a36Sopenharmony_ci			/* Another thread is already isolating this folio */
229762306a36Sopenharmony_ci			folio_put(folio);
229862306a36Sopenharmony_ci			goto move;
229962306a36Sopenharmony_ci		}
230062306a36Sopenharmony_ci
230162306a36Sopenharmony_ci		nr_taken += nr_pages;
230262306a36Sopenharmony_ci		nr_zone_taken[folio_zonenum(folio)] += nr_pages;
230362306a36Sopenharmony_ci		move_to = dst;
230462306a36Sopenharmony_cimove:
230562306a36Sopenharmony_ci		list_move(&folio->lru, move_to);
230662306a36Sopenharmony_ci	}
230762306a36Sopenharmony_ci
230862306a36Sopenharmony_ci	/*
230962306a36Sopenharmony_ci	 * Splice any skipped folios to the start of the LRU list. Note that
231062306a36Sopenharmony_ci	 * this disrupts the LRU order when reclaiming for lower zones but
231162306a36Sopenharmony_ci	 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
231262306a36Sopenharmony_ci	 * scanning would soon rescan the same folios to skip and waste lots
231362306a36Sopenharmony_ci	 * of cpu cycles.
231462306a36Sopenharmony_ci	 */
231562306a36Sopenharmony_ci	if (!list_empty(&folios_skipped)) {
231662306a36Sopenharmony_ci		int zid;
231762306a36Sopenharmony_ci
231862306a36Sopenharmony_ci		list_splice(&folios_skipped, src);
231962306a36Sopenharmony_ci		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
232062306a36Sopenharmony_ci			if (!nr_skipped[zid])
232162306a36Sopenharmony_ci				continue;
232262306a36Sopenharmony_ci
232362306a36Sopenharmony_ci			__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
232462306a36Sopenharmony_ci			skipped += nr_skipped[zid];
232562306a36Sopenharmony_ci		}
232662306a36Sopenharmony_ci	}
232762306a36Sopenharmony_ci	*nr_scanned = total_scan;
232862306a36Sopenharmony_ci	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
232962306a36Sopenharmony_ci				    total_scan, skipped, nr_taken,
233062306a36Sopenharmony_ci				    sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
233162306a36Sopenharmony_ci	update_lru_sizes(lruvec, lru, nr_zone_taken);
233262306a36Sopenharmony_ci	return nr_taken;
233362306a36Sopenharmony_ci}
233462306a36Sopenharmony_ci
233562306a36Sopenharmony_ci/**
233662306a36Sopenharmony_ci * folio_isolate_lru() - Try to isolate a folio from its LRU list.
233762306a36Sopenharmony_ci * @folio: Folio to isolate from its LRU list.
233862306a36Sopenharmony_ci *
233962306a36Sopenharmony_ci * Isolate a @folio from an LRU list and adjust the vmstat statistic
234062306a36Sopenharmony_ci * corresponding to whatever LRU list the folio was on.
234162306a36Sopenharmony_ci *
234262306a36Sopenharmony_ci * The folio will have its LRU flag cleared.  If it was found on the
234362306a36Sopenharmony_ci * active list, it will have the Active flag set.  If it was found on the
234462306a36Sopenharmony_ci * unevictable list, it will have the Unevictable flag set.  These flags
234562306a36Sopenharmony_ci * may need to be cleared by the caller before letting the page go.
234662306a36Sopenharmony_ci *
234762306a36Sopenharmony_ci * Context:
234862306a36Sopenharmony_ci *
234962306a36Sopenharmony_ci * (1) Must be called with an elevated refcount on the folio. This is a
235062306a36Sopenharmony_ci *     fundamental difference from isolate_lru_folios() (which is called
235162306a36Sopenharmony_ci *     without a stable reference).
235262306a36Sopenharmony_ci * (2) The lru_lock must not be held.
235362306a36Sopenharmony_ci * (3) Interrupts must be enabled.
235462306a36Sopenharmony_ci *
235562306a36Sopenharmony_ci * Return: true if the folio was removed from an LRU list.
235662306a36Sopenharmony_ci * false if the folio was not on an LRU list.
235762306a36Sopenharmony_ci */
235862306a36Sopenharmony_cibool folio_isolate_lru(struct folio *folio)
235962306a36Sopenharmony_ci{
236062306a36Sopenharmony_ci	bool ret = false;
236162306a36Sopenharmony_ci
236262306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
236362306a36Sopenharmony_ci
236462306a36Sopenharmony_ci	if (folio_test_clear_lru(folio)) {
236562306a36Sopenharmony_ci		struct lruvec *lruvec;
236662306a36Sopenharmony_ci
236762306a36Sopenharmony_ci		folio_get(folio);
236862306a36Sopenharmony_ci		lruvec = folio_lruvec_lock_irq(folio);
236962306a36Sopenharmony_ci		lruvec_del_folio(lruvec, folio);
237062306a36Sopenharmony_ci		unlock_page_lruvec_irq(lruvec);
237162306a36Sopenharmony_ci		ret = true;
237262306a36Sopenharmony_ci	}
237362306a36Sopenharmony_ci
237462306a36Sopenharmony_ci	return ret;
237562306a36Sopenharmony_ci}
237662306a36Sopenharmony_ci
237762306a36Sopenharmony_ci/*
237862306a36Sopenharmony_ci * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
237962306a36Sopenharmony_ci * then get rescheduled. When there are massive number of tasks doing page
238062306a36Sopenharmony_ci * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
238162306a36Sopenharmony_ci * the LRU list will go small and be scanned faster than necessary, leading to
238262306a36Sopenharmony_ci * unnecessary swapping, thrashing and OOM.
238362306a36Sopenharmony_ci */
238462306a36Sopenharmony_cistatic int too_many_isolated(struct pglist_data *pgdat, int file,
238562306a36Sopenharmony_ci		struct scan_control *sc)
238662306a36Sopenharmony_ci{
238762306a36Sopenharmony_ci	unsigned long inactive, isolated;
238862306a36Sopenharmony_ci	bool too_many;
238962306a36Sopenharmony_ci
239062306a36Sopenharmony_ci	if (current_is_kswapd())
239162306a36Sopenharmony_ci		return 0;
239262306a36Sopenharmony_ci
239362306a36Sopenharmony_ci	if (!writeback_throttling_sane(sc))
239462306a36Sopenharmony_ci		return 0;
239562306a36Sopenharmony_ci
239662306a36Sopenharmony_ci	if (file) {
239762306a36Sopenharmony_ci		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
239862306a36Sopenharmony_ci		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
239962306a36Sopenharmony_ci	} else {
240062306a36Sopenharmony_ci		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
240162306a36Sopenharmony_ci		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
240262306a36Sopenharmony_ci	}
240362306a36Sopenharmony_ci
240462306a36Sopenharmony_ci	/*
240562306a36Sopenharmony_ci	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
240662306a36Sopenharmony_ci	 * won't get blocked by normal direct-reclaimers, forming a circular
240762306a36Sopenharmony_ci	 * deadlock.
240862306a36Sopenharmony_ci	 */
240962306a36Sopenharmony_ci	if (gfp_has_io_fs(sc->gfp_mask))
241062306a36Sopenharmony_ci		inactive >>= 3;
241162306a36Sopenharmony_ci
241262306a36Sopenharmony_ci	too_many = isolated > inactive;
241362306a36Sopenharmony_ci
241462306a36Sopenharmony_ci	/* Wake up tasks throttled due to too_many_isolated. */
241562306a36Sopenharmony_ci	if (!too_many)
241662306a36Sopenharmony_ci		wake_throttle_isolated(pgdat);
241762306a36Sopenharmony_ci
241862306a36Sopenharmony_ci	return too_many;
241962306a36Sopenharmony_ci}
242062306a36Sopenharmony_ci
242162306a36Sopenharmony_ci/*
242262306a36Sopenharmony_ci * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
242362306a36Sopenharmony_ci * On return, @list is reused as a list of folios to be freed by the caller.
242462306a36Sopenharmony_ci *
242562306a36Sopenharmony_ci * Returns the number of pages moved to the given lruvec.
242662306a36Sopenharmony_ci */
242762306a36Sopenharmony_ciunsigned int move_folios_to_lru(struct lruvec *lruvec,
242862306a36Sopenharmony_ci		struct list_head *list)
242962306a36Sopenharmony_ci{
243062306a36Sopenharmony_ci	int nr_pages, nr_moved = 0;
243162306a36Sopenharmony_ci	LIST_HEAD(folios_to_free);
243262306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
243362306a36Sopenharmony_ci	bool prot;
243462306a36Sopenharmony_ci	bool file;
243562306a36Sopenharmony_ci#endif
243662306a36Sopenharmony_ci
243762306a36Sopenharmony_ci	while (!list_empty(list)) {
243862306a36Sopenharmony_ci		struct folio *folio = lru_to_folio(list);
243962306a36Sopenharmony_ci
244062306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
244162306a36Sopenharmony_ci		list_del(&folio->lru);
244262306a36Sopenharmony_ci		if (unlikely(!folio_evictable(folio))) {
244362306a36Sopenharmony_ci			spin_unlock_irq(&lruvec->lru_lock);
244462306a36Sopenharmony_ci			folio_putback_lru(folio);
244562306a36Sopenharmony_ci			spin_lock_irq(&lruvec->lru_lock);
244662306a36Sopenharmony_ci			continue;
244762306a36Sopenharmony_ci		}
244862306a36Sopenharmony_ci
244962306a36Sopenharmony_ci		/*
245062306a36Sopenharmony_ci		 * The folio_set_lru needs to be kept here for list integrity.
245162306a36Sopenharmony_ci		 * Otherwise:
245262306a36Sopenharmony_ci		 *   #0 move_folios_to_lru             #1 release_pages
245362306a36Sopenharmony_ci		 *   if (!folio_put_testzero())
245462306a36Sopenharmony_ci		 *				      if (folio_put_testzero())
245562306a36Sopenharmony_ci		 *				        !lru //skip lru_lock
245662306a36Sopenharmony_ci		 *     folio_set_lru()
245762306a36Sopenharmony_ci		 *     list_add(&folio->lru,)
245862306a36Sopenharmony_ci		 *                                        list_add(&folio->lru,)
245962306a36Sopenharmony_ci		 */
246062306a36Sopenharmony_ci		folio_set_lru(folio);
246162306a36Sopenharmony_ci
246262306a36Sopenharmony_ci		if (unlikely(folio_put_testzero(folio))) {
246362306a36Sopenharmony_ci			__folio_clear_lru_flags(folio);
246462306a36Sopenharmony_ci
246562306a36Sopenharmony_ci			if (unlikely(folio_test_large(folio))) {
246662306a36Sopenharmony_ci				spin_unlock_irq(&lruvec->lru_lock);
246762306a36Sopenharmony_ci				destroy_large_folio(folio);
246862306a36Sopenharmony_ci				spin_lock_irq(&lruvec->lru_lock);
246962306a36Sopenharmony_ci			} else
247062306a36Sopenharmony_ci				list_add(&folio->lru, &folios_to_free);
247162306a36Sopenharmony_ci
247262306a36Sopenharmony_ci			continue;
247362306a36Sopenharmony_ci		}
247462306a36Sopenharmony_ci
247562306a36Sopenharmony_ci		/*
247662306a36Sopenharmony_ci		 * All pages were isolated from the same lruvec (and isolation
247762306a36Sopenharmony_ci		 * inhibits memcg migration).
247862306a36Sopenharmony_ci		 */
247962306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
248062306a36Sopenharmony_ci		lruvec_add_folio(lruvec, folio);
248162306a36Sopenharmony_ci		nr_pages = folio_nr_pages(folio);
248262306a36Sopenharmony_ci		nr_moved += nr_pages;
248362306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
248462306a36Sopenharmony_ci		if (folio_test_active(folio)) {
248562306a36Sopenharmony_ci			prot = is_prot_page(folio_page(folio, 0));
248662306a36Sopenharmony_ci			file = page_is_file_lru(folio_page(folio, 0));
248762306a36Sopenharmony_ci			if (!prot && file) {
248862306a36Sopenharmony_ci				lruvec = folio_lruvec(folio);
248962306a36Sopenharmony_ci				workingset_age_nonresident(lruvec,
249062306a36Sopenharmony_ci							   nr_pages);
249162306a36Sopenharmony_ci			} else {
249262306a36Sopenharmony_ci				workingset_age_nonresident(lruvec,
249362306a36Sopenharmony_ci							   nr_pages);
249462306a36Sopenharmony_ci			}
249562306a36Sopenharmony_ci		}
249662306a36Sopenharmony_ci#else
249762306a36Sopenharmony_ci		if (folio_test_active(folio))
249862306a36Sopenharmony_ci			workingset_age_nonresident(lruvec, nr_pages);
249962306a36Sopenharmony_ci#endif
250062306a36Sopenharmony_ci	}
250162306a36Sopenharmony_ci
250262306a36Sopenharmony_ci	/*
250362306a36Sopenharmony_ci	 * To save our caller's stack, now use input list for pages to free.
250462306a36Sopenharmony_ci	 */
250562306a36Sopenharmony_ci	list_splice(&folios_to_free, list);
250662306a36Sopenharmony_ci
250762306a36Sopenharmony_ci	return nr_moved;
250862306a36Sopenharmony_ci}
250962306a36Sopenharmony_ci
251062306a36Sopenharmony_ci/*
251162306a36Sopenharmony_ci * If a kernel thread (such as nfsd for loop-back mounts) services a backing
251262306a36Sopenharmony_ci * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
251362306a36Sopenharmony_ci * we should not throttle.  Otherwise it is safe to do so.
251462306a36Sopenharmony_ci */
251562306a36Sopenharmony_ciint current_may_throttle(void)
251662306a36Sopenharmony_ci{
251762306a36Sopenharmony_ci	return !(current->flags & PF_LOCAL_THROTTLE);
251862306a36Sopenharmony_ci}
251962306a36Sopenharmony_ci
252062306a36Sopenharmony_ci/*
252162306a36Sopenharmony_ci * shrink_inactive_list() is a helper for shrink_node().  It returns the number
252262306a36Sopenharmony_ci * of reclaimed pages
252362306a36Sopenharmony_ci */
252462306a36Sopenharmony_ciunsigned long shrink_inactive_list(unsigned long nr_to_scan,
252562306a36Sopenharmony_ci		struct lruvec *lruvec, struct scan_control *sc,
252662306a36Sopenharmony_ci		enum lru_list lru)
252762306a36Sopenharmony_ci{
252862306a36Sopenharmony_ci	LIST_HEAD(folio_list);
252962306a36Sopenharmony_ci	unsigned long nr_scanned;
253062306a36Sopenharmony_ci	unsigned int nr_reclaimed = 0;
253162306a36Sopenharmony_ci	unsigned long nr_taken;
253262306a36Sopenharmony_ci	struct reclaim_stat stat;
253362306a36Sopenharmony_ci	bool file = is_file_lru(lru);
253462306a36Sopenharmony_ci	enum vm_event_item item;
253562306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
253662306a36Sopenharmony_ci	bool stalled = false;
253762306a36Sopenharmony_ci
253862306a36Sopenharmony_ci	while (unlikely(too_many_isolated(pgdat, file, sc))) {
253962306a36Sopenharmony_ci		if (stalled)
254062306a36Sopenharmony_ci			return 0;
254162306a36Sopenharmony_ci
254262306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
254362306a36Sopenharmony_ci		sc->isolate_count++;
254462306a36Sopenharmony_ci#endif
254562306a36Sopenharmony_ci		/* wait a bit for the reclaimer. */
254662306a36Sopenharmony_ci		stalled = true;
254762306a36Sopenharmony_ci		reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
254862306a36Sopenharmony_ci
254962306a36Sopenharmony_ci		/* We are about to die and free our memory. Return now. */
255062306a36Sopenharmony_ci		if (fatal_signal_pending(current))
255162306a36Sopenharmony_ci			return SWAP_CLUSTER_MAX;
255262306a36Sopenharmony_ci	}
255362306a36Sopenharmony_ci
255462306a36Sopenharmony_ci	lru_add_drain();
255562306a36Sopenharmony_ci
255662306a36Sopenharmony_ci	spin_lock_irq(&lruvec->lru_lock);
255762306a36Sopenharmony_ci
255862306a36Sopenharmony_ci	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
255962306a36Sopenharmony_ci				     &nr_scanned, sc, lru);
256062306a36Sopenharmony_ci
256162306a36Sopenharmony_ci	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
256262306a36Sopenharmony_ci	item = PGSCAN_KSWAPD + reclaimer_offset();
256362306a36Sopenharmony_ci	if (!cgroup_reclaim(sc))
256462306a36Sopenharmony_ci		__count_vm_events(item, nr_scanned);
256562306a36Sopenharmony_ci	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
256662306a36Sopenharmony_ci	__count_vm_events(PGSCAN_ANON + file, nr_scanned);
256762306a36Sopenharmony_ci
256862306a36Sopenharmony_ci	spin_unlock_irq(&lruvec->lru_lock);
256962306a36Sopenharmony_ci
257062306a36Sopenharmony_ci	if (nr_taken == 0)
257162306a36Sopenharmony_ci		return 0;
257262306a36Sopenharmony_ci
257362306a36Sopenharmony_ci	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
257462306a36Sopenharmony_ci
257562306a36Sopenharmony_ci	spin_lock_irq(&lruvec->lru_lock);
257662306a36Sopenharmony_ci	move_folios_to_lru(lruvec, &folio_list);
257762306a36Sopenharmony_ci
257862306a36Sopenharmony_ci	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
257962306a36Sopenharmony_ci	item = PGSTEAL_KSWAPD + reclaimer_offset();
258062306a36Sopenharmony_ci	if (!cgroup_reclaim(sc))
258162306a36Sopenharmony_ci		__count_vm_events(item, nr_reclaimed);
258262306a36Sopenharmony_ci	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
258362306a36Sopenharmony_ci	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
258462306a36Sopenharmony_ci	spin_unlock_irq(&lruvec->lru_lock);
258562306a36Sopenharmony_ci
258662306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
258762306a36Sopenharmony_ci	if (file)
258862306a36Sopenharmony_ci		lru_note_cost(node_lruvec(pgdat), file, stat.nr_pageout, nr_scanned - nr_reclaimed);
258962306a36Sopenharmony_ci	else
259062306a36Sopenharmony_ci		lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
259162306a36Sopenharmony_ci#else
259262306a36Sopenharmony_ci	lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
259362306a36Sopenharmony_ci#endif
259462306a36Sopenharmony_ci	mem_cgroup_uncharge_list(&folio_list);
259562306a36Sopenharmony_ci	free_unref_page_list(&folio_list);
259662306a36Sopenharmony_ci
259762306a36Sopenharmony_ci	/*
259862306a36Sopenharmony_ci	 * If dirty folios are scanned that are not queued for IO, it
259962306a36Sopenharmony_ci	 * implies that flushers are not doing their job. This can
260062306a36Sopenharmony_ci	 * happen when memory pressure pushes dirty folios to the end of
260162306a36Sopenharmony_ci	 * the LRU before the dirty limits are breached and the dirty
260262306a36Sopenharmony_ci	 * data has expired. It can also happen when the proportion of
260362306a36Sopenharmony_ci	 * dirty folios grows not through writes but through memory
260462306a36Sopenharmony_ci	 * pressure reclaiming all the clean cache. And in some cases,
260562306a36Sopenharmony_ci	 * the flushers simply cannot keep up with the allocation
260662306a36Sopenharmony_ci	 * rate. Nudge the flusher threads in case they are asleep.
260762306a36Sopenharmony_ci	 */
260862306a36Sopenharmony_ci	if (stat.nr_unqueued_dirty == nr_taken) {
260962306a36Sopenharmony_ci		wakeup_flusher_threads(WB_REASON_VMSCAN);
261062306a36Sopenharmony_ci		/*
261162306a36Sopenharmony_ci		 * For cgroupv1 dirty throttling is achieved by waking up
261262306a36Sopenharmony_ci		 * the kernel flusher here and later waiting on folios
261362306a36Sopenharmony_ci		 * which are in writeback to finish (see shrink_folio_list()).
261462306a36Sopenharmony_ci		 *
261562306a36Sopenharmony_ci		 * Flusher may not be able to issue writeback quickly
261662306a36Sopenharmony_ci		 * enough for cgroupv1 writeback throttling to work
261762306a36Sopenharmony_ci		 * on a large system.
261862306a36Sopenharmony_ci		 */
261962306a36Sopenharmony_ci		if (!writeback_throttling_sane(sc))
262062306a36Sopenharmony_ci			reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
262162306a36Sopenharmony_ci	}
262262306a36Sopenharmony_ci
262362306a36Sopenharmony_ci	sc->nr.dirty += stat.nr_dirty;
262462306a36Sopenharmony_ci	sc->nr.congested += stat.nr_congested;
262562306a36Sopenharmony_ci	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
262662306a36Sopenharmony_ci	sc->nr.writeback += stat.nr_writeback;
262762306a36Sopenharmony_ci	sc->nr.immediate += stat.nr_immediate;
262862306a36Sopenharmony_ci	sc->nr.taken += nr_taken;
262962306a36Sopenharmony_ci	if (file)
263062306a36Sopenharmony_ci		sc->nr.file_taken += nr_taken;
263162306a36Sopenharmony_ci
263262306a36Sopenharmony_ci	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
263362306a36Sopenharmony_ci			nr_scanned, nr_reclaimed, &stat, sc->priority, file);
263462306a36Sopenharmony_ci	return nr_reclaimed;
263562306a36Sopenharmony_ci}
263662306a36Sopenharmony_ci
263762306a36Sopenharmony_ci/*
263862306a36Sopenharmony_ci * shrink_active_list() moves folios from the active LRU to the inactive LRU.
263962306a36Sopenharmony_ci *
264062306a36Sopenharmony_ci * We move them the other way if the folio is referenced by one or more
264162306a36Sopenharmony_ci * processes.
264262306a36Sopenharmony_ci *
264362306a36Sopenharmony_ci * If the folios are mostly unmapped, the processing is fast and it is
264462306a36Sopenharmony_ci * appropriate to hold lru_lock across the whole operation.  But if
264562306a36Sopenharmony_ci * the folios are mapped, the processing is slow (folio_referenced()), so
264662306a36Sopenharmony_ci * we should drop lru_lock around each folio.  It's impossible to balance
264762306a36Sopenharmony_ci * this, so instead we remove the folios from the LRU while processing them.
264862306a36Sopenharmony_ci * It is safe to rely on the active flag against the non-LRU folios in here
264962306a36Sopenharmony_ci * because nobody will play with that bit on a non-LRU folio.
265062306a36Sopenharmony_ci *
265162306a36Sopenharmony_ci * The downside is that we have to touch folio->_refcount against each folio.
265262306a36Sopenharmony_ci * But we had to alter folio->flags anyway.
265362306a36Sopenharmony_ci */
265462306a36Sopenharmony_civoid shrink_active_list(unsigned long nr_to_scan,
265562306a36Sopenharmony_ci			       struct lruvec *lruvec,
265662306a36Sopenharmony_ci			       struct scan_control *sc,
265762306a36Sopenharmony_ci			       enum lru_list lru)
265862306a36Sopenharmony_ci{
265962306a36Sopenharmony_ci	unsigned long nr_taken;
266062306a36Sopenharmony_ci	unsigned long nr_scanned;
266162306a36Sopenharmony_ci	unsigned long vm_flags;
266262306a36Sopenharmony_ci	LIST_HEAD(l_hold);	/* The folios which were snipped off */
266362306a36Sopenharmony_ci	LIST_HEAD(l_active);
266462306a36Sopenharmony_ci	LIST_HEAD(l_inactive);
266562306a36Sopenharmony_ci	unsigned nr_deactivate, nr_activate;
266662306a36Sopenharmony_ci	unsigned nr_rotated = 0;
266762306a36Sopenharmony_ci	int file = is_file_lru(lru);
266862306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
266962306a36Sopenharmony_ci
267062306a36Sopenharmony_ci	lru_add_drain();
267162306a36Sopenharmony_ci
267262306a36Sopenharmony_ci	spin_lock_irq(&lruvec->lru_lock);
267362306a36Sopenharmony_ci
267462306a36Sopenharmony_ci	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
267562306a36Sopenharmony_ci				     &nr_scanned, sc, lru);
267662306a36Sopenharmony_ci
267762306a36Sopenharmony_ci	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
267862306a36Sopenharmony_ci
267962306a36Sopenharmony_ci	if (!cgroup_reclaim(sc))
268062306a36Sopenharmony_ci		__count_vm_events(PGREFILL, nr_scanned);
268162306a36Sopenharmony_ci	__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
268262306a36Sopenharmony_ci
268362306a36Sopenharmony_ci	spin_unlock_irq(&lruvec->lru_lock);
268462306a36Sopenharmony_ci
268562306a36Sopenharmony_ci	while (!list_empty(&l_hold)) {
268662306a36Sopenharmony_ci		struct folio *folio;
268762306a36Sopenharmony_ci
268862306a36Sopenharmony_ci		cond_resched();
268962306a36Sopenharmony_ci		folio = lru_to_folio(&l_hold);
269062306a36Sopenharmony_ci		list_del(&folio->lru);
269162306a36Sopenharmony_ci
269262306a36Sopenharmony_ci		if (unlikely(!folio_evictable(folio))) {
269362306a36Sopenharmony_ci			folio_putback_lru(folio);
269462306a36Sopenharmony_ci			continue;
269562306a36Sopenharmony_ci		}
269662306a36Sopenharmony_ci
269762306a36Sopenharmony_ci		if (unlikely(buffer_heads_over_limit)) {
269862306a36Sopenharmony_ci			if (folio_needs_release(folio) &&
269962306a36Sopenharmony_ci			    folio_trylock(folio)) {
270062306a36Sopenharmony_ci				filemap_release_folio(folio, 0);
270162306a36Sopenharmony_ci				folio_unlock(folio);
270262306a36Sopenharmony_ci			}
270362306a36Sopenharmony_ci		}
270462306a36Sopenharmony_ci
270562306a36Sopenharmony_ci		/* Referenced or rmap lock contention: rotate */
270662306a36Sopenharmony_ci		if (folio_referenced(folio, 0, sc->target_mem_cgroup,
270762306a36Sopenharmony_ci				     &vm_flags) != 0) {
270862306a36Sopenharmony_ci			/*
270962306a36Sopenharmony_ci			 * Identify referenced, file-backed active folios and
271062306a36Sopenharmony_ci			 * give them one more trip around the active list. So
271162306a36Sopenharmony_ci			 * that executable code get better chances to stay in
271262306a36Sopenharmony_ci			 * memory under moderate memory pressure.  Anon folios
271362306a36Sopenharmony_ci			 * are not likely to be evicted by use-once streaming
271462306a36Sopenharmony_ci			 * IO, plus JVM can create lots of anon VM_EXEC folios,
271562306a36Sopenharmony_ci			 * so we ignore them here.
271662306a36Sopenharmony_ci			 */
271762306a36Sopenharmony_ci			if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
271862306a36Sopenharmony_ci				nr_rotated += folio_nr_pages(folio);
271962306a36Sopenharmony_ci				list_add(&folio->lru, &l_active);
272062306a36Sopenharmony_ci				continue;
272162306a36Sopenharmony_ci			}
272262306a36Sopenharmony_ci		}
272362306a36Sopenharmony_ci
272462306a36Sopenharmony_ci		folio_clear_active(folio);	/* we are de-activating */
272562306a36Sopenharmony_ci		folio_set_workingset(folio);
272662306a36Sopenharmony_ci		list_add(&folio->lru, &l_inactive);
272762306a36Sopenharmony_ci	}
272862306a36Sopenharmony_ci
272962306a36Sopenharmony_ci	/*
273062306a36Sopenharmony_ci	 * Move folios back to the lru list.
273162306a36Sopenharmony_ci	 */
273262306a36Sopenharmony_ci	spin_lock_irq(&lruvec->lru_lock);
273362306a36Sopenharmony_ci
273462306a36Sopenharmony_ci	nr_activate = move_folios_to_lru(lruvec, &l_active);
273562306a36Sopenharmony_ci	nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
273662306a36Sopenharmony_ci	/* Keep all free folios in l_active list */
273762306a36Sopenharmony_ci	list_splice(&l_inactive, &l_active);
273862306a36Sopenharmony_ci
273962306a36Sopenharmony_ci	__count_vm_events(PGDEACTIVATE, nr_deactivate);
274062306a36Sopenharmony_ci	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
274162306a36Sopenharmony_ci
274262306a36Sopenharmony_ci	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
274362306a36Sopenharmony_ci	spin_unlock_irq(&lruvec->lru_lock);
274462306a36Sopenharmony_ci
274562306a36Sopenharmony_ci	if (nr_rotated)
274662306a36Sopenharmony_ci		lru_note_cost(lruvec, file, 0, nr_rotated);
274762306a36Sopenharmony_ci	mem_cgroup_uncharge_list(&l_active);
274862306a36Sopenharmony_ci	free_unref_page_list(&l_active);
274962306a36Sopenharmony_ci	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
275062306a36Sopenharmony_ci			nr_deactivate, nr_rotated, sc->priority, file);
275162306a36Sopenharmony_ci}
275262306a36Sopenharmony_ci
275362306a36Sopenharmony_cistatic unsigned int reclaim_folio_list(struct list_head *folio_list,
275462306a36Sopenharmony_ci				      struct pglist_data *pgdat)
275562306a36Sopenharmony_ci{
275662306a36Sopenharmony_ci	struct reclaim_stat dummy_stat;
275762306a36Sopenharmony_ci	unsigned int nr_reclaimed;
275862306a36Sopenharmony_ci	struct folio *folio;
275962306a36Sopenharmony_ci	struct scan_control sc = {
276062306a36Sopenharmony_ci		.gfp_mask = GFP_KERNEL,
276162306a36Sopenharmony_ci		.may_writepage = 1,
276262306a36Sopenharmony_ci		.may_unmap = 1,
276362306a36Sopenharmony_ci		.may_swap = 1,
276462306a36Sopenharmony_ci		.no_demotion = 1,
276562306a36Sopenharmony_ci	};
276662306a36Sopenharmony_ci
276762306a36Sopenharmony_ci	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
276862306a36Sopenharmony_ci	while (!list_empty(folio_list)) {
276962306a36Sopenharmony_ci		folio = lru_to_folio(folio_list);
277062306a36Sopenharmony_ci		list_del(&folio->lru);
277162306a36Sopenharmony_ci		folio_putback_lru(folio);
277262306a36Sopenharmony_ci	}
277362306a36Sopenharmony_ci
277462306a36Sopenharmony_ci	return nr_reclaimed;
277562306a36Sopenharmony_ci}
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_ciunsigned long reclaim_pages(struct list_head *folio_list)
277862306a36Sopenharmony_ci{
277962306a36Sopenharmony_ci	int nid;
278062306a36Sopenharmony_ci	unsigned int nr_reclaimed = 0;
278162306a36Sopenharmony_ci	LIST_HEAD(node_folio_list);
278262306a36Sopenharmony_ci	unsigned int noreclaim_flag;
278362306a36Sopenharmony_ci
278462306a36Sopenharmony_ci	if (list_empty(folio_list))
278562306a36Sopenharmony_ci		return nr_reclaimed;
278662306a36Sopenharmony_ci
278762306a36Sopenharmony_ci	noreclaim_flag = memalloc_noreclaim_save();
278862306a36Sopenharmony_ci
278962306a36Sopenharmony_ci	nid = folio_nid(lru_to_folio(folio_list));
279062306a36Sopenharmony_ci	do {
279162306a36Sopenharmony_ci		struct folio *folio = lru_to_folio(folio_list);
279262306a36Sopenharmony_ci
279362306a36Sopenharmony_ci		if (nid == folio_nid(folio)) {
279462306a36Sopenharmony_ci			folio_clear_active(folio);
279562306a36Sopenharmony_ci			list_move(&folio->lru, &node_folio_list);
279662306a36Sopenharmony_ci			continue;
279762306a36Sopenharmony_ci		}
279862306a36Sopenharmony_ci
279962306a36Sopenharmony_ci		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
280062306a36Sopenharmony_ci		nid = folio_nid(lru_to_folio(folio_list));
280162306a36Sopenharmony_ci	} while (!list_empty(folio_list));
280262306a36Sopenharmony_ci
280362306a36Sopenharmony_ci	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
280462306a36Sopenharmony_ci
280562306a36Sopenharmony_ci	memalloc_noreclaim_restore(noreclaim_flag);
280662306a36Sopenharmony_ci
280762306a36Sopenharmony_ci	return nr_reclaimed;
280862306a36Sopenharmony_ci}
280962306a36Sopenharmony_ci
281062306a36Sopenharmony_ciunsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
281162306a36Sopenharmony_ci				 struct lruvec *lruvec, struct scan_control *sc)
281262306a36Sopenharmony_ci{
281362306a36Sopenharmony_ci	if (is_active_lru(lru)) {
281462306a36Sopenharmony_ci		if (sc->may_deactivate & (1 << is_file_lru(lru)))
281562306a36Sopenharmony_ci			shrink_active_list(nr_to_scan, lruvec, sc, lru);
281662306a36Sopenharmony_ci		else
281762306a36Sopenharmony_ci			sc->skipped_deactivate = 1;
281862306a36Sopenharmony_ci		return 0;
281962306a36Sopenharmony_ci	}
282062306a36Sopenharmony_ci
282162306a36Sopenharmony_ci	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
282262306a36Sopenharmony_ci}
282362306a36Sopenharmony_ci
282462306a36Sopenharmony_ci/*
282562306a36Sopenharmony_ci * The inactive anon list should be small enough that the VM never has
282662306a36Sopenharmony_ci * to do too much work.
282762306a36Sopenharmony_ci *
282862306a36Sopenharmony_ci * The inactive file list should be small enough to leave most memory
282962306a36Sopenharmony_ci * to the established workingset on the scan-resistant active list,
283062306a36Sopenharmony_ci * but large enough to avoid thrashing the aggregate readahead window.
283162306a36Sopenharmony_ci *
283262306a36Sopenharmony_ci * Both inactive lists should also be large enough that each inactive
283362306a36Sopenharmony_ci * folio has a chance to be referenced again before it is reclaimed.
283462306a36Sopenharmony_ci *
283562306a36Sopenharmony_ci * If that fails and refaulting is observed, the inactive list grows.
283662306a36Sopenharmony_ci *
283762306a36Sopenharmony_ci * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
283862306a36Sopenharmony_ci * on this LRU, maintained by the pageout code. An inactive_ratio
283962306a36Sopenharmony_ci * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
284062306a36Sopenharmony_ci *
284162306a36Sopenharmony_ci * total     target    max
284262306a36Sopenharmony_ci * memory    ratio     inactive
284362306a36Sopenharmony_ci * -------------------------------------
284462306a36Sopenharmony_ci *   10MB       1         5MB
284562306a36Sopenharmony_ci *  100MB       1        50MB
284662306a36Sopenharmony_ci *    1GB       3       250MB
284762306a36Sopenharmony_ci *   10GB      10       0.9GB
284862306a36Sopenharmony_ci *  100GB      31         3GB
284962306a36Sopenharmony_ci *    1TB     101        10GB
285062306a36Sopenharmony_ci *   10TB     320        32GB
285162306a36Sopenharmony_ci */
285262306a36Sopenharmony_cibool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
285362306a36Sopenharmony_ci{
285462306a36Sopenharmony_ci	enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
285562306a36Sopenharmony_ci	unsigned long inactive, active;
285662306a36Sopenharmony_ci	unsigned long inactive_ratio;
285762306a36Sopenharmony_ci	unsigned long gb;
285862306a36Sopenharmony_ci
285962306a36Sopenharmony_ci	inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
286062306a36Sopenharmony_ci	active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
286162306a36Sopenharmony_ci
286262306a36Sopenharmony_ci	gb = (inactive + active) >> (30 - PAGE_SHIFT);
286362306a36Sopenharmony_ci	if (gb)
286462306a36Sopenharmony_ci		inactive_ratio = int_sqrt(10 * gb);
286562306a36Sopenharmony_ci	else
286662306a36Sopenharmony_ci		inactive_ratio = 1;
286762306a36Sopenharmony_ci
286862306a36Sopenharmony_ci	return inactive * inactive_ratio < active;
286962306a36Sopenharmony_ci}
287062306a36Sopenharmony_ci
287162306a36Sopenharmony_cistatic void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
287262306a36Sopenharmony_ci{
287362306a36Sopenharmony_ci	unsigned long file;
287462306a36Sopenharmony_ci	struct lruvec *target_lruvec;
287562306a36Sopenharmony_ci
287662306a36Sopenharmony_ci	if (lru_gen_enabled())
287762306a36Sopenharmony_ci		return;
287862306a36Sopenharmony_ci
287962306a36Sopenharmony_ci	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
288062306a36Sopenharmony_ci
288162306a36Sopenharmony_ci	/*
288262306a36Sopenharmony_ci	 * Flush the memory cgroup stats, so that we read accurate per-memcg
288362306a36Sopenharmony_ci	 * lruvec stats for heuristics.
288462306a36Sopenharmony_ci	 */
288562306a36Sopenharmony_ci	mem_cgroup_flush_stats();
288662306a36Sopenharmony_ci
288762306a36Sopenharmony_ci	/*
288862306a36Sopenharmony_ci	 * Determine the scan balance between anon and file LRUs.
288962306a36Sopenharmony_ci	 */
289062306a36Sopenharmony_ci	spin_lock_irq(&target_lruvec->lru_lock);
289162306a36Sopenharmony_ci	sc->anon_cost = target_lruvec->anon_cost;
289262306a36Sopenharmony_ci	sc->file_cost = target_lruvec->file_cost;
289362306a36Sopenharmony_ci	spin_unlock_irq(&target_lruvec->lru_lock);
289462306a36Sopenharmony_ci
289562306a36Sopenharmony_ci	/*
289662306a36Sopenharmony_ci	 * Target desirable inactive:active list ratios for the anon
289762306a36Sopenharmony_ci	 * and file LRU lists.
289862306a36Sopenharmony_ci	 */
289962306a36Sopenharmony_ci	if (!sc->force_deactivate) {
290062306a36Sopenharmony_ci		unsigned long refaults;
290162306a36Sopenharmony_ci
290262306a36Sopenharmony_ci		/*
290362306a36Sopenharmony_ci		 * When refaults are being observed, it means a new
290462306a36Sopenharmony_ci		 * workingset is being established. Deactivate to get
290562306a36Sopenharmony_ci		 * rid of any stale active pages quickly.
290662306a36Sopenharmony_ci		 */
290762306a36Sopenharmony_ci		refaults = lruvec_page_state(target_lruvec,
290862306a36Sopenharmony_ci				WORKINGSET_ACTIVATE_ANON);
290962306a36Sopenharmony_ci		if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
291062306a36Sopenharmony_ci			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
291162306a36Sopenharmony_ci			sc->may_deactivate |= DEACTIVATE_ANON;
291262306a36Sopenharmony_ci		else
291362306a36Sopenharmony_ci			sc->may_deactivate &= ~DEACTIVATE_ANON;
291462306a36Sopenharmony_ci
291562306a36Sopenharmony_ci		refaults = lruvec_page_state(target_lruvec,
291662306a36Sopenharmony_ci				WORKINGSET_ACTIVATE_FILE);
291762306a36Sopenharmony_ci		if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
291862306a36Sopenharmony_ci		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
291962306a36Sopenharmony_ci			sc->may_deactivate |= DEACTIVATE_FILE;
292062306a36Sopenharmony_ci		else
292162306a36Sopenharmony_ci			sc->may_deactivate &= ~DEACTIVATE_FILE;
292262306a36Sopenharmony_ci	} else
292362306a36Sopenharmony_ci		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
292462306a36Sopenharmony_ci
292562306a36Sopenharmony_ci	/*
292662306a36Sopenharmony_ci	 * If we have plenty of inactive file pages that aren't
292762306a36Sopenharmony_ci	 * thrashing, try to reclaim those first before touching
292862306a36Sopenharmony_ci	 * anonymous pages.
292962306a36Sopenharmony_ci	 */
293062306a36Sopenharmony_ci	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
293162306a36Sopenharmony_ci	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
293262306a36Sopenharmony_ci		sc->cache_trim_mode = 1;
293362306a36Sopenharmony_ci	else
293462306a36Sopenharmony_ci		sc->cache_trim_mode = 0;
293562306a36Sopenharmony_ci
293662306a36Sopenharmony_ci	/*
293762306a36Sopenharmony_ci	 * Prevent the reclaimer from falling into the cache trap: as
293862306a36Sopenharmony_ci	 * cache pages start out inactive, every cache fault will tip
293962306a36Sopenharmony_ci	 * the scan balance towards the file LRU.  And as the file LRU
294062306a36Sopenharmony_ci	 * shrinks, so does the window for rotation from references.
294162306a36Sopenharmony_ci	 * This means we have a runaway feedback loop where a tiny
294262306a36Sopenharmony_ci	 * thrashing file LRU becomes infinitely more attractive than
294362306a36Sopenharmony_ci	 * anon pages.  Try to detect this based on file LRU size.
294462306a36Sopenharmony_ci	 */
294562306a36Sopenharmony_ci	if (!cgroup_reclaim(sc)) {
294662306a36Sopenharmony_ci		unsigned long total_high_wmark = 0;
294762306a36Sopenharmony_ci		unsigned long free, anon;
294862306a36Sopenharmony_ci		int z;
294962306a36Sopenharmony_ci
295062306a36Sopenharmony_ci		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
295162306a36Sopenharmony_ci		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
295262306a36Sopenharmony_ci			   node_page_state(pgdat, NR_INACTIVE_FILE);
295362306a36Sopenharmony_ci
295462306a36Sopenharmony_ci		for (z = 0; z < MAX_NR_ZONES; z++) {
295562306a36Sopenharmony_ci			struct zone *zone = &pgdat->node_zones[z];
295662306a36Sopenharmony_ci
295762306a36Sopenharmony_ci			if (!managed_zone(zone))
295862306a36Sopenharmony_ci				continue;
295962306a36Sopenharmony_ci
296062306a36Sopenharmony_ci			total_high_wmark += high_wmark_pages(zone);
296162306a36Sopenharmony_ci		}
296262306a36Sopenharmony_ci
296362306a36Sopenharmony_ci		/*
296462306a36Sopenharmony_ci		 * Consider anon: if that's low too, this isn't a
296562306a36Sopenharmony_ci		 * runaway file reclaim problem, but rather just
296662306a36Sopenharmony_ci		 * extreme pressure. Reclaim as per usual then.
296762306a36Sopenharmony_ci		 */
296862306a36Sopenharmony_ci		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
296962306a36Sopenharmony_ci
297062306a36Sopenharmony_ci		sc->file_is_tiny =
297162306a36Sopenharmony_ci			file + free <= total_high_wmark &&
297262306a36Sopenharmony_ci			!(sc->may_deactivate & DEACTIVATE_ANON) &&
297362306a36Sopenharmony_ci			anon >> sc->priority;
297462306a36Sopenharmony_ci	}
297562306a36Sopenharmony_ci}
297662306a36Sopenharmony_ci
297762306a36Sopenharmony_ci/*
297862306a36Sopenharmony_ci * Determine how aggressively the anon and file LRU lists should be
297962306a36Sopenharmony_ci * scanned.
298062306a36Sopenharmony_ci *
298162306a36Sopenharmony_ci * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
298262306a36Sopenharmony_ci * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
298362306a36Sopenharmony_ci */
298462306a36Sopenharmony_cistatic void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
298562306a36Sopenharmony_ci			   unsigned long *nr)
298662306a36Sopenharmony_ci{
298762306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
298862306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
298962306a36Sopenharmony_ci	unsigned long anon_cost, file_cost, total_cost;
299062306a36Sopenharmony_ci	int swappiness = mem_cgroup_swappiness(memcg);
299162306a36Sopenharmony_ci	u64 fraction[ANON_AND_FILE];
299262306a36Sopenharmony_ci	u64 denominator = 0;	/* gcc */
299362306a36Sopenharmony_ci	enum scan_balance scan_balance;
299462306a36Sopenharmony_ci	unsigned long ap, fp;
299562306a36Sopenharmony_ci	enum lru_list lru;
299662306a36Sopenharmony_ci
299762306a36Sopenharmony_ci	/* If we have no swap space, do not bother scanning anon folios. */
299862306a36Sopenharmony_ci	if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
299962306a36Sopenharmony_ci		scan_balance = SCAN_FILE;
300062306a36Sopenharmony_ci		goto out;
300162306a36Sopenharmony_ci	}
300262306a36Sopenharmony_ci
300362306a36Sopenharmony_ci	/*
300462306a36Sopenharmony_ci	 * Global reclaim will swap to prevent OOM even with no
300562306a36Sopenharmony_ci	 * swappiness, but memcg users want to use this knob to
300662306a36Sopenharmony_ci	 * disable swapping for individual groups completely when
300762306a36Sopenharmony_ci	 * using the memory controller's swap limit feature would be
300862306a36Sopenharmony_ci	 * too expensive.
300962306a36Sopenharmony_ci	 */
301062306a36Sopenharmony_ci	if (cgroup_reclaim(sc) && !swappiness) {
301162306a36Sopenharmony_ci		scan_balance = SCAN_FILE;
301262306a36Sopenharmony_ci		goto out;
301362306a36Sopenharmony_ci	}
301462306a36Sopenharmony_ci
301562306a36Sopenharmony_ci	/*
301662306a36Sopenharmony_ci	 * Do not apply any pressure balancing cleverness when the
301762306a36Sopenharmony_ci	 * system is close to OOM, scan both anon and file equally
301862306a36Sopenharmony_ci	 * (unless the swappiness setting disagrees with swapping).
301962306a36Sopenharmony_ci	 */
302062306a36Sopenharmony_ci	if (!sc->priority && swappiness) {
302162306a36Sopenharmony_ci		scan_balance = SCAN_EQUAL;
302262306a36Sopenharmony_ci		goto out;
302362306a36Sopenharmony_ci	}
302462306a36Sopenharmony_ci
302562306a36Sopenharmony_ci	/*
302662306a36Sopenharmony_ci	 * If the system is almost out of file pages, force-scan anon.
302762306a36Sopenharmony_ci	 */
302862306a36Sopenharmony_ci	if (sc->file_is_tiny) {
302962306a36Sopenharmony_ci		scan_balance = SCAN_ANON;
303062306a36Sopenharmony_ci		goto out;
303162306a36Sopenharmony_ci	}
303262306a36Sopenharmony_ci
303362306a36Sopenharmony_ci	/*
303462306a36Sopenharmony_ci	 * If there is enough inactive page cache, we do not reclaim
303562306a36Sopenharmony_ci	 * anything from the anonymous working right now.
303662306a36Sopenharmony_ci	 */
303762306a36Sopenharmony_ci	if (sc->cache_trim_mode) {
303862306a36Sopenharmony_ci		scan_balance = SCAN_FILE;
303962306a36Sopenharmony_ci		goto out;
304062306a36Sopenharmony_ci	}
304162306a36Sopenharmony_ci
304262306a36Sopenharmony_ci	scan_balance = SCAN_FRACT;
304362306a36Sopenharmony_ci	/*
304462306a36Sopenharmony_ci	 * Calculate the pressure balance between anon and file pages.
304562306a36Sopenharmony_ci	 *
304662306a36Sopenharmony_ci	 * The amount of pressure we put on each LRU is inversely
304762306a36Sopenharmony_ci	 * proportional to the cost of reclaiming each list, as
304862306a36Sopenharmony_ci	 * determined by the share of pages that are refaulting, times
304962306a36Sopenharmony_ci	 * the relative IO cost of bringing back a swapped out
305062306a36Sopenharmony_ci	 * anonymous page vs reloading a filesystem page (swappiness).
305162306a36Sopenharmony_ci	 *
305262306a36Sopenharmony_ci	 * Although we limit that influence to ensure no list gets
305362306a36Sopenharmony_ci	 * left behind completely: at least a third of the pressure is
305462306a36Sopenharmony_ci	 * applied, before swappiness.
305562306a36Sopenharmony_ci	 *
305662306a36Sopenharmony_ci	 * With swappiness at 100, anon and file have equal IO cost.
305762306a36Sopenharmony_ci	 */
305862306a36Sopenharmony_ci	total_cost = sc->anon_cost + sc->file_cost;
305962306a36Sopenharmony_ci	anon_cost = total_cost + sc->anon_cost;
306062306a36Sopenharmony_ci	file_cost = total_cost + sc->file_cost;
306162306a36Sopenharmony_ci	total_cost = anon_cost + file_cost;
306262306a36Sopenharmony_ci
306362306a36Sopenharmony_ci	ap = swappiness * (total_cost + 1);
306462306a36Sopenharmony_ci	ap /= anon_cost + 1;
306562306a36Sopenharmony_ci
306662306a36Sopenharmony_ci	fp = (200 - swappiness) * (total_cost + 1);
306762306a36Sopenharmony_ci	fp /= file_cost + 1;
306862306a36Sopenharmony_ci
306962306a36Sopenharmony_ci	fraction[0] = ap;
307062306a36Sopenharmony_ci	fraction[1] = fp;
307162306a36Sopenharmony_ci	denominator = ap + fp;
307262306a36Sopenharmony_ciout:
307362306a36Sopenharmony_ci	for_each_evictable_lru(lru) {
307462306a36Sopenharmony_ci		int file = is_file_lru(lru);
307562306a36Sopenharmony_ci		unsigned long lruvec_size;
307662306a36Sopenharmony_ci		unsigned long low, min;
307762306a36Sopenharmony_ci		unsigned long scan;
307862306a36Sopenharmony_ci
307962306a36Sopenharmony_ci		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
308062306a36Sopenharmony_ci		mem_cgroup_protection(sc->target_mem_cgroup, memcg,
308162306a36Sopenharmony_ci				      &min, &low);
308262306a36Sopenharmony_ci
308362306a36Sopenharmony_ci		if (min || low) {
308462306a36Sopenharmony_ci			/*
308562306a36Sopenharmony_ci			 * Scale a cgroup's reclaim pressure by proportioning
308662306a36Sopenharmony_ci			 * its current usage to its memory.low or memory.min
308762306a36Sopenharmony_ci			 * setting.
308862306a36Sopenharmony_ci			 *
308962306a36Sopenharmony_ci			 * This is important, as otherwise scanning aggression
309062306a36Sopenharmony_ci			 * becomes extremely binary -- from nothing as we
309162306a36Sopenharmony_ci			 * approach the memory protection threshold, to totally
309262306a36Sopenharmony_ci			 * nominal as we exceed it.  This results in requiring
309362306a36Sopenharmony_ci			 * setting extremely liberal protection thresholds. It
309462306a36Sopenharmony_ci			 * also means we simply get no protection at all if we
309562306a36Sopenharmony_ci			 * set it too low, which is not ideal.
309662306a36Sopenharmony_ci			 *
309762306a36Sopenharmony_ci			 * If there is any protection in place, we reduce scan
309862306a36Sopenharmony_ci			 * pressure by how much of the total memory used is
309962306a36Sopenharmony_ci			 * within protection thresholds.
310062306a36Sopenharmony_ci			 *
310162306a36Sopenharmony_ci			 * There is one special case: in the first reclaim pass,
310262306a36Sopenharmony_ci			 * we skip over all groups that are within their low
310362306a36Sopenharmony_ci			 * protection. If that fails to reclaim enough pages to
310462306a36Sopenharmony_ci			 * satisfy the reclaim goal, we come back and override
310562306a36Sopenharmony_ci			 * the best-effort low protection. However, we still
310662306a36Sopenharmony_ci			 * ideally want to honor how well-behaved groups are in
310762306a36Sopenharmony_ci			 * that case instead of simply punishing them all
310862306a36Sopenharmony_ci			 * equally. As such, we reclaim them based on how much
310962306a36Sopenharmony_ci			 * memory they are using, reducing the scan pressure
311062306a36Sopenharmony_ci			 * again by how much of the total memory used is under
311162306a36Sopenharmony_ci			 * hard protection.
311262306a36Sopenharmony_ci			 */
311362306a36Sopenharmony_ci			unsigned long cgroup_size = mem_cgroup_size(memcg);
311462306a36Sopenharmony_ci			unsigned long protection;
311562306a36Sopenharmony_ci
311662306a36Sopenharmony_ci			/* memory.low scaling, make sure we retry before OOM */
311762306a36Sopenharmony_ci			if (!sc->memcg_low_reclaim && low > min) {
311862306a36Sopenharmony_ci				protection = low;
311962306a36Sopenharmony_ci				sc->memcg_low_skipped = 1;
312062306a36Sopenharmony_ci			} else {
312162306a36Sopenharmony_ci				protection = min;
312262306a36Sopenharmony_ci			}
312362306a36Sopenharmony_ci
312462306a36Sopenharmony_ci			/* Avoid TOCTOU with earlier protection check */
312562306a36Sopenharmony_ci			cgroup_size = max(cgroup_size, protection);
312662306a36Sopenharmony_ci
312762306a36Sopenharmony_ci			scan = lruvec_size - lruvec_size * protection /
312862306a36Sopenharmony_ci				(cgroup_size + 1);
312962306a36Sopenharmony_ci
313062306a36Sopenharmony_ci			/*
313162306a36Sopenharmony_ci			 * Minimally target SWAP_CLUSTER_MAX pages to keep
313262306a36Sopenharmony_ci			 * reclaim moving forwards, avoiding decrementing
313362306a36Sopenharmony_ci			 * sc->priority further than desirable.
313462306a36Sopenharmony_ci			 */
313562306a36Sopenharmony_ci			scan = max(scan, SWAP_CLUSTER_MAX);
313662306a36Sopenharmony_ci		} else {
313762306a36Sopenharmony_ci			scan = lruvec_size;
313862306a36Sopenharmony_ci		}
313962306a36Sopenharmony_ci
314062306a36Sopenharmony_ci		scan >>= sc->priority;
314162306a36Sopenharmony_ci
314262306a36Sopenharmony_ci		/*
314362306a36Sopenharmony_ci		 * If the cgroup's already been deleted, make sure to
314462306a36Sopenharmony_ci		 * scrape out the remaining cache.
314562306a36Sopenharmony_ci		 */
314662306a36Sopenharmony_ci		if (!scan && !mem_cgroup_online(memcg))
314762306a36Sopenharmony_ci			scan = min(lruvec_size, SWAP_CLUSTER_MAX);
314862306a36Sopenharmony_ci
314962306a36Sopenharmony_ci		switch (scan_balance) {
315062306a36Sopenharmony_ci		case SCAN_EQUAL:
315162306a36Sopenharmony_ci			/* Scan lists relative to size */
315262306a36Sopenharmony_ci			break;
315362306a36Sopenharmony_ci		case SCAN_FRACT:
315462306a36Sopenharmony_ci			/*
315562306a36Sopenharmony_ci			 * Scan types proportional to swappiness and
315662306a36Sopenharmony_ci			 * their relative recent reclaim efficiency.
315762306a36Sopenharmony_ci			 * Make sure we don't miss the last page on
315862306a36Sopenharmony_ci			 * the offlined memory cgroups because of a
315962306a36Sopenharmony_ci			 * round-off error.
316062306a36Sopenharmony_ci			 */
316162306a36Sopenharmony_ci			scan = mem_cgroup_online(memcg) ?
316262306a36Sopenharmony_ci			       div64_u64(scan * fraction[file], denominator) :
316362306a36Sopenharmony_ci			       DIV64_U64_ROUND_UP(scan * fraction[file],
316462306a36Sopenharmony_ci						  denominator);
316562306a36Sopenharmony_ci			break;
316662306a36Sopenharmony_ci		case SCAN_FILE:
316762306a36Sopenharmony_ci		case SCAN_ANON:
316862306a36Sopenharmony_ci			/* Scan one type exclusively */
316962306a36Sopenharmony_ci			if ((scan_balance == SCAN_FILE) != file)
317062306a36Sopenharmony_ci				scan = 0;
317162306a36Sopenharmony_ci			break;
317262306a36Sopenharmony_ci		default:
317362306a36Sopenharmony_ci			/* Look ma, no brain */
317462306a36Sopenharmony_ci			BUG();
317562306a36Sopenharmony_ci		}
317662306a36Sopenharmony_ci
317762306a36Sopenharmony_ci		nr[lru] = scan;
317862306a36Sopenharmony_ci	}
317962306a36Sopenharmony_ci}
318062306a36Sopenharmony_ci
318162306a36Sopenharmony_ci/*
318262306a36Sopenharmony_ci * Anonymous LRU management is a waste if there is
318362306a36Sopenharmony_ci * ultimately no way to reclaim the memory.
318462306a36Sopenharmony_ci */
318562306a36Sopenharmony_cistatic bool can_age_anon_pages(struct pglist_data *pgdat,
318662306a36Sopenharmony_ci			       struct scan_control *sc)
318762306a36Sopenharmony_ci{
318862306a36Sopenharmony_ci	/* Aging the anon LRU is valuable if swap is present: */
318962306a36Sopenharmony_ci	if (total_swap_pages > 0)
319062306a36Sopenharmony_ci		return true;
319162306a36Sopenharmony_ci
319262306a36Sopenharmony_ci	/* Also valuable if anon pages can be demoted: */
319362306a36Sopenharmony_ci	return can_demote(pgdat->node_id, sc);
319462306a36Sopenharmony_ci}
319562306a36Sopenharmony_ci
319662306a36Sopenharmony_ci#ifdef CONFIG_LRU_GEN
319762306a36Sopenharmony_ci
319862306a36Sopenharmony_ci#ifdef CONFIG_LRU_GEN_ENABLED
319962306a36Sopenharmony_ciDEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
320062306a36Sopenharmony_ci#define get_cap(cap)	static_branch_likely(&lru_gen_caps[cap])
320162306a36Sopenharmony_ci#else
320262306a36Sopenharmony_ciDEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
320362306a36Sopenharmony_ci#define get_cap(cap)	static_branch_unlikely(&lru_gen_caps[cap])
320462306a36Sopenharmony_ci#endif
320562306a36Sopenharmony_ci
320662306a36Sopenharmony_cistatic bool should_walk_mmu(void)
320762306a36Sopenharmony_ci{
320862306a36Sopenharmony_ci	return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
320962306a36Sopenharmony_ci}
321062306a36Sopenharmony_ci
321162306a36Sopenharmony_cistatic bool should_clear_pmd_young(void)
321262306a36Sopenharmony_ci{
321362306a36Sopenharmony_ci	return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
321462306a36Sopenharmony_ci}
321562306a36Sopenharmony_ci
321662306a36Sopenharmony_ci/******************************************************************************
321762306a36Sopenharmony_ci *                          shorthand helpers
321862306a36Sopenharmony_ci ******************************************************************************/
321962306a36Sopenharmony_ci
322062306a36Sopenharmony_ci#define LRU_REFS_FLAGS	(BIT(PG_referenced) | BIT(PG_workingset))
322162306a36Sopenharmony_ci
322262306a36Sopenharmony_ci#define DEFINE_MAX_SEQ(lruvec)						\
322362306a36Sopenharmony_ci	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
322462306a36Sopenharmony_ci
322562306a36Sopenharmony_ci#define DEFINE_MIN_SEQ(lruvec)						\
322662306a36Sopenharmony_ci	unsigned long min_seq[ANON_AND_FILE] = {			\
322762306a36Sopenharmony_ci		READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),	\
322862306a36Sopenharmony_ci		READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),	\
322962306a36Sopenharmony_ci	}
323062306a36Sopenharmony_ci
323162306a36Sopenharmony_ci#define for_each_gen_type_zone(gen, type, zone)				\
323262306a36Sopenharmony_ci	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)			\
323362306a36Sopenharmony_ci		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
323462306a36Sopenharmony_ci			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
323562306a36Sopenharmony_ci
323662306a36Sopenharmony_ci#define get_memcg_gen(seq)	((seq) % MEMCG_NR_GENS)
323762306a36Sopenharmony_ci#define get_memcg_bin(bin)	((bin) % MEMCG_NR_BINS)
323862306a36Sopenharmony_ci
323962306a36Sopenharmony_cistatic struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
324062306a36Sopenharmony_ci{
324162306a36Sopenharmony_ci	struct pglist_data *pgdat = NODE_DATA(nid);
324262306a36Sopenharmony_ci
324362306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
324462306a36Sopenharmony_ci	if (memcg) {
324562306a36Sopenharmony_ci		struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
324662306a36Sopenharmony_ci
324762306a36Sopenharmony_ci		/* see the comment in mem_cgroup_lruvec() */
324862306a36Sopenharmony_ci		if (!lruvec->pgdat)
324962306a36Sopenharmony_ci			lruvec->pgdat = pgdat;
325062306a36Sopenharmony_ci
325162306a36Sopenharmony_ci		return lruvec;
325262306a36Sopenharmony_ci	}
325362306a36Sopenharmony_ci#endif
325462306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
325562306a36Sopenharmony_ci
325662306a36Sopenharmony_ci	return &pgdat->__lruvec;
325762306a36Sopenharmony_ci}
325862306a36Sopenharmony_ci
325962306a36Sopenharmony_cistatic int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
326062306a36Sopenharmony_ci{
326162306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
326262306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
326362306a36Sopenharmony_ci
326462306a36Sopenharmony_ci	if (!sc->may_swap)
326562306a36Sopenharmony_ci		return 0;
326662306a36Sopenharmony_ci
326762306a36Sopenharmony_ci	if (!can_demote(pgdat->node_id, sc) &&
326862306a36Sopenharmony_ci	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
326962306a36Sopenharmony_ci		return 0;
327062306a36Sopenharmony_ci
327162306a36Sopenharmony_ci	return mem_cgroup_swappiness(memcg);
327262306a36Sopenharmony_ci}
327362306a36Sopenharmony_ci
327462306a36Sopenharmony_cistatic int get_nr_gens(struct lruvec *lruvec, int type)
327562306a36Sopenharmony_ci{
327662306a36Sopenharmony_ci	return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
327762306a36Sopenharmony_ci}
327862306a36Sopenharmony_ci
327962306a36Sopenharmony_cistatic bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
328062306a36Sopenharmony_ci{
328162306a36Sopenharmony_ci	/* see the comment on lru_gen_folio */
328262306a36Sopenharmony_ci	return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
328362306a36Sopenharmony_ci	       get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
328462306a36Sopenharmony_ci	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
328562306a36Sopenharmony_ci}
328662306a36Sopenharmony_ci
328762306a36Sopenharmony_ci/******************************************************************************
328862306a36Sopenharmony_ci *                          Bloom filters
328962306a36Sopenharmony_ci ******************************************************************************/
329062306a36Sopenharmony_ci
329162306a36Sopenharmony_ci/*
329262306a36Sopenharmony_ci * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
329362306a36Sopenharmony_ci * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
329462306a36Sopenharmony_ci * bits in a bitmap, k is the number of hash functions and n is the number of
329562306a36Sopenharmony_ci * inserted items.
329662306a36Sopenharmony_ci *
329762306a36Sopenharmony_ci * Page table walkers use one of the two filters to reduce their search space.
329862306a36Sopenharmony_ci * To get rid of non-leaf entries that no longer have enough leaf entries, the
329962306a36Sopenharmony_ci * aging uses the double-buffering technique to flip to the other filter each
330062306a36Sopenharmony_ci * time it produces a new generation. For non-leaf entries that have enough
330162306a36Sopenharmony_ci * leaf entries, the aging carries them over to the next generation in
330262306a36Sopenharmony_ci * walk_pmd_range(); the eviction also report them when walking the rmap
330362306a36Sopenharmony_ci * in lru_gen_look_around().
330462306a36Sopenharmony_ci *
330562306a36Sopenharmony_ci * For future optimizations:
330662306a36Sopenharmony_ci * 1. It's not necessary to keep both filters all the time. The spare one can be
330762306a36Sopenharmony_ci *    freed after the RCU grace period and reallocated if needed again.
330862306a36Sopenharmony_ci * 2. And when reallocating, it's worth scaling its size according to the number
330962306a36Sopenharmony_ci *    of inserted entries in the other filter, to reduce the memory overhead on
331062306a36Sopenharmony_ci *    small systems and false positives on large systems.
331162306a36Sopenharmony_ci * 3. Jenkins' hash function is an alternative to Knuth's.
331262306a36Sopenharmony_ci */
331362306a36Sopenharmony_ci#define BLOOM_FILTER_SHIFT	15
331462306a36Sopenharmony_ci
331562306a36Sopenharmony_cistatic inline int filter_gen_from_seq(unsigned long seq)
331662306a36Sopenharmony_ci{
331762306a36Sopenharmony_ci	return seq % NR_BLOOM_FILTERS;
331862306a36Sopenharmony_ci}
331962306a36Sopenharmony_ci
332062306a36Sopenharmony_cistatic void get_item_key(void *item, int *key)
332162306a36Sopenharmony_ci{
332262306a36Sopenharmony_ci	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
332362306a36Sopenharmony_ci
332462306a36Sopenharmony_ci	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
332562306a36Sopenharmony_ci
332662306a36Sopenharmony_ci	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
332762306a36Sopenharmony_ci	key[1] = hash >> BLOOM_FILTER_SHIFT;
332862306a36Sopenharmony_ci}
332962306a36Sopenharmony_ci
333062306a36Sopenharmony_cistatic bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
333162306a36Sopenharmony_ci{
333262306a36Sopenharmony_ci	int key[2];
333362306a36Sopenharmony_ci	unsigned long *filter;
333462306a36Sopenharmony_ci	int gen = filter_gen_from_seq(seq);
333562306a36Sopenharmony_ci
333662306a36Sopenharmony_ci	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
333762306a36Sopenharmony_ci	if (!filter)
333862306a36Sopenharmony_ci		return true;
333962306a36Sopenharmony_ci
334062306a36Sopenharmony_ci	get_item_key(item, key);
334162306a36Sopenharmony_ci
334262306a36Sopenharmony_ci	return test_bit(key[0], filter) && test_bit(key[1], filter);
334362306a36Sopenharmony_ci}
334462306a36Sopenharmony_ci
334562306a36Sopenharmony_cistatic void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
334662306a36Sopenharmony_ci{
334762306a36Sopenharmony_ci	int key[2];
334862306a36Sopenharmony_ci	unsigned long *filter;
334962306a36Sopenharmony_ci	int gen = filter_gen_from_seq(seq);
335062306a36Sopenharmony_ci
335162306a36Sopenharmony_ci	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
335262306a36Sopenharmony_ci	if (!filter)
335362306a36Sopenharmony_ci		return;
335462306a36Sopenharmony_ci
335562306a36Sopenharmony_ci	get_item_key(item, key);
335662306a36Sopenharmony_ci
335762306a36Sopenharmony_ci	if (!test_bit(key[0], filter))
335862306a36Sopenharmony_ci		set_bit(key[0], filter);
335962306a36Sopenharmony_ci	if (!test_bit(key[1], filter))
336062306a36Sopenharmony_ci		set_bit(key[1], filter);
336162306a36Sopenharmony_ci}
336262306a36Sopenharmony_ci
336362306a36Sopenharmony_cistatic void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
336462306a36Sopenharmony_ci{
336562306a36Sopenharmony_ci	unsigned long *filter;
336662306a36Sopenharmony_ci	int gen = filter_gen_from_seq(seq);
336762306a36Sopenharmony_ci
336862306a36Sopenharmony_ci	filter = lruvec->mm_state.filters[gen];
336962306a36Sopenharmony_ci	if (filter) {
337062306a36Sopenharmony_ci		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
337162306a36Sopenharmony_ci		return;
337262306a36Sopenharmony_ci	}
337362306a36Sopenharmony_ci
337462306a36Sopenharmony_ci	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
337562306a36Sopenharmony_ci			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
337662306a36Sopenharmony_ci	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
337762306a36Sopenharmony_ci}
337862306a36Sopenharmony_ci
337962306a36Sopenharmony_ci/******************************************************************************
338062306a36Sopenharmony_ci *                          mm_struct list
338162306a36Sopenharmony_ci ******************************************************************************/
338262306a36Sopenharmony_ci
338362306a36Sopenharmony_cistatic struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
338462306a36Sopenharmony_ci{
338562306a36Sopenharmony_ci	static struct lru_gen_mm_list mm_list = {
338662306a36Sopenharmony_ci		.fifo = LIST_HEAD_INIT(mm_list.fifo),
338762306a36Sopenharmony_ci		.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
338862306a36Sopenharmony_ci	};
338962306a36Sopenharmony_ci
339062306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
339162306a36Sopenharmony_ci	if (memcg)
339262306a36Sopenharmony_ci		return &memcg->mm_list;
339362306a36Sopenharmony_ci#endif
339462306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
339562306a36Sopenharmony_ci
339662306a36Sopenharmony_ci	return &mm_list;
339762306a36Sopenharmony_ci}
339862306a36Sopenharmony_ci
339962306a36Sopenharmony_civoid lru_gen_add_mm(struct mm_struct *mm)
340062306a36Sopenharmony_ci{
340162306a36Sopenharmony_ci	int nid;
340262306a36Sopenharmony_ci	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
340362306a36Sopenharmony_ci	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
340462306a36Sopenharmony_ci
340562306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
340662306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
340762306a36Sopenharmony_ci	VM_WARN_ON_ONCE(mm->lru_gen.memcg);
340862306a36Sopenharmony_ci	mm->lru_gen.memcg = memcg;
340962306a36Sopenharmony_ci#endif
341062306a36Sopenharmony_ci	spin_lock(&mm_list->lock);
341162306a36Sopenharmony_ci
341262306a36Sopenharmony_ci	for_each_node_state(nid, N_MEMORY) {
341362306a36Sopenharmony_ci		struct lruvec *lruvec = get_lruvec(memcg, nid);
341462306a36Sopenharmony_ci
341562306a36Sopenharmony_ci		/* the first addition since the last iteration */
341662306a36Sopenharmony_ci		if (lruvec->mm_state.tail == &mm_list->fifo)
341762306a36Sopenharmony_ci			lruvec->mm_state.tail = &mm->lru_gen.list;
341862306a36Sopenharmony_ci	}
341962306a36Sopenharmony_ci
342062306a36Sopenharmony_ci	list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
342162306a36Sopenharmony_ci
342262306a36Sopenharmony_ci	spin_unlock(&mm_list->lock);
342362306a36Sopenharmony_ci}
342462306a36Sopenharmony_ci
342562306a36Sopenharmony_civoid lru_gen_del_mm(struct mm_struct *mm)
342662306a36Sopenharmony_ci{
342762306a36Sopenharmony_ci	int nid;
342862306a36Sopenharmony_ci	struct lru_gen_mm_list *mm_list;
342962306a36Sopenharmony_ci	struct mem_cgroup *memcg = NULL;
343062306a36Sopenharmony_ci
343162306a36Sopenharmony_ci	if (list_empty(&mm->lru_gen.list))
343262306a36Sopenharmony_ci		return;
343362306a36Sopenharmony_ci
343462306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
343562306a36Sopenharmony_ci	memcg = mm->lru_gen.memcg;
343662306a36Sopenharmony_ci#endif
343762306a36Sopenharmony_ci	mm_list = get_mm_list(memcg);
343862306a36Sopenharmony_ci
343962306a36Sopenharmony_ci	spin_lock(&mm_list->lock);
344062306a36Sopenharmony_ci
344162306a36Sopenharmony_ci	for_each_node(nid) {
344262306a36Sopenharmony_ci		struct lruvec *lruvec = get_lruvec(memcg, nid);
344362306a36Sopenharmony_ci
344462306a36Sopenharmony_ci		/* where the current iteration continues after */
344562306a36Sopenharmony_ci		if (lruvec->mm_state.head == &mm->lru_gen.list)
344662306a36Sopenharmony_ci			lruvec->mm_state.head = lruvec->mm_state.head->prev;
344762306a36Sopenharmony_ci
344862306a36Sopenharmony_ci		/* where the last iteration ended before */
344962306a36Sopenharmony_ci		if (lruvec->mm_state.tail == &mm->lru_gen.list)
345062306a36Sopenharmony_ci			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
345162306a36Sopenharmony_ci	}
345262306a36Sopenharmony_ci
345362306a36Sopenharmony_ci	list_del_init(&mm->lru_gen.list);
345462306a36Sopenharmony_ci
345562306a36Sopenharmony_ci	spin_unlock(&mm_list->lock);
345662306a36Sopenharmony_ci
345762306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
345862306a36Sopenharmony_ci	mem_cgroup_put(mm->lru_gen.memcg);
345962306a36Sopenharmony_ci	mm->lru_gen.memcg = NULL;
346062306a36Sopenharmony_ci#endif
346162306a36Sopenharmony_ci}
346262306a36Sopenharmony_ci
346362306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
346462306a36Sopenharmony_civoid lru_gen_migrate_mm(struct mm_struct *mm)
346562306a36Sopenharmony_ci{
346662306a36Sopenharmony_ci	struct mem_cgroup *memcg;
346762306a36Sopenharmony_ci	struct task_struct *task = rcu_dereference_protected(mm->owner, true);
346862306a36Sopenharmony_ci
346962306a36Sopenharmony_ci	VM_WARN_ON_ONCE(task->mm != mm);
347062306a36Sopenharmony_ci	lockdep_assert_held(&task->alloc_lock);
347162306a36Sopenharmony_ci
347262306a36Sopenharmony_ci	/* for mm_update_next_owner() */
347362306a36Sopenharmony_ci	if (mem_cgroup_disabled())
347462306a36Sopenharmony_ci		return;
347562306a36Sopenharmony_ci
347662306a36Sopenharmony_ci	/* migration can happen before addition */
347762306a36Sopenharmony_ci	if (!mm->lru_gen.memcg)
347862306a36Sopenharmony_ci		return;
347962306a36Sopenharmony_ci
348062306a36Sopenharmony_ci	rcu_read_lock();
348162306a36Sopenharmony_ci	memcg = mem_cgroup_from_task(task);
348262306a36Sopenharmony_ci	rcu_read_unlock();
348362306a36Sopenharmony_ci	if (memcg == mm->lru_gen.memcg)
348462306a36Sopenharmony_ci		return;
348562306a36Sopenharmony_ci
348662306a36Sopenharmony_ci	VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
348762306a36Sopenharmony_ci
348862306a36Sopenharmony_ci	lru_gen_del_mm(mm);
348962306a36Sopenharmony_ci	lru_gen_add_mm(mm);
349062306a36Sopenharmony_ci}
349162306a36Sopenharmony_ci#endif
349262306a36Sopenharmony_ci
349362306a36Sopenharmony_cistatic void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
349462306a36Sopenharmony_ci{
349562306a36Sopenharmony_ci	int i;
349662306a36Sopenharmony_ci	int hist;
349762306a36Sopenharmony_ci
349862306a36Sopenharmony_ci	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
349962306a36Sopenharmony_ci
350062306a36Sopenharmony_ci	if (walk) {
350162306a36Sopenharmony_ci		hist = lru_hist_from_seq(walk->max_seq);
350262306a36Sopenharmony_ci
350362306a36Sopenharmony_ci		for (i = 0; i < NR_MM_STATS; i++) {
350462306a36Sopenharmony_ci			WRITE_ONCE(lruvec->mm_state.stats[hist][i],
350562306a36Sopenharmony_ci				   lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
350662306a36Sopenharmony_ci			walk->mm_stats[i] = 0;
350762306a36Sopenharmony_ci		}
350862306a36Sopenharmony_ci	}
350962306a36Sopenharmony_ci
351062306a36Sopenharmony_ci	if (NR_HIST_GENS > 1 && last) {
351162306a36Sopenharmony_ci		hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
351262306a36Sopenharmony_ci
351362306a36Sopenharmony_ci		for (i = 0; i < NR_MM_STATS; i++)
351462306a36Sopenharmony_ci			WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
351562306a36Sopenharmony_ci	}
351662306a36Sopenharmony_ci}
351762306a36Sopenharmony_ci
351862306a36Sopenharmony_cistatic bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
351962306a36Sopenharmony_ci{
352062306a36Sopenharmony_ci	int type;
352162306a36Sopenharmony_ci	unsigned long size = 0;
352262306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
352362306a36Sopenharmony_ci	int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
352462306a36Sopenharmony_ci
352562306a36Sopenharmony_ci	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
352662306a36Sopenharmony_ci		return true;
352762306a36Sopenharmony_ci
352862306a36Sopenharmony_ci	clear_bit(key, &mm->lru_gen.bitmap);
352962306a36Sopenharmony_ci
353062306a36Sopenharmony_ci	for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
353162306a36Sopenharmony_ci		size += type ? get_mm_counter(mm, MM_FILEPAGES) :
353262306a36Sopenharmony_ci			       get_mm_counter(mm, MM_ANONPAGES) +
353362306a36Sopenharmony_ci			       get_mm_counter(mm, MM_SHMEMPAGES);
353462306a36Sopenharmony_ci	}
353562306a36Sopenharmony_ci
353662306a36Sopenharmony_ci	if (size < MIN_LRU_BATCH)
353762306a36Sopenharmony_ci		return true;
353862306a36Sopenharmony_ci
353962306a36Sopenharmony_ci	return !mmget_not_zero(mm);
354062306a36Sopenharmony_ci}
354162306a36Sopenharmony_ci
354262306a36Sopenharmony_cistatic bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
354362306a36Sopenharmony_ci			    struct mm_struct **iter)
354462306a36Sopenharmony_ci{
354562306a36Sopenharmony_ci	bool first = false;
354662306a36Sopenharmony_ci	bool last = false;
354762306a36Sopenharmony_ci	struct mm_struct *mm = NULL;
354862306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
354962306a36Sopenharmony_ci	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
355062306a36Sopenharmony_ci	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
355162306a36Sopenharmony_ci
355262306a36Sopenharmony_ci	/*
355362306a36Sopenharmony_ci	 * mm_state->seq is incremented after each iteration of mm_list. There
355462306a36Sopenharmony_ci	 * are three interesting cases for this page table walker:
355562306a36Sopenharmony_ci	 * 1. It tries to start a new iteration with a stale max_seq: there is
355662306a36Sopenharmony_ci	 *    nothing left to do.
355762306a36Sopenharmony_ci	 * 2. It started the next iteration: it needs to reset the Bloom filter
355862306a36Sopenharmony_ci	 *    so that a fresh set of PTE tables can be recorded.
355962306a36Sopenharmony_ci	 * 3. It ended the current iteration: it needs to reset the mm stats
356062306a36Sopenharmony_ci	 *    counters and tell its caller to increment max_seq.
356162306a36Sopenharmony_ci	 */
356262306a36Sopenharmony_ci	spin_lock(&mm_list->lock);
356362306a36Sopenharmony_ci
356462306a36Sopenharmony_ci	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
356562306a36Sopenharmony_ci
356662306a36Sopenharmony_ci	if (walk->max_seq <= mm_state->seq)
356762306a36Sopenharmony_ci		goto done;
356862306a36Sopenharmony_ci
356962306a36Sopenharmony_ci	if (!mm_state->head)
357062306a36Sopenharmony_ci		mm_state->head = &mm_list->fifo;
357162306a36Sopenharmony_ci
357262306a36Sopenharmony_ci	if (mm_state->head == &mm_list->fifo)
357362306a36Sopenharmony_ci		first = true;
357462306a36Sopenharmony_ci
357562306a36Sopenharmony_ci	do {
357662306a36Sopenharmony_ci		mm_state->head = mm_state->head->next;
357762306a36Sopenharmony_ci		if (mm_state->head == &mm_list->fifo) {
357862306a36Sopenharmony_ci			WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
357962306a36Sopenharmony_ci			last = true;
358062306a36Sopenharmony_ci			break;
358162306a36Sopenharmony_ci		}
358262306a36Sopenharmony_ci
358362306a36Sopenharmony_ci		/* force scan for those added after the last iteration */
358462306a36Sopenharmony_ci		if (!mm_state->tail || mm_state->tail == mm_state->head) {
358562306a36Sopenharmony_ci			mm_state->tail = mm_state->head->next;
358662306a36Sopenharmony_ci			walk->force_scan = true;
358762306a36Sopenharmony_ci		}
358862306a36Sopenharmony_ci
358962306a36Sopenharmony_ci		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
359062306a36Sopenharmony_ci		if (should_skip_mm(mm, walk))
359162306a36Sopenharmony_ci			mm = NULL;
359262306a36Sopenharmony_ci	} while (!mm);
359362306a36Sopenharmony_cidone:
359462306a36Sopenharmony_ci	if (*iter || last)
359562306a36Sopenharmony_ci		reset_mm_stats(lruvec, walk, last);
359662306a36Sopenharmony_ci
359762306a36Sopenharmony_ci	spin_unlock(&mm_list->lock);
359862306a36Sopenharmony_ci
359962306a36Sopenharmony_ci	if (mm && first)
360062306a36Sopenharmony_ci		reset_bloom_filter(lruvec, walk->max_seq + 1);
360162306a36Sopenharmony_ci
360262306a36Sopenharmony_ci	if (*iter)
360362306a36Sopenharmony_ci		mmput_async(*iter);
360462306a36Sopenharmony_ci
360562306a36Sopenharmony_ci	*iter = mm;
360662306a36Sopenharmony_ci
360762306a36Sopenharmony_ci	return last;
360862306a36Sopenharmony_ci}
360962306a36Sopenharmony_ci
361062306a36Sopenharmony_cistatic bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
361162306a36Sopenharmony_ci{
361262306a36Sopenharmony_ci	bool success = false;
361362306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
361462306a36Sopenharmony_ci	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
361562306a36Sopenharmony_ci	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
361662306a36Sopenharmony_ci
361762306a36Sopenharmony_ci	spin_lock(&mm_list->lock);
361862306a36Sopenharmony_ci
361962306a36Sopenharmony_ci	VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
362062306a36Sopenharmony_ci
362162306a36Sopenharmony_ci	if (max_seq > mm_state->seq) {
362262306a36Sopenharmony_ci		mm_state->head = NULL;
362362306a36Sopenharmony_ci		mm_state->tail = NULL;
362462306a36Sopenharmony_ci		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
362562306a36Sopenharmony_ci		reset_mm_stats(lruvec, NULL, true);
362662306a36Sopenharmony_ci		success = true;
362762306a36Sopenharmony_ci	}
362862306a36Sopenharmony_ci
362962306a36Sopenharmony_ci	spin_unlock(&mm_list->lock);
363062306a36Sopenharmony_ci
363162306a36Sopenharmony_ci	return success;
363262306a36Sopenharmony_ci}
363362306a36Sopenharmony_ci
363462306a36Sopenharmony_ci/******************************************************************************
363562306a36Sopenharmony_ci *                          PID controller
363662306a36Sopenharmony_ci ******************************************************************************/
363762306a36Sopenharmony_ci
363862306a36Sopenharmony_ci/*
363962306a36Sopenharmony_ci * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
364062306a36Sopenharmony_ci *
364162306a36Sopenharmony_ci * The P term is refaulted/(evicted+protected) from a tier in the generation
364262306a36Sopenharmony_ci * currently being evicted; the I term is the exponential moving average of the
364362306a36Sopenharmony_ci * P term over the generations previously evicted, using the smoothing factor
364462306a36Sopenharmony_ci * 1/2; the D term isn't supported.
364562306a36Sopenharmony_ci *
364662306a36Sopenharmony_ci * The setpoint (SP) is always the first tier of one type; the process variable
364762306a36Sopenharmony_ci * (PV) is either any tier of the other type or any other tier of the same
364862306a36Sopenharmony_ci * type.
364962306a36Sopenharmony_ci *
365062306a36Sopenharmony_ci * The error is the difference between the SP and the PV; the correction is to
365162306a36Sopenharmony_ci * turn off protection when SP>PV or turn on protection when SP<PV.
365262306a36Sopenharmony_ci *
365362306a36Sopenharmony_ci * For future optimizations:
365462306a36Sopenharmony_ci * 1. The D term may discount the other two terms over time so that long-lived
365562306a36Sopenharmony_ci *    generations can resist stale information.
365662306a36Sopenharmony_ci */
365762306a36Sopenharmony_cistruct ctrl_pos {
365862306a36Sopenharmony_ci	unsigned long refaulted;
365962306a36Sopenharmony_ci	unsigned long total;
366062306a36Sopenharmony_ci	int gain;
366162306a36Sopenharmony_ci};
366262306a36Sopenharmony_ci
366362306a36Sopenharmony_cistatic void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
366462306a36Sopenharmony_ci			  struct ctrl_pos *pos)
366562306a36Sopenharmony_ci{
366662306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
366762306a36Sopenharmony_ci	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
366862306a36Sopenharmony_ci
366962306a36Sopenharmony_ci	pos->refaulted = lrugen->avg_refaulted[type][tier] +
367062306a36Sopenharmony_ci			 atomic_long_read(&lrugen->refaulted[hist][type][tier]);
367162306a36Sopenharmony_ci	pos->total = lrugen->avg_total[type][tier] +
367262306a36Sopenharmony_ci		     atomic_long_read(&lrugen->evicted[hist][type][tier]);
367362306a36Sopenharmony_ci	if (tier)
367462306a36Sopenharmony_ci		pos->total += lrugen->protected[hist][type][tier - 1];
367562306a36Sopenharmony_ci	pos->gain = gain;
367662306a36Sopenharmony_ci}
367762306a36Sopenharmony_ci
367862306a36Sopenharmony_cistatic void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
367962306a36Sopenharmony_ci{
368062306a36Sopenharmony_ci	int hist, tier;
368162306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
368262306a36Sopenharmony_ci	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
368362306a36Sopenharmony_ci	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
368462306a36Sopenharmony_ci
368562306a36Sopenharmony_ci	lockdep_assert_held(&lruvec->lru_lock);
368662306a36Sopenharmony_ci
368762306a36Sopenharmony_ci	if (!carryover && !clear)
368862306a36Sopenharmony_ci		return;
368962306a36Sopenharmony_ci
369062306a36Sopenharmony_ci	hist = lru_hist_from_seq(seq);
369162306a36Sopenharmony_ci
369262306a36Sopenharmony_ci	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
369362306a36Sopenharmony_ci		if (carryover) {
369462306a36Sopenharmony_ci			unsigned long sum;
369562306a36Sopenharmony_ci
369662306a36Sopenharmony_ci			sum = lrugen->avg_refaulted[type][tier] +
369762306a36Sopenharmony_ci			      atomic_long_read(&lrugen->refaulted[hist][type][tier]);
369862306a36Sopenharmony_ci			WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
369962306a36Sopenharmony_ci
370062306a36Sopenharmony_ci			sum = lrugen->avg_total[type][tier] +
370162306a36Sopenharmony_ci			      atomic_long_read(&lrugen->evicted[hist][type][tier]);
370262306a36Sopenharmony_ci			if (tier)
370362306a36Sopenharmony_ci				sum += lrugen->protected[hist][type][tier - 1];
370462306a36Sopenharmony_ci			WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
370562306a36Sopenharmony_ci		}
370662306a36Sopenharmony_ci
370762306a36Sopenharmony_ci		if (clear) {
370862306a36Sopenharmony_ci			atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
370962306a36Sopenharmony_ci			atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
371062306a36Sopenharmony_ci			if (tier)
371162306a36Sopenharmony_ci				WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
371262306a36Sopenharmony_ci		}
371362306a36Sopenharmony_ci	}
371462306a36Sopenharmony_ci}
371562306a36Sopenharmony_ci
371662306a36Sopenharmony_cistatic bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
371762306a36Sopenharmony_ci{
371862306a36Sopenharmony_ci	/*
371962306a36Sopenharmony_ci	 * Return true if the PV has a limited number of refaults or a lower
372062306a36Sopenharmony_ci	 * refaulted/total than the SP.
372162306a36Sopenharmony_ci	 */
372262306a36Sopenharmony_ci	return pv->refaulted < MIN_LRU_BATCH ||
372362306a36Sopenharmony_ci	       pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
372462306a36Sopenharmony_ci	       (sp->refaulted + 1) * pv->total * pv->gain;
372562306a36Sopenharmony_ci}
372662306a36Sopenharmony_ci
372762306a36Sopenharmony_ci/******************************************************************************
372862306a36Sopenharmony_ci *                          the aging
372962306a36Sopenharmony_ci ******************************************************************************/
373062306a36Sopenharmony_ci
373162306a36Sopenharmony_ci/* promote pages accessed through page tables */
373262306a36Sopenharmony_cistatic int folio_update_gen(struct folio *folio, int gen)
373362306a36Sopenharmony_ci{
373462306a36Sopenharmony_ci	unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
373562306a36Sopenharmony_ci
373662306a36Sopenharmony_ci	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
373762306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!rcu_read_lock_held());
373862306a36Sopenharmony_ci
373962306a36Sopenharmony_ci	do {
374062306a36Sopenharmony_ci		/* lru_gen_del_folio() has isolated this page? */
374162306a36Sopenharmony_ci		if (!(old_flags & LRU_GEN_MASK)) {
374262306a36Sopenharmony_ci			/* for shrink_folio_list() */
374362306a36Sopenharmony_ci			new_flags = old_flags | BIT(PG_referenced);
374462306a36Sopenharmony_ci			continue;
374562306a36Sopenharmony_ci		}
374662306a36Sopenharmony_ci
374762306a36Sopenharmony_ci		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
374862306a36Sopenharmony_ci		new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
374962306a36Sopenharmony_ci	} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
375062306a36Sopenharmony_ci
375162306a36Sopenharmony_ci	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
375262306a36Sopenharmony_ci}
375362306a36Sopenharmony_ci
375462306a36Sopenharmony_ci/* protect pages accessed multiple times through file descriptors */
375562306a36Sopenharmony_cistatic int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
375662306a36Sopenharmony_ci{
375762306a36Sopenharmony_ci	int type = folio_is_file_lru(folio);
375862306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
375962306a36Sopenharmony_ci	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
376062306a36Sopenharmony_ci	unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
376162306a36Sopenharmony_ci
376262306a36Sopenharmony_ci	VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
376362306a36Sopenharmony_ci
376462306a36Sopenharmony_ci	do {
376562306a36Sopenharmony_ci		new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
376662306a36Sopenharmony_ci		/* folio_update_gen() has promoted this page? */
376762306a36Sopenharmony_ci		if (new_gen >= 0 && new_gen != old_gen)
376862306a36Sopenharmony_ci			return new_gen;
376962306a36Sopenharmony_ci
377062306a36Sopenharmony_ci		new_gen = (old_gen + 1) % MAX_NR_GENS;
377162306a36Sopenharmony_ci
377262306a36Sopenharmony_ci		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
377362306a36Sopenharmony_ci		new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
377462306a36Sopenharmony_ci		/* for folio_end_writeback() */
377562306a36Sopenharmony_ci		if (reclaiming)
377662306a36Sopenharmony_ci			new_flags |= BIT(PG_reclaim);
377762306a36Sopenharmony_ci	} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
377862306a36Sopenharmony_ci
377962306a36Sopenharmony_ci	lru_gen_update_size(lruvec, folio, old_gen, new_gen);
378062306a36Sopenharmony_ci
378162306a36Sopenharmony_ci	return new_gen;
378262306a36Sopenharmony_ci}
378362306a36Sopenharmony_ci
378462306a36Sopenharmony_cistatic void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
378562306a36Sopenharmony_ci			      int old_gen, int new_gen)
378662306a36Sopenharmony_ci{
378762306a36Sopenharmony_ci	int type = folio_is_file_lru(folio);
378862306a36Sopenharmony_ci	int zone = folio_zonenum(folio);
378962306a36Sopenharmony_ci	int delta = folio_nr_pages(folio);
379062306a36Sopenharmony_ci
379162306a36Sopenharmony_ci	VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
379262306a36Sopenharmony_ci	VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
379362306a36Sopenharmony_ci
379462306a36Sopenharmony_ci	walk->batched++;
379562306a36Sopenharmony_ci
379662306a36Sopenharmony_ci	walk->nr_pages[old_gen][type][zone] -= delta;
379762306a36Sopenharmony_ci	walk->nr_pages[new_gen][type][zone] += delta;
379862306a36Sopenharmony_ci}
379962306a36Sopenharmony_ci
380062306a36Sopenharmony_cistatic void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
380162306a36Sopenharmony_ci{
380262306a36Sopenharmony_ci	int gen, type, zone;
380362306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
380462306a36Sopenharmony_ci
380562306a36Sopenharmony_ci	walk->batched = 0;
380662306a36Sopenharmony_ci
380762306a36Sopenharmony_ci	for_each_gen_type_zone(gen, type, zone) {
380862306a36Sopenharmony_ci		enum lru_list lru = type * LRU_INACTIVE_FILE;
380962306a36Sopenharmony_ci		int delta = walk->nr_pages[gen][type][zone];
381062306a36Sopenharmony_ci
381162306a36Sopenharmony_ci		if (!delta)
381262306a36Sopenharmony_ci			continue;
381362306a36Sopenharmony_ci
381462306a36Sopenharmony_ci		walk->nr_pages[gen][type][zone] = 0;
381562306a36Sopenharmony_ci		WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
381662306a36Sopenharmony_ci			   lrugen->nr_pages[gen][type][zone] + delta);
381762306a36Sopenharmony_ci
381862306a36Sopenharmony_ci		if (lru_gen_is_active(lruvec, gen))
381962306a36Sopenharmony_ci			lru += LRU_ACTIVE;
382062306a36Sopenharmony_ci		__update_lru_size(lruvec, lru, zone, delta);
382162306a36Sopenharmony_ci	}
382262306a36Sopenharmony_ci}
382362306a36Sopenharmony_ci
382462306a36Sopenharmony_cistatic int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
382562306a36Sopenharmony_ci{
382662306a36Sopenharmony_ci	struct address_space *mapping;
382762306a36Sopenharmony_ci	struct vm_area_struct *vma = args->vma;
382862306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk = args->private;
382962306a36Sopenharmony_ci
383062306a36Sopenharmony_ci	if (!vma_is_accessible(vma))
383162306a36Sopenharmony_ci		return true;
383262306a36Sopenharmony_ci
383362306a36Sopenharmony_ci	if (is_vm_hugetlb_page(vma))
383462306a36Sopenharmony_ci		return true;
383562306a36Sopenharmony_ci
383662306a36Sopenharmony_ci	if (!vma_has_recency(vma))
383762306a36Sopenharmony_ci		return true;
383862306a36Sopenharmony_ci
383962306a36Sopenharmony_ci	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
384062306a36Sopenharmony_ci		return true;
384162306a36Sopenharmony_ci
384262306a36Sopenharmony_ci	if (vma == get_gate_vma(vma->vm_mm))
384362306a36Sopenharmony_ci		return true;
384462306a36Sopenharmony_ci
384562306a36Sopenharmony_ci	if (vma_is_anonymous(vma))
384662306a36Sopenharmony_ci		return !walk->can_swap;
384762306a36Sopenharmony_ci
384862306a36Sopenharmony_ci	if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
384962306a36Sopenharmony_ci		return true;
385062306a36Sopenharmony_ci
385162306a36Sopenharmony_ci	mapping = vma->vm_file->f_mapping;
385262306a36Sopenharmony_ci	if (mapping_unevictable(mapping))
385362306a36Sopenharmony_ci		return true;
385462306a36Sopenharmony_ci
385562306a36Sopenharmony_ci	if (shmem_mapping(mapping))
385662306a36Sopenharmony_ci		return !walk->can_swap;
385762306a36Sopenharmony_ci
385862306a36Sopenharmony_ci	/* to exclude special mappings like dax, etc. */
385962306a36Sopenharmony_ci	return !mapping->a_ops->read_folio;
386062306a36Sopenharmony_ci}
386162306a36Sopenharmony_ci
386262306a36Sopenharmony_ci/*
386362306a36Sopenharmony_ci * Some userspace memory allocators map many single-page VMAs. Instead of
386462306a36Sopenharmony_ci * returning back to the PGD table for each of such VMAs, finish an entire PMD
386562306a36Sopenharmony_ci * table to reduce zigzags and improve cache performance.
386662306a36Sopenharmony_ci */
386762306a36Sopenharmony_cistatic bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
386862306a36Sopenharmony_ci			 unsigned long *vm_start, unsigned long *vm_end)
386962306a36Sopenharmony_ci{
387062306a36Sopenharmony_ci	unsigned long start = round_up(*vm_end, size);
387162306a36Sopenharmony_ci	unsigned long end = (start | ~mask) + 1;
387262306a36Sopenharmony_ci	VMA_ITERATOR(vmi, args->mm, start);
387362306a36Sopenharmony_ci
387462306a36Sopenharmony_ci	VM_WARN_ON_ONCE(mask & size);
387562306a36Sopenharmony_ci	VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
387662306a36Sopenharmony_ci
387762306a36Sopenharmony_ci	for_each_vma(vmi, args->vma) {
387862306a36Sopenharmony_ci		if (end && end <= args->vma->vm_start)
387962306a36Sopenharmony_ci			return false;
388062306a36Sopenharmony_ci
388162306a36Sopenharmony_ci		if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
388262306a36Sopenharmony_ci			continue;
388362306a36Sopenharmony_ci
388462306a36Sopenharmony_ci		*vm_start = max(start, args->vma->vm_start);
388562306a36Sopenharmony_ci		*vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
388662306a36Sopenharmony_ci
388762306a36Sopenharmony_ci		return true;
388862306a36Sopenharmony_ci	}
388962306a36Sopenharmony_ci
389062306a36Sopenharmony_ci	return false;
389162306a36Sopenharmony_ci}
389262306a36Sopenharmony_ci
389362306a36Sopenharmony_cistatic unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
389462306a36Sopenharmony_ci{
389562306a36Sopenharmony_ci	unsigned long pfn = pte_pfn(pte);
389662306a36Sopenharmony_ci
389762306a36Sopenharmony_ci	VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
389862306a36Sopenharmony_ci
389962306a36Sopenharmony_ci	if (!pte_present(pte) || is_zero_pfn(pfn))
390062306a36Sopenharmony_ci		return -1;
390162306a36Sopenharmony_ci
390262306a36Sopenharmony_ci	if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
390362306a36Sopenharmony_ci		return -1;
390462306a36Sopenharmony_ci
390562306a36Sopenharmony_ci	if (WARN_ON_ONCE(!pfn_valid(pfn)))
390662306a36Sopenharmony_ci		return -1;
390762306a36Sopenharmony_ci
390862306a36Sopenharmony_ci	return pfn;
390962306a36Sopenharmony_ci}
391062306a36Sopenharmony_ci
391162306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
391262306a36Sopenharmony_cistatic unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
391362306a36Sopenharmony_ci{
391462306a36Sopenharmony_ci	unsigned long pfn = pmd_pfn(pmd);
391562306a36Sopenharmony_ci
391662306a36Sopenharmony_ci	VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
391762306a36Sopenharmony_ci
391862306a36Sopenharmony_ci	if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
391962306a36Sopenharmony_ci		return -1;
392062306a36Sopenharmony_ci
392162306a36Sopenharmony_ci	if (WARN_ON_ONCE(pmd_devmap(pmd)))
392262306a36Sopenharmony_ci		return -1;
392362306a36Sopenharmony_ci
392462306a36Sopenharmony_ci	if (WARN_ON_ONCE(!pfn_valid(pfn)))
392562306a36Sopenharmony_ci		return -1;
392662306a36Sopenharmony_ci
392762306a36Sopenharmony_ci	return pfn;
392862306a36Sopenharmony_ci}
392962306a36Sopenharmony_ci#endif
393062306a36Sopenharmony_ci
393162306a36Sopenharmony_cistatic struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
393262306a36Sopenharmony_ci				   struct pglist_data *pgdat, bool can_swap)
393362306a36Sopenharmony_ci{
393462306a36Sopenharmony_ci	struct folio *folio;
393562306a36Sopenharmony_ci
393662306a36Sopenharmony_ci	/* try to avoid unnecessary memory loads */
393762306a36Sopenharmony_ci	if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
393862306a36Sopenharmony_ci		return NULL;
393962306a36Sopenharmony_ci
394062306a36Sopenharmony_ci	folio = pfn_folio(pfn);
394162306a36Sopenharmony_ci	if (folio_nid(folio) != pgdat->node_id)
394262306a36Sopenharmony_ci		return NULL;
394362306a36Sopenharmony_ci
394462306a36Sopenharmony_ci	if (folio_memcg_rcu(folio) != memcg)
394562306a36Sopenharmony_ci		return NULL;
394662306a36Sopenharmony_ci
394762306a36Sopenharmony_ci	/* file VMAs can contain anon pages from COW */
394862306a36Sopenharmony_ci	if (!folio_is_file_lru(folio) && !can_swap)
394962306a36Sopenharmony_ci		return NULL;
395062306a36Sopenharmony_ci
395162306a36Sopenharmony_ci	return folio;
395262306a36Sopenharmony_ci}
395362306a36Sopenharmony_ci
395462306a36Sopenharmony_cistatic bool suitable_to_scan(int total, int young)
395562306a36Sopenharmony_ci{
395662306a36Sopenharmony_ci	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
395762306a36Sopenharmony_ci
395862306a36Sopenharmony_ci	/* suitable if the average number of young PTEs per cacheline is >=1 */
395962306a36Sopenharmony_ci	return young * n >= total;
396062306a36Sopenharmony_ci}
396162306a36Sopenharmony_ci
396262306a36Sopenharmony_cistatic bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
396362306a36Sopenharmony_ci			   struct mm_walk *args)
396462306a36Sopenharmony_ci{
396562306a36Sopenharmony_ci	int i;
396662306a36Sopenharmony_ci	pte_t *pte;
396762306a36Sopenharmony_ci	spinlock_t *ptl;
396862306a36Sopenharmony_ci	unsigned long addr;
396962306a36Sopenharmony_ci	int total = 0;
397062306a36Sopenharmony_ci	int young = 0;
397162306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk = args->private;
397262306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
397362306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
397462306a36Sopenharmony_ci	int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
397562306a36Sopenharmony_ci
397662306a36Sopenharmony_ci	pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
397762306a36Sopenharmony_ci	if (!pte)
397862306a36Sopenharmony_ci		return false;
397962306a36Sopenharmony_ci	if (!spin_trylock(ptl)) {
398062306a36Sopenharmony_ci		pte_unmap(pte);
398162306a36Sopenharmony_ci		return false;
398262306a36Sopenharmony_ci	}
398362306a36Sopenharmony_ci
398462306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
398562306a36Sopenharmony_cirestart:
398662306a36Sopenharmony_ci	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
398762306a36Sopenharmony_ci		unsigned long pfn;
398862306a36Sopenharmony_ci		struct folio *folio;
398962306a36Sopenharmony_ci		pte_t ptent = ptep_get(pte + i);
399062306a36Sopenharmony_ci
399162306a36Sopenharmony_ci		total++;
399262306a36Sopenharmony_ci		walk->mm_stats[MM_LEAF_TOTAL]++;
399362306a36Sopenharmony_ci
399462306a36Sopenharmony_ci		pfn = get_pte_pfn(ptent, args->vma, addr);
399562306a36Sopenharmony_ci		if (pfn == -1)
399662306a36Sopenharmony_ci			continue;
399762306a36Sopenharmony_ci
399862306a36Sopenharmony_ci		if (!pte_young(ptent)) {
399962306a36Sopenharmony_ci			walk->mm_stats[MM_LEAF_OLD]++;
400062306a36Sopenharmony_ci			continue;
400162306a36Sopenharmony_ci		}
400262306a36Sopenharmony_ci
400362306a36Sopenharmony_ci		folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
400462306a36Sopenharmony_ci		if (!folio)
400562306a36Sopenharmony_ci			continue;
400662306a36Sopenharmony_ci
400762306a36Sopenharmony_ci		if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
400862306a36Sopenharmony_ci			VM_WARN_ON_ONCE(true);
400962306a36Sopenharmony_ci
401062306a36Sopenharmony_ci		young++;
401162306a36Sopenharmony_ci		walk->mm_stats[MM_LEAF_YOUNG]++;
401262306a36Sopenharmony_ci
401362306a36Sopenharmony_ci		if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
401462306a36Sopenharmony_ci		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
401562306a36Sopenharmony_ci		      !folio_test_swapcache(folio)))
401662306a36Sopenharmony_ci			folio_mark_dirty(folio);
401762306a36Sopenharmony_ci
401862306a36Sopenharmony_ci		old_gen = folio_update_gen(folio, new_gen);
401962306a36Sopenharmony_ci		if (old_gen >= 0 && old_gen != new_gen)
402062306a36Sopenharmony_ci			update_batch_size(walk, folio, old_gen, new_gen);
402162306a36Sopenharmony_ci	}
402262306a36Sopenharmony_ci
402362306a36Sopenharmony_ci	if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
402462306a36Sopenharmony_ci		goto restart;
402562306a36Sopenharmony_ci
402662306a36Sopenharmony_ci	arch_leave_lazy_mmu_mode();
402762306a36Sopenharmony_ci	pte_unmap_unlock(pte, ptl);
402862306a36Sopenharmony_ci
402962306a36Sopenharmony_ci	return suitable_to_scan(total, young);
403062306a36Sopenharmony_ci}
403162306a36Sopenharmony_ci
403262306a36Sopenharmony_ci#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
403362306a36Sopenharmony_cistatic void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
403462306a36Sopenharmony_ci				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
403562306a36Sopenharmony_ci{
403662306a36Sopenharmony_ci	int i;
403762306a36Sopenharmony_ci	pmd_t *pmd;
403862306a36Sopenharmony_ci	spinlock_t *ptl;
403962306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk = args->private;
404062306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
404162306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
404262306a36Sopenharmony_ci	int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
404362306a36Sopenharmony_ci
404462306a36Sopenharmony_ci	VM_WARN_ON_ONCE(pud_leaf(*pud));
404562306a36Sopenharmony_ci
404662306a36Sopenharmony_ci	/* try to batch at most 1+MIN_LRU_BATCH+1 entries */
404762306a36Sopenharmony_ci	if (*first == -1) {
404862306a36Sopenharmony_ci		*first = addr;
404962306a36Sopenharmony_ci		bitmap_zero(bitmap, MIN_LRU_BATCH);
405062306a36Sopenharmony_ci		return;
405162306a36Sopenharmony_ci	}
405262306a36Sopenharmony_ci
405362306a36Sopenharmony_ci	i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
405462306a36Sopenharmony_ci	if (i && i <= MIN_LRU_BATCH) {
405562306a36Sopenharmony_ci		__set_bit(i - 1, bitmap);
405662306a36Sopenharmony_ci		return;
405762306a36Sopenharmony_ci	}
405862306a36Sopenharmony_ci
405962306a36Sopenharmony_ci	pmd = pmd_offset(pud, *first);
406062306a36Sopenharmony_ci
406162306a36Sopenharmony_ci	ptl = pmd_lockptr(args->mm, pmd);
406262306a36Sopenharmony_ci	if (!spin_trylock(ptl))
406362306a36Sopenharmony_ci		goto done;
406462306a36Sopenharmony_ci
406562306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
406662306a36Sopenharmony_ci
406762306a36Sopenharmony_ci	do {
406862306a36Sopenharmony_ci		unsigned long pfn;
406962306a36Sopenharmony_ci		struct folio *folio;
407062306a36Sopenharmony_ci
407162306a36Sopenharmony_ci		/* don't round down the first address */
407262306a36Sopenharmony_ci		addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
407362306a36Sopenharmony_ci
407462306a36Sopenharmony_ci		pfn = get_pmd_pfn(pmd[i], vma, addr);
407562306a36Sopenharmony_ci		if (pfn == -1)
407662306a36Sopenharmony_ci			goto next;
407762306a36Sopenharmony_ci
407862306a36Sopenharmony_ci		if (!pmd_trans_huge(pmd[i])) {
407962306a36Sopenharmony_ci			if (should_clear_pmd_young())
408062306a36Sopenharmony_ci				pmdp_test_and_clear_young(vma, addr, pmd + i);
408162306a36Sopenharmony_ci			goto next;
408262306a36Sopenharmony_ci		}
408362306a36Sopenharmony_ci
408462306a36Sopenharmony_ci		folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
408562306a36Sopenharmony_ci		if (!folio)
408662306a36Sopenharmony_ci			goto next;
408762306a36Sopenharmony_ci
408862306a36Sopenharmony_ci		if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
408962306a36Sopenharmony_ci			goto next;
409062306a36Sopenharmony_ci
409162306a36Sopenharmony_ci		walk->mm_stats[MM_LEAF_YOUNG]++;
409262306a36Sopenharmony_ci
409362306a36Sopenharmony_ci		if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
409462306a36Sopenharmony_ci		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
409562306a36Sopenharmony_ci		      !folio_test_swapcache(folio)))
409662306a36Sopenharmony_ci			folio_mark_dirty(folio);
409762306a36Sopenharmony_ci
409862306a36Sopenharmony_ci		old_gen = folio_update_gen(folio, new_gen);
409962306a36Sopenharmony_ci		if (old_gen >= 0 && old_gen != new_gen)
410062306a36Sopenharmony_ci			update_batch_size(walk, folio, old_gen, new_gen);
410162306a36Sopenharmony_cinext:
410262306a36Sopenharmony_ci		i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
410362306a36Sopenharmony_ci	} while (i <= MIN_LRU_BATCH);
410462306a36Sopenharmony_ci
410562306a36Sopenharmony_ci	arch_leave_lazy_mmu_mode();
410662306a36Sopenharmony_ci	spin_unlock(ptl);
410762306a36Sopenharmony_cidone:
410862306a36Sopenharmony_ci	*first = -1;
410962306a36Sopenharmony_ci}
411062306a36Sopenharmony_ci#else
411162306a36Sopenharmony_cistatic void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
411262306a36Sopenharmony_ci				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
411362306a36Sopenharmony_ci{
411462306a36Sopenharmony_ci}
411562306a36Sopenharmony_ci#endif
411662306a36Sopenharmony_ci
411762306a36Sopenharmony_cistatic void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
411862306a36Sopenharmony_ci			   struct mm_walk *args)
411962306a36Sopenharmony_ci{
412062306a36Sopenharmony_ci	int i;
412162306a36Sopenharmony_ci	pmd_t *pmd;
412262306a36Sopenharmony_ci	unsigned long next;
412362306a36Sopenharmony_ci	unsigned long addr;
412462306a36Sopenharmony_ci	struct vm_area_struct *vma;
412562306a36Sopenharmony_ci	DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
412662306a36Sopenharmony_ci	unsigned long first = -1;
412762306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk = args->private;
412862306a36Sopenharmony_ci
412962306a36Sopenharmony_ci	VM_WARN_ON_ONCE(pud_leaf(*pud));
413062306a36Sopenharmony_ci
413162306a36Sopenharmony_ci	/*
413262306a36Sopenharmony_ci	 * Finish an entire PMD in two passes: the first only reaches to PTE
413362306a36Sopenharmony_ci	 * tables to avoid taking the PMD lock; the second, if necessary, takes
413462306a36Sopenharmony_ci	 * the PMD lock to clear the accessed bit in PMD entries.
413562306a36Sopenharmony_ci	 */
413662306a36Sopenharmony_ci	pmd = pmd_offset(pud, start & PUD_MASK);
413762306a36Sopenharmony_cirestart:
413862306a36Sopenharmony_ci	/* walk_pte_range() may call get_next_vma() */
413962306a36Sopenharmony_ci	vma = args->vma;
414062306a36Sopenharmony_ci	for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
414162306a36Sopenharmony_ci		pmd_t val = pmdp_get_lockless(pmd + i);
414262306a36Sopenharmony_ci
414362306a36Sopenharmony_ci		next = pmd_addr_end(addr, end);
414462306a36Sopenharmony_ci
414562306a36Sopenharmony_ci		if (!pmd_present(val) || is_huge_zero_pmd(val)) {
414662306a36Sopenharmony_ci			walk->mm_stats[MM_LEAF_TOTAL]++;
414762306a36Sopenharmony_ci			continue;
414862306a36Sopenharmony_ci		}
414962306a36Sopenharmony_ci
415062306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
415162306a36Sopenharmony_ci		if (pmd_trans_huge(val)) {
415262306a36Sopenharmony_ci			unsigned long pfn = pmd_pfn(val);
415362306a36Sopenharmony_ci			struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
415462306a36Sopenharmony_ci
415562306a36Sopenharmony_ci			walk->mm_stats[MM_LEAF_TOTAL]++;
415662306a36Sopenharmony_ci
415762306a36Sopenharmony_ci			if (!pmd_young(val)) {
415862306a36Sopenharmony_ci				walk->mm_stats[MM_LEAF_OLD]++;
415962306a36Sopenharmony_ci				continue;
416062306a36Sopenharmony_ci			}
416162306a36Sopenharmony_ci
416262306a36Sopenharmony_ci			/* try to avoid unnecessary memory loads */
416362306a36Sopenharmony_ci			if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
416462306a36Sopenharmony_ci				continue;
416562306a36Sopenharmony_ci
416662306a36Sopenharmony_ci			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
416762306a36Sopenharmony_ci			continue;
416862306a36Sopenharmony_ci		}
416962306a36Sopenharmony_ci#endif
417062306a36Sopenharmony_ci		walk->mm_stats[MM_NONLEAF_TOTAL]++;
417162306a36Sopenharmony_ci
417262306a36Sopenharmony_ci		if (should_clear_pmd_young()) {
417362306a36Sopenharmony_ci			if (!pmd_young(val))
417462306a36Sopenharmony_ci				continue;
417562306a36Sopenharmony_ci
417662306a36Sopenharmony_ci			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
417762306a36Sopenharmony_ci		}
417862306a36Sopenharmony_ci
417962306a36Sopenharmony_ci		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
418062306a36Sopenharmony_ci			continue;
418162306a36Sopenharmony_ci
418262306a36Sopenharmony_ci		walk->mm_stats[MM_NONLEAF_FOUND]++;
418362306a36Sopenharmony_ci
418462306a36Sopenharmony_ci		if (!walk_pte_range(&val, addr, next, args))
418562306a36Sopenharmony_ci			continue;
418662306a36Sopenharmony_ci
418762306a36Sopenharmony_ci		walk->mm_stats[MM_NONLEAF_ADDED]++;
418862306a36Sopenharmony_ci
418962306a36Sopenharmony_ci		/* carry over to the next generation */
419062306a36Sopenharmony_ci		update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
419162306a36Sopenharmony_ci	}
419262306a36Sopenharmony_ci
419362306a36Sopenharmony_ci	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
419462306a36Sopenharmony_ci
419562306a36Sopenharmony_ci	if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
419662306a36Sopenharmony_ci		goto restart;
419762306a36Sopenharmony_ci}
419862306a36Sopenharmony_ci
419962306a36Sopenharmony_cistatic int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
420062306a36Sopenharmony_ci			  struct mm_walk *args)
420162306a36Sopenharmony_ci{
420262306a36Sopenharmony_ci	int i;
420362306a36Sopenharmony_ci	pud_t *pud;
420462306a36Sopenharmony_ci	unsigned long addr;
420562306a36Sopenharmony_ci	unsigned long next;
420662306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk = args->private;
420762306a36Sopenharmony_ci
420862306a36Sopenharmony_ci	VM_WARN_ON_ONCE(p4d_leaf(*p4d));
420962306a36Sopenharmony_ci
421062306a36Sopenharmony_ci	pud = pud_offset(p4d, start & P4D_MASK);
421162306a36Sopenharmony_cirestart:
421262306a36Sopenharmony_ci	for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
421362306a36Sopenharmony_ci		pud_t val = READ_ONCE(pud[i]);
421462306a36Sopenharmony_ci
421562306a36Sopenharmony_ci		next = pud_addr_end(addr, end);
421662306a36Sopenharmony_ci
421762306a36Sopenharmony_ci		if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
421862306a36Sopenharmony_ci			continue;
421962306a36Sopenharmony_ci
422062306a36Sopenharmony_ci		walk_pmd_range(&val, addr, next, args);
422162306a36Sopenharmony_ci
422262306a36Sopenharmony_ci		if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
422362306a36Sopenharmony_ci			end = (addr | ~PUD_MASK) + 1;
422462306a36Sopenharmony_ci			goto done;
422562306a36Sopenharmony_ci		}
422662306a36Sopenharmony_ci	}
422762306a36Sopenharmony_ci
422862306a36Sopenharmony_ci	if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
422962306a36Sopenharmony_ci		goto restart;
423062306a36Sopenharmony_ci
423162306a36Sopenharmony_ci	end = round_up(end, P4D_SIZE);
423262306a36Sopenharmony_cidone:
423362306a36Sopenharmony_ci	if (!end || !args->vma)
423462306a36Sopenharmony_ci		return 1;
423562306a36Sopenharmony_ci
423662306a36Sopenharmony_ci	walk->next_addr = max(end, args->vma->vm_start);
423762306a36Sopenharmony_ci
423862306a36Sopenharmony_ci	return -EAGAIN;
423962306a36Sopenharmony_ci}
424062306a36Sopenharmony_ci
424162306a36Sopenharmony_cistatic void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
424262306a36Sopenharmony_ci{
424362306a36Sopenharmony_ci	static const struct mm_walk_ops mm_walk_ops = {
424462306a36Sopenharmony_ci		.test_walk = should_skip_vma,
424562306a36Sopenharmony_ci		.p4d_entry = walk_pud_range,
424662306a36Sopenharmony_ci		.walk_lock = PGWALK_RDLOCK,
424762306a36Sopenharmony_ci	};
424862306a36Sopenharmony_ci
424962306a36Sopenharmony_ci	int err;
425062306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
425162306a36Sopenharmony_ci
425262306a36Sopenharmony_ci	walk->next_addr = FIRST_USER_ADDRESS;
425362306a36Sopenharmony_ci
425462306a36Sopenharmony_ci	do {
425562306a36Sopenharmony_ci		DEFINE_MAX_SEQ(lruvec);
425662306a36Sopenharmony_ci
425762306a36Sopenharmony_ci		err = -EBUSY;
425862306a36Sopenharmony_ci
425962306a36Sopenharmony_ci		/* another thread might have called inc_max_seq() */
426062306a36Sopenharmony_ci		if (walk->max_seq != max_seq)
426162306a36Sopenharmony_ci			break;
426262306a36Sopenharmony_ci
426362306a36Sopenharmony_ci		/* folio_update_gen() requires stable folio_memcg() */
426462306a36Sopenharmony_ci		if (!mem_cgroup_trylock_pages(memcg))
426562306a36Sopenharmony_ci			break;
426662306a36Sopenharmony_ci
426762306a36Sopenharmony_ci		/* the caller might be holding the lock for write */
426862306a36Sopenharmony_ci		if (mmap_read_trylock(mm)) {
426962306a36Sopenharmony_ci			err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
427062306a36Sopenharmony_ci
427162306a36Sopenharmony_ci			mmap_read_unlock(mm);
427262306a36Sopenharmony_ci		}
427362306a36Sopenharmony_ci
427462306a36Sopenharmony_ci		mem_cgroup_unlock_pages();
427562306a36Sopenharmony_ci
427662306a36Sopenharmony_ci		if (walk->batched) {
427762306a36Sopenharmony_ci			spin_lock_irq(&lruvec->lru_lock);
427862306a36Sopenharmony_ci			reset_batch_size(lruvec, walk);
427962306a36Sopenharmony_ci			spin_unlock_irq(&lruvec->lru_lock);
428062306a36Sopenharmony_ci		}
428162306a36Sopenharmony_ci
428262306a36Sopenharmony_ci		cond_resched();
428362306a36Sopenharmony_ci	} while (err == -EAGAIN);
428462306a36Sopenharmony_ci}
428562306a36Sopenharmony_ci
428662306a36Sopenharmony_cistatic struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
428762306a36Sopenharmony_ci{
428862306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
428962306a36Sopenharmony_ci
429062306a36Sopenharmony_ci	if (pgdat && current_is_kswapd()) {
429162306a36Sopenharmony_ci		VM_WARN_ON_ONCE(walk);
429262306a36Sopenharmony_ci
429362306a36Sopenharmony_ci		walk = &pgdat->mm_walk;
429462306a36Sopenharmony_ci	} else if (!walk && force_alloc) {
429562306a36Sopenharmony_ci		VM_WARN_ON_ONCE(current_is_kswapd());
429662306a36Sopenharmony_ci
429762306a36Sopenharmony_ci		walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
429862306a36Sopenharmony_ci	}
429962306a36Sopenharmony_ci
430062306a36Sopenharmony_ci	current->reclaim_state->mm_walk = walk;
430162306a36Sopenharmony_ci
430262306a36Sopenharmony_ci	return walk;
430362306a36Sopenharmony_ci}
430462306a36Sopenharmony_ci
430562306a36Sopenharmony_cistatic void clear_mm_walk(void)
430662306a36Sopenharmony_ci{
430762306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
430862306a36Sopenharmony_ci
430962306a36Sopenharmony_ci	VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
431062306a36Sopenharmony_ci	VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
431162306a36Sopenharmony_ci
431262306a36Sopenharmony_ci	current->reclaim_state->mm_walk = NULL;
431362306a36Sopenharmony_ci
431462306a36Sopenharmony_ci	if (!current_is_kswapd())
431562306a36Sopenharmony_ci		kfree(walk);
431662306a36Sopenharmony_ci}
431762306a36Sopenharmony_ci
431862306a36Sopenharmony_cistatic bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
431962306a36Sopenharmony_ci{
432062306a36Sopenharmony_ci	int zone;
432162306a36Sopenharmony_ci	int remaining = MAX_LRU_BATCH;
432262306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
432362306a36Sopenharmony_ci	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
432462306a36Sopenharmony_ci
432562306a36Sopenharmony_ci	if (type == LRU_GEN_ANON && !can_swap)
432662306a36Sopenharmony_ci		goto done;
432762306a36Sopenharmony_ci
432862306a36Sopenharmony_ci	/* prevent cold/hot inversion if force_scan is true */
432962306a36Sopenharmony_ci	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
433062306a36Sopenharmony_ci		struct list_head *head = &lrugen->folios[old_gen][type][zone];
433162306a36Sopenharmony_ci
433262306a36Sopenharmony_ci		while (!list_empty(head)) {
433362306a36Sopenharmony_ci			struct folio *folio = lru_to_folio(head);
433462306a36Sopenharmony_ci
433562306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
433662306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
433762306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
433862306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
433962306a36Sopenharmony_ci
434062306a36Sopenharmony_ci			new_gen = folio_inc_gen(lruvec, folio, false);
434162306a36Sopenharmony_ci			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
434262306a36Sopenharmony_ci
434362306a36Sopenharmony_ci			if (!--remaining)
434462306a36Sopenharmony_ci				return false;
434562306a36Sopenharmony_ci		}
434662306a36Sopenharmony_ci	}
434762306a36Sopenharmony_cidone:
434862306a36Sopenharmony_ci	reset_ctrl_pos(lruvec, type, true);
434962306a36Sopenharmony_ci	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
435062306a36Sopenharmony_ci
435162306a36Sopenharmony_ci	return true;
435262306a36Sopenharmony_ci}
435362306a36Sopenharmony_ci
435462306a36Sopenharmony_cistatic bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
435562306a36Sopenharmony_ci{
435662306a36Sopenharmony_ci	int gen, type, zone;
435762306a36Sopenharmony_ci	bool success = false;
435862306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
435962306a36Sopenharmony_ci	DEFINE_MIN_SEQ(lruvec);
436062306a36Sopenharmony_ci
436162306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
436262306a36Sopenharmony_ci
436362306a36Sopenharmony_ci	/* find the oldest populated generation */
436462306a36Sopenharmony_ci	for (type = !can_swap; type < ANON_AND_FILE; type++) {
436562306a36Sopenharmony_ci		while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
436662306a36Sopenharmony_ci			gen = lru_gen_from_seq(min_seq[type]);
436762306a36Sopenharmony_ci
436862306a36Sopenharmony_ci			for (zone = 0; zone < MAX_NR_ZONES; zone++) {
436962306a36Sopenharmony_ci				if (!list_empty(&lrugen->folios[gen][type][zone]))
437062306a36Sopenharmony_ci					goto next;
437162306a36Sopenharmony_ci			}
437262306a36Sopenharmony_ci
437362306a36Sopenharmony_ci			min_seq[type]++;
437462306a36Sopenharmony_ci		}
437562306a36Sopenharmony_cinext:
437662306a36Sopenharmony_ci		;
437762306a36Sopenharmony_ci	}
437862306a36Sopenharmony_ci
437962306a36Sopenharmony_ci	/* see the comment on lru_gen_folio */
438062306a36Sopenharmony_ci	if (can_swap) {
438162306a36Sopenharmony_ci		min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
438262306a36Sopenharmony_ci		min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
438362306a36Sopenharmony_ci	}
438462306a36Sopenharmony_ci
438562306a36Sopenharmony_ci	for (type = !can_swap; type < ANON_AND_FILE; type++) {
438662306a36Sopenharmony_ci		if (min_seq[type] == lrugen->min_seq[type])
438762306a36Sopenharmony_ci			continue;
438862306a36Sopenharmony_ci
438962306a36Sopenharmony_ci		reset_ctrl_pos(lruvec, type, true);
439062306a36Sopenharmony_ci		WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
439162306a36Sopenharmony_ci		success = true;
439262306a36Sopenharmony_ci	}
439362306a36Sopenharmony_ci
439462306a36Sopenharmony_ci	return success;
439562306a36Sopenharmony_ci}
439662306a36Sopenharmony_ci
439762306a36Sopenharmony_cistatic void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
439862306a36Sopenharmony_ci{
439962306a36Sopenharmony_ci	int prev, next;
440062306a36Sopenharmony_ci	int type, zone;
440162306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
440262306a36Sopenharmony_cirestart:
440362306a36Sopenharmony_ci	spin_lock_irq(&lruvec->lru_lock);
440462306a36Sopenharmony_ci
440562306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
440662306a36Sopenharmony_ci
440762306a36Sopenharmony_ci	for (type = ANON_AND_FILE - 1; type >= 0; type--) {
440862306a36Sopenharmony_ci		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
440962306a36Sopenharmony_ci			continue;
441062306a36Sopenharmony_ci
441162306a36Sopenharmony_ci		VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
441262306a36Sopenharmony_ci
441362306a36Sopenharmony_ci		if (inc_min_seq(lruvec, type, can_swap))
441462306a36Sopenharmony_ci			continue;
441562306a36Sopenharmony_ci
441662306a36Sopenharmony_ci		spin_unlock_irq(&lruvec->lru_lock);
441762306a36Sopenharmony_ci		cond_resched();
441862306a36Sopenharmony_ci		goto restart;
441962306a36Sopenharmony_ci	}
442062306a36Sopenharmony_ci
442162306a36Sopenharmony_ci	/*
442262306a36Sopenharmony_ci	 * Update the active/inactive LRU sizes for compatibility. Both sides of
442362306a36Sopenharmony_ci	 * the current max_seq need to be covered, since max_seq+1 can overlap
442462306a36Sopenharmony_ci	 * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
442562306a36Sopenharmony_ci	 * overlap, cold/hot inversion happens.
442662306a36Sopenharmony_ci	 */
442762306a36Sopenharmony_ci	prev = lru_gen_from_seq(lrugen->max_seq - 1);
442862306a36Sopenharmony_ci	next = lru_gen_from_seq(lrugen->max_seq + 1);
442962306a36Sopenharmony_ci
443062306a36Sopenharmony_ci	for (type = 0; type < ANON_AND_FILE; type++) {
443162306a36Sopenharmony_ci		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
443262306a36Sopenharmony_ci			enum lru_list lru = type * LRU_INACTIVE_FILE;
443362306a36Sopenharmony_ci			long delta = lrugen->nr_pages[prev][type][zone] -
443462306a36Sopenharmony_ci				     lrugen->nr_pages[next][type][zone];
443562306a36Sopenharmony_ci
443662306a36Sopenharmony_ci			if (!delta)
443762306a36Sopenharmony_ci				continue;
443862306a36Sopenharmony_ci
443962306a36Sopenharmony_ci			__update_lru_size(lruvec, lru, zone, delta);
444062306a36Sopenharmony_ci			__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
444162306a36Sopenharmony_ci		}
444262306a36Sopenharmony_ci	}
444362306a36Sopenharmony_ci
444462306a36Sopenharmony_ci	for (type = 0; type < ANON_AND_FILE; type++)
444562306a36Sopenharmony_ci		reset_ctrl_pos(lruvec, type, false);
444662306a36Sopenharmony_ci
444762306a36Sopenharmony_ci	WRITE_ONCE(lrugen->timestamps[next], jiffies);
444862306a36Sopenharmony_ci	/* make sure preceding modifications appear */
444962306a36Sopenharmony_ci	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
445062306a36Sopenharmony_ci
445162306a36Sopenharmony_ci	spin_unlock_irq(&lruvec->lru_lock);
445262306a36Sopenharmony_ci}
445362306a36Sopenharmony_ci
445462306a36Sopenharmony_cistatic bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
445562306a36Sopenharmony_ci			       struct scan_control *sc, bool can_swap, bool force_scan)
445662306a36Sopenharmony_ci{
445762306a36Sopenharmony_ci	bool success;
445862306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk;
445962306a36Sopenharmony_ci	struct mm_struct *mm = NULL;
446062306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
446162306a36Sopenharmony_ci
446262306a36Sopenharmony_ci	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
446362306a36Sopenharmony_ci
446462306a36Sopenharmony_ci	/* see the comment in iterate_mm_list() */
446562306a36Sopenharmony_ci	if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
446662306a36Sopenharmony_ci		success = false;
446762306a36Sopenharmony_ci		goto done;
446862306a36Sopenharmony_ci	}
446962306a36Sopenharmony_ci
447062306a36Sopenharmony_ci	/*
447162306a36Sopenharmony_ci	 * If the hardware doesn't automatically set the accessed bit, fallback
447262306a36Sopenharmony_ci	 * to lru_gen_look_around(), which only clears the accessed bit in a
447362306a36Sopenharmony_ci	 * handful of PTEs. Spreading the work out over a period of time usually
447462306a36Sopenharmony_ci	 * is less efficient, but it avoids bursty page faults.
447562306a36Sopenharmony_ci	 */
447662306a36Sopenharmony_ci	if (!should_walk_mmu()) {
447762306a36Sopenharmony_ci		success = iterate_mm_list_nowalk(lruvec, max_seq);
447862306a36Sopenharmony_ci		goto done;
447962306a36Sopenharmony_ci	}
448062306a36Sopenharmony_ci
448162306a36Sopenharmony_ci	walk = set_mm_walk(NULL, true);
448262306a36Sopenharmony_ci	if (!walk) {
448362306a36Sopenharmony_ci		success = iterate_mm_list_nowalk(lruvec, max_seq);
448462306a36Sopenharmony_ci		goto done;
448562306a36Sopenharmony_ci	}
448662306a36Sopenharmony_ci
448762306a36Sopenharmony_ci	walk->lruvec = lruvec;
448862306a36Sopenharmony_ci	walk->max_seq = max_seq;
448962306a36Sopenharmony_ci	walk->can_swap = can_swap;
449062306a36Sopenharmony_ci	walk->force_scan = force_scan;
449162306a36Sopenharmony_ci
449262306a36Sopenharmony_ci	do {
449362306a36Sopenharmony_ci		success = iterate_mm_list(lruvec, walk, &mm);
449462306a36Sopenharmony_ci		if (mm)
449562306a36Sopenharmony_ci			walk_mm(lruvec, mm, walk);
449662306a36Sopenharmony_ci	} while (mm);
449762306a36Sopenharmony_cidone:
449862306a36Sopenharmony_ci	if (success)
449962306a36Sopenharmony_ci		inc_max_seq(lruvec, can_swap, force_scan);
450062306a36Sopenharmony_ci
450162306a36Sopenharmony_ci	return success;
450262306a36Sopenharmony_ci}
450362306a36Sopenharmony_ci
450462306a36Sopenharmony_ci/******************************************************************************
450562306a36Sopenharmony_ci *                          working set protection
450662306a36Sopenharmony_ci ******************************************************************************/
450762306a36Sopenharmony_ci
450862306a36Sopenharmony_cistatic bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
450962306a36Sopenharmony_ci{
451062306a36Sopenharmony_ci	int gen, type, zone;
451162306a36Sopenharmony_ci	unsigned long total = 0;
451262306a36Sopenharmony_ci	bool can_swap = get_swappiness(lruvec, sc);
451362306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
451462306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
451562306a36Sopenharmony_ci	DEFINE_MAX_SEQ(lruvec);
451662306a36Sopenharmony_ci	DEFINE_MIN_SEQ(lruvec);
451762306a36Sopenharmony_ci
451862306a36Sopenharmony_ci	for (type = !can_swap; type < ANON_AND_FILE; type++) {
451962306a36Sopenharmony_ci		unsigned long seq;
452062306a36Sopenharmony_ci
452162306a36Sopenharmony_ci		for (seq = min_seq[type]; seq <= max_seq; seq++) {
452262306a36Sopenharmony_ci			gen = lru_gen_from_seq(seq);
452362306a36Sopenharmony_ci
452462306a36Sopenharmony_ci			for (zone = 0; zone < MAX_NR_ZONES; zone++)
452562306a36Sopenharmony_ci				total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
452662306a36Sopenharmony_ci		}
452762306a36Sopenharmony_ci	}
452862306a36Sopenharmony_ci
452962306a36Sopenharmony_ci	/* whether the size is big enough to be helpful */
453062306a36Sopenharmony_ci	return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
453162306a36Sopenharmony_ci}
453262306a36Sopenharmony_ci
453362306a36Sopenharmony_cistatic bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
453462306a36Sopenharmony_ci				  unsigned long min_ttl)
453562306a36Sopenharmony_ci{
453662306a36Sopenharmony_ci	int gen;
453762306a36Sopenharmony_ci	unsigned long birth;
453862306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
453962306a36Sopenharmony_ci	DEFINE_MIN_SEQ(lruvec);
454062306a36Sopenharmony_ci
454162306a36Sopenharmony_ci	/* see the comment on lru_gen_folio */
454262306a36Sopenharmony_ci	gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
454362306a36Sopenharmony_ci	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
454462306a36Sopenharmony_ci
454562306a36Sopenharmony_ci	if (time_is_after_jiffies(birth + min_ttl))
454662306a36Sopenharmony_ci		return false;
454762306a36Sopenharmony_ci
454862306a36Sopenharmony_ci	if (!lruvec_is_sizable(lruvec, sc))
454962306a36Sopenharmony_ci		return false;
455062306a36Sopenharmony_ci
455162306a36Sopenharmony_ci	mem_cgroup_calculate_protection(NULL, memcg);
455262306a36Sopenharmony_ci
455362306a36Sopenharmony_ci	return !mem_cgroup_below_min(NULL, memcg);
455462306a36Sopenharmony_ci}
455562306a36Sopenharmony_ci
455662306a36Sopenharmony_ci/* to protect the working set of the last N jiffies */
455762306a36Sopenharmony_cistatic unsigned long lru_gen_min_ttl __read_mostly;
455862306a36Sopenharmony_ci
455962306a36Sopenharmony_cistatic void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
456062306a36Sopenharmony_ci{
456162306a36Sopenharmony_ci	struct mem_cgroup *memcg;
456262306a36Sopenharmony_ci	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
456362306a36Sopenharmony_ci
456462306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!current_is_kswapd());
456562306a36Sopenharmony_ci
456662306a36Sopenharmony_ci	/* check the order to exclude compaction-induced reclaim */
456762306a36Sopenharmony_ci	if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
456862306a36Sopenharmony_ci		return;
456962306a36Sopenharmony_ci
457062306a36Sopenharmony_ci	memcg = mem_cgroup_iter(NULL, NULL, NULL);
457162306a36Sopenharmony_ci	do {
457262306a36Sopenharmony_ci		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
457362306a36Sopenharmony_ci
457462306a36Sopenharmony_ci		if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
457562306a36Sopenharmony_ci			mem_cgroup_iter_break(NULL, memcg);
457662306a36Sopenharmony_ci			return;
457762306a36Sopenharmony_ci		}
457862306a36Sopenharmony_ci
457962306a36Sopenharmony_ci		cond_resched();
458062306a36Sopenharmony_ci	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
458162306a36Sopenharmony_ci
458262306a36Sopenharmony_ci	/*
458362306a36Sopenharmony_ci	 * The main goal is to OOM kill if every generation from all memcgs is
458462306a36Sopenharmony_ci	 * younger than min_ttl. However, another possibility is all memcgs are
458562306a36Sopenharmony_ci	 * either too small or below min.
458662306a36Sopenharmony_ci	 */
458762306a36Sopenharmony_ci	if (mutex_trylock(&oom_lock)) {
458862306a36Sopenharmony_ci		struct oom_control oc = {
458962306a36Sopenharmony_ci			.gfp_mask = sc->gfp_mask,
459062306a36Sopenharmony_ci		};
459162306a36Sopenharmony_ci
459262306a36Sopenharmony_ci		out_of_memory(&oc);
459362306a36Sopenharmony_ci
459462306a36Sopenharmony_ci		mutex_unlock(&oom_lock);
459562306a36Sopenharmony_ci	}
459662306a36Sopenharmony_ci}
459762306a36Sopenharmony_ci
459862306a36Sopenharmony_ci/******************************************************************************
459962306a36Sopenharmony_ci *                          rmap/PT walk feedback
460062306a36Sopenharmony_ci ******************************************************************************/
460162306a36Sopenharmony_ci
460262306a36Sopenharmony_ci/*
460362306a36Sopenharmony_ci * This function exploits spatial locality when shrink_folio_list() walks the
460462306a36Sopenharmony_ci * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
460562306a36Sopenharmony_ci * the scan was done cacheline efficiently, it adds the PMD entry pointing to
460662306a36Sopenharmony_ci * the PTE table to the Bloom filter. This forms a feedback loop between the
460762306a36Sopenharmony_ci * eviction and the aging.
460862306a36Sopenharmony_ci */
460962306a36Sopenharmony_civoid lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
461062306a36Sopenharmony_ci{
461162306a36Sopenharmony_ci	int i;
461262306a36Sopenharmony_ci	unsigned long start;
461362306a36Sopenharmony_ci	unsigned long end;
461462306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk;
461562306a36Sopenharmony_ci	int young = 0;
461662306a36Sopenharmony_ci	pte_t *pte = pvmw->pte;
461762306a36Sopenharmony_ci	unsigned long addr = pvmw->address;
461862306a36Sopenharmony_ci	struct vm_area_struct *vma = pvmw->vma;
461962306a36Sopenharmony_ci	struct folio *folio = pfn_folio(pvmw->pfn);
462062306a36Sopenharmony_ci	bool can_swap = !folio_is_file_lru(folio);
462162306a36Sopenharmony_ci	struct mem_cgroup *memcg = folio_memcg(folio);
462262306a36Sopenharmony_ci	struct pglist_data *pgdat = folio_pgdat(folio);
462362306a36Sopenharmony_ci	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
462462306a36Sopenharmony_ci	DEFINE_MAX_SEQ(lruvec);
462562306a36Sopenharmony_ci	int old_gen, new_gen = lru_gen_from_seq(max_seq);
462662306a36Sopenharmony_ci
462762306a36Sopenharmony_ci	lockdep_assert_held(pvmw->ptl);
462862306a36Sopenharmony_ci	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
462962306a36Sopenharmony_ci
463062306a36Sopenharmony_ci	if (spin_is_contended(pvmw->ptl))
463162306a36Sopenharmony_ci		return;
463262306a36Sopenharmony_ci
463362306a36Sopenharmony_ci	/* exclude special VMAs containing anon pages from COW */
463462306a36Sopenharmony_ci	if (vma->vm_flags & VM_SPECIAL)
463562306a36Sopenharmony_ci		return;
463662306a36Sopenharmony_ci
463762306a36Sopenharmony_ci	/* avoid taking the LRU lock under the PTL when possible */
463862306a36Sopenharmony_ci	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
463962306a36Sopenharmony_ci
464062306a36Sopenharmony_ci	start = max(addr & PMD_MASK, vma->vm_start);
464162306a36Sopenharmony_ci	end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
464262306a36Sopenharmony_ci
464362306a36Sopenharmony_ci	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
464462306a36Sopenharmony_ci		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
464562306a36Sopenharmony_ci			end = start + MIN_LRU_BATCH * PAGE_SIZE;
464662306a36Sopenharmony_ci		else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
464762306a36Sopenharmony_ci			start = end - MIN_LRU_BATCH * PAGE_SIZE;
464862306a36Sopenharmony_ci		else {
464962306a36Sopenharmony_ci			start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
465062306a36Sopenharmony_ci			end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
465162306a36Sopenharmony_ci		}
465262306a36Sopenharmony_ci	}
465362306a36Sopenharmony_ci
465462306a36Sopenharmony_ci	/* folio_update_gen() requires stable folio_memcg() */
465562306a36Sopenharmony_ci	if (!mem_cgroup_trylock_pages(memcg))
465662306a36Sopenharmony_ci		return;
465762306a36Sopenharmony_ci
465862306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
465962306a36Sopenharmony_ci
466062306a36Sopenharmony_ci	pte -= (addr - start) / PAGE_SIZE;
466162306a36Sopenharmony_ci
466262306a36Sopenharmony_ci	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
466362306a36Sopenharmony_ci		unsigned long pfn;
466462306a36Sopenharmony_ci		pte_t ptent = ptep_get(pte + i);
466562306a36Sopenharmony_ci
466662306a36Sopenharmony_ci		pfn = get_pte_pfn(ptent, vma, addr);
466762306a36Sopenharmony_ci		if (pfn == -1)
466862306a36Sopenharmony_ci			continue;
466962306a36Sopenharmony_ci
467062306a36Sopenharmony_ci		if (!pte_young(ptent))
467162306a36Sopenharmony_ci			continue;
467262306a36Sopenharmony_ci
467362306a36Sopenharmony_ci		folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
467462306a36Sopenharmony_ci		if (!folio)
467562306a36Sopenharmony_ci			continue;
467662306a36Sopenharmony_ci
467762306a36Sopenharmony_ci		if (!ptep_test_and_clear_young(vma, addr, pte + i))
467862306a36Sopenharmony_ci			VM_WARN_ON_ONCE(true);
467962306a36Sopenharmony_ci
468062306a36Sopenharmony_ci		young++;
468162306a36Sopenharmony_ci
468262306a36Sopenharmony_ci		if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
468362306a36Sopenharmony_ci		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
468462306a36Sopenharmony_ci		      !folio_test_swapcache(folio)))
468562306a36Sopenharmony_ci			folio_mark_dirty(folio);
468662306a36Sopenharmony_ci
468762306a36Sopenharmony_ci		if (walk) {
468862306a36Sopenharmony_ci			old_gen = folio_update_gen(folio, new_gen);
468962306a36Sopenharmony_ci			if (old_gen >= 0 && old_gen != new_gen)
469062306a36Sopenharmony_ci				update_batch_size(walk, folio, old_gen, new_gen);
469162306a36Sopenharmony_ci
469262306a36Sopenharmony_ci			continue;
469362306a36Sopenharmony_ci		}
469462306a36Sopenharmony_ci
469562306a36Sopenharmony_ci		old_gen = folio_lru_gen(folio);
469662306a36Sopenharmony_ci		if (old_gen < 0)
469762306a36Sopenharmony_ci			folio_set_referenced(folio);
469862306a36Sopenharmony_ci		else if (old_gen != new_gen)
469962306a36Sopenharmony_ci			folio_activate(folio);
470062306a36Sopenharmony_ci	}
470162306a36Sopenharmony_ci
470262306a36Sopenharmony_ci	arch_leave_lazy_mmu_mode();
470362306a36Sopenharmony_ci	mem_cgroup_unlock_pages();
470462306a36Sopenharmony_ci
470562306a36Sopenharmony_ci	/* feedback from rmap walkers to page table walkers */
470662306a36Sopenharmony_ci	if (suitable_to_scan(i, young))
470762306a36Sopenharmony_ci		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
470862306a36Sopenharmony_ci}
470962306a36Sopenharmony_ci
471062306a36Sopenharmony_ci/******************************************************************************
471162306a36Sopenharmony_ci *                          memcg LRU
471262306a36Sopenharmony_ci ******************************************************************************/
471362306a36Sopenharmony_ci
471462306a36Sopenharmony_ci/* see the comment on MEMCG_NR_GENS */
471562306a36Sopenharmony_cienum {
471662306a36Sopenharmony_ci	MEMCG_LRU_NOP,
471762306a36Sopenharmony_ci	MEMCG_LRU_HEAD,
471862306a36Sopenharmony_ci	MEMCG_LRU_TAIL,
471962306a36Sopenharmony_ci	MEMCG_LRU_OLD,
472062306a36Sopenharmony_ci	MEMCG_LRU_YOUNG,
472162306a36Sopenharmony_ci};
472262306a36Sopenharmony_ci
472362306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
472462306a36Sopenharmony_ci
472562306a36Sopenharmony_cistatic int lru_gen_memcg_seg(struct lruvec *lruvec)
472662306a36Sopenharmony_ci{
472762306a36Sopenharmony_ci	return READ_ONCE(lruvec->lrugen.seg);
472862306a36Sopenharmony_ci}
472962306a36Sopenharmony_ci
473062306a36Sopenharmony_cistatic void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
473162306a36Sopenharmony_ci{
473262306a36Sopenharmony_ci	int seg;
473362306a36Sopenharmony_ci	int old, new;
473462306a36Sopenharmony_ci	unsigned long flags;
473562306a36Sopenharmony_ci	int bin = get_random_u32_below(MEMCG_NR_BINS);
473662306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
473762306a36Sopenharmony_ci
473862306a36Sopenharmony_ci	spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
473962306a36Sopenharmony_ci
474062306a36Sopenharmony_ci	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
474162306a36Sopenharmony_ci
474262306a36Sopenharmony_ci	seg = 0;
474362306a36Sopenharmony_ci	new = old = lruvec->lrugen.gen;
474462306a36Sopenharmony_ci
474562306a36Sopenharmony_ci	/* see the comment on MEMCG_NR_GENS */
474662306a36Sopenharmony_ci	if (op == MEMCG_LRU_HEAD)
474762306a36Sopenharmony_ci		seg = MEMCG_LRU_HEAD;
474862306a36Sopenharmony_ci	else if (op == MEMCG_LRU_TAIL)
474962306a36Sopenharmony_ci		seg = MEMCG_LRU_TAIL;
475062306a36Sopenharmony_ci	else if (op == MEMCG_LRU_OLD)
475162306a36Sopenharmony_ci		new = get_memcg_gen(pgdat->memcg_lru.seq);
475262306a36Sopenharmony_ci	else if (op == MEMCG_LRU_YOUNG)
475362306a36Sopenharmony_ci		new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
475462306a36Sopenharmony_ci	else
475562306a36Sopenharmony_ci		VM_WARN_ON_ONCE(true);
475662306a36Sopenharmony_ci
475762306a36Sopenharmony_ci	WRITE_ONCE(lruvec->lrugen.seg, seg);
475862306a36Sopenharmony_ci	WRITE_ONCE(lruvec->lrugen.gen, new);
475962306a36Sopenharmony_ci
476062306a36Sopenharmony_ci	hlist_nulls_del_rcu(&lruvec->lrugen.list);
476162306a36Sopenharmony_ci
476262306a36Sopenharmony_ci	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
476362306a36Sopenharmony_ci		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
476462306a36Sopenharmony_ci	else
476562306a36Sopenharmony_ci		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
476662306a36Sopenharmony_ci
476762306a36Sopenharmony_ci	pgdat->memcg_lru.nr_memcgs[old]--;
476862306a36Sopenharmony_ci	pgdat->memcg_lru.nr_memcgs[new]++;
476962306a36Sopenharmony_ci
477062306a36Sopenharmony_ci	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
477162306a36Sopenharmony_ci		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
477262306a36Sopenharmony_ci
477362306a36Sopenharmony_ci	spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
477462306a36Sopenharmony_ci}
477562306a36Sopenharmony_ci
477662306a36Sopenharmony_civoid lru_gen_online_memcg(struct mem_cgroup *memcg)
477762306a36Sopenharmony_ci{
477862306a36Sopenharmony_ci	int gen;
477962306a36Sopenharmony_ci	int nid;
478062306a36Sopenharmony_ci	int bin = get_random_u32_below(MEMCG_NR_BINS);
478162306a36Sopenharmony_ci
478262306a36Sopenharmony_ci	for_each_node(nid) {
478362306a36Sopenharmony_ci		struct pglist_data *pgdat = NODE_DATA(nid);
478462306a36Sopenharmony_ci		struct lruvec *lruvec = get_lruvec(memcg, nid);
478562306a36Sopenharmony_ci
478662306a36Sopenharmony_ci		spin_lock_irq(&pgdat->memcg_lru.lock);
478762306a36Sopenharmony_ci
478862306a36Sopenharmony_ci		VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
478962306a36Sopenharmony_ci
479062306a36Sopenharmony_ci		gen = get_memcg_gen(pgdat->memcg_lru.seq);
479162306a36Sopenharmony_ci
479262306a36Sopenharmony_ci		lruvec->lrugen.gen = gen;
479362306a36Sopenharmony_ci
479462306a36Sopenharmony_ci		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
479562306a36Sopenharmony_ci		pgdat->memcg_lru.nr_memcgs[gen]++;
479662306a36Sopenharmony_ci
479762306a36Sopenharmony_ci		spin_unlock_irq(&pgdat->memcg_lru.lock);
479862306a36Sopenharmony_ci	}
479962306a36Sopenharmony_ci}
480062306a36Sopenharmony_ci
480162306a36Sopenharmony_civoid lru_gen_offline_memcg(struct mem_cgroup *memcg)
480262306a36Sopenharmony_ci{
480362306a36Sopenharmony_ci	int nid;
480462306a36Sopenharmony_ci
480562306a36Sopenharmony_ci	for_each_node(nid) {
480662306a36Sopenharmony_ci		struct lruvec *lruvec = get_lruvec(memcg, nid);
480762306a36Sopenharmony_ci
480862306a36Sopenharmony_ci		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
480962306a36Sopenharmony_ci	}
481062306a36Sopenharmony_ci}
481162306a36Sopenharmony_ci
481262306a36Sopenharmony_civoid lru_gen_release_memcg(struct mem_cgroup *memcg)
481362306a36Sopenharmony_ci{
481462306a36Sopenharmony_ci	int gen;
481562306a36Sopenharmony_ci	int nid;
481662306a36Sopenharmony_ci
481762306a36Sopenharmony_ci	for_each_node(nid) {
481862306a36Sopenharmony_ci		struct pglist_data *pgdat = NODE_DATA(nid);
481962306a36Sopenharmony_ci		struct lruvec *lruvec = get_lruvec(memcg, nid);
482062306a36Sopenharmony_ci
482162306a36Sopenharmony_ci		spin_lock_irq(&pgdat->memcg_lru.lock);
482262306a36Sopenharmony_ci
482362306a36Sopenharmony_ci		if (hlist_nulls_unhashed(&lruvec->lrugen.list))
482462306a36Sopenharmony_ci			goto unlock;
482562306a36Sopenharmony_ci
482662306a36Sopenharmony_ci		gen = lruvec->lrugen.gen;
482762306a36Sopenharmony_ci
482862306a36Sopenharmony_ci		hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
482962306a36Sopenharmony_ci		pgdat->memcg_lru.nr_memcgs[gen]--;
483062306a36Sopenharmony_ci
483162306a36Sopenharmony_ci		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
483262306a36Sopenharmony_ci			WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
483362306a36Sopenharmony_ciunlock:
483462306a36Sopenharmony_ci		spin_unlock_irq(&pgdat->memcg_lru.lock);
483562306a36Sopenharmony_ci	}
483662306a36Sopenharmony_ci}
483762306a36Sopenharmony_ci
483862306a36Sopenharmony_civoid lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
483962306a36Sopenharmony_ci{
484062306a36Sopenharmony_ci	struct lruvec *lruvec = get_lruvec(memcg, nid);
484162306a36Sopenharmony_ci
484262306a36Sopenharmony_ci	/* see the comment on MEMCG_NR_GENS */
484362306a36Sopenharmony_ci	if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
484462306a36Sopenharmony_ci		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
484562306a36Sopenharmony_ci}
484662306a36Sopenharmony_ci
484762306a36Sopenharmony_ci#else /* !CONFIG_MEMCG */
484862306a36Sopenharmony_ci
484962306a36Sopenharmony_cistatic int lru_gen_memcg_seg(struct lruvec *lruvec)
485062306a36Sopenharmony_ci{
485162306a36Sopenharmony_ci	return 0;
485262306a36Sopenharmony_ci}
485362306a36Sopenharmony_ci
485462306a36Sopenharmony_ci#endif
485562306a36Sopenharmony_ci
485662306a36Sopenharmony_ci/******************************************************************************
485762306a36Sopenharmony_ci *                          the eviction
485862306a36Sopenharmony_ci ******************************************************************************/
485962306a36Sopenharmony_ci
486062306a36Sopenharmony_cistatic bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
486162306a36Sopenharmony_ci		       int tier_idx)
486262306a36Sopenharmony_ci{
486362306a36Sopenharmony_ci	bool success;
486462306a36Sopenharmony_ci	int gen = folio_lru_gen(folio);
486562306a36Sopenharmony_ci	int type = folio_is_file_lru(folio);
486662306a36Sopenharmony_ci	int zone = folio_zonenum(folio);
486762306a36Sopenharmony_ci	int delta = folio_nr_pages(folio);
486862306a36Sopenharmony_ci	int refs = folio_lru_refs(folio);
486962306a36Sopenharmony_ci	int tier = lru_tier_from_refs(refs);
487062306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
487162306a36Sopenharmony_ci
487262306a36Sopenharmony_ci	VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
487362306a36Sopenharmony_ci
487462306a36Sopenharmony_ci	/* unevictable */
487562306a36Sopenharmony_ci	if (!folio_evictable(folio)) {
487662306a36Sopenharmony_ci		success = lru_gen_del_folio(lruvec, folio, true);
487762306a36Sopenharmony_ci		VM_WARN_ON_ONCE_FOLIO(!success, folio);
487862306a36Sopenharmony_ci		folio_set_unevictable(folio);
487962306a36Sopenharmony_ci		lruvec_add_folio(lruvec, folio);
488062306a36Sopenharmony_ci		__count_vm_events(UNEVICTABLE_PGCULLED, delta);
488162306a36Sopenharmony_ci		return true;
488262306a36Sopenharmony_ci	}
488362306a36Sopenharmony_ci
488462306a36Sopenharmony_ci	/* dirty lazyfree */
488562306a36Sopenharmony_ci	if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
488662306a36Sopenharmony_ci		success = lru_gen_del_folio(lruvec, folio, true);
488762306a36Sopenharmony_ci		VM_WARN_ON_ONCE_FOLIO(!success, folio);
488862306a36Sopenharmony_ci		folio_set_swapbacked(folio);
488962306a36Sopenharmony_ci		lruvec_add_folio_tail(lruvec, folio);
489062306a36Sopenharmony_ci		return true;
489162306a36Sopenharmony_ci	}
489262306a36Sopenharmony_ci
489362306a36Sopenharmony_ci	/* promoted */
489462306a36Sopenharmony_ci	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
489562306a36Sopenharmony_ci		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
489662306a36Sopenharmony_ci		return true;
489762306a36Sopenharmony_ci	}
489862306a36Sopenharmony_ci
489962306a36Sopenharmony_ci	/* protected */
490062306a36Sopenharmony_ci	if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
490162306a36Sopenharmony_ci		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
490262306a36Sopenharmony_ci
490362306a36Sopenharmony_ci		gen = folio_inc_gen(lruvec, folio, false);
490462306a36Sopenharmony_ci		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
490562306a36Sopenharmony_ci
490662306a36Sopenharmony_ci		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
490762306a36Sopenharmony_ci			   lrugen->protected[hist][type][tier - 1] + delta);
490862306a36Sopenharmony_ci		return true;
490962306a36Sopenharmony_ci	}
491062306a36Sopenharmony_ci
491162306a36Sopenharmony_ci	/* ineligible */
491262306a36Sopenharmony_ci	if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
491362306a36Sopenharmony_ci		gen = folio_inc_gen(lruvec, folio, false);
491462306a36Sopenharmony_ci		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
491562306a36Sopenharmony_ci		return true;
491662306a36Sopenharmony_ci	}
491762306a36Sopenharmony_ci
491862306a36Sopenharmony_ci	/* waiting for writeback */
491962306a36Sopenharmony_ci	if (folio_test_locked(folio) || folio_test_writeback(folio) ||
492062306a36Sopenharmony_ci	    (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
492162306a36Sopenharmony_ci		gen = folio_inc_gen(lruvec, folio, true);
492262306a36Sopenharmony_ci		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
492362306a36Sopenharmony_ci		return true;
492462306a36Sopenharmony_ci	}
492562306a36Sopenharmony_ci
492662306a36Sopenharmony_ci	return false;
492762306a36Sopenharmony_ci}
492862306a36Sopenharmony_ci
492962306a36Sopenharmony_cistatic bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
493062306a36Sopenharmony_ci{
493162306a36Sopenharmony_ci	bool success;
493262306a36Sopenharmony_ci
493362306a36Sopenharmony_ci	/* swapping inhibited */
493462306a36Sopenharmony_ci	if (!(sc->gfp_mask & __GFP_IO) &&
493562306a36Sopenharmony_ci	    (folio_test_dirty(folio) ||
493662306a36Sopenharmony_ci	     (folio_test_anon(folio) && !folio_test_swapcache(folio))))
493762306a36Sopenharmony_ci		return false;
493862306a36Sopenharmony_ci
493962306a36Sopenharmony_ci	/* raced with release_pages() */
494062306a36Sopenharmony_ci	if (!folio_try_get(folio))
494162306a36Sopenharmony_ci		return false;
494262306a36Sopenharmony_ci
494362306a36Sopenharmony_ci	/* raced with another isolation */
494462306a36Sopenharmony_ci	if (!folio_test_clear_lru(folio)) {
494562306a36Sopenharmony_ci		folio_put(folio);
494662306a36Sopenharmony_ci		return false;
494762306a36Sopenharmony_ci	}
494862306a36Sopenharmony_ci
494962306a36Sopenharmony_ci	/* see the comment on MAX_NR_TIERS */
495062306a36Sopenharmony_ci	if (!folio_test_referenced(folio))
495162306a36Sopenharmony_ci		set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
495262306a36Sopenharmony_ci
495362306a36Sopenharmony_ci	/* for shrink_folio_list() */
495462306a36Sopenharmony_ci	folio_clear_reclaim(folio);
495562306a36Sopenharmony_ci	folio_clear_referenced(folio);
495662306a36Sopenharmony_ci
495762306a36Sopenharmony_ci	success = lru_gen_del_folio(lruvec, folio, true);
495862306a36Sopenharmony_ci	VM_WARN_ON_ONCE_FOLIO(!success, folio);
495962306a36Sopenharmony_ci
496062306a36Sopenharmony_ci	return true;
496162306a36Sopenharmony_ci}
496262306a36Sopenharmony_ci
496362306a36Sopenharmony_cistatic int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
496462306a36Sopenharmony_ci		       int type, int tier, struct list_head *list)
496562306a36Sopenharmony_ci{
496662306a36Sopenharmony_ci	int i;
496762306a36Sopenharmony_ci	int gen;
496862306a36Sopenharmony_ci	enum vm_event_item item;
496962306a36Sopenharmony_ci	int sorted = 0;
497062306a36Sopenharmony_ci	int scanned = 0;
497162306a36Sopenharmony_ci	int isolated = 0;
497262306a36Sopenharmony_ci	int remaining = MAX_LRU_BATCH;
497362306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
497462306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
497562306a36Sopenharmony_ci
497662306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!list_empty(list));
497762306a36Sopenharmony_ci
497862306a36Sopenharmony_ci	if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
497962306a36Sopenharmony_ci		return 0;
498062306a36Sopenharmony_ci
498162306a36Sopenharmony_ci	gen = lru_gen_from_seq(lrugen->min_seq[type]);
498262306a36Sopenharmony_ci
498362306a36Sopenharmony_ci	for (i = MAX_NR_ZONES; i > 0; i--) {
498462306a36Sopenharmony_ci		LIST_HEAD(moved);
498562306a36Sopenharmony_ci		int skipped = 0;
498662306a36Sopenharmony_ci		int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
498762306a36Sopenharmony_ci		struct list_head *head = &lrugen->folios[gen][type][zone];
498862306a36Sopenharmony_ci
498962306a36Sopenharmony_ci		while (!list_empty(head)) {
499062306a36Sopenharmony_ci			struct folio *folio = lru_to_folio(head);
499162306a36Sopenharmony_ci			int delta = folio_nr_pages(folio);
499262306a36Sopenharmony_ci
499362306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
499462306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
499562306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
499662306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
499762306a36Sopenharmony_ci
499862306a36Sopenharmony_ci			scanned += delta;
499962306a36Sopenharmony_ci
500062306a36Sopenharmony_ci			if (sort_folio(lruvec, folio, sc, tier))
500162306a36Sopenharmony_ci				sorted += delta;
500262306a36Sopenharmony_ci			else if (isolate_folio(lruvec, folio, sc)) {
500362306a36Sopenharmony_ci				list_add(&folio->lru, list);
500462306a36Sopenharmony_ci				isolated += delta;
500562306a36Sopenharmony_ci			} else {
500662306a36Sopenharmony_ci				list_move(&folio->lru, &moved);
500762306a36Sopenharmony_ci				skipped += delta;
500862306a36Sopenharmony_ci			}
500962306a36Sopenharmony_ci
501062306a36Sopenharmony_ci			if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
501162306a36Sopenharmony_ci				break;
501262306a36Sopenharmony_ci		}
501362306a36Sopenharmony_ci
501462306a36Sopenharmony_ci		if (skipped) {
501562306a36Sopenharmony_ci			list_splice(&moved, head);
501662306a36Sopenharmony_ci			__count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
501762306a36Sopenharmony_ci		}
501862306a36Sopenharmony_ci
501962306a36Sopenharmony_ci		if (!remaining || isolated >= MIN_LRU_BATCH)
502062306a36Sopenharmony_ci			break;
502162306a36Sopenharmony_ci	}
502262306a36Sopenharmony_ci
502362306a36Sopenharmony_ci	item = PGSCAN_KSWAPD + reclaimer_offset();
502462306a36Sopenharmony_ci	if (!cgroup_reclaim(sc)) {
502562306a36Sopenharmony_ci		__count_vm_events(item, isolated);
502662306a36Sopenharmony_ci		__count_vm_events(PGREFILL, sorted);
502762306a36Sopenharmony_ci	}
502862306a36Sopenharmony_ci	__count_memcg_events(memcg, item, isolated);
502962306a36Sopenharmony_ci	__count_memcg_events(memcg, PGREFILL, sorted);
503062306a36Sopenharmony_ci	__count_vm_events(PGSCAN_ANON + type, isolated);
503162306a36Sopenharmony_ci
503262306a36Sopenharmony_ci	/*
503362306a36Sopenharmony_ci	 * There might not be eligible folios due to reclaim_idx. Check the
503462306a36Sopenharmony_ci	 * remaining to prevent livelock if it's not making progress.
503562306a36Sopenharmony_ci	 */
503662306a36Sopenharmony_ci	return isolated || !remaining ? scanned : 0;
503762306a36Sopenharmony_ci}
503862306a36Sopenharmony_ci
503962306a36Sopenharmony_cistatic int get_tier_idx(struct lruvec *lruvec, int type)
504062306a36Sopenharmony_ci{
504162306a36Sopenharmony_ci	int tier;
504262306a36Sopenharmony_ci	struct ctrl_pos sp, pv;
504362306a36Sopenharmony_ci
504462306a36Sopenharmony_ci	/*
504562306a36Sopenharmony_ci	 * To leave a margin for fluctuations, use a larger gain factor (1:2).
504662306a36Sopenharmony_ci	 * This value is chosen because any other tier would have at least twice
504762306a36Sopenharmony_ci	 * as many refaults as the first tier.
504862306a36Sopenharmony_ci	 */
504962306a36Sopenharmony_ci	read_ctrl_pos(lruvec, type, 0, 1, &sp);
505062306a36Sopenharmony_ci	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
505162306a36Sopenharmony_ci		read_ctrl_pos(lruvec, type, tier, 2, &pv);
505262306a36Sopenharmony_ci		if (!positive_ctrl_err(&sp, &pv))
505362306a36Sopenharmony_ci			break;
505462306a36Sopenharmony_ci	}
505562306a36Sopenharmony_ci
505662306a36Sopenharmony_ci	return tier - 1;
505762306a36Sopenharmony_ci}
505862306a36Sopenharmony_ci
505962306a36Sopenharmony_cistatic int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
506062306a36Sopenharmony_ci{
506162306a36Sopenharmony_ci	int type, tier;
506262306a36Sopenharmony_ci	struct ctrl_pos sp, pv;
506362306a36Sopenharmony_ci	int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
506462306a36Sopenharmony_ci
506562306a36Sopenharmony_ci	/*
506662306a36Sopenharmony_ci	 * Compare the first tier of anon with that of file to determine which
506762306a36Sopenharmony_ci	 * type to scan. Also need to compare other tiers of the selected type
506862306a36Sopenharmony_ci	 * with the first tier of the other type to determine the last tier (of
506962306a36Sopenharmony_ci	 * the selected type) to evict.
507062306a36Sopenharmony_ci	 */
507162306a36Sopenharmony_ci	read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
507262306a36Sopenharmony_ci	read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
507362306a36Sopenharmony_ci	type = positive_ctrl_err(&sp, &pv);
507462306a36Sopenharmony_ci
507562306a36Sopenharmony_ci	read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
507662306a36Sopenharmony_ci	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
507762306a36Sopenharmony_ci		read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
507862306a36Sopenharmony_ci		if (!positive_ctrl_err(&sp, &pv))
507962306a36Sopenharmony_ci			break;
508062306a36Sopenharmony_ci	}
508162306a36Sopenharmony_ci
508262306a36Sopenharmony_ci	*tier_idx = tier - 1;
508362306a36Sopenharmony_ci
508462306a36Sopenharmony_ci	return type;
508562306a36Sopenharmony_ci}
508662306a36Sopenharmony_ci
508762306a36Sopenharmony_cistatic int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
508862306a36Sopenharmony_ci			  int *type_scanned, struct list_head *list)
508962306a36Sopenharmony_ci{
509062306a36Sopenharmony_ci	int i;
509162306a36Sopenharmony_ci	int type;
509262306a36Sopenharmony_ci	int scanned;
509362306a36Sopenharmony_ci	int tier = -1;
509462306a36Sopenharmony_ci	DEFINE_MIN_SEQ(lruvec);
509562306a36Sopenharmony_ci
509662306a36Sopenharmony_ci	/*
509762306a36Sopenharmony_ci	 * Try to make the obvious choice first. When anon and file are both
509862306a36Sopenharmony_ci	 * available from the same generation, interpret swappiness 1 as file
509962306a36Sopenharmony_ci	 * first and 200 as anon first.
510062306a36Sopenharmony_ci	 */
510162306a36Sopenharmony_ci	if (!swappiness)
510262306a36Sopenharmony_ci		type = LRU_GEN_FILE;
510362306a36Sopenharmony_ci	else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
510462306a36Sopenharmony_ci		type = LRU_GEN_ANON;
510562306a36Sopenharmony_ci	else if (swappiness == 1)
510662306a36Sopenharmony_ci		type = LRU_GEN_FILE;
510762306a36Sopenharmony_ci	else if (swappiness == 200)
510862306a36Sopenharmony_ci		type = LRU_GEN_ANON;
510962306a36Sopenharmony_ci	else
511062306a36Sopenharmony_ci		type = get_type_to_scan(lruvec, swappiness, &tier);
511162306a36Sopenharmony_ci
511262306a36Sopenharmony_ci	for (i = !swappiness; i < ANON_AND_FILE; i++) {
511362306a36Sopenharmony_ci		if (tier < 0)
511462306a36Sopenharmony_ci			tier = get_tier_idx(lruvec, type);
511562306a36Sopenharmony_ci
511662306a36Sopenharmony_ci		scanned = scan_folios(lruvec, sc, type, tier, list);
511762306a36Sopenharmony_ci		if (scanned)
511862306a36Sopenharmony_ci			break;
511962306a36Sopenharmony_ci
512062306a36Sopenharmony_ci		type = !type;
512162306a36Sopenharmony_ci		tier = -1;
512262306a36Sopenharmony_ci	}
512362306a36Sopenharmony_ci
512462306a36Sopenharmony_ci	*type_scanned = type;
512562306a36Sopenharmony_ci
512662306a36Sopenharmony_ci	return scanned;
512762306a36Sopenharmony_ci}
512862306a36Sopenharmony_ci
512962306a36Sopenharmony_cistatic int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
513062306a36Sopenharmony_ci{
513162306a36Sopenharmony_ci	int type;
513262306a36Sopenharmony_ci	int scanned;
513362306a36Sopenharmony_ci	int reclaimed;
513462306a36Sopenharmony_ci	LIST_HEAD(list);
513562306a36Sopenharmony_ci	LIST_HEAD(clean);
513662306a36Sopenharmony_ci	struct folio *folio;
513762306a36Sopenharmony_ci	struct folio *next;
513862306a36Sopenharmony_ci	enum vm_event_item item;
513962306a36Sopenharmony_ci	struct reclaim_stat stat;
514062306a36Sopenharmony_ci	struct lru_gen_mm_walk *walk;
514162306a36Sopenharmony_ci	bool skip_retry = false;
514262306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
514362306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
514462306a36Sopenharmony_ci
514562306a36Sopenharmony_ci	spin_lock_irq(&lruvec->lru_lock);
514662306a36Sopenharmony_ci
514762306a36Sopenharmony_ci	scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
514862306a36Sopenharmony_ci
514962306a36Sopenharmony_ci	scanned += try_to_inc_min_seq(lruvec, swappiness);
515062306a36Sopenharmony_ci
515162306a36Sopenharmony_ci	if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
515262306a36Sopenharmony_ci		scanned = 0;
515362306a36Sopenharmony_ci
515462306a36Sopenharmony_ci	spin_unlock_irq(&lruvec->lru_lock);
515562306a36Sopenharmony_ci
515662306a36Sopenharmony_ci	if (list_empty(&list))
515762306a36Sopenharmony_ci		return scanned;
515862306a36Sopenharmony_ciretry:
515962306a36Sopenharmony_ci	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
516062306a36Sopenharmony_ci	sc->nr_reclaimed += reclaimed;
516162306a36Sopenharmony_ci
516262306a36Sopenharmony_ci	list_for_each_entry_safe_reverse(folio, next, &list, lru) {
516362306a36Sopenharmony_ci		if (!folio_evictable(folio)) {
516462306a36Sopenharmony_ci			list_del(&folio->lru);
516562306a36Sopenharmony_ci			folio_putback_lru(folio);
516662306a36Sopenharmony_ci			continue;
516762306a36Sopenharmony_ci		}
516862306a36Sopenharmony_ci
516962306a36Sopenharmony_ci		if (folio_test_reclaim(folio) &&
517062306a36Sopenharmony_ci		    (folio_test_dirty(folio) || folio_test_writeback(folio))) {
517162306a36Sopenharmony_ci			/* restore LRU_REFS_FLAGS cleared by isolate_folio() */
517262306a36Sopenharmony_ci			if (folio_test_workingset(folio))
517362306a36Sopenharmony_ci				folio_set_referenced(folio);
517462306a36Sopenharmony_ci			continue;
517562306a36Sopenharmony_ci		}
517662306a36Sopenharmony_ci
517762306a36Sopenharmony_ci		if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
517862306a36Sopenharmony_ci		    folio_mapped(folio) || folio_test_locked(folio) ||
517962306a36Sopenharmony_ci		    folio_test_dirty(folio) || folio_test_writeback(folio)) {
518062306a36Sopenharmony_ci			/* don't add rejected folios to the oldest generation */
518162306a36Sopenharmony_ci			set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
518262306a36Sopenharmony_ci				      BIT(PG_active));
518362306a36Sopenharmony_ci			continue;
518462306a36Sopenharmony_ci		}
518562306a36Sopenharmony_ci
518662306a36Sopenharmony_ci		/* retry folios that may have missed folio_rotate_reclaimable() */
518762306a36Sopenharmony_ci		list_move(&folio->lru, &clean);
518862306a36Sopenharmony_ci		sc->nr_scanned -= folio_nr_pages(folio);
518962306a36Sopenharmony_ci	}
519062306a36Sopenharmony_ci
519162306a36Sopenharmony_ci	spin_lock_irq(&lruvec->lru_lock);
519262306a36Sopenharmony_ci
519362306a36Sopenharmony_ci	move_folios_to_lru(lruvec, &list);
519462306a36Sopenharmony_ci
519562306a36Sopenharmony_ci	walk = current->reclaim_state->mm_walk;
519662306a36Sopenharmony_ci	if (walk && walk->batched)
519762306a36Sopenharmony_ci		reset_batch_size(lruvec, walk);
519862306a36Sopenharmony_ci
519962306a36Sopenharmony_ci	item = PGSTEAL_KSWAPD + reclaimer_offset();
520062306a36Sopenharmony_ci	if (!cgroup_reclaim(sc))
520162306a36Sopenharmony_ci		__count_vm_events(item, reclaimed);
520262306a36Sopenharmony_ci	__count_memcg_events(memcg, item, reclaimed);
520362306a36Sopenharmony_ci	__count_vm_events(PGSTEAL_ANON + type, reclaimed);
520462306a36Sopenharmony_ci
520562306a36Sopenharmony_ci	spin_unlock_irq(&lruvec->lru_lock);
520662306a36Sopenharmony_ci
520762306a36Sopenharmony_ci	mem_cgroup_uncharge_list(&list);
520862306a36Sopenharmony_ci	free_unref_page_list(&list);
520962306a36Sopenharmony_ci
521062306a36Sopenharmony_ci	INIT_LIST_HEAD(&list);
521162306a36Sopenharmony_ci	list_splice_init(&clean, &list);
521262306a36Sopenharmony_ci
521362306a36Sopenharmony_ci	if (!list_empty(&list)) {
521462306a36Sopenharmony_ci		skip_retry = true;
521562306a36Sopenharmony_ci		goto retry;
521662306a36Sopenharmony_ci	}
521762306a36Sopenharmony_ci
521862306a36Sopenharmony_ci	return scanned;
521962306a36Sopenharmony_ci}
522062306a36Sopenharmony_ci
522162306a36Sopenharmony_cistatic bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
522262306a36Sopenharmony_ci			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
522362306a36Sopenharmony_ci{
522462306a36Sopenharmony_ci	int gen, type, zone;
522562306a36Sopenharmony_ci	unsigned long old = 0;
522662306a36Sopenharmony_ci	unsigned long young = 0;
522762306a36Sopenharmony_ci	unsigned long total = 0;
522862306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
522962306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
523062306a36Sopenharmony_ci	DEFINE_MIN_SEQ(lruvec);
523162306a36Sopenharmony_ci
523262306a36Sopenharmony_ci	/* whether this lruvec is completely out of cold folios */
523362306a36Sopenharmony_ci	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
523462306a36Sopenharmony_ci		*nr_to_scan = 0;
523562306a36Sopenharmony_ci		return true;
523662306a36Sopenharmony_ci	}
523762306a36Sopenharmony_ci
523862306a36Sopenharmony_ci	for (type = !can_swap; type < ANON_AND_FILE; type++) {
523962306a36Sopenharmony_ci		unsigned long seq;
524062306a36Sopenharmony_ci
524162306a36Sopenharmony_ci		for (seq = min_seq[type]; seq <= max_seq; seq++) {
524262306a36Sopenharmony_ci			unsigned long size = 0;
524362306a36Sopenharmony_ci
524462306a36Sopenharmony_ci			gen = lru_gen_from_seq(seq);
524562306a36Sopenharmony_ci
524662306a36Sopenharmony_ci			for (zone = 0; zone < MAX_NR_ZONES; zone++)
524762306a36Sopenharmony_ci				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
524862306a36Sopenharmony_ci
524962306a36Sopenharmony_ci			total += size;
525062306a36Sopenharmony_ci			if (seq == max_seq)
525162306a36Sopenharmony_ci				young += size;
525262306a36Sopenharmony_ci			else if (seq + MIN_NR_GENS == max_seq)
525362306a36Sopenharmony_ci				old += size;
525462306a36Sopenharmony_ci		}
525562306a36Sopenharmony_ci	}
525662306a36Sopenharmony_ci
525762306a36Sopenharmony_ci	/* try to scrape all its memory if this memcg was deleted */
525862306a36Sopenharmony_ci	if (!mem_cgroup_online(memcg)) {
525962306a36Sopenharmony_ci		*nr_to_scan = total;
526062306a36Sopenharmony_ci		return false;
526162306a36Sopenharmony_ci	}
526262306a36Sopenharmony_ci
526362306a36Sopenharmony_ci	*nr_to_scan = total >> sc->priority;
526462306a36Sopenharmony_ci
526562306a36Sopenharmony_ci	/*
526662306a36Sopenharmony_ci	 * The aging tries to be lazy to reduce the overhead, while the eviction
526762306a36Sopenharmony_ci	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
526862306a36Sopenharmony_ci	 * ideal number of generations is MIN_NR_GENS+1.
526962306a36Sopenharmony_ci	 */
527062306a36Sopenharmony_ci	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
527162306a36Sopenharmony_ci		return false;
527262306a36Sopenharmony_ci
527362306a36Sopenharmony_ci	/*
527462306a36Sopenharmony_ci	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
527562306a36Sopenharmony_ci	 * of the total number of pages for each generation. A reasonable range
527662306a36Sopenharmony_ci	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
527762306a36Sopenharmony_ci	 * aging cares about the upper bound of hot pages, while the eviction
527862306a36Sopenharmony_ci	 * cares about the lower bound of cold pages.
527962306a36Sopenharmony_ci	 */
528062306a36Sopenharmony_ci	if (young * MIN_NR_GENS > total)
528162306a36Sopenharmony_ci		return true;
528262306a36Sopenharmony_ci	if (old * (MIN_NR_GENS + 2) < total)
528362306a36Sopenharmony_ci		return true;
528462306a36Sopenharmony_ci
528562306a36Sopenharmony_ci	return false;
528662306a36Sopenharmony_ci}
528762306a36Sopenharmony_ci
528862306a36Sopenharmony_ci/*
528962306a36Sopenharmony_ci * For future optimizations:
529062306a36Sopenharmony_ci * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
529162306a36Sopenharmony_ci *    reclaim.
529262306a36Sopenharmony_ci */
529362306a36Sopenharmony_cistatic long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
529462306a36Sopenharmony_ci{
529562306a36Sopenharmony_ci	unsigned long nr_to_scan;
529662306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
529762306a36Sopenharmony_ci	DEFINE_MAX_SEQ(lruvec);
529862306a36Sopenharmony_ci
529962306a36Sopenharmony_ci	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
530062306a36Sopenharmony_ci		return -1;
530162306a36Sopenharmony_ci
530262306a36Sopenharmony_ci	if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
530362306a36Sopenharmony_ci		return nr_to_scan;
530462306a36Sopenharmony_ci
530562306a36Sopenharmony_ci	/* skip the aging path at the default priority */
530662306a36Sopenharmony_ci	if (sc->priority == DEF_PRIORITY)
530762306a36Sopenharmony_ci		return nr_to_scan;
530862306a36Sopenharmony_ci
530962306a36Sopenharmony_ci	/* skip this lruvec as it's low on cold folios */
531062306a36Sopenharmony_ci	return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
531162306a36Sopenharmony_ci}
531262306a36Sopenharmony_ci
531362306a36Sopenharmony_cistatic bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
531462306a36Sopenharmony_ci{
531562306a36Sopenharmony_ci	int i;
531662306a36Sopenharmony_ci	enum zone_watermarks mark;
531762306a36Sopenharmony_ci
531862306a36Sopenharmony_ci	/* don't abort memcg reclaim to ensure fairness */
531962306a36Sopenharmony_ci	if (!root_reclaim(sc))
532062306a36Sopenharmony_ci		return false;
532162306a36Sopenharmony_ci
532262306a36Sopenharmony_ci	if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
532362306a36Sopenharmony_ci		return true;
532462306a36Sopenharmony_ci
532562306a36Sopenharmony_ci	/* check the order to exclude compaction-induced reclaim */
532662306a36Sopenharmony_ci	if (!current_is_kswapd() || sc->order)
532762306a36Sopenharmony_ci		return false;
532862306a36Sopenharmony_ci
532962306a36Sopenharmony_ci	mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
533062306a36Sopenharmony_ci	       WMARK_PROMO : WMARK_HIGH;
533162306a36Sopenharmony_ci
533262306a36Sopenharmony_ci	for (i = 0; i <= sc->reclaim_idx; i++) {
533362306a36Sopenharmony_ci		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
533462306a36Sopenharmony_ci		unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;
533562306a36Sopenharmony_ci
533662306a36Sopenharmony_ci		if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0))
533762306a36Sopenharmony_ci			return false;
533862306a36Sopenharmony_ci	}
533962306a36Sopenharmony_ci
534062306a36Sopenharmony_ci	/* kswapd should abort if all eligible zones are safe */
534162306a36Sopenharmony_ci	return true;
534262306a36Sopenharmony_ci}
534362306a36Sopenharmony_ci
534462306a36Sopenharmony_cistatic bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
534562306a36Sopenharmony_ci{
534662306a36Sopenharmony_ci	long nr_to_scan;
534762306a36Sopenharmony_ci	unsigned long scanned = 0;
534862306a36Sopenharmony_ci	int swappiness = get_swappiness(lruvec, sc);
534962306a36Sopenharmony_ci
535062306a36Sopenharmony_ci	/* clean file folios are more likely to exist */
535162306a36Sopenharmony_ci	if (swappiness && !(sc->gfp_mask & __GFP_IO))
535262306a36Sopenharmony_ci		swappiness = 1;
535362306a36Sopenharmony_ci
535462306a36Sopenharmony_ci	while (true) {
535562306a36Sopenharmony_ci		int delta;
535662306a36Sopenharmony_ci
535762306a36Sopenharmony_ci		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
535862306a36Sopenharmony_ci		if (nr_to_scan <= 0)
535962306a36Sopenharmony_ci			break;
536062306a36Sopenharmony_ci
536162306a36Sopenharmony_ci		delta = evict_folios(lruvec, sc, swappiness);
536262306a36Sopenharmony_ci		if (!delta)
536362306a36Sopenharmony_ci			break;
536462306a36Sopenharmony_ci
536562306a36Sopenharmony_ci		scanned += delta;
536662306a36Sopenharmony_ci		if (scanned >= nr_to_scan)
536762306a36Sopenharmony_ci			break;
536862306a36Sopenharmony_ci
536962306a36Sopenharmony_ci		if (should_abort_scan(lruvec, sc))
537062306a36Sopenharmony_ci			break;
537162306a36Sopenharmony_ci
537262306a36Sopenharmony_ci		cond_resched();
537362306a36Sopenharmony_ci	}
537462306a36Sopenharmony_ci
537562306a36Sopenharmony_ci	/* whether this lruvec should be rotated */
537662306a36Sopenharmony_ci	return nr_to_scan < 0;
537762306a36Sopenharmony_ci}
537862306a36Sopenharmony_ci
537962306a36Sopenharmony_cistatic int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
538062306a36Sopenharmony_ci{
538162306a36Sopenharmony_ci	bool success;
538262306a36Sopenharmony_ci	unsigned long scanned = sc->nr_scanned;
538362306a36Sopenharmony_ci	unsigned long reclaimed = sc->nr_reclaimed;
538462306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
538562306a36Sopenharmony_ci	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
538662306a36Sopenharmony_ci
538762306a36Sopenharmony_ci	mem_cgroup_calculate_protection(NULL, memcg);
538862306a36Sopenharmony_ci
538962306a36Sopenharmony_ci	if (mem_cgroup_below_min(NULL, memcg))
539062306a36Sopenharmony_ci		return MEMCG_LRU_YOUNG;
539162306a36Sopenharmony_ci
539262306a36Sopenharmony_ci	if (mem_cgroup_below_low(NULL, memcg)) {
539362306a36Sopenharmony_ci		/* see the comment on MEMCG_NR_GENS */
539462306a36Sopenharmony_ci		if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL)
539562306a36Sopenharmony_ci			return MEMCG_LRU_TAIL;
539662306a36Sopenharmony_ci
539762306a36Sopenharmony_ci		memcg_memory_event(memcg, MEMCG_LOW);
539862306a36Sopenharmony_ci	}
539962306a36Sopenharmony_ci
540062306a36Sopenharmony_ci	success = try_to_shrink_lruvec(lruvec, sc);
540162306a36Sopenharmony_ci
540262306a36Sopenharmony_ci	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
540362306a36Sopenharmony_ci
540462306a36Sopenharmony_ci	if (!sc->proactive)
540562306a36Sopenharmony_ci		vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
540662306a36Sopenharmony_ci			   sc->nr_reclaimed - reclaimed);
540762306a36Sopenharmony_ci
540862306a36Sopenharmony_ci	flush_reclaim_state(sc);
540962306a36Sopenharmony_ci
541062306a36Sopenharmony_ci	if (success && mem_cgroup_online(memcg))
541162306a36Sopenharmony_ci		return MEMCG_LRU_YOUNG;
541262306a36Sopenharmony_ci
541362306a36Sopenharmony_ci	if (!success && lruvec_is_sizable(lruvec, sc))
541462306a36Sopenharmony_ci		return 0;
541562306a36Sopenharmony_ci
541662306a36Sopenharmony_ci	/* one retry if offlined or too small */
541762306a36Sopenharmony_ci	return lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL ?
541862306a36Sopenharmony_ci	       MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
541962306a36Sopenharmony_ci}
542062306a36Sopenharmony_ci
542162306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
542262306a36Sopenharmony_ci
542362306a36Sopenharmony_cistatic void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
542462306a36Sopenharmony_ci{
542562306a36Sopenharmony_ci	int op;
542662306a36Sopenharmony_ci	int gen;
542762306a36Sopenharmony_ci	int bin;
542862306a36Sopenharmony_ci	int first_bin;
542962306a36Sopenharmony_ci	struct lruvec *lruvec;
543062306a36Sopenharmony_ci	struct lru_gen_folio *lrugen;
543162306a36Sopenharmony_ci	struct mem_cgroup *memcg;
543262306a36Sopenharmony_ci	struct hlist_nulls_node *pos;
543362306a36Sopenharmony_ci
543462306a36Sopenharmony_ci	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
543562306a36Sopenharmony_ci	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
543662306a36Sopenharmony_cirestart:
543762306a36Sopenharmony_ci	op = 0;
543862306a36Sopenharmony_ci	memcg = NULL;
543962306a36Sopenharmony_ci
544062306a36Sopenharmony_ci	rcu_read_lock();
544162306a36Sopenharmony_ci
544262306a36Sopenharmony_ci	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
544362306a36Sopenharmony_ci		if (op) {
544462306a36Sopenharmony_ci			lru_gen_rotate_memcg(lruvec, op);
544562306a36Sopenharmony_ci			op = 0;
544662306a36Sopenharmony_ci		}
544762306a36Sopenharmony_ci
544862306a36Sopenharmony_ci		mem_cgroup_put(memcg);
544962306a36Sopenharmony_ci		memcg = NULL;
545062306a36Sopenharmony_ci
545162306a36Sopenharmony_ci		if (gen != READ_ONCE(lrugen->gen))
545262306a36Sopenharmony_ci			continue;
545362306a36Sopenharmony_ci
545462306a36Sopenharmony_ci		lruvec = container_of(lrugen, struct lruvec, lrugen);
545562306a36Sopenharmony_ci		memcg = lruvec_memcg(lruvec);
545662306a36Sopenharmony_ci
545762306a36Sopenharmony_ci		if (!mem_cgroup_tryget(memcg)) {
545862306a36Sopenharmony_ci			lru_gen_release_memcg(memcg);
545962306a36Sopenharmony_ci			memcg = NULL;
546062306a36Sopenharmony_ci			continue;
546162306a36Sopenharmony_ci		}
546262306a36Sopenharmony_ci
546362306a36Sopenharmony_ci		rcu_read_unlock();
546462306a36Sopenharmony_ci
546562306a36Sopenharmony_ci		op = shrink_one(lruvec, sc);
546662306a36Sopenharmony_ci
546762306a36Sopenharmony_ci		rcu_read_lock();
546862306a36Sopenharmony_ci
546962306a36Sopenharmony_ci		if (should_abort_scan(lruvec, sc))
547062306a36Sopenharmony_ci			break;
547162306a36Sopenharmony_ci	}
547262306a36Sopenharmony_ci
547362306a36Sopenharmony_ci	rcu_read_unlock();
547462306a36Sopenharmony_ci
547562306a36Sopenharmony_ci	if (op)
547662306a36Sopenharmony_ci		lru_gen_rotate_memcg(lruvec, op);
547762306a36Sopenharmony_ci
547862306a36Sopenharmony_ci	mem_cgroup_put(memcg);
547962306a36Sopenharmony_ci
548062306a36Sopenharmony_ci	if (!is_a_nulls(pos))
548162306a36Sopenharmony_ci		return;
548262306a36Sopenharmony_ci
548362306a36Sopenharmony_ci	/* restart if raced with lru_gen_rotate_memcg() */
548462306a36Sopenharmony_ci	if (gen != get_nulls_value(pos))
548562306a36Sopenharmony_ci		goto restart;
548662306a36Sopenharmony_ci
548762306a36Sopenharmony_ci	/* try the rest of the bins of the current generation */
548862306a36Sopenharmony_ci	bin = get_memcg_bin(bin + 1);
548962306a36Sopenharmony_ci	if (bin != first_bin)
549062306a36Sopenharmony_ci		goto restart;
549162306a36Sopenharmony_ci}
549262306a36Sopenharmony_ci
549362306a36Sopenharmony_ci#ifndef CONFIG_HYPERHOLD_FILE_LRU
549462306a36Sopenharmony_cistatic void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
549562306a36Sopenharmony_ci{
549662306a36Sopenharmony_ci	struct blk_plug plug;
549762306a36Sopenharmony_ci
549862306a36Sopenharmony_ci	VM_WARN_ON_ONCE(root_reclaim(sc));
549962306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
550062306a36Sopenharmony_ci
550162306a36Sopenharmony_ci	lru_add_drain();
550262306a36Sopenharmony_ci
550362306a36Sopenharmony_ci	blk_start_plug(&plug);
550462306a36Sopenharmony_ci
550562306a36Sopenharmony_ci	set_mm_walk(NULL, sc->proactive);
550662306a36Sopenharmony_ci
550762306a36Sopenharmony_ci	if (try_to_shrink_lruvec(lruvec, sc))
550862306a36Sopenharmony_ci		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
550962306a36Sopenharmony_ci
551062306a36Sopenharmony_ci	clear_mm_walk();
551162306a36Sopenharmony_ci
551262306a36Sopenharmony_ci	blk_finish_plug(&plug);
551362306a36Sopenharmony_ci}
551462306a36Sopenharmony_ci#endif
551562306a36Sopenharmony_ci
551662306a36Sopenharmony_ci#else /* !CONFIG_MEMCG */
551762306a36Sopenharmony_ci
551862306a36Sopenharmony_cistatic void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
551962306a36Sopenharmony_ci{
552062306a36Sopenharmony_ci	BUILD_BUG();
552162306a36Sopenharmony_ci}
552262306a36Sopenharmony_ci
552362306a36Sopenharmony_ci#ifndef CONFIG_HYPERHOLD_FILE_LRU
552462306a36Sopenharmony_cistatic void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
552562306a36Sopenharmony_ci{
552662306a36Sopenharmony_ci	BUILD_BUG();
552762306a36Sopenharmony_ci}
552862306a36Sopenharmony_ci#endif
552962306a36Sopenharmony_ci
553062306a36Sopenharmony_ci#endif
553162306a36Sopenharmony_ci
553262306a36Sopenharmony_cistatic void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
553362306a36Sopenharmony_ci{
553462306a36Sopenharmony_ci	int priority;
553562306a36Sopenharmony_ci	unsigned long reclaimable;
553662306a36Sopenharmony_ci	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
553762306a36Sopenharmony_ci
553862306a36Sopenharmony_ci	if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
553962306a36Sopenharmony_ci		return;
554062306a36Sopenharmony_ci	/*
554162306a36Sopenharmony_ci	 * Determine the initial priority based on
554262306a36Sopenharmony_ci	 * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
554362306a36Sopenharmony_ci	 * where reclaimed_to_scanned_ratio = inactive / total.
554462306a36Sopenharmony_ci	 */
554562306a36Sopenharmony_ci	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
554662306a36Sopenharmony_ci	if (get_swappiness(lruvec, sc))
554762306a36Sopenharmony_ci		reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
554862306a36Sopenharmony_ci
554962306a36Sopenharmony_ci	/* round down reclaimable and round up sc->nr_to_reclaim */
555062306a36Sopenharmony_ci	priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
555162306a36Sopenharmony_ci
555262306a36Sopenharmony_ci	sc->priority = clamp(priority, 0, DEF_PRIORITY);
555362306a36Sopenharmony_ci}
555462306a36Sopenharmony_ci
555562306a36Sopenharmony_cistatic void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
555662306a36Sopenharmony_ci{
555762306a36Sopenharmony_ci	struct blk_plug plug;
555862306a36Sopenharmony_ci	unsigned long reclaimed = sc->nr_reclaimed;
555962306a36Sopenharmony_ci
556062306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!root_reclaim(sc));
556162306a36Sopenharmony_ci
556262306a36Sopenharmony_ci	/*
556362306a36Sopenharmony_ci	 * Unmapped clean folios are already prioritized. Scanning for more of
556462306a36Sopenharmony_ci	 * them is likely futile and can cause high reclaim latency when there
556562306a36Sopenharmony_ci	 * is a large number of memcgs.
556662306a36Sopenharmony_ci	 */
556762306a36Sopenharmony_ci	if (!sc->may_writepage || !sc->may_unmap)
556862306a36Sopenharmony_ci		goto done;
556962306a36Sopenharmony_ci
557062306a36Sopenharmony_ci	lru_add_drain();
557162306a36Sopenharmony_ci
557262306a36Sopenharmony_ci	blk_start_plug(&plug);
557362306a36Sopenharmony_ci
557462306a36Sopenharmony_ci	set_mm_walk(pgdat, sc->proactive);
557562306a36Sopenharmony_ci
557662306a36Sopenharmony_ci	set_initial_priority(pgdat, sc);
557762306a36Sopenharmony_ci
557862306a36Sopenharmony_ci	if (current_is_kswapd())
557962306a36Sopenharmony_ci		sc->nr_reclaimed = 0;
558062306a36Sopenharmony_ci
558162306a36Sopenharmony_ci	if (mem_cgroup_disabled())
558262306a36Sopenharmony_ci		shrink_one(&pgdat->__lruvec, sc);
558362306a36Sopenharmony_ci	else
558462306a36Sopenharmony_ci		shrink_many(pgdat, sc);
558562306a36Sopenharmony_ci
558662306a36Sopenharmony_ci	if (current_is_kswapd())
558762306a36Sopenharmony_ci		sc->nr_reclaimed += reclaimed;
558862306a36Sopenharmony_ci
558962306a36Sopenharmony_ci	clear_mm_walk();
559062306a36Sopenharmony_ci
559162306a36Sopenharmony_ci	blk_finish_plug(&plug);
559262306a36Sopenharmony_cidone:
559362306a36Sopenharmony_ci	/* kswapd should never fail */
559462306a36Sopenharmony_ci	pgdat->kswapd_failures = 0;
559562306a36Sopenharmony_ci}
559662306a36Sopenharmony_ci
559762306a36Sopenharmony_ci/******************************************************************************
559862306a36Sopenharmony_ci *                          state change
559962306a36Sopenharmony_ci ******************************************************************************/
560062306a36Sopenharmony_ci
560162306a36Sopenharmony_cistatic bool __maybe_unused state_is_valid(struct lruvec *lruvec)
560262306a36Sopenharmony_ci{
560362306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
560462306a36Sopenharmony_ci
560562306a36Sopenharmony_ci	if (lrugen->enabled) {
560662306a36Sopenharmony_ci		enum lru_list lru;
560762306a36Sopenharmony_ci
560862306a36Sopenharmony_ci		for_each_evictable_lru(lru) {
560962306a36Sopenharmony_ci			if (!list_empty(&lruvec->lists[lru]))
561062306a36Sopenharmony_ci				return false;
561162306a36Sopenharmony_ci		}
561262306a36Sopenharmony_ci	} else {
561362306a36Sopenharmony_ci		int gen, type, zone;
561462306a36Sopenharmony_ci
561562306a36Sopenharmony_ci		for_each_gen_type_zone(gen, type, zone) {
561662306a36Sopenharmony_ci			if (!list_empty(&lrugen->folios[gen][type][zone]))
561762306a36Sopenharmony_ci				return false;
561862306a36Sopenharmony_ci		}
561962306a36Sopenharmony_ci	}
562062306a36Sopenharmony_ci
562162306a36Sopenharmony_ci	return true;
562262306a36Sopenharmony_ci}
562362306a36Sopenharmony_ci
562462306a36Sopenharmony_cistatic bool fill_evictable(struct lruvec *lruvec)
562562306a36Sopenharmony_ci{
562662306a36Sopenharmony_ci	enum lru_list lru;
562762306a36Sopenharmony_ci	int remaining = MAX_LRU_BATCH;
562862306a36Sopenharmony_ci
562962306a36Sopenharmony_ci	for_each_evictable_lru(lru) {
563062306a36Sopenharmony_ci		int type = is_file_lru(lru);
563162306a36Sopenharmony_ci		bool active = is_active_lru(lru);
563262306a36Sopenharmony_ci		struct list_head *head = &lruvec->lists[lru];
563362306a36Sopenharmony_ci
563462306a36Sopenharmony_ci		while (!list_empty(head)) {
563562306a36Sopenharmony_ci			bool success;
563662306a36Sopenharmony_ci			struct folio *folio = lru_to_folio(head);
563762306a36Sopenharmony_ci
563862306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
563962306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
564062306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
564162306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
564262306a36Sopenharmony_ci
564362306a36Sopenharmony_ci			lruvec_del_folio(lruvec, folio);
564462306a36Sopenharmony_ci			success = lru_gen_add_folio(lruvec, folio, false);
564562306a36Sopenharmony_ci			VM_WARN_ON_ONCE(!success);
564662306a36Sopenharmony_ci
564762306a36Sopenharmony_ci			if (!--remaining)
564862306a36Sopenharmony_ci				return false;
564962306a36Sopenharmony_ci		}
565062306a36Sopenharmony_ci	}
565162306a36Sopenharmony_ci
565262306a36Sopenharmony_ci	return true;
565362306a36Sopenharmony_ci}
565462306a36Sopenharmony_ci
565562306a36Sopenharmony_cistatic bool drain_evictable(struct lruvec *lruvec)
565662306a36Sopenharmony_ci{
565762306a36Sopenharmony_ci	int gen, type, zone;
565862306a36Sopenharmony_ci	int remaining = MAX_LRU_BATCH;
565962306a36Sopenharmony_ci
566062306a36Sopenharmony_ci	for_each_gen_type_zone(gen, type, zone) {
566162306a36Sopenharmony_ci		struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
566262306a36Sopenharmony_ci
566362306a36Sopenharmony_ci		while (!list_empty(head)) {
566462306a36Sopenharmony_ci			bool success;
566562306a36Sopenharmony_ci			struct folio *folio = lru_to_folio(head);
566662306a36Sopenharmony_ci
566762306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
566862306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
566962306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
567062306a36Sopenharmony_ci			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
567162306a36Sopenharmony_ci
567262306a36Sopenharmony_ci			success = lru_gen_del_folio(lruvec, folio, false);
567362306a36Sopenharmony_ci			VM_WARN_ON_ONCE(!success);
567462306a36Sopenharmony_ci			lruvec_add_folio(lruvec, folio);
567562306a36Sopenharmony_ci
567662306a36Sopenharmony_ci			if (!--remaining)
567762306a36Sopenharmony_ci				return false;
567862306a36Sopenharmony_ci		}
567962306a36Sopenharmony_ci	}
568062306a36Sopenharmony_ci
568162306a36Sopenharmony_ci	return true;
568262306a36Sopenharmony_ci}
568362306a36Sopenharmony_ci
568462306a36Sopenharmony_cistatic void lru_gen_change_state(bool enabled)
568562306a36Sopenharmony_ci{
568662306a36Sopenharmony_ci	static DEFINE_MUTEX(state_mutex);
568762306a36Sopenharmony_ci
568862306a36Sopenharmony_ci	struct mem_cgroup *memcg;
568962306a36Sopenharmony_ci
569062306a36Sopenharmony_ci	cgroup_lock();
569162306a36Sopenharmony_ci	cpus_read_lock();
569262306a36Sopenharmony_ci	get_online_mems();
569362306a36Sopenharmony_ci	mutex_lock(&state_mutex);
569462306a36Sopenharmony_ci
569562306a36Sopenharmony_ci	if (enabled == lru_gen_enabled())
569662306a36Sopenharmony_ci		goto unlock;
569762306a36Sopenharmony_ci
569862306a36Sopenharmony_ci	if (enabled)
569962306a36Sopenharmony_ci		static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
570062306a36Sopenharmony_ci	else
570162306a36Sopenharmony_ci		static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
570262306a36Sopenharmony_ci
570362306a36Sopenharmony_ci	memcg = mem_cgroup_iter(NULL, NULL, NULL);
570462306a36Sopenharmony_ci	do {
570562306a36Sopenharmony_ci		int nid;
570662306a36Sopenharmony_ci
570762306a36Sopenharmony_ci		for_each_node(nid) {
570862306a36Sopenharmony_ci			struct lruvec *lruvec = get_lruvec(memcg, nid);
570962306a36Sopenharmony_ci
571062306a36Sopenharmony_ci			spin_lock_irq(&lruvec->lru_lock);
571162306a36Sopenharmony_ci
571262306a36Sopenharmony_ci			VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
571362306a36Sopenharmony_ci			VM_WARN_ON_ONCE(!state_is_valid(lruvec));
571462306a36Sopenharmony_ci
571562306a36Sopenharmony_ci			lruvec->lrugen.enabled = enabled;
571662306a36Sopenharmony_ci
571762306a36Sopenharmony_ci			while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
571862306a36Sopenharmony_ci				spin_unlock_irq(&lruvec->lru_lock);
571962306a36Sopenharmony_ci				cond_resched();
572062306a36Sopenharmony_ci				spin_lock_irq(&lruvec->lru_lock);
572162306a36Sopenharmony_ci			}
572262306a36Sopenharmony_ci
572362306a36Sopenharmony_ci			spin_unlock_irq(&lruvec->lru_lock);
572462306a36Sopenharmony_ci		}
572562306a36Sopenharmony_ci
572662306a36Sopenharmony_ci		cond_resched();
572762306a36Sopenharmony_ci	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
572862306a36Sopenharmony_ciunlock:
572962306a36Sopenharmony_ci	mutex_unlock(&state_mutex);
573062306a36Sopenharmony_ci	put_online_mems();
573162306a36Sopenharmony_ci	cpus_read_unlock();
573262306a36Sopenharmony_ci	cgroup_unlock();
573362306a36Sopenharmony_ci}
573462306a36Sopenharmony_ci
573562306a36Sopenharmony_ci/******************************************************************************
573662306a36Sopenharmony_ci *                          sysfs interface
573762306a36Sopenharmony_ci ******************************************************************************/
573862306a36Sopenharmony_ci
573962306a36Sopenharmony_cistatic ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
574062306a36Sopenharmony_ci{
574162306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
574262306a36Sopenharmony_ci}
574362306a36Sopenharmony_ci
574462306a36Sopenharmony_ci/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
574562306a36Sopenharmony_cistatic ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
574662306a36Sopenharmony_ci				const char *buf, size_t len)
574762306a36Sopenharmony_ci{
574862306a36Sopenharmony_ci	unsigned int msecs;
574962306a36Sopenharmony_ci
575062306a36Sopenharmony_ci	if (kstrtouint(buf, 0, &msecs))
575162306a36Sopenharmony_ci		return -EINVAL;
575262306a36Sopenharmony_ci
575362306a36Sopenharmony_ci	WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
575462306a36Sopenharmony_ci
575562306a36Sopenharmony_ci	return len;
575662306a36Sopenharmony_ci}
575762306a36Sopenharmony_ci
575862306a36Sopenharmony_cistatic struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
575962306a36Sopenharmony_ci
576062306a36Sopenharmony_cistatic ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
576162306a36Sopenharmony_ci{
576262306a36Sopenharmony_ci	unsigned int caps = 0;
576362306a36Sopenharmony_ci
576462306a36Sopenharmony_ci	if (get_cap(LRU_GEN_CORE))
576562306a36Sopenharmony_ci		caps |= BIT(LRU_GEN_CORE);
576662306a36Sopenharmony_ci
576762306a36Sopenharmony_ci	if (should_walk_mmu())
576862306a36Sopenharmony_ci		caps |= BIT(LRU_GEN_MM_WALK);
576962306a36Sopenharmony_ci
577062306a36Sopenharmony_ci	if (should_clear_pmd_young())
577162306a36Sopenharmony_ci		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
577262306a36Sopenharmony_ci
577362306a36Sopenharmony_ci	return sysfs_emit(buf, "0x%04x\n", caps);
577462306a36Sopenharmony_ci}
577562306a36Sopenharmony_ci
577662306a36Sopenharmony_ci/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
577762306a36Sopenharmony_cistatic ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
577862306a36Sopenharmony_ci			     const char *buf, size_t len)
577962306a36Sopenharmony_ci{
578062306a36Sopenharmony_ci	int i;
578162306a36Sopenharmony_ci	unsigned int caps;
578262306a36Sopenharmony_ci
578362306a36Sopenharmony_ci	if (tolower(*buf) == 'n')
578462306a36Sopenharmony_ci		caps = 0;
578562306a36Sopenharmony_ci	else if (tolower(*buf) == 'y')
578662306a36Sopenharmony_ci		caps = -1;
578762306a36Sopenharmony_ci	else if (kstrtouint(buf, 0, &caps))
578862306a36Sopenharmony_ci		return -EINVAL;
578962306a36Sopenharmony_ci
579062306a36Sopenharmony_ci	for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
579162306a36Sopenharmony_ci		bool enabled = caps & BIT(i);
579262306a36Sopenharmony_ci
579362306a36Sopenharmony_ci		if (i == LRU_GEN_CORE)
579462306a36Sopenharmony_ci			lru_gen_change_state(enabled);
579562306a36Sopenharmony_ci		else if (enabled)
579662306a36Sopenharmony_ci			static_branch_enable(&lru_gen_caps[i]);
579762306a36Sopenharmony_ci		else
579862306a36Sopenharmony_ci			static_branch_disable(&lru_gen_caps[i]);
579962306a36Sopenharmony_ci	}
580062306a36Sopenharmony_ci
580162306a36Sopenharmony_ci	return len;
580262306a36Sopenharmony_ci}
580362306a36Sopenharmony_ci
580462306a36Sopenharmony_cistatic struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
580562306a36Sopenharmony_ci
580662306a36Sopenharmony_cistatic struct attribute *lru_gen_attrs[] = {
580762306a36Sopenharmony_ci	&lru_gen_min_ttl_attr.attr,
580862306a36Sopenharmony_ci	&lru_gen_enabled_attr.attr,
580962306a36Sopenharmony_ci	NULL
581062306a36Sopenharmony_ci};
581162306a36Sopenharmony_ci
581262306a36Sopenharmony_cistatic const struct attribute_group lru_gen_attr_group = {
581362306a36Sopenharmony_ci	.name = "lru_gen",
581462306a36Sopenharmony_ci	.attrs = lru_gen_attrs,
581562306a36Sopenharmony_ci};
581662306a36Sopenharmony_ci
581762306a36Sopenharmony_ci/******************************************************************************
581862306a36Sopenharmony_ci *                          debugfs interface
581962306a36Sopenharmony_ci ******************************************************************************/
582062306a36Sopenharmony_ci
582162306a36Sopenharmony_cistatic void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
582262306a36Sopenharmony_ci{
582362306a36Sopenharmony_ci	struct mem_cgroup *memcg;
582462306a36Sopenharmony_ci	loff_t nr_to_skip = *pos;
582562306a36Sopenharmony_ci
582662306a36Sopenharmony_ci	m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
582762306a36Sopenharmony_ci	if (!m->private)
582862306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
582962306a36Sopenharmony_ci
583062306a36Sopenharmony_ci	memcg = mem_cgroup_iter(NULL, NULL, NULL);
583162306a36Sopenharmony_ci	do {
583262306a36Sopenharmony_ci		int nid;
583362306a36Sopenharmony_ci
583462306a36Sopenharmony_ci		for_each_node_state(nid, N_MEMORY) {
583562306a36Sopenharmony_ci			if (!nr_to_skip--)
583662306a36Sopenharmony_ci				return get_lruvec(memcg, nid);
583762306a36Sopenharmony_ci		}
583862306a36Sopenharmony_ci	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
583962306a36Sopenharmony_ci
584062306a36Sopenharmony_ci	return NULL;
584162306a36Sopenharmony_ci}
584262306a36Sopenharmony_ci
584362306a36Sopenharmony_cistatic void lru_gen_seq_stop(struct seq_file *m, void *v)
584462306a36Sopenharmony_ci{
584562306a36Sopenharmony_ci	if (!IS_ERR_OR_NULL(v))
584662306a36Sopenharmony_ci		mem_cgroup_iter_break(NULL, lruvec_memcg(v));
584762306a36Sopenharmony_ci
584862306a36Sopenharmony_ci	kvfree(m->private);
584962306a36Sopenharmony_ci	m->private = NULL;
585062306a36Sopenharmony_ci}
585162306a36Sopenharmony_ci
585262306a36Sopenharmony_cistatic void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
585362306a36Sopenharmony_ci{
585462306a36Sopenharmony_ci	int nid = lruvec_pgdat(v)->node_id;
585562306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(v);
585662306a36Sopenharmony_ci
585762306a36Sopenharmony_ci	++*pos;
585862306a36Sopenharmony_ci
585962306a36Sopenharmony_ci	nid = next_memory_node(nid);
586062306a36Sopenharmony_ci	if (nid == MAX_NUMNODES) {
586162306a36Sopenharmony_ci		memcg = mem_cgroup_iter(NULL, memcg, NULL);
586262306a36Sopenharmony_ci		if (!memcg)
586362306a36Sopenharmony_ci			return NULL;
586462306a36Sopenharmony_ci
586562306a36Sopenharmony_ci		nid = first_memory_node;
586662306a36Sopenharmony_ci	}
586762306a36Sopenharmony_ci
586862306a36Sopenharmony_ci	return get_lruvec(memcg, nid);
586962306a36Sopenharmony_ci}
587062306a36Sopenharmony_ci
587162306a36Sopenharmony_cistatic void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
587262306a36Sopenharmony_ci				  unsigned long max_seq, unsigned long *min_seq,
587362306a36Sopenharmony_ci				  unsigned long seq)
587462306a36Sopenharmony_ci{
587562306a36Sopenharmony_ci	int i;
587662306a36Sopenharmony_ci	int type, tier;
587762306a36Sopenharmony_ci	int hist = lru_hist_from_seq(seq);
587862306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
587962306a36Sopenharmony_ci
588062306a36Sopenharmony_ci	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
588162306a36Sopenharmony_ci		seq_printf(m, "            %10d", tier);
588262306a36Sopenharmony_ci		for (type = 0; type < ANON_AND_FILE; type++) {
588362306a36Sopenharmony_ci			const char *s = "   ";
588462306a36Sopenharmony_ci			unsigned long n[3] = {};
588562306a36Sopenharmony_ci
588662306a36Sopenharmony_ci			if (seq == max_seq) {
588762306a36Sopenharmony_ci				s = "RT ";
588862306a36Sopenharmony_ci				n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
588962306a36Sopenharmony_ci				n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
589062306a36Sopenharmony_ci			} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
589162306a36Sopenharmony_ci				s = "rep";
589262306a36Sopenharmony_ci				n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
589362306a36Sopenharmony_ci				n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
589462306a36Sopenharmony_ci				if (tier)
589562306a36Sopenharmony_ci					n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
589662306a36Sopenharmony_ci			}
589762306a36Sopenharmony_ci
589862306a36Sopenharmony_ci			for (i = 0; i < 3; i++)
589962306a36Sopenharmony_ci				seq_printf(m, " %10lu%c", n[i], s[i]);
590062306a36Sopenharmony_ci		}
590162306a36Sopenharmony_ci		seq_putc(m, '\n');
590262306a36Sopenharmony_ci	}
590362306a36Sopenharmony_ci
590462306a36Sopenharmony_ci	seq_puts(m, "                      ");
590562306a36Sopenharmony_ci	for (i = 0; i < NR_MM_STATS; i++) {
590662306a36Sopenharmony_ci		const char *s = "      ";
590762306a36Sopenharmony_ci		unsigned long n = 0;
590862306a36Sopenharmony_ci
590962306a36Sopenharmony_ci		if (seq == max_seq && NR_HIST_GENS == 1) {
591062306a36Sopenharmony_ci			s = "LOYNFA";
591162306a36Sopenharmony_ci			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
591262306a36Sopenharmony_ci		} else if (seq != max_seq && NR_HIST_GENS > 1) {
591362306a36Sopenharmony_ci			s = "loynfa";
591462306a36Sopenharmony_ci			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
591562306a36Sopenharmony_ci		}
591662306a36Sopenharmony_ci
591762306a36Sopenharmony_ci		seq_printf(m, " %10lu%c", n, s[i]);
591862306a36Sopenharmony_ci	}
591962306a36Sopenharmony_ci	seq_putc(m, '\n');
592062306a36Sopenharmony_ci}
592162306a36Sopenharmony_ci
592262306a36Sopenharmony_ci/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
592362306a36Sopenharmony_cistatic int lru_gen_seq_show(struct seq_file *m, void *v)
592462306a36Sopenharmony_ci{
592562306a36Sopenharmony_ci	unsigned long seq;
592662306a36Sopenharmony_ci	bool full = !debugfs_real_fops(m->file)->write;
592762306a36Sopenharmony_ci	struct lruvec *lruvec = v;
592862306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
592962306a36Sopenharmony_ci	int nid = lruvec_pgdat(lruvec)->node_id;
593062306a36Sopenharmony_ci	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
593162306a36Sopenharmony_ci	DEFINE_MAX_SEQ(lruvec);
593262306a36Sopenharmony_ci	DEFINE_MIN_SEQ(lruvec);
593362306a36Sopenharmony_ci
593462306a36Sopenharmony_ci	if (nid == first_memory_node) {
593562306a36Sopenharmony_ci		const char *path = memcg ? m->private : "";
593662306a36Sopenharmony_ci
593762306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
593862306a36Sopenharmony_ci		if (memcg)
593962306a36Sopenharmony_ci			cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
594062306a36Sopenharmony_ci#endif
594162306a36Sopenharmony_ci		seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
594262306a36Sopenharmony_ci	}
594362306a36Sopenharmony_ci
594462306a36Sopenharmony_ci	seq_printf(m, " node %5d\n", nid);
594562306a36Sopenharmony_ci
594662306a36Sopenharmony_ci	if (!full)
594762306a36Sopenharmony_ci		seq = min_seq[LRU_GEN_ANON];
594862306a36Sopenharmony_ci	else if (max_seq >= MAX_NR_GENS)
594962306a36Sopenharmony_ci		seq = max_seq - MAX_NR_GENS + 1;
595062306a36Sopenharmony_ci	else
595162306a36Sopenharmony_ci		seq = 0;
595262306a36Sopenharmony_ci
595362306a36Sopenharmony_ci	for (; seq <= max_seq; seq++) {
595462306a36Sopenharmony_ci		int type, zone;
595562306a36Sopenharmony_ci		int gen = lru_gen_from_seq(seq);
595662306a36Sopenharmony_ci		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
595762306a36Sopenharmony_ci
595862306a36Sopenharmony_ci		seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
595962306a36Sopenharmony_ci
596062306a36Sopenharmony_ci		for (type = 0; type < ANON_AND_FILE; type++) {
596162306a36Sopenharmony_ci			unsigned long size = 0;
596262306a36Sopenharmony_ci			char mark = full && seq < min_seq[type] ? 'x' : ' ';
596362306a36Sopenharmony_ci
596462306a36Sopenharmony_ci			for (zone = 0; zone < MAX_NR_ZONES; zone++)
596562306a36Sopenharmony_ci				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
596662306a36Sopenharmony_ci
596762306a36Sopenharmony_ci			seq_printf(m, " %10lu%c", size, mark);
596862306a36Sopenharmony_ci		}
596962306a36Sopenharmony_ci
597062306a36Sopenharmony_ci		seq_putc(m, '\n');
597162306a36Sopenharmony_ci
597262306a36Sopenharmony_ci		if (full)
597362306a36Sopenharmony_ci			lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
597462306a36Sopenharmony_ci	}
597562306a36Sopenharmony_ci
597662306a36Sopenharmony_ci	return 0;
597762306a36Sopenharmony_ci}
597862306a36Sopenharmony_ci
597962306a36Sopenharmony_cistatic const struct seq_operations lru_gen_seq_ops = {
598062306a36Sopenharmony_ci	.start = lru_gen_seq_start,
598162306a36Sopenharmony_ci	.stop = lru_gen_seq_stop,
598262306a36Sopenharmony_ci	.next = lru_gen_seq_next,
598362306a36Sopenharmony_ci	.show = lru_gen_seq_show,
598462306a36Sopenharmony_ci};
598562306a36Sopenharmony_ci
598662306a36Sopenharmony_cistatic int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
598762306a36Sopenharmony_ci		     bool can_swap, bool force_scan)
598862306a36Sopenharmony_ci{
598962306a36Sopenharmony_ci	DEFINE_MAX_SEQ(lruvec);
599062306a36Sopenharmony_ci	DEFINE_MIN_SEQ(lruvec);
599162306a36Sopenharmony_ci
599262306a36Sopenharmony_ci	if (seq < max_seq)
599362306a36Sopenharmony_ci		return 0;
599462306a36Sopenharmony_ci
599562306a36Sopenharmony_ci	if (seq > max_seq)
599662306a36Sopenharmony_ci		return -EINVAL;
599762306a36Sopenharmony_ci
599862306a36Sopenharmony_ci	if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
599962306a36Sopenharmony_ci		return -ERANGE;
600062306a36Sopenharmony_ci
600162306a36Sopenharmony_ci	try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
600262306a36Sopenharmony_ci
600362306a36Sopenharmony_ci	return 0;
600462306a36Sopenharmony_ci}
600562306a36Sopenharmony_ci
600662306a36Sopenharmony_cistatic int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
600762306a36Sopenharmony_ci			int swappiness, unsigned long nr_to_reclaim)
600862306a36Sopenharmony_ci{
600962306a36Sopenharmony_ci	DEFINE_MAX_SEQ(lruvec);
601062306a36Sopenharmony_ci
601162306a36Sopenharmony_ci	if (seq + MIN_NR_GENS > max_seq)
601262306a36Sopenharmony_ci		return -EINVAL;
601362306a36Sopenharmony_ci
601462306a36Sopenharmony_ci	sc->nr_reclaimed = 0;
601562306a36Sopenharmony_ci
601662306a36Sopenharmony_ci	while (!signal_pending(current)) {
601762306a36Sopenharmony_ci		DEFINE_MIN_SEQ(lruvec);
601862306a36Sopenharmony_ci
601962306a36Sopenharmony_ci		if (seq < min_seq[!swappiness])
602062306a36Sopenharmony_ci			return 0;
602162306a36Sopenharmony_ci
602262306a36Sopenharmony_ci		if (sc->nr_reclaimed >= nr_to_reclaim)
602362306a36Sopenharmony_ci			return 0;
602462306a36Sopenharmony_ci
602562306a36Sopenharmony_ci		if (!evict_folios(lruvec, sc, swappiness))
602662306a36Sopenharmony_ci			return 0;
602762306a36Sopenharmony_ci
602862306a36Sopenharmony_ci		cond_resched();
602962306a36Sopenharmony_ci	}
603062306a36Sopenharmony_ci
603162306a36Sopenharmony_ci	return -EINTR;
603262306a36Sopenharmony_ci}
603362306a36Sopenharmony_ci
603462306a36Sopenharmony_cistatic int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
603562306a36Sopenharmony_ci		   struct scan_control *sc, int swappiness, unsigned long opt)
603662306a36Sopenharmony_ci{
603762306a36Sopenharmony_ci	struct lruvec *lruvec;
603862306a36Sopenharmony_ci	int err = -EINVAL;
603962306a36Sopenharmony_ci	struct mem_cgroup *memcg = NULL;
604062306a36Sopenharmony_ci
604162306a36Sopenharmony_ci	if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
604262306a36Sopenharmony_ci		return -EINVAL;
604362306a36Sopenharmony_ci
604462306a36Sopenharmony_ci	if (!mem_cgroup_disabled()) {
604562306a36Sopenharmony_ci		rcu_read_lock();
604662306a36Sopenharmony_ci
604762306a36Sopenharmony_ci		memcg = mem_cgroup_from_id(memcg_id);
604862306a36Sopenharmony_ci		if (!mem_cgroup_tryget(memcg))
604962306a36Sopenharmony_ci			memcg = NULL;
605062306a36Sopenharmony_ci
605162306a36Sopenharmony_ci		rcu_read_unlock();
605262306a36Sopenharmony_ci
605362306a36Sopenharmony_ci		if (!memcg)
605462306a36Sopenharmony_ci			return -EINVAL;
605562306a36Sopenharmony_ci	}
605662306a36Sopenharmony_ci
605762306a36Sopenharmony_ci	if (memcg_id != mem_cgroup_id(memcg))
605862306a36Sopenharmony_ci		goto done;
605962306a36Sopenharmony_ci
606062306a36Sopenharmony_ci	lruvec = get_lruvec(memcg, nid);
606162306a36Sopenharmony_ci
606262306a36Sopenharmony_ci	if (swappiness < 0)
606362306a36Sopenharmony_ci		swappiness = get_swappiness(lruvec, sc);
606462306a36Sopenharmony_ci	else if (swappiness > 200)
606562306a36Sopenharmony_ci		goto done;
606662306a36Sopenharmony_ci
606762306a36Sopenharmony_ci	switch (cmd) {
606862306a36Sopenharmony_ci	case '+':
606962306a36Sopenharmony_ci		err = run_aging(lruvec, seq, sc, swappiness, opt);
607062306a36Sopenharmony_ci		break;
607162306a36Sopenharmony_ci	case '-':
607262306a36Sopenharmony_ci		err = run_eviction(lruvec, seq, sc, swappiness, opt);
607362306a36Sopenharmony_ci		break;
607462306a36Sopenharmony_ci	}
607562306a36Sopenharmony_cidone:
607662306a36Sopenharmony_ci	mem_cgroup_put(memcg);
607762306a36Sopenharmony_ci
607862306a36Sopenharmony_ci	return err;
607962306a36Sopenharmony_ci}
608062306a36Sopenharmony_ci
608162306a36Sopenharmony_ci/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
608262306a36Sopenharmony_cistatic ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
608362306a36Sopenharmony_ci				 size_t len, loff_t *pos)
608462306a36Sopenharmony_ci{
608562306a36Sopenharmony_ci	void *buf;
608662306a36Sopenharmony_ci	char *cur, *next;
608762306a36Sopenharmony_ci	unsigned int flags;
608862306a36Sopenharmony_ci	struct blk_plug plug;
608962306a36Sopenharmony_ci	int err = -EINVAL;
609062306a36Sopenharmony_ci	struct scan_control sc = {
609162306a36Sopenharmony_ci		.may_writepage = true,
609262306a36Sopenharmony_ci		.may_unmap = true,
609362306a36Sopenharmony_ci		.may_swap = true,
609462306a36Sopenharmony_ci		.reclaim_idx = MAX_NR_ZONES - 1,
609562306a36Sopenharmony_ci		.gfp_mask = GFP_KERNEL,
609662306a36Sopenharmony_ci	};
609762306a36Sopenharmony_ci
609862306a36Sopenharmony_ci	buf = kvmalloc(len + 1, GFP_KERNEL);
609962306a36Sopenharmony_ci	if (!buf)
610062306a36Sopenharmony_ci		return -ENOMEM;
610162306a36Sopenharmony_ci
610262306a36Sopenharmony_ci	if (copy_from_user(buf, src, len)) {
610362306a36Sopenharmony_ci		kvfree(buf);
610462306a36Sopenharmony_ci		return -EFAULT;
610562306a36Sopenharmony_ci	}
610662306a36Sopenharmony_ci
610762306a36Sopenharmony_ci	set_task_reclaim_state(current, &sc.reclaim_state);
610862306a36Sopenharmony_ci	flags = memalloc_noreclaim_save();
610962306a36Sopenharmony_ci	blk_start_plug(&plug);
611062306a36Sopenharmony_ci	if (!set_mm_walk(NULL, true)) {
611162306a36Sopenharmony_ci		err = -ENOMEM;
611262306a36Sopenharmony_ci		goto done;
611362306a36Sopenharmony_ci	}
611462306a36Sopenharmony_ci
611562306a36Sopenharmony_ci	next = buf;
611662306a36Sopenharmony_ci	next[len] = '\0';
611762306a36Sopenharmony_ci
611862306a36Sopenharmony_ci	while ((cur = strsep(&next, ",;\n"))) {
611962306a36Sopenharmony_ci		int n;
612062306a36Sopenharmony_ci		int end;
612162306a36Sopenharmony_ci		char cmd;
612262306a36Sopenharmony_ci		unsigned int memcg_id;
612362306a36Sopenharmony_ci		unsigned int nid;
612462306a36Sopenharmony_ci		unsigned long seq;
612562306a36Sopenharmony_ci		unsigned int swappiness = -1;
612662306a36Sopenharmony_ci		unsigned long opt = -1;
612762306a36Sopenharmony_ci
612862306a36Sopenharmony_ci		cur = skip_spaces(cur);
612962306a36Sopenharmony_ci		if (!*cur)
613062306a36Sopenharmony_ci			continue;
613162306a36Sopenharmony_ci
613262306a36Sopenharmony_ci		n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
613362306a36Sopenharmony_ci			   &seq, &end, &swappiness, &end, &opt, &end);
613462306a36Sopenharmony_ci		if (n < 4 || cur[end]) {
613562306a36Sopenharmony_ci			err = -EINVAL;
613662306a36Sopenharmony_ci			break;
613762306a36Sopenharmony_ci		}
613862306a36Sopenharmony_ci
613962306a36Sopenharmony_ci		err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
614062306a36Sopenharmony_ci		if (err)
614162306a36Sopenharmony_ci			break;
614262306a36Sopenharmony_ci	}
614362306a36Sopenharmony_cidone:
614462306a36Sopenharmony_ci	clear_mm_walk();
614562306a36Sopenharmony_ci	blk_finish_plug(&plug);
614662306a36Sopenharmony_ci	memalloc_noreclaim_restore(flags);
614762306a36Sopenharmony_ci	set_task_reclaim_state(current, NULL);
614862306a36Sopenharmony_ci
614962306a36Sopenharmony_ci	kvfree(buf);
615062306a36Sopenharmony_ci
615162306a36Sopenharmony_ci	return err ? : len;
615262306a36Sopenharmony_ci}
615362306a36Sopenharmony_ci
615462306a36Sopenharmony_cistatic int lru_gen_seq_open(struct inode *inode, struct file *file)
615562306a36Sopenharmony_ci{
615662306a36Sopenharmony_ci	return seq_open(file, &lru_gen_seq_ops);
615762306a36Sopenharmony_ci}
615862306a36Sopenharmony_ci
615962306a36Sopenharmony_cistatic const struct file_operations lru_gen_rw_fops = {
616062306a36Sopenharmony_ci	.open = lru_gen_seq_open,
616162306a36Sopenharmony_ci	.read = seq_read,
616262306a36Sopenharmony_ci	.write = lru_gen_seq_write,
616362306a36Sopenharmony_ci	.llseek = seq_lseek,
616462306a36Sopenharmony_ci	.release = seq_release,
616562306a36Sopenharmony_ci};
616662306a36Sopenharmony_ci
616762306a36Sopenharmony_cistatic const struct file_operations lru_gen_ro_fops = {
616862306a36Sopenharmony_ci	.open = lru_gen_seq_open,
616962306a36Sopenharmony_ci	.read = seq_read,
617062306a36Sopenharmony_ci	.llseek = seq_lseek,
617162306a36Sopenharmony_ci	.release = seq_release,
617262306a36Sopenharmony_ci};
617362306a36Sopenharmony_ci
617462306a36Sopenharmony_ci/******************************************************************************
617562306a36Sopenharmony_ci *                          initialization
617662306a36Sopenharmony_ci ******************************************************************************/
617762306a36Sopenharmony_ci
617862306a36Sopenharmony_civoid lru_gen_init_lruvec(struct lruvec *lruvec)
617962306a36Sopenharmony_ci{
618062306a36Sopenharmony_ci	int i;
618162306a36Sopenharmony_ci	int gen, type, zone;
618262306a36Sopenharmony_ci	struct lru_gen_folio *lrugen = &lruvec->lrugen;
618362306a36Sopenharmony_ci
618462306a36Sopenharmony_ci	lrugen->max_seq = MIN_NR_GENS + 1;
618562306a36Sopenharmony_ci	lrugen->enabled = lru_gen_enabled();
618662306a36Sopenharmony_ci
618762306a36Sopenharmony_ci	for (i = 0; i <= MIN_NR_GENS + 1; i++)
618862306a36Sopenharmony_ci		lrugen->timestamps[i] = jiffies;
618962306a36Sopenharmony_ci
619062306a36Sopenharmony_ci	for_each_gen_type_zone(gen, type, zone)
619162306a36Sopenharmony_ci		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
619262306a36Sopenharmony_ci
619362306a36Sopenharmony_ci	lruvec->mm_state.seq = MIN_NR_GENS;
619462306a36Sopenharmony_ci}
619562306a36Sopenharmony_ci
619662306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
619762306a36Sopenharmony_ci
619862306a36Sopenharmony_civoid lru_gen_init_pgdat(struct pglist_data *pgdat)
619962306a36Sopenharmony_ci{
620062306a36Sopenharmony_ci	int i, j;
620162306a36Sopenharmony_ci
620262306a36Sopenharmony_ci	spin_lock_init(&pgdat->memcg_lru.lock);
620362306a36Sopenharmony_ci
620462306a36Sopenharmony_ci	for (i = 0; i < MEMCG_NR_GENS; i++) {
620562306a36Sopenharmony_ci		for (j = 0; j < MEMCG_NR_BINS; j++)
620662306a36Sopenharmony_ci			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
620762306a36Sopenharmony_ci	}
620862306a36Sopenharmony_ci}
620962306a36Sopenharmony_ci
621062306a36Sopenharmony_civoid lru_gen_init_memcg(struct mem_cgroup *memcg)
621162306a36Sopenharmony_ci{
621262306a36Sopenharmony_ci	INIT_LIST_HEAD(&memcg->mm_list.fifo);
621362306a36Sopenharmony_ci	spin_lock_init(&memcg->mm_list.lock);
621462306a36Sopenharmony_ci}
621562306a36Sopenharmony_ci
621662306a36Sopenharmony_civoid lru_gen_exit_memcg(struct mem_cgroup *memcg)
621762306a36Sopenharmony_ci{
621862306a36Sopenharmony_ci	int i;
621962306a36Sopenharmony_ci	int nid;
622062306a36Sopenharmony_ci
622162306a36Sopenharmony_ci	VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
622262306a36Sopenharmony_ci
622362306a36Sopenharmony_ci	for_each_node(nid) {
622462306a36Sopenharmony_ci		struct lruvec *lruvec = get_lruvec(memcg, nid);
622562306a36Sopenharmony_ci
622662306a36Sopenharmony_ci		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
622762306a36Sopenharmony_ci					   sizeof(lruvec->lrugen.nr_pages)));
622862306a36Sopenharmony_ci
622962306a36Sopenharmony_ci		lruvec->lrugen.list.next = LIST_POISON1;
623062306a36Sopenharmony_ci
623162306a36Sopenharmony_ci		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
623262306a36Sopenharmony_ci			bitmap_free(lruvec->mm_state.filters[i]);
623362306a36Sopenharmony_ci			lruvec->mm_state.filters[i] = NULL;
623462306a36Sopenharmony_ci		}
623562306a36Sopenharmony_ci	}
623662306a36Sopenharmony_ci}
623762306a36Sopenharmony_ci
623862306a36Sopenharmony_ci#endif /* CONFIG_MEMCG */
623962306a36Sopenharmony_ci
624062306a36Sopenharmony_cistatic int __init init_lru_gen(void)
624162306a36Sopenharmony_ci{
624262306a36Sopenharmony_ci	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
624362306a36Sopenharmony_ci	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
624462306a36Sopenharmony_ci
624562306a36Sopenharmony_ci	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
624662306a36Sopenharmony_ci		pr_err("lru_gen: failed to create sysfs group\n");
624762306a36Sopenharmony_ci
624862306a36Sopenharmony_ci	debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
624962306a36Sopenharmony_ci	debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
625062306a36Sopenharmony_ci
625162306a36Sopenharmony_ci	return 0;
625262306a36Sopenharmony_ci};
625362306a36Sopenharmony_cilate_initcall(init_lru_gen);
625462306a36Sopenharmony_ci
625562306a36Sopenharmony_ci#else /* !CONFIG_LRU_GEN */
625662306a36Sopenharmony_ci
625762306a36Sopenharmony_cistatic void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
625862306a36Sopenharmony_ci{
625962306a36Sopenharmony_ci}
626062306a36Sopenharmony_ci
626162306a36Sopenharmony_cistatic void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
626262306a36Sopenharmony_ci{
626362306a36Sopenharmony_ci}
626462306a36Sopenharmony_ci
626562306a36Sopenharmony_cistatic void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
626662306a36Sopenharmony_ci{
626762306a36Sopenharmony_ci}
626862306a36Sopenharmony_ci
626962306a36Sopenharmony_ci#endif /* CONFIG_LRU_GEN */
627062306a36Sopenharmony_ci
627162306a36Sopenharmony_civoid shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
627262306a36Sopenharmony_ci{
627362306a36Sopenharmony_ci	unsigned long nr[NR_LRU_LISTS];
627462306a36Sopenharmony_ci	unsigned long targets[NR_LRU_LISTS];
627562306a36Sopenharmony_ci	unsigned long nr_to_scan;
627662306a36Sopenharmony_ci	enum lru_list lru;
627762306a36Sopenharmony_ci	unsigned long nr_reclaimed = 0;
627862306a36Sopenharmony_ci	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
627962306a36Sopenharmony_ci	bool proportional_reclaim;
628062306a36Sopenharmony_ci	struct blk_plug plug;
628162306a36Sopenharmony_ci
628262306a36Sopenharmony_ci	if (lru_gen_enabled() && !root_reclaim(sc)) {
628362306a36Sopenharmony_ci		lru_gen_shrink_lruvec(lruvec, sc);
628462306a36Sopenharmony_ci		return;
628562306a36Sopenharmony_ci	}
628662306a36Sopenharmony_ci
628762306a36Sopenharmony_ci	get_scan_count(lruvec, sc, nr);
628862306a36Sopenharmony_ci
628962306a36Sopenharmony_ci	/* Record the original scan target for proportional adjustments later */
629062306a36Sopenharmony_ci	memcpy(targets, nr, sizeof(nr));
629162306a36Sopenharmony_ci
629262306a36Sopenharmony_ci	/*
629362306a36Sopenharmony_ci	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
629462306a36Sopenharmony_ci	 * event that can occur when there is little memory pressure e.g.
629562306a36Sopenharmony_ci	 * multiple streaming readers/writers. Hence, we do not abort scanning
629662306a36Sopenharmony_ci	 * when the requested number of pages are reclaimed when scanning at
629762306a36Sopenharmony_ci	 * DEF_PRIORITY on the assumption that the fact we are direct
629862306a36Sopenharmony_ci	 * reclaiming implies that kswapd is not keeping up and it is best to
629962306a36Sopenharmony_ci	 * do a batch of work at once. For memcg reclaim one check is made to
630062306a36Sopenharmony_ci	 * abort proportional reclaim if either the file or anon lru has already
630162306a36Sopenharmony_ci	 * dropped to zero at the first pass.
630262306a36Sopenharmony_ci	 */
630362306a36Sopenharmony_ci	proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
630462306a36Sopenharmony_ci				sc->priority == DEF_PRIORITY);
630562306a36Sopenharmony_ci
630662306a36Sopenharmony_ci	blk_start_plug(&plug);
630762306a36Sopenharmony_ci	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
630862306a36Sopenharmony_ci					nr[LRU_INACTIVE_FILE]) {
630962306a36Sopenharmony_ci		unsigned long nr_anon, nr_file, percentage;
631062306a36Sopenharmony_ci		unsigned long nr_scanned;
631162306a36Sopenharmony_ci
631262306a36Sopenharmony_ci		for_each_evictable_lru(lru) {
631362306a36Sopenharmony_ci			if (nr[lru]) {
631462306a36Sopenharmony_ci				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
631562306a36Sopenharmony_ci				nr[lru] -= nr_to_scan;
631662306a36Sopenharmony_ci
631762306a36Sopenharmony_ci				nr_reclaimed += shrink_list(lru, nr_to_scan,
631862306a36Sopenharmony_ci							    lruvec, sc);
631962306a36Sopenharmony_ci			}
632062306a36Sopenharmony_ci		}
632162306a36Sopenharmony_ci
632262306a36Sopenharmony_ci		cond_resched();
632362306a36Sopenharmony_ci
632462306a36Sopenharmony_ci		if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
632562306a36Sopenharmony_ci			continue;
632662306a36Sopenharmony_ci
632762306a36Sopenharmony_ci		/*
632862306a36Sopenharmony_ci		 * For kswapd and memcg, reclaim at least the number of pages
632962306a36Sopenharmony_ci		 * requested. Ensure that the anon and file LRUs are scanned
633062306a36Sopenharmony_ci		 * proportionally what was requested by get_scan_count(). We
633162306a36Sopenharmony_ci		 * stop reclaiming one LRU and reduce the amount scanning
633262306a36Sopenharmony_ci		 * proportional to the original scan target.
633362306a36Sopenharmony_ci		 */
633462306a36Sopenharmony_ci		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
633562306a36Sopenharmony_ci		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
633662306a36Sopenharmony_ci
633762306a36Sopenharmony_ci		/*
633862306a36Sopenharmony_ci		 * It's just vindictive to attack the larger once the smaller
633962306a36Sopenharmony_ci		 * has gone to zero.  And given the way we stop scanning the
634062306a36Sopenharmony_ci		 * smaller below, this makes sure that we only make one nudge
634162306a36Sopenharmony_ci		 * towards proportionality once we've got nr_to_reclaim.
634262306a36Sopenharmony_ci		 */
634362306a36Sopenharmony_ci		if (!nr_file || !nr_anon)
634462306a36Sopenharmony_ci			break;
634562306a36Sopenharmony_ci
634662306a36Sopenharmony_ci		if (nr_file > nr_anon) {
634762306a36Sopenharmony_ci			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
634862306a36Sopenharmony_ci						targets[LRU_ACTIVE_ANON] + 1;
634962306a36Sopenharmony_ci			lru = LRU_BASE;
635062306a36Sopenharmony_ci			percentage = nr_anon * 100 / scan_target;
635162306a36Sopenharmony_ci		} else {
635262306a36Sopenharmony_ci			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
635362306a36Sopenharmony_ci						targets[LRU_ACTIVE_FILE] + 1;
635462306a36Sopenharmony_ci			lru = LRU_FILE;
635562306a36Sopenharmony_ci			percentage = nr_file * 100 / scan_target;
635662306a36Sopenharmony_ci		}
635762306a36Sopenharmony_ci
635862306a36Sopenharmony_ci		/* Stop scanning the smaller of the LRU */
635962306a36Sopenharmony_ci		nr[lru] = 0;
636062306a36Sopenharmony_ci		nr[lru + LRU_ACTIVE] = 0;
636162306a36Sopenharmony_ci
636262306a36Sopenharmony_ci		/*
636362306a36Sopenharmony_ci		 * Recalculate the other LRU scan count based on its original
636462306a36Sopenharmony_ci		 * scan target and the percentage scanning already complete
636562306a36Sopenharmony_ci		 */
636662306a36Sopenharmony_ci		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
636762306a36Sopenharmony_ci		nr_scanned = targets[lru] - nr[lru];
636862306a36Sopenharmony_ci		nr[lru] = targets[lru] * (100 - percentage) / 100;
636962306a36Sopenharmony_ci		nr[lru] -= min(nr[lru], nr_scanned);
637062306a36Sopenharmony_ci
637162306a36Sopenharmony_ci		lru += LRU_ACTIVE;
637262306a36Sopenharmony_ci		nr_scanned = targets[lru] - nr[lru];
637362306a36Sopenharmony_ci		nr[lru] = targets[lru] * (100 - percentage) / 100;
637462306a36Sopenharmony_ci		nr[lru] -= min(nr[lru], nr_scanned);
637562306a36Sopenharmony_ci	}
637662306a36Sopenharmony_ci	blk_finish_plug(&plug);
637762306a36Sopenharmony_ci	sc->nr_reclaimed += nr_reclaimed;
637862306a36Sopenharmony_ci
637962306a36Sopenharmony_ci	/*
638062306a36Sopenharmony_ci	 * Even if we did not try to evict anon pages at all, we want to
638162306a36Sopenharmony_ci	 * rebalance the anon lru active/inactive ratio.
638262306a36Sopenharmony_ci	 */
638362306a36Sopenharmony_ci	if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
638462306a36Sopenharmony_ci	    inactive_is_low(lruvec, LRU_INACTIVE_ANON))
638562306a36Sopenharmony_ci		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
638662306a36Sopenharmony_ci				   sc, LRU_ACTIVE_ANON);
638762306a36Sopenharmony_ci}
638862306a36Sopenharmony_ci
638962306a36Sopenharmony_ci/* Use reclaim/compaction for costly allocs or under memory pressure */
639062306a36Sopenharmony_cistatic bool in_reclaim_compaction(struct scan_control *sc)
639162306a36Sopenharmony_ci{
639262306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
639362306a36Sopenharmony_ci			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
639462306a36Sopenharmony_ci			 sc->priority < DEF_PRIORITY - 2))
639562306a36Sopenharmony_ci		return true;
639662306a36Sopenharmony_ci
639762306a36Sopenharmony_ci	return false;
639862306a36Sopenharmony_ci}
639962306a36Sopenharmony_ci
640062306a36Sopenharmony_ci/*
640162306a36Sopenharmony_ci * Reclaim/compaction is used for high-order allocation requests. It reclaims
640262306a36Sopenharmony_ci * order-0 pages before compacting the zone. should_continue_reclaim() returns
640362306a36Sopenharmony_ci * true if more pages should be reclaimed such that when the page allocator
640462306a36Sopenharmony_ci * calls try_to_compact_pages() that it will have enough free pages to succeed.
640562306a36Sopenharmony_ci * It will give up earlier than that if there is difficulty reclaiming pages.
640662306a36Sopenharmony_ci */
640762306a36Sopenharmony_cistatic inline bool should_continue_reclaim(struct pglist_data *pgdat,
640862306a36Sopenharmony_ci					unsigned long nr_reclaimed,
640962306a36Sopenharmony_ci					struct scan_control *sc)
641062306a36Sopenharmony_ci{
641162306a36Sopenharmony_ci	unsigned long pages_for_compaction;
641262306a36Sopenharmony_ci	unsigned long inactive_lru_pages;
641362306a36Sopenharmony_ci	int z;
641462306a36Sopenharmony_ci
641562306a36Sopenharmony_ci	/* If not in reclaim/compaction mode, stop */
641662306a36Sopenharmony_ci	if (!in_reclaim_compaction(sc))
641762306a36Sopenharmony_ci		return false;
641862306a36Sopenharmony_ci
641962306a36Sopenharmony_ci	/*
642062306a36Sopenharmony_ci	 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
642162306a36Sopenharmony_ci	 * number of pages that were scanned. This will return to the caller
642262306a36Sopenharmony_ci	 * with the risk reclaim/compaction and the resulting allocation attempt
642362306a36Sopenharmony_ci	 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
642462306a36Sopenharmony_ci	 * allocations through requiring that the full LRU list has been scanned
642562306a36Sopenharmony_ci	 * first, by assuming that zero delta of sc->nr_scanned means full LRU
642662306a36Sopenharmony_ci	 * scan, but that approximation was wrong, and there were corner cases
642762306a36Sopenharmony_ci	 * where always a non-zero amount of pages were scanned.
642862306a36Sopenharmony_ci	 */
642962306a36Sopenharmony_ci	if (!nr_reclaimed)
643062306a36Sopenharmony_ci		return false;
643162306a36Sopenharmony_ci
643262306a36Sopenharmony_ci	/* If compaction would go ahead or the allocation would succeed, stop */
643362306a36Sopenharmony_ci	for (z = 0; z <= sc->reclaim_idx; z++) {
643462306a36Sopenharmony_ci		struct zone *zone = &pgdat->node_zones[z];
643562306a36Sopenharmony_ci		if (!managed_zone(zone))
643662306a36Sopenharmony_ci			continue;
643762306a36Sopenharmony_ci
643862306a36Sopenharmony_ci		/* Allocation can already succeed, nothing to do */
643962306a36Sopenharmony_ci		if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
644062306a36Sopenharmony_ci				      sc->reclaim_idx, 0))
644162306a36Sopenharmony_ci			return false;
644262306a36Sopenharmony_ci
644362306a36Sopenharmony_ci		if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
644462306a36Sopenharmony_ci			return false;
644562306a36Sopenharmony_ci	}
644662306a36Sopenharmony_ci
644762306a36Sopenharmony_ci	/*
644862306a36Sopenharmony_ci	 * If we have not reclaimed enough pages for compaction and the
644962306a36Sopenharmony_ci	 * inactive lists are large enough, continue reclaiming
645062306a36Sopenharmony_ci	 */
645162306a36Sopenharmony_ci	pages_for_compaction = compact_gap(sc->order);
645262306a36Sopenharmony_ci	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
645362306a36Sopenharmony_ci	if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
645462306a36Sopenharmony_ci		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
645562306a36Sopenharmony_ci
645662306a36Sopenharmony_ci	return inactive_lru_pages > pages_for_compaction;
645762306a36Sopenharmony_ci}
645862306a36Sopenharmony_ci
645962306a36Sopenharmony_ci#ifndef CONFIG_HYPERHOLD_FILE_LRU
646062306a36Sopenharmony_cistatic void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
646162306a36Sopenharmony_ci{
646262306a36Sopenharmony_ci	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
646362306a36Sopenharmony_ci	struct mem_cgroup *memcg;
646462306a36Sopenharmony_ci
646562306a36Sopenharmony_ci	memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
646662306a36Sopenharmony_ci	do {
646762306a36Sopenharmony_ci		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
646862306a36Sopenharmony_ci		unsigned long reclaimed;
646962306a36Sopenharmony_ci		unsigned long scanned;
647062306a36Sopenharmony_ci
647162306a36Sopenharmony_ci		/*
647262306a36Sopenharmony_ci		 * This loop can become CPU-bound when target memcgs
647362306a36Sopenharmony_ci		 * aren't eligible for reclaim - either because they
647462306a36Sopenharmony_ci		 * don't have any reclaimable pages, or because their
647562306a36Sopenharmony_ci		 * memory is explicitly protected. Avoid soft lockups.
647662306a36Sopenharmony_ci		 */
647762306a36Sopenharmony_ci		cond_resched();
647862306a36Sopenharmony_ci
647962306a36Sopenharmony_ci		mem_cgroup_calculate_protection(target_memcg, memcg);
648062306a36Sopenharmony_ci
648162306a36Sopenharmony_ci		if (mem_cgroup_below_min(target_memcg, memcg)) {
648262306a36Sopenharmony_ci			/*
648362306a36Sopenharmony_ci			 * Hard protection.
648462306a36Sopenharmony_ci			 * If there is no reclaimable memory, OOM.
648562306a36Sopenharmony_ci			 */
648662306a36Sopenharmony_ci			continue;
648762306a36Sopenharmony_ci		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
648862306a36Sopenharmony_ci			/*
648962306a36Sopenharmony_ci			 * Soft protection.
649062306a36Sopenharmony_ci			 * Respect the protection only as long as
649162306a36Sopenharmony_ci			 * there is an unprotected supply
649262306a36Sopenharmony_ci			 * of reclaimable memory from other cgroups.
649362306a36Sopenharmony_ci			 */
649462306a36Sopenharmony_ci			if (!sc->memcg_low_reclaim) {
649562306a36Sopenharmony_ci				sc->memcg_low_skipped = 1;
649662306a36Sopenharmony_ci				continue;
649762306a36Sopenharmony_ci			}
649862306a36Sopenharmony_ci			memcg_memory_event(memcg, MEMCG_LOW);
649962306a36Sopenharmony_ci		}
650062306a36Sopenharmony_ci
650162306a36Sopenharmony_ci		reclaimed = sc->nr_reclaimed;
650262306a36Sopenharmony_ci		scanned = sc->nr_scanned;
650362306a36Sopenharmony_ci
650462306a36Sopenharmony_ci		shrink_lruvec(lruvec, sc);
650562306a36Sopenharmony_ci
650662306a36Sopenharmony_ci		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
650762306a36Sopenharmony_ci			    sc->priority);
650862306a36Sopenharmony_ci
650962306a36Sopenharmony_ci		/* Record the group's reclaim efficiency */
651062306a36Sopenharmony_ci		if (!sc->proactive)
651162306a36Sopenharmony_ci			vmpressure(sc->gfp_mask, memcg, false,
651262306a36Sopenharmony_ci				   sc->nr_scanned - scanned,
651362306a36Sopenharmony_ci				   sc->nr_reclaimed - reclaimed);
651462306a36Sopenharmony_ci
651562306a36Sopenharmony_ci	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
651662306a36Sopenharmony_ci}
651762306a36Sopenharmony_ci
651862306a36Sopenharmony_cistatic void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
651962306a36Sopenharmony_ci{
652062306a36Sopenharmony_ci	unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed;
652162306a36Sopenharmony_ci	struct lruvec *target_lruvec;
652262306a36Sopenharmony_ci	bool reclaimable = false;
652362306a36Sopenharmony_ci
652462306a36Sopenharmony_ci	if (lru_gen_enabled() && root_reclaim(sc)) {
652562306a36Sopenharmony_ci		lru_gen_shrink_node(pgdat, sc);
652662306a36Sopenharmony_ci		return;
652762306a36Sopenharmony_ci	}
652862306a36Sopenharmony_ci
652962306a36Sopenharmony_ci	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
653062306a36Sopenharmony_ci
653162306a36Sopenharmony_ciagain:
653262306a36Sopenharmony_ci	memset(&sc->nr, 0, sizeof(sc->nr));
653362306a36Sopenharmony_ci
653462306a36Sopenharmony_ci	nr_reclaimed = sc->nr_reclaimed;
653562306a36Sopenharmony_ci	nr_scanned = sc->nr_scanned;
653662306a36Sopenharmony_ci
653762306a36Sopenharmony_ci	prepare_scan_count(pgdat, sc);
653862306a36Sopenharmony_ci
653962306a36Sopenharmony_ci	shrink_node_memcgs(pgdat, sc);
654062306a36Sopenharmony_ci
654162306a36Sopenharmony_ci	flush_reclaim_state(sc);
654262306a36Sopenharmony_ci
654362306a36Sopenharmony_ci	nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed;
654462306a36Sopenharmony_ci
654562306a36Sopenharmony_ci	/* Record the subtree's reclaim efficiency */
654662306a36Sopenharmony_ci	if (!sc->proactive)
654762306a36Sopenharmony_ci		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
654862306a36Sopenharmony_ci			   sc->nr_scanned - nr_scanned, nr_node_reclaimed);
654962306a36Sopenharmony_ci
655062306a36Sopenharmony_ci	if (nr_node_reclaimed)
655162306a36Sopenharmony_ci		reclaimable = true;
655262306a36Sopenharmony_ci
655362306a36Sopenharmony_ci	if (current_is_kswapd()) {
655462306a36Sopenharmony_ci		/*
655562306a36Sopenharmony_ci		 * If reclaim is isolating dirty pages under writeback,
655662306a36Sopenharmony_ci		 * it implies that the long-lived page allocation rate
655762306a36Sopenharmony_ci		 * is exceeding the page laundering rate. Either the
655862306a36Sopenharmony_ci		 * global limits are not being effective at throttling
655962306a36Sopenharmony_ci		 * processes due to the page distribution throughout
656062306a36Sopenharmony_ci		 * zones or there is heavy usage of a slow backing
656162306a36Sopenharmony_ci		 * device. The only option is to throttle from reclaim
656262306a36Sopenharmony_ci		 * context which is not ideal as there is no guarantee
656362306a36Sopenharmony_ci		 * the dirtying process is throttled in the same way
656462306a36Sopenharmony_ci		 * balance_dirty_pages() manages.
656562306a36Sopenharmony_ci		 *
656662306a36Sopenharmony_ci		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
656762306a36Sopenharmony_ci		 * count the number of pages under pages flagged for
656862306a36Sopenharmony_ci		 * immediate reclaim and stall if any are encountered
656962306a36Sopenharmony_ci		 * in the nr_immediate check below.
657062306a36Sopenharmony_ci		 */
657162306a36Sopenharmony_ci		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
657262306a36Sopenharmony_ci			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
657362306a36Sopenharmony_ci
657462306a36Sopenharmony_ci		/* Allow kswapd to start writing pages during reclaim.*/
657562306a36Sopenharmony_ci		if (sc->nr.unqueued_dirty == sc->nr.file_taken)
657662306a36Sopenharmony_ci			set_bit(PGDAT_DIRTY, &pgdat->flags);
657762306a36Sopenharmony_ci
657862306a36Sopenharmony_ci		/*
657962306a36Sopenharmony_ci		 * If kswapd scans pages marked for immediate
658062306a36Sopenharmony_ci		 * reclaim and under writeback (nr_immediate), it
658162306a36Sopenharmony_ci		 * implies that pages are cycling through the LRU
658262306a36Sopenharmony_ci		 * faster than they are written so forcibly stall
658362306a36Sopenharmony_ci		 * until some pages complete writeback.
658462306a36Sopenharmony_ci		 */
658562306a36Sopenharmony_ci		if (sc->nr.immediate)
658662306a36Sopenharmony_ci			reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
658762306a36Sopenharmony_ci	}
658862306a36Sopenharmony_ci
658962306a36Sopenharmony_ci	/*
659062306a36Sopenharmony_ci	 * Tag a node/memcg as congested if all the dirty pages were marked
659162306a36Sopenharmony_ci	 * for writeback and immediate reclaim (counted in nr.congested).
659262306a36Sopenharmony_ci	 *
659362306a36Sopenharmony_ci	 * Legacy memcg will stall in page writeback so avoid forcibly
659462306a36Sopenharmony_ci	 * stalling in reclaim_throttle().
659562306a36Sopenharmony_ci	 */
659662306a36Sopenharmony_ci	if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
659762306a36Sopenharmony_ci		if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
659862306a36Sopenharmony_ci			set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
659962306a36Sopenharmony_ci
660062306a36Sopenharmony_ci		if (current_is_kswapd())
660162306a36Sopenharmony_ci			set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
660262306a36Sopenharmony_ci	}
660362306a36Sopenharmony_ci
660462306a36Sopenharmony_ci	/*
660562306a36Sopenharmony_ci	 * Stall direct reclaim for IO completions if the lruvec is
660662306a36Sopenharmony_ci	 * node is congested. Allow kswapd to continue until it
660762306a36Sopenharmony_ci	 * starts encountering unqueued dirty pages or cycling through
660862306a36Sopenharmony_ci	 * the LRU too quickly.
660962306a36Sopenharmony_ci	 */
661062306a36Sopenharmony_ci	if (!current_is_kswapd() && current_may_throttle() &&
661162306a36Sopenharmony_ci	    !sc->hibernation_mode &&
661262306a36Sopenharmony_ci	    (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
661362306a36Sopenharmony_ci	     test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
661462306a36Sopenharmony_ci		reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
661562306a36Sopenharmony_ci
661662306a36Sopenharmony_ci	if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc))
661762306a36Sopenharmony_ci		goto again;
661862306a36Sopenharmony_ci
661962306a36Sopenharmony_ci	/*
662062306a36Sopenharmony_ci	 * Kswapd gives up on balancing particular nodes after too
662162306a36Sopenharmony_ci	 * many failures to reclaim anything from them and goes to
662262306a36Sopenharmony_ci	 * sleep. On reclaim progress, reset the failure counter. A
662362306a36Sopenharmony_ci	 * successful direct reclaim run will revive a dormant kswapd.
662462306a36Sopenharmony_ci	 */
662562306a36Sopenharmony_ci	if (reclaimable)
662662306a36Sopenharmony_ci		pgdat->kswapd_failures = 0;
662762306a36Sopenharmony_ci}
662862306a36Sopenharmony_ci#endif
662962306a36Sopenharmony_ci
663062306a36Sopenharmony_ci/*
663162306a36Sopenharmony_ci * Returns true if compaction should go ahead for a costly-order request, or
663262306a36Sopenharmony_ci * the allocation would already succeed without compaction. Return false if we
663362306a36Sopenharmony_ci * should reclaim first.
663462306a36Sopenharmony_ci */
663562306a36Sopenharmony_cistatic inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
663662306a36Sopenharmony_ci{
663762306a36Sopenharmony_ci	unsigned long watermark;
663862306a36Sopenharmony_ci
663962306a36Sopenharmony_ci	/* Allocation can already succeed, nothing to do */
664062306a36Sopenharmony_ci	if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
664162306a36Sopenharmony_ci			      sc->reclaim_idx, 0))
664262306a36Sopenharmony_ci		return true;
664362306a36Sopenharmony_ci
664462306a36Sopenharmony_ci	/* Compaction cannot yet proceed. Do reclaim. */
664562306a36Sopenharmony_ci	if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
664662306a36Sopenharmony_ci		return false;
664762306a36Sopenharmony_ci
664862306a36Sopenharmony_ci	/*
664962306a36Sopenharmony_ci	 * Compaction is already possible, but it takes time to run and there
665062306a36Sopenharmony_ci	 * are potentially other callers using the pages just freed. So proceed
665162306a36Sopenharmony_ci	 * with reclaim to make a buffer of free pages available to give
665262306a36Sopenharmony_ci	 * compaction a reasonable chance of completing and allocating the page.
665362306a36Sopenharmony_ci	 * Note that we won't actually reclaim the whole buffer in one attempt
665462306a36Sopenharmony_ci	 * as the target watermark in should_continue_reclaim() is lower. But if
665562306a36Sopenharmony_ci	 * we are already above the high+gap watermark, don't reclaim at all.
665662306a36Sopenharmony_ci	 */
665762306a36Sopenharmony_ci	watermark = high_wmark_pages(zone) + compact_gap(sc->order);
665862306a36Sopenharmony_ci
665962306a36Sopenharmony_ci	return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
666062306a36Sopenharmony_ci}
666162306a36Sopenharmony_ci
666262306a36Sopenharmony_cistatic void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
666362306a36Sopenharmony_ci{
666462306a36Sopenharmony_ci	/*
666562306a36Sopenharmony_ci	 * If reclaim is making progress greater than 12% efficiency then
666662306a36Sopenharmony_ci	 * wake all the NOPROGRESS throttled tasks.
666762306a36Sopenharmony_ci	 */
666862306a36Sopenharmony_ci	if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
666962306a36Sopenharmony_ci		wait_queue_head_t *wqh;
667062306a36Sopenharmony_ci
667162306a36Sopenharmony_ci		wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
667262306a36Sopenharmony_ci		if (waitqueue_active(wqh))
667362306a36Sopenharmony_ci			wake_up(wqh);
667462306a36Sopenharmony_ci
667562306a36Sopenharmony_ci		return;
667662306a36Sopenharmony_ci	}
667762306a36Sopenharmony_ci
667862306a36Sopenharmony_ci	/*
667962306a36Sopenharmony_ci	 * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
668062306a36Sopenharmony_ci	 * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
668162306a36Sopenharmony_ci	 * under writeback and marked for immediate reclaim at the tail of the
668262306a36Sopenharmony_ci	 * LRU.
668362306a36Sopenharmony_ci	 */
668462306a36Sopenharmony_ci	if (current_is_kswapd() || cgroup_reclaim(sc))
668562306a36Sopenharmony_ci		return;
668662306a36Sopenharmony_ci
668762306a36Sopenharmony_ci	/* Throttle if making no progress at high prioities. */
668862306a36Sopenharmony_ci	if (sc->priority == 1 && !sc->nr_reclaimed)
668962306a36Sopenharmony_ci		reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
669062306a36Sopenharmony_ci}
669162306a36Sopenharmony_ci
669262306a36Sopenharmony_ci/*
669362306a36Sopenharmony_ci * This is the direct reclaim path, for page-allocating processes.  We only
669462306a36Sopenharmony_ci * try to reclaim pages from zones which will satisfy the caller's allocation
669562306a36Sopenharmony_ci * request.
669662306a36Sopenharmony_ci *
669762306a36Sopenharmony_ci * If a zone is deemed to be full of pinned pages then just give it a light
669862306a36Sopenharmony_ci * scan then give up on it.
669962306a36Sopenharmony_ci */
670062306a36Sopenharmony_cistatic void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
670162306a36Sopenharmony_ci{
670262306a36Sopenharmony_ci	struct zoneref *z;
670362306a36Sopenharmony_ci	struct zone *zone;
670462306a36Sopenharmony_ci	unsigned long nr_soft_reclaimed;
670562306a36Sopenharmony_ci	unsigned long nr_soft_scanned;
670662306a36Sopenharmony_ci	gfp_t orig_mask;
670762306a36Sopenharmony_ci	pg_data_t *last_pgdat = NULL;
670862306a36Sopenharmony_ci	pg_data_t *first_pgdat = NULL;
670962306a36Sopenharmony_ci
671062306a36Sopenharmony_ci	/*
671162306a36Sopenharmony_ci	 * If the number of buffer_heads in the machine exceeds the maximum
671262306a36Sopenharmony_ci	 * allowed level, force direct reclaim to scan the highmem zone as
671362306a36Sopenharmony_ci	 * highmem pages could be pinning lowmem pages storing buffer_heads
671462306a36Sopenharmony_ci	 */
671562306a36Sopenharmony_ci	orig_mask = sc->gfp_mask;
671662306a36Sopenharmony_ci	if (buffer_heads_over_limit) {
671762306a36Sopenharmony_ci		sc->gfp_mask |= __GFP_HIGHMEM;
671862306a36Sopenharmony_ci		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
671962306a36Sopenharmony_ci	}
672062306a36Sopenharmony_ci
672162306a36Sopenharmony_ci	for_each_zone_zonelist_nodemask(zone, z, zonelist,
672262306a36Sopenharmony_ci					sc->reclaim_idx, sc->nodemask) {
672362306a36Sopenharmony_ci		/*
672462306a36Sopenharmony_ci		 * Take care memory controller reclaiming has small influence
672562306a36Sopenharmony_ci		 * to global LRU.
672662306a36Sopenharmony_ci		 */
672762306a36Sopenharmony_ci		if (!cgroup_reclaim(sc)) {
672862306a36Sopenharmony_ci			if (!cpuset_zone_allowed(zone,
672962306a36Sopenharmony_ci						 GFP_KERNEL | __GFP_HARDWALL))
673062306a36Sopenharmony_ci				continue;
673162306a36Sopenharmony_ci
673262306a36Sopenharmony_ci			/*
673362306a36Sopenharmony_ci			 * If we already have plenty of memory free for
673462306a36Sopenharmony_ci			 * compaction in this zone, don't free any more.
673562306a36Sopenharmony_ci			 * Even though compaction is invoked for any
673662306a36Sopenharmony_ci			 * non-zero order, only frequent costly order
673762306a36Sopenharmony_ci			 * reclamation is disruptive enough to become a
673862306a36Sopenharmony_ci			 * noticeable problem, like transparent huge
673962306a36Sopenharmony_ci			 * page allocations.
674062306a36Sopenharmony_ci			 */
674162306a36Sopenharmony_ci			if (IS_ENABLED(CONFIG_COMPACTION) &&
674262306a36Sopenharmony_ci			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&
674362306a36Sopenharmony_ci			    compaction_ready(zone, sc)) {
674462306a36Sopenharmony_ci				sc->compaction_ready = true;
674562306a36Sopenharmony_ci				continue;
674662306a36Sopenharmony_ci			}
674762306a36Sopenharmony_ci
674862306a36Sopenharmony_ci			/*
674962306a36Sopenharmony_ci			 * Shrink each node in the zonelist once. If the
675062306a36Sopenharmony_ci			 * zonelist is ordered by zone (not the default) then a
675162306a36Sopenharmony_ci			 * node may be shrunk multiple times but in that case
675262306a36Sopenharmony_ci			 * the user prefers lower zones being preserved.
675362306a36Sopenharmony_ci			 */
675462306a36Sopenharmony_ci			if (zone->zone_pgdat == last_pgdat)
675562306a36Sopenharmony_ci				continue;
675662306a36Sopenharmony_ci
675762306a36Sopenharmony_ci			/*
675862306a36Sopenharmony_ci			 * This steals pages from memory cgroups over softlimit
675962306a36Sopenharmony_ci			 * and returns the number of reclaimed pages and
676062306a36Sopenharmony_ci			 * scanned pages. This works for global memory pressure
676162306a36Sopenharmony_ci			 * and balancing, not for a memcg's limit.
676262306a36Sopenharmony_ci			 */
676362306a36Sopenharmony_ci			nr_soft_scanned = 0;
676462306a36Sopenharmony_ci			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
676562306a36Sopenharmony_ci						sc->order, sc->gfp_mask,
676662306a36Sopenharmony_ci						&nr_soft_scanned);
676762306a36Sopenharmony_ci			sc->nr_reclaimed += nr_soft_reclaimed;
676862306a36Sopenharmony_ci			sc->nr_scanned += nr_soft_scanned;
676962306a36Sopenharmony_ci			/* need some check for avoid more shrink_zone() */
677062306a36Sopenharmony_ci		}
677162306a36Sopenharmony_ci
677262306a36Sopenharmony_ci		if (!first_pgdat)
677362306a36Sopenharmony_ci			first_pgdat = zone->zone_pgdat;
677462306a36Sopenharmony_ci
677562306a36Sopenharmony_ci		/* See comment about same check for global reclaim above */
677662306a36Sopenharmony_ci		if (zone->zone_pgdat == last_pgdat)
677762306a36Sopenharmony_ci			continue;
677862306a36Sopenharmony_ci		last_pgdat = zone->zone_pgdat;
677962306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
678062306a36Sopenharmony_ci		shrink_node_hyperhold(zone->zone_pgdat, sc);
678162306a36Sopenharmony_ci#else
678262306a36Sopenharmony_ci		shrink_node(zone->zone_pgdat, sc);
678362306a36Sopenharmony_ci#endif
678462306a36Sopenharmony_ci	}
678562306a36Sopenharmony_ci
678662306a36Sopenharmony_ci	if (first_pgdat)
678762306a36Sopenharmony_ci		consider_reclaim_throttle(first_pgdat, sc);
678862306a36Sopenharmony_ci
678962306a36Sopenharmony_ci	/*
679062306a36Sopenharmony_ci	 * Restore to original mask to avoid the impact on the caller if we
679162306a36Sopenharmony_ci	 * promoted it to __GFP_HIGHMEM.
679262306a36Sopenharmony_ci	 */
679362306a36Sopenharmony_ci	sc->gfp_mask = orig_mask;
679462306a36Sopenharmony_ci}
679562306a36Sopenharmony_ci
679662306a36Sopenharmony_cistatic void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
679762306a36Sopenharmony_ci{
679862306a36Sopenharmony_ci	struct lruvec *target_lruvec;
679962306a36Sopenharmony_ci	unsigned long refaults;
680062306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
680162306a36Sopenharmony_ci	struct lruvec *lruvec;
680262306a36Sopenharmony_ci#endif
680362306a36Sopenharmony_ci
680462306a36Sopenharmony_ci	if (lru_gen_enabled())
680562306a36Sopenharmony_ci		return;
680662306a36Sopenharmony_ci
680762306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
680862306a36Sopenharmony_ci	lruvec = node_lruvec(pgdat);
680962306a36Sopenharmony_ci	lruvec->refaults[0] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_ANON); /* modified */
681062306a36Sopenharmony_ci	lruvec->refaults[1] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_FILE); /* modified */
681162306a36Sopenharmony_ci#endif
681262306a36Sopenharmony_ci
681362306a36Sopenharmony_ci	target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
681462306a36Sopenharmony_ci	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
681562306a36Sopenharmony_ci	target_lruvec->refaults[WORKINGSET_ANON] = refaults;
681662306a36Sopenharmony_ci	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
681762306a36Sopenharmony_ci	target_lruvec->refaults[WORKINGSET_FILE] = refaults;
681862306a36Sopenharmony_ci}
681962306a36Sopenharmony_ci
682062306a36Sopenharmony_ci/*
682162306a36Sopenharmony_ci * This is the main entry point to direct page reclaim.
682262306a36Sopenharmony_ci *
682362306a36Sopenharmony_ci * If a full scan of the inactive list fails to free enough memory then we
682462306a36Sopenharmony_ci * are "out of memory" and something needs to be killed.
682562306a36Sopenharmony_ci *
682662306a36Sopenharmony_ci * If the caller is !__GFP_FS then the probability of a failure is reasonably
682762306a36Sopenharmony_ci * high - the zone may be full of dirty or under-writeback pages, which this
682862306a36Sopenharmony_ci * caller can't do much about.  We kick the writeback threads and take explicit
682962306a36Sopenharmony_ci * naps in the hope that some of these pages can be written.  But if the
683062306a36Sopenharmony_ci * allocating task holds filesystem locks which prevent writeout this might not
683162306a36Sopenharmony_ci * work, and the allocation attempt will fail.
683262306a36Sopenharmony_ci *
683362306a36Sopenharmony_ci * returns:	0, if no pages reclaimed
683462306a36Sopenharmony_ci * 		else, the number of pages reclaimed
683562306a36Sopenharmony_ci */
683662306a36Sopenharmony_cistatic unsigned long do_try_to_free_pages(struct zonelist *zonelist,
683762306a36Sopenharmony_ci					  struct scan_control *sc)
683862306a36Sopenharmony_ci{
683962306a36Sopenharmony_ci	int initial_priority = sc->priority;
684062306a36Sopenharmony_ci	pg_data_t *last_pgdat;
684162306a36Sopenharmony_ci	struct zoneref *z;
684262306a36Sopenharmony_ci	struct zone *zone;
684362306a36Sopenharmony_ciretry:
684462306a36Sopenharmony_ci	delayacct_freepages_start();
684562306a36Sopenharmony_ci
684662306a36Sopenharmony_ci	if (!cgroup_reclaim(sc))
684762306a36Sopenharmony_ci		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
684862306a36Sopenharmony_ci
684962306a36Sopenharmony_ci	do {
685062306a36Sopenharmony_ci		if (!sc->proactive)
685162306a36Sopenharmony_ci			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
685262306a36Sopenharmony_ci					sc->priority);
685362306a36Sopenharmony_ci		sc->nr_scanned = 0;
685462306a36Sopenharmony_ci		shrink_zones(zonelist, sc);
685562306a36Sopenharmony_ci
685662306a36Sopenharmony_ci		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
685762306a36Sopenharmony_ci			break;
685862306a36Sopenharmony_ci
685962306a36Sopenharmony_ci		if (sc->compaction_ready)
686062306a36Sopenharmony_ci			break;
686162306a36Sopenharmony_ci
686262306a36Sopenharmony_ci		/*
686362306a36Sopenharmony_ci		 * If we're getting trouble reclaiming, start doing
686462306a36Sopenharmony_ci		 * writepage even in laptop mode.
686562306a36Sopenharmony_ci		 */
686662306a36Sopenharmony_ci		if (sc->priority < DEF_PRIORITY - 2)
686762306a36Sopenharmony_ci			sc->may_writepage = 1;
686862306a36Sopenharmony_ci	} while (--sc->priority >= 0);
686962306a36Sopenharmony_ci
687062306a36Sopenharmony_ci	last_pgdat = NULL;
687162306a36Sopenharmony_ci	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
687262306a36Sopenharmony_ci					sc->nodemask) {
687362306a36Sopenharmony_ci		if (zone->zone_pgdat == last_pgdat)
687462306a36Sopenharmony_ci			continue;
687562306a36Sopenharmony_ci		last_pgdat = zone->zone_pgdat;
687662306a36Sopenharmony_ci
687762306a36Sopenharmony_ci		snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
687862306a36Sopenharmony_ci
687962306a36Sopenharmony_ci		if (cgroup_reclaim(sc)) {
688062306a36Sopenharmony_ci			struct lruvec *lruvec;
688162306a36Sopenharmony_ci
688262306a36Sopenharmony_ci			lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
688362306a36Sopenharmony_ci						   zone->zone_pgdat);
688462306a36Sopenharmony_ci			clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
688562306a36Sopenharmony_ci		}
688662306a36Sopenharmony_ci	}
688762306a36Sopenharmony_ci
688862306a36Sopenharmony_ci	delayacct_freepages_end();
688962306a36Sopenharmony_ci
689062306a36Sopenharmony_ci	if (sc->nr_reclaimed)
689162306a36Sopenharmony_ci		return sc->nr_reclaimed;
689262306a36Sopenharmony_ci
689362306a36Sopenharmony_ci	/* Aborted reclaim to try compaction? don't OOM, then */
689462306a36Sopenharmony_ci	if (sc->compaction_ready)
689562306a36Sopenharmony_ci		return 1;
689662306a36Sopenharmony_ci
689762306a36Sopenharmony_ci	/*
689862306a36Sopenharmony_ci	 * We make inactive:active ratio decisions based on the node's
689962306a36Sopenharmony_ci	 * composition of memory, but a restrictive reclaim_idx or a
690062306a36Sopenharmony_ci	 * memory.low cgroup setting can exempt large amounts of
690162306a36Sopenharmony_ci	 * memory from reclaim. Neither of which are very common, so
690262306a36Sopenharmony_ci	 * instead of doing costly eligibility calculations of the
690362306a36Sopenharmony_ci	 * entire cgroup subtree up front, we assume the estimates are
690462306a36Sopenharmony_ci	 * good, and retry with forcible deactivation if that fails.
690562306a36Sopenharmony_ci	 */
690662306a36Sopenharmony_ci	if (sc->skipped_deactivate) {
690762306a36Sopenharmony_ci		sc->priority = initial_priority;
690862306a36Sopenharmony_ci		sc->force_deactivate = 1;
690962306a36Sopenharmony_ci		sc->skipped_deactivate = 0;
691062306a36Sopenharmony_ci		goto retry;
691162306a36Sopenharmony_ci	}
691262306a36Sopenharmony_ci
691362306a36Sopenharmony_ci	/* Untapped cgroup reserves?  Don't OOM, retry. */
691462306a36Sopenharmony_ci	if (sc->memcg_low_skipped) {
691562306a36Sopenharmony_ci		sc->priority = initial_priority;
691662306a36Sopenharmony_ci		sc->force_deactivate = 0;
691762306a36Sopenharmony_ci		sc->memcg_low_reclaim = 1;
691862306a36Sopenharmony_ci		sc->memcg_low_skipped = 0;
691962306a36Sopenharmony_ci		goto retry;
692062306a36Sopenharmony_ci	}
692162306a36Sopenharmony_ci
692262306a36Sopenharmony_ci	return 0;
692362306a36Sopenharmony_ci}
692462306a36Sopenharmony_ci
692562306a36Sopenharmony_cistatic bool allow_direct_reclaim(pg_data_t *pgdat)
692662306a36Sopenharmony_ci{
692762306a36Sopenharmony_ci	struct zone *zone;
692862306a36Sopenharmony_ci	unsigned long pfmemalloc_reserve = 0;
692962306a36Sopenharmony_ci	unsigned long free_pages = 0;
693062306a36Sopenharmony_ci	int i;
693162306a36Sopenharmony_ci	bool wmark_ok;
693262306a36Sopenharmony_ci
693362306a36Sopenharmony_ci	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
693462306a36Sopenharmony_ci		return true;
693562306a36Sopenharmony_ci
693662306a36Sopenharmony_ci	for (i = 0; i <= ZONE_NORMAL; i++) {
693762306a36Sopenharmony_ci		zone = &pgdat->node_zones[i];
693862306a36Sopenharmony_ci		if (!managed_zone(zone))
693962306a36Sopenharmony_ci			continue;
694062306a36Sopenharmony_ci
694162306a36Sopenharmony_ci		if (!zone_reclaimable_pages(zone))
694262306a36Sopenharmony_ci			continue;
694362306a36Sopenharmony_ci
694462306a36Sopenharmony_ci		pfmemalloc_reserve += min_wmark_pages(zone);
694562306a36Sopenharmony_ci		free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES);
694662306a36Sopenharmony_ci	}
694762306a36Sopenharmony_ci
694862306a36Sopenharmony_ci	/* If there are no reserves (unexpected config) then do not throttle */
694962306a36Sopenharmony_ci	if (!pfmemalloc_reserve)
695062306a36Sopenharmony_ci		return true;
695162306a36Sopenharmony_ci
695262306a36Sopenharmony_ci	wmark_ok = free_pages > pfmemalloc_reserve / 2;
695362306a36Sopenharmony_ci
695462306a36Sopenharmony_ci	/* kswapd must be awake if processes are being throttled */
695562306a36Sopenharmony_ci	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
695662306a36Sopenharmony_ci		if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
695762306a36Sopenharmony_ci			WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
695862306a36Sopenharmony_ci
695962306a36Sopenharmony_ci		wake_up_interruptible(&pgdat->kswapd_wait);
696062306a36Sopenharmony_ci	}
696162306a36Sopenharmony_ci
696262306a36Sopenharmony_ci	return wmark_ok;
696362306a36Sopenharmony_ci}
696462306a36Sopenharmony_ci
696562306a36Sopenharmony_ci/*
696662306a36Sopenharmony_ci * Throttle direct reclaimers if backing storage is backed by the network
696762306a36Sopenharmony_ci * and the PFMEMALLOC reserve for the preferred node is getting dangerously
696862306a36Sopenharmony_ci * depleted. kswapd will continue to make progress and wake the processes
696962306a36Sopenharmony_ci * when the low watermark is reached.
697062306a36Sopenharmony_ci *
697162306a36Sopenharmony_ci * Returns true if a fatal signal was delivered during throttling. If this
697262306a36Sopenharmony_ci * happens, the page allocator should not consider triggering the OOM killer.
697362306a36Sopenharmony_ci */
697462306a36Sopenharmony_cistatic bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
697562306a36Sopenharmony_ci					nodemask_t *nodemask)
697662306a36Sopenharmony_ci{
697762306a36Sopenharmony_ci	struct zoneref *z;
697862306a36Sopenharmony_ci	struct zone *zone;
697962306a36Sopenharmony_ci	pg_data_t *pgdat = NULL;
698062306a36Sopenharmony_ci
698162306a36Sopenharmony_ci	/*
698262306a36Sopenharmony_ci	 * Kernel threads should not be throttled as they may be indirectly
698362306a36Sopenharmony_ci	 * responsible for cleaning pages necessary for reclaim to make forward
698462306a36Sopenharmony_ci	 * progress. kjournald for example may enter direct reclaim while
698562306a36Sopenharmony_ci	 * committing a transaction where throttling it could forcing other
698662306a36Sopenharmony_ci	 * processes to block on log_wait_commit().
698762306a36Sopenharmony_ci	 */
698862306a36Sopenharmony_ci	if (current->flags & PF_KTHREAD)
698962306a36Sopenharmony_ci		goto out;
699062306a36Sopenharmony_ci
699162306a36Sopenharmony_ci	/*
699262306a36Sopenharmony_ci	 * If a fatal signal is pending, this process should not throttle.
699362306a36Sopenharmony_ci	 * It should return quickly so it can exit and free its memory
699462306a36Sopenharmony_ci	 */
699562306a36Sopenharmony_ci	if (fatal_signal_pending(current))
699662306a36Sopenharmony_ci		goto out;
699762306a36Sopenharmony_ci
699862306a36Sopenharmony_ci	/*
699962306a36Sopenharmony_ci	 * Check if the pfmemalloc reserves are ok by finding the first node
700062306a36Sopenharmony_ci	 * with a usable ZONE_NORMAL or lower zone. The expectation is that
700162306a36Sopenharmony_ci	 * GFP_KERNEL will be required for allocating network buffers when
700262306a36Sopenharmony_ci	 * swapping over the network so ZONE_HIGHMEM is unusable.
700362306a36Sopenharmony_ci	 *
700462306a36Sopenharmony_ci	 * Throttling is based on the first usable node and throttled processes
700562306a36Sopenharmony_ci	 * wait on a queue until kswapd makes progress and wakes them. There
700662306a36Sopenharmony_ci	 * is an affinity then between processes waking up and where reclaim
700762306a36Sopenharmony_ci	 * progress has been made assuming the process wakes on the same node.
700862306a36Sopenharmony_ci	 * More importantly, processes running on remote nodes will not compete
700962306a36Sopenharmony_ci	 * for remote pfmemalloc reserves and processes on different nodes
701062306a36Sopenharmony_ci	 * should make reasonable progress.
701162306a36Sopenharmony_ci	 */
701262306a36Sopenharmony_ci	for_each_zone_zonelist_nodemask(zone, z, zonelist,
701362306a36Sopenharmony_ci					gfp_zone(gfp_mask), nodemask) {
701462306a36Sopenharmony_ci		if (zone_idx(zone) > ZONE_NORMAL)
701562306a36Sopenharmony_ci			continue;
701662306a36Sopenharmony_ci
701762306a36Sopenharmony_ci		/* Throttle based on the first usable node */
701862306a36Sopenharmony_ci		pgdat = zone->zone_pgdat;
701962306a36Sopenharmony_ci		if (allow_direct_reclaim(pgdat))
702062306a36Sopenharmony_ci			goto out;
702162306a36Sopenharmony_ci		break;
702262306a36Sopenharmony_ci	}
702362306a36Sopenharmony_ci
702462306a36Sopenharmony_ci	/* If no zone was usable by the allocation flags then do not throttle */
702562306a36Sopenharmony_ci	if (!pgdat)
702662306a36Sopenharmony_ci		goto out;
702762306a36Sopenharmony_ci
702862306a36Sopenharmony_ci	/* Account for the throttling */
702962306a36Sopenharmony_ci	count_vm_event(PGSCAN_DIRECT_THROTTLE);
703062306a36Sopenharmony_ci
703162306a36Sopenharmony_ci	/*
703262306a36Sopenharmony_ci	 * If the caller cannot enter the filesystem, it's possible that it
703362306a36Sopenharmony_ci	 * is due to the caller holding an FS lock or performing a journal
703462306a36Sopenharmony_ci	 * transaction in the case of a filesystem like ext[3|4]. In this case,
703562306a36Sopenharmony_ci	 * it is not safe to block on pfmemalloc_wait as kswapd could be
703662306a36Sopenharmony_ci	 * blocked waiting on the same lock. Instead, throttle for up to a
703762306a36Sopenharmony_ci	 * second before continuing.
703862306a36Sopenharmony_ci	 */
703962306a36Sopenharmony_ci	if (!(gfp_mask & __GFP_FS))
704062306a36Sopenharmony_ci		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
704162306a36Sopenharmony_ci			allow_direct_reclaim(pgdat), HZ);
704262306a36Sopenharmony_ci	else
704362306a36Sopenharmony_ci		/* Throttle until kswapd wakes the process */
704462306a36Sopenharmony_ci		wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
704562306a36Sopenharmony_ci			allow_direct_reclaim(pgdat));
704662306a36Sopenharmony_ci
704762306a36Sopenharmony_ci	if (fatal_signal_pending(current))
704862306a36Sopenharmony_ci		return true;
704962306a36Sopenharmony_ci
705062306a36Sopenharmony_ciout:
705162306a36Sopenharmony_ci	return false;
705262306a36Sopenharmony_ci}
705362306a36Sopenharmony_ci
705462306a36Sopenharmony_ciunsigned long try_to_free_pages(struct zonelist *zonelist, int order,
705562306a36Sopenharmony_ci				gfp_t gfp_mask, nodemask_t *nodemask)
705662306a36Sopenharmony_ci{
705762306a36Sopenharmony_ci	unsigned long nr_reclaimed;
705862306a36Sopenharmony_ci	struct scan_control sc = {
705962306a36Sopenharmony_ci		.nr_to_reclaim = SWAP_CLUSTER_MAX,
706062306a36Sopenharmony_ci		.gfp_mask = current_gfp_context(gfp_mask),
706162306a36Sopenharmony_ci		.reclaim_idx = gfp_zone(gfp_mask),
706262306a36Sopenharmony_ci		.order = order,
706362306a36Sopenharmony_ci		.nodemask = nodemask,
706462306a36Sopenharmony_ci		.priority = DEF_PRIORITY,
706562306a36Sopenharmony_ci		.may_writepage = !laptop_mode,
706662306a36Sopenharmony_ci		.may_unmap = 1,
706762306a36Sopenharmony_ci		.may_swap = 1,
706862306a36Sopenharmony_ci	};
706962306a36Sopenharmony_ci
707062306a36Sopenharmony_ci	/*
707162306a36Sopenharmony_ci	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
707262306a36Sopenharmony_ci	 * Confirm they are large enough for max values.
707362306a36Sopenharmony_ci	 */
707462306a36Sopenharmony_ci	BUILD_BUG_ON(MAX_ORDER >= S8_MAX);
707562306a36Sopenharmony_ci	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
707662306a36Sopenharmony_ci	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
707762306a36Sopenharmony_ci
707862306a36Sopenharmony_ci	/*
707962306a36Sopenharmony_ci	 * Do not enter reclaim if fatal signal was delivered while throttled.
708062306a36Sopenharmony_ci	 * 1 is returned so that the page allocator does not OOM kill at this
708162306a36Sopenharmony_ci	 * point.
708262306a36Sopenharmony_ci	 */
708362306a36Sopenharmony_ci	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
708462306a36Sopenharmony_ci		return 1;
708562306a36Sopenharmony_ci
708662306a36Sopenharmony_ci	set_task_reclaim_state(current, &sc.reclaim_state);
708762306a36Sopenharmony_ci	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
708862306a36Sopenharmony_ci
708962306a36Sopenharmony_ci	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
709062306a36Sopenharmony_ci
709162306a36Sopenharmony_ci	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
709262306a36Sopenharmony_ci	set_task_reclaim_state(current, NULL);
709362306a36Sopenharmony_ci
709462306a36Sopenharmony_ci	return nr_reclaimed;
709562306a36Sopenharmony_ci}
709662306a36Sopenharmony_ci
709762306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
709862306a36Sopenharmony_ci
709962306a36Sopenharmony_ci/* Only used by soft limit reclaim. Do not reuse for anything else. */
710062306a36Sopenharmony_ciunsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
710162306a36Sopenharmony_ci						gfp_t gfp_mask, bool noswap,
710262306a36Sopenharmony_ci						pg_data_t *pgdat,
710362306a36Sopenharmony_ci						unsigned long *nr_scanned)
710462306a36Sopenharmony_ci{
710562306a36Sopenharmony_ci	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
710662306a36Sopenharmony_ci	struct scan_control sc = {
710762306a36Sopenharmony_ci		.nr_to_reclaim = SWAP_CLUSTER_MAX,
710862306a36Sopenharmony_ci		.target_mem_cgroup = memcg,
710962306a36Sopenharmony_ci		.may_writepage = !laptop_mode,
711062306a36Sopenharmony_ci		.may_unmap = 1,
711162306a36Sopenharmony_ci		.reclaim_idx = MAX_NR_ZONES - 1,
711262306a36Sopenharmony_ci		.may_swap = !noswap,
711362306a36Sopenharmony_ci	};
711462306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
711562306a36Sopenharmony_ci	unsigned long nr[NR_LRU_LISTS];
711662306a36Sopenharmony_ci#endif
711762306a36Sopenharmony_ci
711862306a36Sopenharmony_ci	WARN_ON_ONCE(!current->reclaim_state);
711962306a36Sopenharmony_ci
712062306a36Sopenharmony_ci	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
712162306a36Sopenharmony_ci			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
712262306a36Sopenharmony_ci
712362306a36Sopenharmony_ci	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
712462306a36Sopenharmony_ci						      sc.gfp_mask);
712562306a36Sopenharmony_ci
712662306a36Sopenharmony_ci	/*
712762306a36Sopenharmony_ci	 * NOTE: Although we can get the priority field, using it
712862306a36Sopenharmony_ci	 * here is not a good idea, since it limits the pages we can scan.
712962306a36Sopenharmony_ci	 * if we don't reclaim here, the shrink_node from balance_pgdat
713062306a36Sopenharmony_ci	 * will pick up pages from other mem cgroup's as well. We hack
713162306a36Sopenharmony_ci	 * the priority and make it zero.
713262306a36Sopenharmony_ci	 */
713362306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
713462306a36Sopenharmony_ci	nr[LRU_ACTIVE_ANON] = lruvec_lru_size(lruvec,
713562306a36Sopenharmony_ci			LRU_ACTIVE_ANON, MAX_NR_ZONES);
713662306a36Sopenharmony_ci	nr[LRU_INACTIVE_ANON] = lruvec_lru_size(lruvec,
713762306a36Sopenharmony_ci			LRU_INACTIVE_ANON, MAX_NR_ZONES);
713862306a36Sopenharmony_ci	nr[LRU_ACTIVE_FILE] = 0;
713962306a36Sopenharmony_ci	nr[LRU_INACTIVE_FILE] = 0;
714062306a36Sopenharmony_ci	shrink_anon_memcg(pgdat, memcg, &sc, nr);
714162306a36Sopenharmony_ci#else
714262306a36Sopenharmony_ci	shrink_lruvec(lruvec, &sc);
714362306a36Sopenharmony_ci#endif
714462306a36Sopenharmony_ci
714562306a36Sopenharmony_ci	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
714662306a36Sopenharmony_ci
714762306a36Sopenharmony_ci	*nr_scanned = sc.nr_scanned;
714862306a36Sopenharmony_ci
714962306a36Sopenharmony_ci	return sc.nr_reclaimed;
715062306a36Sopenharmony_ci}
715162306a36Sopenharmony_ci
715262306a36Sopenharmony_ciunsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
715362306a36Sopenharmony_ci					   unsigned long nr_pages,
715462306a36Sopenharmony_ci					   gfp_t gfp_mask,
715562306a36Sopenharmony_ci					   unsigned int reclaim_options)
715662306a36Sopenharmony_ci{
715762306a36Sopenharmony_ci	unsigned long nr_reclaimed;
715862306a36Sopenharmony_ci	unsigned int noreclaim_flag;
715962306a36Sopenharmony_ci	struct scan_control sc = {
716062306a36Sopenharmony_ci		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
716162306a36Sopenharmony_ci		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
716262306a36Sopenharmony_ci				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
716362306a36Sopenharmony_ci		.reclaim_idx = MAX_NR_ZONES - 1,
716462306a36Sopenharmony_ci		.target_mem_cgroup = memcg,
716562306a36Sopenharmony_ci		.priority = DEF_PRIORITY,
716662306a36Sopenharmony_ci		.may_writepage = !laptop_mode,
716762306a36Sopenharmony_ci		.may_unmap = 1,
716862306a36Sopenharmony_ci		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
716962306a36Sopenharmony_ci		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
717062306a36Sopenharmony_ci	};
717162306a36Sopenharmony_ci	/*
717262306a36Sopenharmony_ci	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
717362306a36Sopenharmony_ci	 * equal pressure on all the nodes. This is based on the assumption that
717462306a36Sopenharmony_ci	 * the reclaim does not bail out early.
717562306a36Sopenharmony_ci	 */
717662306a36Sopenharmony_ci	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
717762306a36Sopenharmony_ci
717862306a36Sopenharmony_ci	set_task_reclaim_state(current, &sc.reclaim_state);
717962306a36Sopenharmony_ci	trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
718062306a36Sopenharmony_ci	noreclaim_flag = memalloc_noreclaim_save();
718162306a36Sopenharmony_ci
718262306a36Sopenharmony_ci	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
718362306a36Sopenharmony_ci
718462306a36Sopenharmony_ci	memalloc_noreclaim_restore(noreclaim_flag);
718562306a36Sopenharmony_ci	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
718662306a36Sopenharmony_ci	set_task_reclaim_state(current, NULL);
718762306a36Sopenharmony_ci
718862306a36Sopenharmony_ci	return nr_reclaimed;
718962306a36Sopenharmony_ci}
719062306a36Sopenharmony_ci#endif
719162306a36Sopenharmony_ci
719262306a36Sopenharmony_cistatic void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
719362306a36Sopenharmony_ci{
719462306a36Sopenharmony_ci	struct mem_cgroup *memcg;
719562306a36Sopenharmony_ci	struct lruvec *lruvec;
719662306a36Sopenharmony_ci
719762306a36Sopenharmony_ci	if (lru_gen_enabled()) {
719862306a36Sopenharmony_ci		lru_gen_age_node(pgdat, sc);
719962306a36Sopenharmony_ci		return;
720062306a36Sopenharmony_ci	}
720162306a36Sopenharmony_ci
720262306a36Sopenharmony_ci	if (!can_age_anon_pages(pgdat, sc))
720362306a36Sopenharmony_ci		return;
720462306a36Sopenharmony_ci
720562306a36Sopenharmony_ci	lruvec = mem_cgroup_lruvec(NULL, pgdat);
720662306a36Sopenharmony_ci	if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
720762306a36Sopenharmony_ci		return;
720862306a36Sopenharmony_ci
720962306a36Sopenharmony_ci	memcg = mem_cgroup_iter(NULL, NULL, NULL);
721062306a36Sopenharmony_ci	do {
721162306a36Sopenharmony_ci		lruvec = mem_cgroup_lruvec(memcg, pgdat);
721262306a36Sopenharmony_ci		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
721362306a36Sopenharmony_ci				   sc, LRU_ACTIVE_ANON);
721462306a36Sopenharmony_ci		memcg = mem_cgroup_iter(NULL, memcg, NULL);
721562306a36Sopenharmony_ci	} while (memcg);
721662306a36Sopenharmony_ci}
721762306a36Sopenharmony_ci
721862306a36Sopenharmony_cistatic bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
721962306a36Sopenharmony_ci{
722062306a36Sopenharmony_ci	int i;
722162306a36Sopenharmony_ci	struct zone *zone;
722262306a36Sopenharmony_ci
722362306a36Sopenharmony_ci	/*
722462306a36Sopenharmony_ci	 * Check for watermark boosts top-down as the higher zones
722562306a36Sopenharmony_ci	 * are more likely to be boosted. Both watermarks and boosts
722662306a36Sopenharmony_ci	 * should not be checked at the same time as reclaim would
722762306a36Sopenharmony_ci	 * start prematurely when there is no boosting and a lower
722862306a36Sopenharmony_ci	 * zone is balanced.
722962306a36Sopenharmony_ci	 */
723062306a36Sopenharmony_ci	for (i = highest_zoneidx; i >= 0; i--) {
723162306a36Sopenharmony_ci		zone = pgdat->node_zones + i;
723262306a36Sopenharmony_ci		if (!managed_zone(zone))
723362306a36Sopenharmony_ci			continue;
723462306a36Sopenharmony_ci
723562306a36Sopenharmony_ci		if (zone->watermark_boost)
723662306a36Sopenharmony_ci			return true;
723762306a36Sopenharmony_ci	}
723862306a36Sopenharmony_ci
723962306a36Sopenharmony_ci	return false;
724062306a36Sopenharmony_ci}
724162306a36Sopenharmony_ci
724262306a36Sopenharmony_ci/*
724362306a36Sopenharmony_ci * Returns true if there is an eligible zone balanced for the request order
724462306a36Sopenharmony_ci * and highest_zoneidx
724562306a36Sopenharmony_ci */
724662306a36Sopenharmony_cistatic bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
724762306a36Sopenharmony_ci{
724862306a36Sopenharmony_ci	int i;
724962306a36Sopenharmony_ci	unsigned long mark = -1;
725062306a36Sopenharmony_ci	struct zone *zone;
725162306a36Sopenharmony_ci
725262306a36Sopenharmony_ci	/*
725362306a36Sopenharmony_ci	 * Check watermarks bottom-up as lower zones are more likely to
725462306a36Sopenharmony_ci	 * meet watermarks.
725562306a36Sopenharmony_ci	 */
725662306a36Sopenharmony_ci	for (i = 0; i <= highest_zoneidx; i++) {
725762306a36Sopenharmony_ci		zone = pgdat->node_zones + i;
725862306a36Sopenharmony_ci
725962306a36Sopenharmony_ci		if (!managed_zone(zone))
726062306a36Sopenharmony_ci			continue;
726162306a36Sopenharmony_ci
726262306a36Sopenharmony_ci		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
726362306a36Sopenharmony_ci			mark = wmark_pages(zone, WMARK_PROMO);
726462306a36Sopenharmony_ci		else
726562306a36Sopenharmony_ci			mark = high_wmark_pages(zone);
726662306a36Sopenharmony_ci		if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
726762306a36Sopenharmony_ci			return true;
726862306a36Sopenharmony_ci	}
726962306a36Sopenharmony_ci
727062306a36Sopenharmony_ci	/*
727162306a36Sopenharmony_ci	 * If a node has no managed zone within highest_zoneidx, it does not
727262306a36Sopenharmony_ci	 * need balancing by definition. This can happen if a zone-restricted
727362306a36Sopenharmony_ci	 * allocation tries to wake a remote kswapd.
727462306a36Sopenharmony_ci	 */
727562306a36Sopenharmony_ci	if (mark == -1)
727662306a36Sopenharmony_ci		return true;
727762306a36Sopenharmony_ci
727862306a36Sopenharmony_ci	return false;
727962306a36Sopenharmony_ci}
728062306a36Sopenharmony_ci
728162306a36Sopenharmony_ci/* Clear pgdat state for congested, dirty or under writeback. */
728262306a36Sopenharmony_cistatic void clear_pgdat_congested(pg_data_t *pgdat)
728362306a36Sopenharmony_ci{
728462306a36Sopenharmony_ci	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
728562306a36Sopenharmony_ci
728662306a36Sopenharmony_ci	clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
728762306a36Sopenharmony_ci	clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
728862306a36Sopenharmony_ci	clear_bit(PGDAT_DIRTY, &pgdat->flags);
728962306a36Sopenharmony_ci	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
729062306a36Sopenharmony_ci}
729162306a36Sopenharmony_ci
729262306a36Sopenharmony_ci/*
729362306a36Sopenharmony_ci * Prepare kswapd for sleeping. This verifies that there are no processes
729462306a36Sopenharmony_ci * waiting in throttle_direct_reclaim() and that watermarks have been met.
729562306a36Sopenharmony_ci *
729662306a36Sopenharmony_ci * Returns true if kswapd is ready to sleep
729762306a36Sopenharmony_ci */
729862306a36Sopenharmony_cistatic bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
729962306a36Sopenharmony_ci				int highest_zoneidx)
730062306a36Sopenharmony_ci{
730162306a36Sopenharmony_ci	/*
730262306a36Sopenharmony_ci	 * The throttled processes are normally woken up in balance_pgdat() as
730362306a36Sopenharmony_ci	 * soon as allow_direct_reclaim() is true. But there is a potential
730462306a36Sopenharmony_ci	 * race between when kswapd checks the watermarks and a process gets
730562306a36Sopenharmony_ci	 * throttled. There is also a potential race if processes get
730662306a36Sopenharmony_ci	 * throttled, kswapd wakes, a large process exits thereby balancing the
730762306a36Sopenharmony_ci	 * zones, which causes kswapd to exit balance_pgdat() before reaching
730862306a36Sopenharmony_ci	 * the wake up checks. If kswapd is going to sleep, no process should
730962306a36Sopenharmony_ci	 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
731062306a36Sopenharmony_ci	 * the wake up is premature, processes will wake kswapd and get
731162306a36Sopenharmony_ci	 * throttled again. The difference from wake ups in balance_pgdat() is
731262306a36Sopenharmony_ci	 * that here we are under prepare_to_wait().
731362306a36Sopenharmony_ci	 */
731462306a36Sopenharmony_ci	if (waitqueue_active(&pgdat->pfmemalloc_wait))
731562306a36Sopenharmony_ci		wake_up_all(&pgdat->pfmemalloc_wait);
731662306a36Sopenharmony_ci
731762306a36Sopenharmony_ci	/* Hopeless node, leave it to direct reclaim */
731862306a36Sopenharmony_ci	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
731962306a36Sopenharmony_ci		return true;
732062306a36Sopenharmony_ci
732162306a36Sopenharmony_ci	if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
732262306a36Sopenharmony_ci		clear_pgdat_congested(pgdat);
732362306a36Sopenharmony_ci		return true;
732462306a36Sopenharmony_ci	}
732562306a36Sopenharmony_ci
732662306a36Sopenharmony_ci	return false;
732762306a36Sopenharmony_ci}
732862306a36Sopenharmony_ci
732962306a36Sopenharmony_ci/*
733062306a36Sopenharmony_ci * kswapd shrinks a node of pages that are at or below the highest usable
733162306a36Sopenharmony_ci * zone that is currently unbalanced.
733262306a36Sopenharmony_ci *
733362306a36Sopenharmony_ci * Returns true if kswapd scanned at least the requested number of pages to
733462306a36Sopenharmony_ci * reclaim or if the lack of progress was due to pages under writeback.
733562306a36Sopenharmony_ci * This is used to determine if the scanning priority needs to be raised.
733662306a36Sopenharmony_ci */
733762306a36Sopenharmony_cistatic bool kswapd_shrink_node(pg_data_t *pgdat,
733862306a36Sopenharmony_ci			       struct scan_control *sc)
733962306a36Sopenharmony_ci{
734062306a36Sopenharmony_ci	struct zone *zone;
734162306a36Sopenharmony_ci	int z;
734262306a36Sopenharmony_ci
734362306a36Sopenharmony_ci	/* Reclaim a number of pages proportional to the number of zones */
734462306a36Sopenharmony_ci	sc->nr_to_reclaim = 0;
734562306a36Sopenharmony_ci	for (z = 0; z <= sc->reclaim_idx; z++) {
734662306a36Sopenharmony_ci		zone = pgdat->node_zones + z;
734762306a36Sopenharmony_ci		if (!managed_zone(zone))
734862306a36Sopenharmony_ci			continue;
734962306a36Sopenharmony_ci
735062306a36Sopenharmony_ci		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
735162306a36Sopenharmony_ci	}
735262306a36Sopenharmony_ci
735362306a36Sopenharmony_ci	/*
735462306a36Sopenharmony_ci	 * Historically care was taken to put equal pressure on all zones but
735562306a36Sopenharmony_ci	 * now pressure is applied based on node LRU order.
735662306a36Sopenharmony_ci	 */
735762306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
735862306a36Sopenharmony_ci	shrink_node_hyperhold(pgdat, sc);
735962306a36Sopenharmony_ci#else
736062306a36Sopenharmony_ci	shrink_node(pgdat, sc);
736162306a36Sopenharmony_ci#endif
736262306a36Sopenharmony_ci
736362306a36Sopenharmony_ci	/*
736462306a36Sopenharmony_ci	 * Fragmentation may mean that the system cannot be rebalanced for
736562306a36Sopenharmony_ci	 * high-order allocations. If twice the allocation size has been
736662306a36Sopenharmony_ci	 * reclaimed then recheck watermarks only at order-0 to prevent
736762306a36Sopenharmony_ci	 * excessive reclaim. Assume that a process requested a high-order
736862306a36Sopenharmony_ci	 * can direct reclaim/compact.
736962306a36Sopenharmony_ci	 */
737062306a36Sopenharmony_ci	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
737162306a36Sopenharmony_ci		sc->order = 0;
737262306a36Sopenharmony_ci
737362306a36Sopenharmony_ci	return sc->nr_scanned >= sc->nr_to_reclaim;
737462306a36Sopenharmony_ci}
737562306a36Sopenharmony_ci
737662306a36Sopenharmony_ci/* Page allocator PCP high watermark is lowered if reclaim is active. */
737762306a36Sopenharmony_cistatic inline void
737862306a36Sopenharmony_ciupdate_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
737962306a36Sopenharmony_ci{
738062306a36Sopenharmony_ci	int i;
738162306a36Sopenharmony_ci	struct zone *zone;
738262306a36Sopenharmony_ci
738362306a36Sopenharmony_ci	for (i = 0; i <= highest_zoneidx; i++) {
738462306a36Sopenharmony_ci		zone = pgdat->node_zones + i;
738562306a36Sopenharmony_ci
738662306a36Sopenharmony_ci		if (!managed_zone(zone))
738762306a36Sopenharmony_ci			continue;
738862306a36Sopenharmony_ci
738962306a36Sopenharmony_ci		if (active)
739062306a36Sopenharmony_ci			set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
739162306a36Sopenharmony_ci		else
739262306a36Sopenharmony_ci			clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
739362306a36Sopenharmony_ci	}
739462306a36Sopenharmony_ci}
739562306a36Sopenharmony_ci
739662306a36Sopenharmony_cistatic inline void
739762306a36Sopenharmony_ciset_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
739862306a36Sopenharmony_ci{
739962306a36Sopenharmony_ci	update_reclaim_active(pgdat, highest_zoneidx, true);
740062306a36Sopenharmony_ci}
740162306a36Sopenharmony_ci
740262306a36Sopenharmony_cistatic inline void
740362306a36Sopenharmony_ciclear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
740462306a36Sopenharmony_ci{
740562306a36Sopenharmony_ci	update_reclaim_active(pgdat, highest_zoneidx, false);
740662306a36Sopenharmony_ci}
740762306a36Sopenharmony_ci
740862306a36Sopenharmony_ci/*
740962306a36Sopenharmony_ci * For kswapd, balance_pgdat() will reclaim pages across a node from zones
741062306a36Sopenharmony_ci * that are eligible for use by the caller until at least one zone is
741162306a36Sopenharmony_ci * balanced.
741262306a36Sopenharmony_ci *
741362306a36Sopenharmony_ci * Returns the order kswapd finished reclaiming at.
741462306a36Sopenharmony_ci *
741562306a36Sopenharmony_ci * kswapd scans the zones in the highmem->normal->dma direction.  It skips
741662306a36Sopenharmony_ci * zones which have free_pages > high_wmark_pages(zone), but once a zone is
741762306a36Sopenharmony_ci * found to have free_pages <= high_wmark_pages(zone), any page in that zone
741862306a36Sopenharmony_ci * or lower is eligible for reclaim until at least one usable zone is
741962306a36Sopenharmony_ci * balanced.
742062306a36Sopenharmony_ci */
742162306a36Sopenharmony_cistatic int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
742262306a36Sopenharmony_ci{
742362306a36Sopenharmony_ci	int i;
742462306a36Sopenharmony_ci	unsigned long nr_soft_reclaimed;
742562306a36Sopenharmony_ci	unsigned long nr_soft_scanned;
742662306a36Sopenharmony_ci	unsigned long pflags;
742762306a36Sopenharmony_ci	unsigned long nr_boost_reclaim;
742862306a36Sopenharmony_ci	unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
742962306a36Sopenharmony_ci	bool boosted;
743062306a36Sopenharmony_ci	struct zone *zone;
743162306a36Sopenharmony_ci	struct scan_control sc = {
743262306a36Sopenharmony_ci		.gfp_mask = GFP_KERNEL,
743362306a36Sopenharmony_ci		.order = order,
743462306a36Sopenharmony_ci		.may_unmap = 1,
743562306a36Sopenharmony_ci	};
743662306a36Sopenharmony_ci
743762306a36Sopenharmony_ci	set_task_reclaim_state(current, &sc.reclaim_state);
743862306a36Sopenharmony_ci	psi_memstall_enter(&pflags);
743962306a36Sopenharmony_ci	__fs_reclaim_acquire(_THIS_IP_);
744062306a36Sopenharmony_ci
744162306a36Sopenharmony_ci	count_vm_event(PAGEOUTRUN);
744262306a36Sopenharmony_ci
744362306a36Sopenharmony_ci	/*
744462306a36Sopenharmony_ci	 * Account for the reclaim boost. Note that the zone boost is left in
744562306a36Sopenharmony_ci	 * place so that parallel allocations that are near the watermark will
744662306a36Sopenharmony_ci	 * stall or direct reclaim until kswapd is finished.
744762306a36Sopenharmony_ci	 */
744862306a36Sopenharmony_ci	nr_boost_reclaim = 0;
744962306a36Sopenharmony_ci	for (i = 0; i <= highest_zoneidx; i++) {
745062306a36Sopenharmony_ci		zone = pgdat->node_zones + i;
745162306a36Sopenharmony_ci		if (!managed_zone(zone))
745262306a36Sopenharmony_ci			continue;
745362306a36Sopenharmony_ci
745462306a36Sopenharmony_ci		nr_boost_reclaim += zone->watermark_boost;
745562306a36Sopenharmony_ci		zone_boosts[i] = zone->watermark_boost;
745662306a36Sopenharmony_ci	}
745762306a36Sopenharmony_ci	boosted = nr_boost_reclaim;
745862306a36Sopenharmony_ci
745962306a36Sopenharmony_cirestart:
746062306a36Sopenharmony_ci	set_reclaim_active(pgdat, highest_zoneidx);
746162306a36Sopenharmony_ci	sc.priority = DEF_PRIORITY;
746262306a36Sopenharmony_ci	do {
746362306a36Sopenharmony_ci		unsigned long nr_reclaimed = sc.nr_reclaimed;
746462306a36Sopenharmony_ci		bool raise_priority = true;
746562306a36Sopenharmony_ci		bool balanced;
746662306a36Sopenharmony_ci		bool ret;
746762306a36Sopenharmony_ci
746862306a36Sopenharmony_ci		sc.reclaim_idx = highest_zoneidx;
746962306a36Sopenharmony_ci
747062306a36Sopenharmony_ci		/*
747162306a36Sopenharmony_ci		 * If the number of buffer_heads exceeds the maximum allowed
747262306a36Sopenharmony_ci		 * then consider reclaiming from all zones. This has a dual
747362306a36Sopenharmony_ci		 * purpose -- on 64-bit systems it is expected that
747462306a36Sopenharmony_ci		 * buffer_heads are stripped during active rotation. On 32-bit
747562306a36Sopenharmony_ci		 * systems, highmem pages can pin lowmem memory and shrinking
747662306a36Sopenharmony_ci		 * buffers can relieve lowmem pressure. Reclaim may still not
747762306a36Sopenharmony_ci		 * go ahead if all eligible zones for the original allocation
747862306a36Sopenharmony_ci		 * request are balanced to avoid excessive reclaim from kswapd.
747962306a36Sopenharmony_ci		 */
748062306a36Sopenharmony_ci		if (buffer_heads_over_limit) {
748162306a36Sopenharmony_ci			for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
748262306a36Sopenharmony_ci				zone = pgdat->node_zones + i;
748362306a36Sopenharmony_ci				if (!managed_zone(zone))
748462306a36Sopenharmony_ci					continue;
748562306a36Sopenharmony_ci
748662306a36Sopenharmony_ci				sc.reclaim_idx = i;
748762306a36Sopenharmony_ci				break;
748862306a36Sopenharmony_ci			}
748962306a36Sopenharmony_ci		}
749062306a36Sopenharmony_ci
749162306a36Sopenharmony_ci		/*
749262306a36Sopenharmony_ci		 * If the pgdat is imbalanced then ignore boosting and preserve
749362306a36Sopenharmony_ci		 * the watermarks for a later time and restart. Note that the
749462306a36Sopenharmony_ci		 * zone watermarks will be still reset at the end of balancing
749562306a36Sopenharmony_ci		 * on the grounds that the normal reclaim should be enough to
749662306a36Sopenharmony_ci		 * re-evaluate if boosting is required when kswapd next wakes.
749762306a36Sopenharmony_ci		 */
749862306a36Sopenharmony_ci		balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
749962306a36Sopenharmony_ci		if (!balanced && nr_boost_reclaim) {
750062306a36Sopenharmony_ci			nr_boost_reclaim = 0;
750162306a36Sopenharmony_ci			goto restart;
750262306a36Sopenharmony_ci		}
750362306a36Sopenharmony_ci
750462306a36Sopenharmony_ci		/*
750562306a36Sopenharmony_ci		 * If boosting is not active then only reclaim if there are no
750662306a36Sopenharmony_ci		 * eligible zones. Note that sc.reclaim_idx is not used as
750762306a36Sopenharmony_ci		 * buffer_heads_over_limit may have adjusted it.
750862306a36Sopenharmony_ci		 */
750962306a36Sopenharmony_ci		if (!nr_boost_reclaim && balanced)
751062306a36Sopenharmony_ci			goto out;
751162306a36Sopenharmony_ci
751262306a36Sopenharmony_ci		/* Limit the priority of boosting to avoid reclaim writeback */
751362306a36Sopenharmony_ci		if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
751462306a36Sopenharmony_ci			raise_priority = false;
751562306a36Sopenharmony_ci
751662306a36Sopenharmony_ci		/*
751762306a36Sopenharmony_ci		 * Do not writeback or swap pages for boosted reclaim. The
751862306a36Sopenharmony_ci		 * intent is to relieve pressure not issue sub-optimal IO
751962306a36Sopenharmony_ci		 * from reclaim context. If no pages are reclaimed, the
752062306a36Sopenharmony_ci		 * reclaim will be aborted.
752162306a36Sopenharmony_ci		 */
752262306a36Sopenharmony_ci		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
752362306a36Sopenharmony_ci		sc.may_swap = !nr_boost_reclaim;
752462306a36Sopenharmony_ci
752562306a36Sopenharmony_ci		/*
752662306a36Sopenharmony_ci		 * Do some background aging, to give pages a chance to be
752762306a36Sopenharmony_ci		 * referenced before reclaiming. All pages are rotated
752862306a36Sopenharmony_ci		 * regardless of classzone as this is about consistent aging.
752962306a36Sopenharmony_ci		 */
753062306a36Sopenharmony_ci		kswapd_age_node(pgdat, &sc);
753162306a36Sopenharmony_ci
753262306a36Sopenharmony_ci		/*
753362306a36Sopenharmony_ci		 * If we're getting trouble reclaiming, start doing writepage
753462306a36Sopenharmony_ci		 * even in laptop mode.
753562306a36Sopenharmony_ci		 */
753662306a36Sopenharmony_ci		if (sc.priority < DEF_PRIORITY - 2)
753762306a36Sopenharmony_ci			sc.may_writepage = 1;
753862306a36Sopenharmony_ci
753962306a36Sopenharmony_ci		/* Call soft limit reclaim before calling shrink_node. */
754062306a36Sopenharmony_ci		sc.nr_scanned = 0;
754162306a36Sopenharmony_ci		nr_soft_scanned = 0;
754262306a36Sopenharmony_ci		nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
754362306a36Sopenharmony_ci						sc.gfp_mask, &nr_soft_scanned);
754462306a36Sopenharmony_ci		sc.nr_reclaimed += nr_soft_reclaimed;
754562306a36Sopenharmony_ci
754662306a36Sopenharmony_ci		/*
754762306a36Sopenharmony_ci		 * There should be no need to raise the scanning priority if
754862306a36Sopenharmony_ci		 * enough pages are already being scanned that that high
754962306a36Sopenharmony_ci		 * watermark would be met at 100% efficiency.
755062306a36Sopenharmony_ci		 */
755162306a36Sopenharmony_ci		if (kswapd_shrink_node(pgdat, &sc))
755262306a36Sopenharmony_ci			raise_priority = false;
755362306a36Sopenharmony_ci
755462306a36Sopenharmony_ci		/*
755562306a36Sopenharmony_ci		 * If the low watermark is met there is no need for processes
755662306a36Sopenharmony_ci		 * to be throttled on pfmemalloc_wait as they should not be
755762306a36Sopenharmony_ci		 * able to safely make forward progress. Wake them
755862306a36Sopenharmony_ci		 */
755962306a36Sopenharmony_ci		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
756062306a36Sopenharmony_ci				allow_direct_reclaim(pgdat))
756162306a36Sopenharmony_ci			wake_up_all(&pgdat->pfmemalloc_wait);
756262306a36Sopenharmony_ci
756362306a36Sopenharmony_ci		/* Check if kswapd should be suspending */
756462306a36Sopenharmony_ci		__fs_reclaim_release(_THIS_IP_);
756562306a36Sopenharmony_ci		ret = try_to_freeze();
756662306a36Sopenharmony_ci		__fs_reclaim_acquire(_THIS_IP_);
756762306a36Sopenharmony_ci		if (ret || kthread_should_stop())
756862306a36Sopenharmony_ci			break;
756962306a36Sopenharmony_ci
757062306a36Sopenharmony_ci		/*
757162306a36Sopenharmony_ci		 * Raise priority if scanning rate is too low or there was no
757262306a36Sopenharmony_ci		 * progress in reclaiming pages
757362306a36Sopenharmony_ci		 */
757462306a36Sopenharmony_ci		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
757562306a36Sopenharmony_ci		nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
757662306a36Sopenharmony_ci
757762306a36Sopenharmony_ci		/*
757862306a36Sopenharmony_ci		 * If reclaim made no progress for a boost, stop reclaim as
757962306a36Sopenharmony_ci		 * IO cannot be queued and it could be an infinite loop in
758062306a36Sopenharmony_ci		 * extreme circumstances.
758162306a36Sopenharmony_ci		 */
758262306a36Sopenharmony_ci		if (nr_boost_reclaim && !nr_reclaimed)
758362306a36Sopenharmony_ci			break;
758462306a36Sopenharmony_ci
758562306a36Sopenharmony_ci		if (raise_priority || !nr_reclaimed)
758662306a36Sopenharmony_ci			sc.priority--;
758762306a36Sopenharmony_ci	} while (sc.priority >= 1);
758862306a36Sopenharmony_ci
758962306a36Sopenharmony_ci	if (!sc.nr_reclaimed)
759062306a36Sopenharmony_ci		pgdat->kswapd_failures++;
759162306a36Sopenharmony_ci
759262306a36Sopenharmony_ciout:
759362306a36Sopenharmony_ci	clear_reclaim_active(pgdat, highest_zoneidx);
759462306a36Sopenharmony_ci
759562306a36Sopenharmony_ci	/* If reclaim was boosted, account for the reclaim done in this pass */
759662306a36Sopenharmony_ci	if (boosted) {
759762306a36Sopenharmony_ci		unsigned long flags;
759862306a36Sopenharmony_ci
759962306a36Sopenharmony_ci		for (i = 0; i <= highest_zoneidx; i++) {
760062306a36Sopenharmony_ci			if (!zone_boosts[i])
760162306a36Sopenharmony_ci				continue;
760262306a36Sopenharmony_ci
760362306a36Sopenharmony_ci			/* Increments are under the zone lock */
760462306a36Sopenharmony_ci			zone = pgdat->node_zones + i;
760562306a36Sopenharmony_ci			spin_lock_irqsave(&zone->lock, flags);
760662306a36Sopenharmony_ci			zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
760762306a36Sopenharmony_ci			spin_unlock_irqrestore(&zone->lock, flags);
760862306a36Sopenharmony_ci		}
760962306a36Sopenharmony_ci
761062306a36Sopenharmony_ci		/*
761162306a36Sopenharmony_ci		 * As there is now likely space, wakeup kcompact to defragment
761262306a36Sopenharmony_ci		 * pageblocks.
761362306a36Sopenharmony_ci		 */
761462306a36Sopenharmony_ci		wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
761562306a36Sopenharmony_ci	}
761662306a36Sopenharmony_ci
761762306a36Sopenharmony_ci	snapshot_refaults(NULL, pgdat);
761862306a36Sopenharmony_ci	__fs_reclaim_release(_THIS_IP_);
761962306a36Sopenharmony_ci	psi_memstall_leave(&pflags);
762062306a36Sopenharmony_ci	set_task_reclaim_state(current, NULL);
762162306a36Sopenharmony_ci
762262306a36Sopenharmony_ci	/*
762362306a36Sopenharmony_ci	 * Return the order kswapd stopped reclaiming at as
762462306a36Sopenharmony_ci	 * prepare_kswapd_sleep() takes it into account. If another caller
762562306a36Sopenharmony_ci	 * entered the allocator slow path while kswapd was awake, order will
762662306a36Sopenharmony_ci	 * remain at the higher level.
762762306a36Sopenharmony_ci	 */
762862306a36Sopenharmony_ci	return sc.order;
762962306a36Sopenharmony_ci}
763062306a36Sopenharmony_ci
763162306a36Sopenharmony_ci/*
763262306a36Sopenharmony_ci * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
763362306a36Sopenharmony_ci * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
763462306a36Sopenharmony_ci * not a valid index then either kswapd runs for first time or kswapd couldn't
763562306a36Sopenharmony_ci * sleep after previous reclaim attempt (node is still unbalanced). In that
763662306a36Sopenharmony_ci * case return the zone index of the previous kswapd reclaim cycle.
763762306a36Sopenharmony_ci */
763862306a36Sopenharmony_cistatic enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
763962306a36Sopenharmony_ci					   enum zone_type prev_highest_zoneidx)
764062306a36Sopenharmony_ci{
764162306a36Sopenharmony_ci	enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
764262306a36Sopenharmony_ci
764362306a36Sopenharmony_ci	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
764462306a36Sopenharmony_ci}
764562306a36Sopenharmony_ci
764662306a36Sopenharmony_cistatic void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
764762306a36Sopenharmony_ci				unsigned int highest_zoneidx)
764862306a36Sopenharmony_ci{
764962306a36Sopenharmony_ci	long remaining = 0;
765062306a36Sopenharmony_ci	DEFINE_WAIT(wait);
765162306a36Sopenharmony_ci
765262306a36Sopenharmony_ci	if (freezing(current) || kthread_should_stop())
765362306a36Sopenharmony_ci		return;
765462306a36Sopenharmony_ci
765562306a36Sopenharmony_ci	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
765662306a36Sopenharmony_ci
765762306a36Sopenharmony_ci	/*
765862306a36Sopenharmony_ci	 * Try to sleep for a short interval. Note that kcompactd will only be
765962306a36Sopenharmony_ci	 * woken if it is possible to sleep for a short interval. This is
766062306a36Sopenharmony_ci	 * deliberate on the assumption that if reclaim cannot keep an
766162306a36Sopenharmony_ci	 * eligible zone balanced that it's also unlikely that compaction will
766262306a36Sopenharmony_ci	 * succeed.
766362306a36Sopenharmony_ci	 */
766462306a36Sopenharmony_ci	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
766562306a36Sopenharmony_ci		/*
766662306a36Sopenharmony_ci		 * Compaction records what page blocks it recently failed to
766762306a36Sopenharmony_ci		 * isolate pages from and skips them in the future scanning.
766862306a36Sopenharmony_ci		 * When kswapd is going to sleep, it is reasonable to assume
766962306a36Sopenharmony_ci		 * that pages and compaction may succeed so reset the cache.
767062306a36Sopenharmony_ci		 */
767162306a36Sopenharmony_ci		reset_isolation_suitable(pgdat);
767262306a36Sopenharmony_ci
767362306a36Sopenharmony_ci		/*
767462306a36Sopenharmony_ci		 * We have freed the memory, now we should compact it to make
767562306a36Sopenharmony_ci		 * allocation of the requested order possible.
767662306a36Sopenharmony_ci		 */
767762306a36Sopenharmony_ci		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
767862306a36Sopenharmony_ci
767962306a36Sopenharmony_ci		remaining = schedule_timeout(HZ/10);
768062306a36Sopenharmony_ci
768162306a36Sopenharmony_ci		/*
768262306a36Sopenharmony_ci		 * If woken prematurely then reset kswapd_highest_zoneidx and
768362306a36Sopenharmony_ci		 * order. The values will either be from a wakeup request or
768462306a36Sopenharmony_ci		 * the previous request that slept prematurely.
768562306a36Sopenharmony_ci		 */
768662306a36Sopenharmony_ci		if (remaining) {
768762306a36Sopenharmony_ci			WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
768862306a36Sopenharmony_ci					kswapd_highest_zoneidx(pgdat,
768962306a36Sopenharmony_ci							highest_zoneidx));
769062306a36Sopenharmony_ci
769162306a36Sopenharmony_ci			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
769262306a36Sopenharmony_ci				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
769362306a36Sopenharmony_ci		}
769462306a36Sopenharmony_ci
769562306a36Sopenharmony_ci		finish_wait(&pgdat->kswapd_wait, &wait);
769662306a36Sopenharmony_ci		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
769762306a36Sopenharmony_ci	}
769862306a36Sopenharmony_ci
769962306a36Sopenharmony_ci	/*
770062306a36Sopenharmony_ci	 * After a short sleep, check if it was a premature sleep. If not, then
770162306a36Sopenharmony_ci	 * go fully to sleep until explicitly woken up.
770262306a36Sopenharmony_ci	 */
770362306a36Sopenharmony_ci	if (!remaining &&
770462306a36Sopenharmony_ci	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
770562306a36Sopenharmony_ci		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
770662306a36Sopenharmony_ci
770762306a36Sopenharmony_ci		/*
770862306a36Sopenharmony_ci		 * vmstat counters are not perfectly accurate and the estimated
770962306a36Sopenharmony_ci		 * value for counters such as NR_FREE_PAGES can deviate from the
771062306a36Sopenharmony_ci		 * true value by nr_online_cpus * threshold. To avoid the zone
771162306a36Sopenharmony_ci		 * watermarks being breached while under pressure, we reduce the
771262306a36Sopenharmony_ci		 * per-cpu vmstat threshold while kswapd is awake and restore
771362306a36Sopenharmony_ci		 * them before going back to sleep.
771462306a36Sopenharmony_ci		 */
771562306a36Sopenharmony_ci		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
771662306a36Sopenharmony_ci
771762306a36Sopenharmony_ci		if (!kthread_should_stop())
771862306a36Sopenharmony_ci			schedule();
771962306a36Sopenharmony_ci
772062306a36Sopenharmony_ci		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
772162306a36Sopenharmony_ci	} else {
772262306a36Sopenharmony_ci		if (remaining)
772362306a36Sopenharmony_ci			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
772462306a36Sopenharmony_ci		else
772562306a36Sopenharmony_ci			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
772662306a36Sopenharmony_ci	}
772762306a36Sopenharmony_ci	finish_wait(&pgdat->kswapd_wait, &wait);
772862306a36Sopenharmony_ci}
772962306a36Sopenharmony_ci
773062306a36Sopenharmony_ci/*
773162306a36Sopenharmony_ci * The background pageout daemon, started as a kernel thread
773262306a36Sopenharmony_ci * from the init process.
773362306a36Sopenharmony_ci *
773462306a36Sopenharmony_ci * This basically trickles out pages so that we have _some_
773562306a36Sopenharmony_ci * free memory available even if there is no other activity
773662306a36Sopenharmony_ci * that frees anything up. This is needed for things like routing
773762306a36Sopenharmony_ci * etc, where we otherwise might have all activity going on in
773862306a36Sopenharmony_ci * asynchronous contexts that cannot page things out.
773962306a36Sopenharmony_ci *
774062306a36Sopenharmony_ci * If there are applications that are active memory-allocators
774162306a36Sopenharmony_ci * (most normal use), this basically shouldn't matter.
774262306a36Sopenharmony_ci */
774362306a36Sopenharmony_cistatic int kswapd(void *p)
774462306a36Sopenharmony_ci{
774562306a36Sopenharmony_ci	unsigned int alloc_order, reclaim_order;
774662306a36Sopenharmony_ci	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
774762306a36Sopenharmony_ci	pg_data_t *pgdat = (pg_data_t *)p;
774862306a36Sopenharmony_ci	struct task_struct *tsk = current;
774962306a36Sopenharmony_ci	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
775062306a36Sopenharmony_ci
775162306a36Sopenharmony_ci	if (!cpumask_empty(cpumask))
775262306a36Sopenharmony_ci		set_cpus_allowed_ptr(tsk, cpumask);
775362306a36Sopenharmony_ci
775462306a36Sopenharmony_ci	/*
775562306a36Sopenharmony_ci	 * Tell the memory management that we're a "memory allocator",
775662306a36Sopenharmony_ci	 * and that if we need more memory we should get access to it
775762306a36Sopenharmony_ci	 * regardless (see "__alloc_pages()"). "kswapd" should
775862306a36Sopenharmony_ci	 * never get caught in the normal page freeing logic.
775962306a36Sopenharmony_ci	 *
776062306a36Sopenharmony_ci	 * (Kswapd normally doesn't need memory anyway, but sometimes
776162306a36Sopenharmony_ci	 * you need a small amount of memory in order to be able to
776262306a36Sopenharmony_ci	 * page out something else, and this flag essentially protects
776362306a36Sopenharmony_ci	 * us from recursively trying to free more memory as we're
776462306a36Sopenharmony_ci	 * trying to free the first piece of memory in the first place).
776562306a36Sopenharmony_ci	 */
776662306a36Sopenharmony_ci	tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
776762306a36Sopenharmony_ci	set_freezable();
776862306a36Sopenharmony_ci
776962306a36Sopenharmony_ci	WRITE_ONCE(pgdat->kswapd_order, 0);
777062306a36Sopenharmony_ci	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
777162306a36Sopenharmony_ci	atomic_set(&pgdat->nr_writeback_throttled, 0);
777262306a36Sopenharmony_ci	for ( ; ; ) {
777362306a36Sopenharmony_ci		bool ret;
777462306a36Sopenharmony_ci
777562306a36Sopenharmony_ci		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
777662306a36Sopenharmony_ci		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
777762306a36Sopenharmony_ci							highest_zoneidx);
777862306a36Sopenharmony_ci
777962306a36Sopenharmony_cikswapd_try_sleep:
778062306a36Sopenharmony_ci		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
778162306a36Sopenharmony_ci					highest_zoneidx);
778262306a36Sopenharmony_ci
778362306a36Sopenharmony_ci		/* Read the new order and highest_zoneidx */
778462306a36Sopenharmony_ci		alloc_order = READ_ONCE(pgdat->kswapd_order);
778562306a36Sopenharmony_ci		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
778662306a36Sopenharmony_ci							highest_zoneidx);
778762306a36Sopenharmony_ci		WRITE_ONCE(pgdat->kswapd_order, 0);
778862306a36Sopenharmony_ci		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
778962306a36Sopenharmony_ci
779062306a36Sopenharmony_ci		ret = try_to_freeze();
779162306a36Sopenharmony_ci		if (kthread_should_stop())
779262306a36Sopenharmony_ci			break;
779362306a36Sopenharmony_ci
779462306a36Sopenharmony_ci		/*
779562306a36Sopenharmony_ci		 * We can speed up thawing tasks if we don't call balance_pgdat
779662306a36Sopenharmony_ci		 * after returning from the refrigerator
779762306a36Sopenharmony_ci		 */
779862306a36Sopenharmony_ci		if (ret)
779962306a36Sopenharmony_ci			continue;
780062306a36Sopenharmony_ci
780162306a36Sopenharmony_ci		/*
780262306a36Sopenharmony_ci		 * Reclaim begins at the requested order but if a high-order
780362306a36Sopenharmony_ci		 * reclaim fails then kswapd falls back to reclaiming for
780462306a36Sopenharmony_ci		 * order-0. If that happens, kswapd will consider sleeping
780562306a36Sopenharmony_ci		 * for the order it finished reclaiming at (reclaim_order)
780662306a36Sopenharmony_ci		 * but kcompactd is woken to compact for the original
780762306a36Sopenharmony_ci		 * request (alloc_order).
780862306a36Sopenharmony_ci		 */
780962306a36Sopenharmony_ci		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
781062306a36Sopenharmony_ci						alloc_order);
781162306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_MONITOR
781262306a36Sopenharmony_ci		kswapd_monitor_wake_up_queue();
781362306a36Sopenharmony_ci#endif
781462306a36Sopenharmony_ci		reclaim_order = balance_pgdat(pgdat, alloc_order,
781562306a36Sopenharmony_ci						highest_zoneidx);
781662306a36Sopenharmony_ci		if (reclaim_order < alloc_order)
781762306a36Sopenharmony_ci			goto kswapd_try_sleep;
781862306a36Sopenharmony_ci	}
781962306a36Sopenharmony_ci
782062306a36Sopenharmony_ci	tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
782162306a36Sopenharmony_ci
782262306a36Sopenharmony_ci	return 0;
782362306a36Sopenharmony_ci}
782462306a36Sopenharmony_ci
782562306a36Sopenharmony_ci/*
782662306a36Sopenharmony_ci * A zone is low on free memory or too fragmented for high-order memory.  If
782762306a36Sopenharmony_ci * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
782862306a36Sopenharmony_ci * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
782962306a36Sopenharmony_ci * has failed or is not needed, still wake up kcompactd if only compaction is
783062306a36Sopenharmony_ci * needed.
783162306a36Sopenharmony_ci */
783262306a36Sopenharmony_civoid wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
783362306a36Sopenharmony_ci		   enum zone_type highest_zoneidx)
783462306a36Sopenharmony_ci{
783562306a36Sopenharmony_ci	pg_data_t *pgdat;
783662306a36Sopenharmony_ci	enum zone_type curr_idx;
783762306a36Sopenharmony_ci
783862306a36Sopenharmony_ci	if (!managed_zone(zone))
783962306a36Sopenharmony_ci		return;
784062306a36Sopenharmony_ci
784162306a36Sopenharmony_ci	if (!cpuset_zone_allowed(zone, gfp_flags))
784262306a36Sopenharmony_ci		return;
784362306a36Sopenharmony_ci
784462306a36Sopenharmony_ci	pgdat = zone->zone_pgdat;
784562306a36Sopenharmony_ci	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
784662306a36Sopenharmony_ci
784762306a36Sopenharmony_ci	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
784862306a36Sopenharmony_ci		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
784962306a36Sopenharmony_ci
785062306a36Sopenharmony_ci	if (READ_ONCE(pgdat->kswapd_order) < order)
785162306a36Sopenharmony_ci		WRITE_ONCE(pgdat->kswapd_order, order);
785262306a36Sopenharmony_ci
785362306a36Sopenharmony_ci	if (!waitqueue_active(&pgdat->kswapd_wait))
785462306a36Sopenharmony_ci		return;
785562306a36Sopenharmony_ci
785662306a36Sopenharmony_ci	/* Hopeless node, leave it to direct reclaim if possible */
785762306a36Sopenharmony_ci	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
785862306a36Sopenharmony_ci	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
785962306a36Sopenharmony_ci	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
786062306a36Sopenharmony_ci		/*
786162306a36Sopenharmony_ci		 * There may be plenty of free memory available, but it's too
786262306a36Sopenharmony_ci		 * fragmented for high-order allocations.  Wake up kcompactd
786362306a36Sopenharmony_ci		 * and rely on compaction_suitable() to determine if it's
786462306a36Sopenharmony_ci		 * needed.  If it fails, it will defer subsequent attempts to
786562306a36Sopenharmony_ci		 * ratelimit its work.
786662306a36Sopenharmony_ci		 */
786762306a36Sopenharmony_ci		if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
786862306a36Sopenharmony_ci			wakeup_kcompactd(pgdat, order, highest_zoneidx);
786962306a36Sopenharmony_ci		return;
787062306a36Sopenharmony_ci	}
787162306a36Sopenharmony_ci
787262306a36Sopenharmony_ci	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
787362306a36Sopenharmony_ci				      gfp_flags);
787462306a36Sopenharmony_ci	wake_up_interruptible(&pgdat->kswapd_wait);
787562306a36Sopenharmony_ci}
787662306a36Sopenharmony_ci
787762306a36Sopenharmony_ci#ifdef CONFIG_HIBERNATION
787862306a36Sopenharmony_ci/*
787962306a36Sopenharmony_ci * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
788062306a36Sopenharmony_ci * freed pages.
788162306a36Sopenharmony_ci *
788262306a36Sopenharmony_ci * Rather than trying to age LRUs the aim is to preserve the overall
788362306a36Sopenharmony_ci * LRU order by reclaiming preferentially
788462306a36Sopenharmony_ci * inactive > active > active referenced > active mapped
788562306a36Sopenharmony_ci */
788662306a36Sopenharmony_ciunsigned long shrink_all_memory(unsigned long nr_to_reclaim)
788762306a36Sopenharmony_ci{
788862306a36Sopenharmony_ci	struct scan_control sc = {
788962306a36Sopenharmony_ci		.nr_to_reclaim = nr_to_reclaim,
789062306a36Sopenharmony_ci		.gfp_mask = GFP_HIGHUSER_MOVABLE,
789162306a36Sopenharmony_ci		.reclaim_idx = MAX_NR_ZONES - 1,
789262306a36Sopenharmony_ci		.priority = DEF_PRIORITY,
789362306a36Sopenharmony_ci		.may_writepage = 1,
789462306a36Sopenharmony_ci		.may_unmap = 1,
789562306a36Sopenharmony_ci		.may_swap = 1,
789662306a36Sopenharmony_ci		.hibernation_mode = 1,
789762306a36Sopenharmony_ci	};
789862306a36Sopenharmony_ci	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
789962306a36Sopenharmony_ci	unsigned long nr_reclaimed;
790062306a36Sopenharmony_ci	unsigned int noreclaim_flag;
790162306a36Sopenharmony_ci
790262306a36Sopenharmony_ci	fs_reclaim_acquire(sc.gfp_mask);
790362306a36Sopenharmony_ci	noreclaim_flag = memalloc_noreclaim_save();
790462306a36Sopenharmony_ci	set_task_reclaim_state(current, &sc.reclaim_state);
790562306a36Sopenharmony_ci
790662306a36Sopenharmony_ci	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
790762306a36Sopenharmony_ci
790862306a36Sopenharmony_ci	set_task_reclaim_state(current, NULL);
790962306a36Sopenharmony_ci	memalloc_noreclaim_restore(noreclaim_flag);
791062306a36Sopenharmony_ci	fs_reclaim_release(sc.gfp_mask);
791162306a36Sopenharmony_ci
791262306a36Sopenharmony_ci	return nr_reclaimed;
791362306a36Sopenharmony_ci}
791462306a36Sopenharmony_ci#endif /* CONFIG_HIBERNATION */
791562306a36Sopenharmony_ci
791662306a36Sopenharmony_ci/*
791762306a36Sopenharmony_ci * This kswapd start function will be called by init and node-hot-add.
791862306a36Sopenharmony_ci */
791962306a36Sopenharmony_civoid __meminit kswapd_run(int nid)
792062306a36Sopenharmony_ci{
792162306a36Sopenharmony_ci	pg_data_t *pgdat = NODE_DATA(nid);
792262306a36Sopenharmony_ci
792362306a36Sopenharmony_ci	pgdat_kswapd_lock(pgdat);
792462306a36Sopenharmony_ci	if (!pgdat->kswapd) {
792562306a36Sopenharmony_ci		pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
792662306a36Sopenharmony_ci		if (IS_ERR(pgdat->kswapd)) {
792762306a36Sopenharmony_ci			/* failure at boot is fatal */
792862306a36Sopenharmony_ci			BUG_ON(system_state < SYSTEM_RUNNING);
792962306a36Sopenharmony_ci			pr_err("Failed to start kswapd on node %d\n", nid);
793062306a36Sopenharmony_ci			pgdat->kswapd = NULL;
793162306a36Sopenharmony_ci		}
793262306a36Sopenharmony_ci	}
793362306a36Sopenharmony_ci	pgdat_kswapd_unlock(pgdat);
793462306a36Sopenharmony_ci}
793562306a36Sopenharmony_ci
793662306a36Sopenharmony_ci/*
793762306a36Sopenharmony_ci * Called by memory hotplug when all memory in a node is offlined.  Caller must
793862306a36Sopenharmony_ci * be holding mem_hotplug_begin/done().
793962306a36Sopenharmony_ci */
794062306a36Sopenharmony_civoid __meminit kswapd_stop(int nid)
794162306a36Sopenharmony_ci{
794262306a36Sopenharmony_ci	pg_data_t *pgdat = NODE_DATA(nid);
794362306a36Sopenharmony_ci	struct task_struct *kswapd;
794462306a36Sopenharmony_ci
794562306a36Sopenharmony_ci	pgdat_kswapd_lock(pgdat);
794662306a36Sopenharmony_ci	kswapd = pgdat->kswapd;
794762306a36Sopenharmony_ci	if (kswapd) {
794862306a36Sopenharmony_ci		kthread_stop(kswapd);
794962306a36Sopenharmony_ci		pgdat->kswapd = NULL;
795062306a36Sopenharmony_ci	}
795162306a36Sopenharmony_ci	pgdat_kswapd_unlock(pgdat);
795262306a36Sopenharmony_ci}
795362306a36Sopenharmony_ci
795462306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE_DEBUG
795562306a36Sopenharmony_cistatic void __init purgeable_debugfs_init(void);
795662306a36Sopenharmony_ci#endif
795762306a36Sopenharmony_ci
795862306a36Sopenharmony_cistatic int __init kswapd_init(void)
795962306a36Sopenharmony_ci{
796062306a36Sopenharmony_ci	int nid;
796162306a36Sopenharmony_ci
796262306a36Sopenharmony_ci	swap_setup();
796362306a36Sopenharmony_ci	for_each_node_state(nid, N_MEMORY)
796462306a36Sopenharmony_ci 		kswapd_run(nid);
796562306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE_DEBUG
796662306a36Sopenharmony_ci	purgeable_debugfs_init();
796762306a36Sopenharmony_ci#endif
796862306a36Sopenharmony_ci	return 0;
796962306a36Sopenharmony_ci}
797062306a36Sopenharmony_ci
797162306a36Sopenharmony_cimodule_init(kswapd_init)
797262306a36Sopenharmony_ci
797362306a36Sopenharmony_ci#ifdef CONFIG_NUMA
797462306a36Sopenharmony_ci/*
797562306a36Sopenharmony_ci * Node reclaim mode
797662306a36Sopenharmony_ci *
797762306a36Sopenharmony_ci * If non-zero call node_reclaim when the number of free pages falls below
797862306a36Sopenharmony_ci * the watermarks.
797962306a36Sopenharmony_ci */
798062306a36Sopenharmony_ciint node_reclaim_mode __read_mostly;
798162306a36Sopenharmony_ci
798262306a36Sopenharmony_ci/*
798362306a36Sopenharmony_ci * Priority for NODE_RECLAIM. This determines the fraction of pages
798462306a36Sopenharmony_ci * of a node considered for each zone_reclaim. 4 scans 1/16th of
798562306a36Sopenharmony_ci * a zone.
798662306a36Sopenharmony_ci */
798762306a36Sopenharmony_ci#define NODE_RECLAIM_PRIORITY 4
798862306a36Sopenharmony_ci
798962306a36Sopenharmony_ci/*
799062306a36Sopenharmony_ci * Percentage of pages in a zone that must be unmapped for node_reclaim to
799162306a36Sopenharmony_ci * occur.
799262306a36Sopenharmony_ci */
799362306a36Sopenharmony_ciint sysctl_min_unmapped_ratio = 1;
799462306a36Sopenharmony_ci
799562306a36Sopenharmony_ci/*
799662306a36Sopenharmony_ci * If the number of slab pages in a zone grows beyond this percentage then
799762306a36Sopenharmony_ci * slab reclaim needs to occur.
799862306a36Sopenharmony_ci */
799962306a36Sopenharmony_ciint sysctl_min_slab_ratio = 5;
800062306a36Sopenharmony_ci
800162306a36Sopenharmony_cistatic inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
800262306a36Sopenharmony_ci{
800362306a36Sopenharmony_ci	unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
800462306a36Sopenharmony_ci	unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
800562306a36Sopenharmony_ci		node_page_state(pgdat, NR_ACTIVE_FILE);
800662306a36Sopenharmony_ci
800762306a36Sopenharmony_ci	/*
800862306a36Sopenharmony_ci	 * It's possible for there to be more file mapped pages than
800962306a36Sopenharmony_ci	 * accounted for by the pages on the file LRU lists because
801062306a36Sopenharmony_ci	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
801162306a36Sopenharmony_ci	 */
801262306a36Sopenharmony_ci	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
801362306a36Sopenharmony_ci}
801462306a36Sopenharmony_ci
801562306a36Sopenharmony_ci/* Work out how many page cache pages we can reclaim in this reclaim_mode */
801662306a36Sopenharmony_cistatic unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
801762306a36Sopenharmony_ci{
801862306a36Sopenharmony_ci	unsigned long nr_pagecache_reclaimable;
801962306a36Sopenharmony_ci	unsigned long delta = 0;
802062306a36Sopenharmony_ci
802162306a36Sopenharmony_ci	/*
802262306a36Sopenharmony_ci	 * If RECLAIM_UNMAP is set, then all file pages are considered
802362306a36Sopenharmony_ci	 * potentially reclaimable. Otherwise, we have to worry about
802462306a36Sopenharmony_ci	 * pages like swapcache and node_unmapped_file_pages() provides
802562306a36Sopenharmony_ci	 * a better estimate
802662306a36Sopenharmony_ci	 */
802762306a36Sopenharmony_ci	if (node_reclaim_mode & RECLAIM_UNMAP)
802862306a36Sopenharmony_ci		nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
802962306a36Sopenharmony_ci	else
803062306a36Sopenharmony_ci		nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
803162306a36Sopenharmony_ci
803262306a36Sopenharmony_ci	/* If we can't clean pages, remove dirty pages from consideration */
803362306a36Sopenharmony_ci	if (!(node_reclaim_mode & RECLAIM_WRITE))
803462306a36Sopenharmony_ci		delta += node_page_state(pgdat, NR_FILE_DIRTY);
803562306a36Sopenharmony_ci
803662306a36Sopenharmony_ci	/* Watch for any possible underflows due to delta */
803762306a36Sopenharmony_ci	if (unlikely(delta > nr_pagecache_reclaimable))
803862306a36Sopenharmony_ci		delta = nr_pagecache_reclaimable;
803962306a36Sopenharmony_ci
804062306a36Sopenharmony_ci	return nr_pagecache_reclaimable - delta;
804162306a36Sopenharmony_ci}
804262306a36Sopenharmony_ci
804362306a36Sopenharmony_ci/*
804462306a36Sopenharmony_ci * Try to free up some pages from this node through reclaim.
804562306a36Sopenharmony_ci */
804662306a36Sopenharmony_cistatic int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
804762306a36Sopenharmony_ci{
804862306a36Sopenharmony_ci	/* Minimum pages needed in order to stay on node */
804962306a36Sopenharmony_ci	const unsigned long nr_pages = 1 << order;
805062306a36Sopenharmony_ci	struct task_struct *p = current;
805162306a36Sopenharmony_ci	unsigned int noreclaim_flag;
805262306a36Sopenharmony_ci	struct scan_control sc = {
805362306a36Sopenharmony_ci		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
805462306a36Sopenharmony_ci		.gfp_mask = current_gfp_context(gfp_mask),
805562306a36Sopenharmony_ci		.order = order,
805662306a36Sopenharmony_ci		.priority = NODE_RECLAIM_PRIORITY,
805762306a36Sopenharmony_ci		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
805862306a36Sopenharmony_ci		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
805962306a36Sopenharmony_ci		.may_swap = 1,
806062306a36Sopenharmony_ci		.reclaim_idx = gfp_zone(gfp_mask),
806162306a36Sopenharmony_ci	};
806262306a36Sopenharmony_ci	unsigned long pflags;
806362306a36Sopenharmony_ci
806462306a36Sopenharmony_ci	trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
806562306a36Sopenharmony_ci					   sc.gfp_mask);
806662306a36Sopenharmony_ci
806762306a36Sopenharmony_ci	cond_resched();
806862306a36Sopenharmony_ci	psi_memstall_enter(&pflags);
806962306a36Sopenharmony_ci	fs_reclaim_acquire(sc.gfp_mask);
807062306a36Sopenharmony_ci	/*
807162306a36Sopenharmony_ci	 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
807262306a36Sopenharmony_ci	 */
807362306a36Sopenharmony_ci	noreclaim_flag = memalloc_noreclaim_save();
807462306a36Sopenharmony_ci	set_task_reclaim_state(p, &sc.reclaim_state);
807562306a36Sopenharmony_ci
807662306a36Sopenharmony_ci	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
807762306a36Sopenharmony_ci	    node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
807862306a36Sopenharmony_ci		/*
807962306a36Sopenharmony_ci		 * Free memory by calling shrink node with increasing
808062306a36Sopenharmony_ci		 * priorities until we have enough memory freed.
808162306a36Sopenharmony_ci		 */
808262306a36Sopenharmony_ci		do {
808362306a36Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_FILE_LRU
808462306a36Sopenharmony_ci			shrink_node_hyperhold(pgdat, &sc);
808562306a36Sopenharmony_ci#else
808662306a36Sopenharmony_ci			shrink_node(pgdat, &sc);
808762306a36Sopenharmony_ci#endif
808862306a36Sopenharmony_ci		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
808962306a36Sopenharmony_ci	}
809062306a36Sopenharmony_ci
809162306a36Sopenharmony_ci	set_task_reclaim_state(p, NULL);
809262306a36Sopenharmony_ci	memalloc_noreclaim_restore(noreclaim_flag);
809362306a36Sopenharmony_ci	fs_reclaim_release(sc.gfp_mask);
809462306a36Sopenharmony_ci	psi_memstall_leave(&pflags);
809562306a36Sopenharmony_ci
809662306a36Sopenharmony_ci	trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
809762306a36Sopenharmony_ci
809862306a36Sopenharmony_ci	return sc.nr_reclaimed >= nr_pages;
809962306a36Sopenharmony_ci}
810062306a36Sopenharmony_ci
810162306a36Sopenharmony_ciint node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
810262306a36Sopenharmony_ci{
810362306a36Sopenharmony_ci	int ret;
810462306a36Sopenharmony_ci
810562306a36Sopenharmony_ci	/*
810662306a36Sopenharmony_ci	 * Node reclaim reclaims unmapped file backed pages and
810762306a36Sopenharmony_ci	 * slab pages if we are over the defined limits.
810862306a36Sopenharmony_ci	 *
810962306a36Sopenharmony_ci	 * A small portion of unmapped file backed pages is needed for
811062306a36Sopenharmony_ci	 * file I/O otherwise pages read by file I/O will be immediately
811162306a36Sopenharmony_ci	 * thrown out if the node is overallocated. So we do not reclaim
811262306a36Sopenharmony_ci	 * if less than a specified percentage of the node is used by
811362306a36Sopenharmony_ci	 * unmapped file backed pages.
811462306a36Sopenharmony_ci	 */
811562306a36Sopenharmony_ci	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
811662306a36Sopenharmony_ci	    node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
811762306a36Sopenharmony_ci	    pgdat->min_slab_pages)
811862306a36Sopenharmony_ci		return NODE_RECLAIM_FULL;
811962306a36Sopenharmony_ci
812062306a36Sopenharmony_ci	/*
812162306a36Sopenharmony_ci	 * Do not scan if the allocation should not be delayed.
812262306a36Sopenharmony_ci	 */
812362306a36Sopenharmony_ci	if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
812462306a36Sopenharmony_ci		return NODE_RECLAIM_NOSCAN;
812562306a36Sopenharmony_ci
812662306a36Sopenharmony_ci	/*
812762306a36Sopenharmony_ci	 * Only run node reclaim on the local node or on nodes that do not
812862306a36Sopenharmony_ci	 * have associated processors. This will favor the local processor
812962306a36Sopenharmony_ci	 * over remote processors and spread off node memory allocations
813062306a36Sopenharmony_ci	 * as wide as possible.
813162306a36Sopenharmony_ci	 */
813262306a36Sopenharmony_ci	if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
813362306a36Sopenharmony_ci		return NODE_RECLAIM_NOSCAN;
813462306a36Sopenharmony_ci
813562306a36Sopenharmony_ci	if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
813662306a36Sopenharmony_ci		return NODE_RECLAIM_NOSCAN;
813762306a36Sopenharmony_ci
813862306a36Sopenharmony_ci	ret = __node_reclaim(pgdat, gfp_mask, order);
813962306a36Sopenharmony_ci	clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
814062306a36Sopenharmony_ci
814162306a36Sopenharmony_ci	if (!ret)
814262306a36Sopenharmony_ci		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
814362306a36Sopenharmony_ci
814462306a36Sopenharmony_ci	return ret;
814562306a36Sopenharmony_ci}
814662306a36Sopenharmony_ci#endif
814762306a36Sopenharmony_ci
814862306a36Sopenharmony_ci/**
814962306a36Sopenharmony_ci * check_move_unevictable_folios - Move evictable folios to appropriate zone
815062306a36Sopenharmony_ci * lru list
815162306a36Sopenharmony_ci * @fbatch: Batch of lru folios to check.
815262306a36Sopenharmony_ci *
815362306a36Sopenharmony_ci * Checks folios for evictability, if an evictable folio is in the unevictable
815462306a36Sopenharmony_ci * lru list, moves it to the appropriate evictable lru list. This function
815562306a36Sopenharmony_ci * should be only used for lru folios.
815662306a36Sopenharmony_ci */
815762306a36Sopenharmony_civoid check_move_unevictable_folios(struct folio_batch *fbatch)
815862306a36Sopenharmony_ci{
815962306a36Sopenharmony_ci	struct lruvec *lruvec = NULL;
816062306a36Sopenharmony_ci	int pgscanned = 0;
816162306a36Sopenharmony_ci	int pgrescued = 0;
816262306a36Sopenharmony_ci	int i;
816362306a36Sopenharmony_ci
816462306a36Sopenharmony_ci	for (i = 0; i < fbatch->nr; i++) {
816562306a36Sopenharmony_ci		struct folio *folio = fbatch->folios[i];
816662306a36Sopenharmony_ci		int nr_pages = folio_nr_pages(folio);
816762306a36Sopenharmony_ci
816862306a36Sopenharmony_ci		pgscanned += nr_pages;
816962306a36Sopenharmony_ci
817062306a36Sopenharmony_ci		/* block memcg migration while the folio moves between lrus */
817162306a36Sopenharmony_ci		if (!folio_test_clear_lru(folio))
817262306a36Sopenharmony_ci			continue;
817362306a36Sopenharmony_ci
817462306a36Sopenharmony_ci		lruvec = folio_lruvec_relock_irq(folio, lruvec);
817562306a36Sopenharmony_ci		if (folio_evictable(folio) && folio_test_unevictable(folio)) {
817662306a36Sopenharmony_ci			lruvec_del_folio(lruvec, folio);
817762306a36Sopenharmony_ci			folio_clear_unevictable(folio);
817862306a36Sopenharmony_ci			lruvec_add_folio(lruvec, folio);
817962306a36Sopenharmony_ci			pgrescued += nr_pages;
818062306a36Sopenharmony_ci		}
818162306a36Sopenharmony_ci		folio_set_lru(folio);
818262306a36Sopenharmony_ci	}
818362306a36Sopenharmony_ci
818462306a36Sopenharmony_ci	if (lruvec) {
818562306a36Sopenharmony_ci		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
818662306a36Sopenharmony_ci		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
818762306a36Sopenharmony_ci		unlock_page_lruvec_irq(lruvec);
818862306a36Sopenharmony_ci	} else if (pgscanned) {
818962306a36Sopenharmony_ci		count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
819062306a36Sopenharmony_ci	}
819162306a36Sopenharmony_ci}
819262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(check_move_unevictable_folios);
819362306a36Sopenharmony_ci
819462306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE_DEBUG
819562306a36Sopenharmony_cistatic unsigned long purgeable_node(pg_data_t *pgdata, struct scan_control *sc)
819662306a36Sopenharmony_ci{
819762306a36Sopenharmony_ci	struct mem_cgroup *memcg = NULL;
819862306a36Sopenharmony_ci	unsigned long nr = 0;
819962306a36Sopenharmony_ci#ifdef CONFIG_MEMCG
820062306a36Sopenharmony_ci	while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)))
820162306a36Sopenharmony_ci#endif
820262306a36Sopenharmony_ci	{
820362306a36Sopenharmony_ci		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdata);
820462306a36Sopenharmony_ci
820562306a36Sopenharmony_ci		shrink_list(LRU_ACTIVE_PURGEABLE, -1, lruvec, sc);
820662306a36Sopenharmony_ci		nr += shrink_list(LRU_INACTIVE_PURGEABLE, -1, lruvec, sc);
820762306a36Sopenharmony_ci	}
820862306a36Sopenharmony_ci
820962306a36Sopenharmony_ci	pr_info("reclaim %lu purgeable pages.\n", nr);
821062306a36Sopenharmony_ci
821162306a36Sopenharmony_ci	return nr;
821262306a36Sopenharmony_ci}
821362306a36Sopenharmony_ci
821462306a36Sopenharmony_cistatic int purgeable(struct ctl_table *table, int write, void *buffer,
821562306a36Sopenharmony_ci		size_t *lenp, loff_t *ppos)
821662306a36Sopenharmony_ci{
821762306a36Sopenharmony_ci	struct scan_control sc = {
821862306a36Sopenharmony_ci		.gfp_mask = GFP_KERNEL,
821962306a36Sopenharmony_ci		.order = 0,
822062306a36Sopenharmony_ci		.priority = DEF_PRIORITY,
822162306a36Sopenharmony_ci		.may_deactivate = DEACTIVATE_ANON,
822262306a36Sopenharmony_ci		.may_writepage = 1,
822362306a36Sopenharmony_ci		.may_unmap = 1,
822462306a36Sopenharmony_ci		.may_swap = 1,
822562306a36Sopenharmony_ci		.reclaim_idx = MAX_NR_ZONES - 1,
822662306a36Sopenharmony_ci	};
822762306a36Sopenharmony_ci	int nid = 0;
822862306a36Sopenharmony_ci	const struct cred *cred = current_cred();
822962306a36Sopenharmony_ci	if (!cred)
823062306a36Sopenharmony_ci		return 0;
823162306a36Sopenharmony_ci
823262306a36Sopenharmony_ci	if (!uid_eq(cred->euid, GLOBAL_MEMMGR_UID) &&
823362306a36Sopenharmony_ci			!uid_eq(cred->euid, GLOBAL_ROOT_UID)) {
823462306a36Sopenharmony_ci			pr_err("no permission to shrink purgeable heap!\n");
823562306a36Sopenharmony_ci			return -EINVAL;
823662306a36Sopenharmony_ci	}
823762306a36Sopenharmony_ci	for_each_node_state(nid, N_MEMORY)
823862306a36Sopenharmony_ci		purgeable_node(NODE_DATA(nid), &sc);
823962306a36Sopenharmony_ci	return 0;
824062306a36Sopenharmony_ci}
824162306a36Sopenharmony_ci
824262306a36Sopenharmony_cistatic struct ctl_table ker_tab[] = {
824362306a36Sopenharmony_ci	{
824462306a36Sopenharmony_ci		.procname = "purgeable",
824562306a36Sopenharmony_ci		.mode = 0666,
824662306a36Sopenharmony_ci		.proc_handler = purgeable,
824762306a36Sopenharmony_ci	},
824862306a36Sopenharmony_ci	{},
824962306a36Sopenharmony_ci};
825062306a36Sopenharmony_ci
825162306a36Sopenharmony_cistatic struct ctl_table_header *purgeable_header;
825262306a36Sopenharmony_ci
825362306a36Sopenharmony_cistatic void __init purgeable_debugfs_init(void)
825462306a36Sopenharmony_ci{
825562306a36Sopenharmony_ci	purgeable_header = register_sysctl("kernel", ker_tab);
825662306a36Sopenharmony_ci	if (!purgeable_header)
825762306a36Sopenharmony_ci		pr_err("register purgeable sysctl table failed.\n");
825862306a36Sopenharmony_ci}
825962306a36Sopenharmony_ci
826062306a36Sopenharmony_cistatic void __exit purgeable_debugfs_exit(void)
826162306a36Sopenharmony_ci{
826262306a36Sopenharmony_ci	unregister_sysctl_table(purgeable_header);
826362306a36Sopenharmony_ci}
826462306a36Sopenharmony_ci#endif /* CONFIG_MEM_PURGEABLE_DEBUG */
8265