162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci#include <linux/init.h>
362306a36Sopenharmony_ci#include <linux/memblock.h>
462306a36Sopenharmony_ci#include <linux/fs.h>
562306a36Sopenharmony_ci#include <linux/sysfs.h>
662306a36Sopenharmony_ci#include <linux/kobject.h>
762306a36Sopenharmony_ci#include <linux/memory_hotplug.h>
862306a36Sopenharmony_ci#include <linux/mm.h>
962306a36Sopenharmony_ci#include <linux/mmzone.h>
1062306a36Sopenharmony_ci#include <linux/pagemap.h>
1162306a36Sopenharmony_ci#include <linux/rmap.h>
1262306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
1362306a36Sopenharmony_ci#include <linux/page_ext.h>
1462306a36Sopenharmony_ci#include <linux/page_idle.h>
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#include "internal.h"
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci#define BITMAP_CHUNK_SIZE	sizeof(u64)
1962306a36Sopenharmony_ci#define BITMAP_CHUNK_BITS	(BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_ci/*
2262306a36Sopenharmony_ci * Idle page tracking only considers user memory pages, for other types of
2362306a36Sopenharmony_ci * pages the idle flag is always unset and an attempt to set it is silently
2462306a36Sopenharmony_ci * ignored.
2562306a36Sopenharmony_ci *
2662306a36Sopenharmony_ci * We treat a page as a user memory page if it is on an LRU list, because it is
2762306a36Sopenharmony_ci * always safe to pass such a page to rmap_walk(), which is essential for idle
2862306a36Sopenharmony_ci * page tracking. With such an indicator of user pages we can skip isolated
2962306a36Sopenharmony_ci * pages, but since there are not usually many of them, it will hardly affect
3062306a36Sopenharmony_ci * the overall result.
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci * This function tries to get a user memory page by pfn as described above.
3362306a36Sopenharmony_ci */
3462306a36Sopenharmony_cistatic struct folio *page_idle_get_folio(unsigned long pfn)
3562306a36Sopenharmony_ci{
3662306a36Sopenharmony_ci	struct page *page = pfn_to_online_page(pfn);
3762306a36Sopenharmony_ci	struct folio *folio;
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci	if (!page || PageTail(page))
4062306a36Sopenharmony_ci		return NULL;
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci	folio = page_folio(page);
4362306a36Sopenharmony_ci	if (!folio_test_lru(folio) || !folio_try_get(folio))
4462306a36Sopenharmony_ci		return NULL;
4562306a36Sopenharmony_ci	if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
4662306a36Sopenharmony_ci		folio_put(folio);
4762306a36Sopenharmony_ci		folio = NULL;
4862306a36Sopenharmony_ci	}
4962306a36Sopenharmony_ci	return folio;
5062306a36Sopenharmony_ci}
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_cistatic bool page_idle_clear_pte_refs_one(struct folio *folio,
5362306a36Sopenharmony_ci					struct vm_area_struct *vma,
5462306a36Sopenharmony_ci					unsigned long addr, void *arg)
5562306a36Sopenharmony_ci{
5662306a36Sopenharmony_ci	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
5762306a36Sopenharmony_ci	bool referenced = false;
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci	while (page_vma_mapped_walk(&pvmw)) {
6062306a36Sopenharmony_ci		addr = pvmw.address;
6162306a36Sopenharmony_ci		if (pvmw.pte) {
6262306a36Sopenharmony_ci			/*
6362306a36Sopenharmony_ci			 * For PTE-mapped THP, one sub page is referenced,
6462306a36Sopenharmony_ci			 * the whole THP is referenced.
6562306a36Sopenharmony_ci			 */
6662306a36Sopenharmony_ci			if (ptep_clear_young_notify(vma, addr, pvmw.pte))
6762306a36Sopenharmony_ci				referenced = true;
6862306a36Sopenharmony_ci		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
6962306a36Sopenharmony_ci			if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
7062306a36Sopenharmony_ci				referenced = true;
7162306a36Sopenharmony_ci		} else {
7262306a36Sopenharmony_ci			/* unexpected pmd-mapped page? */
7362306a36Sopenharmony_ci			WARN_ON_ONCE(1);
7462306a36Sopenharmony_ci		}
7562306a36Sopenharmony_ci	}
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci	if (referenced) {
7862306a36Sopenharmony_ci		folio_clear_idle(folio);
7962306a36Sopenharmony_ci		/*
8062306a36Sopenharmony_ci		 * We cleared the referenced bit in a mapping to this page. To
8162306a36Sopenharmony_ci		 * avoid interference with page reclaim, mark it young so that
8262306a36Sopenharmony_ci		 * folio_referenced() will return > 0.
8362306a36Sopenharmony_ci		 */
8462306a36Sopenharmony_ci		folio_set_young(folio);
8562306a36Sopenharmony_ci	}
8662306a36Sopenharmony_ci	return true;
8762306a36Sopenharmony_ci}
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_cistatic void page_idle_clear_pte_refs(struct folio *folio)
9062306a36Sopenharmony_ci{
9162306a36Sopenharmony_ci	/*
9262306a36Sopenharmony_ci	 * Since rwc.try_lock is unused, rwc is effectively immutable, so we
9362306a36Sopenharmony_ci	 * can make it static to save some cycles and stack.
9462306a36Sopenharmony_ci	 */
9562306a36Sopenharmony_ci	static struct rmap_walk_control rwc = {
9662306a36Sopenharmony_ci		.rmap_one = page_idle_clear_pte_refs_one,
9762306a36Sopenharmony_ci		.anon_lock = folio_lock_anon_vma_read,
9862306a36Sopenharmony_ci	};
9962306a36Sopenharmony_ci	bool need_lock;
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci	if (!folio_mapped(folio) || !folio_raw_mapping(folio))
10262306a36Sopenharmony_ci		return;
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
10562306a36Sopenharmony_ci	if (need_lock && !folio_trylock(folio))
10662306a36Sopenharmony_ci		return;
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci	rmap_walk(folio, &rwc);
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	if (need_lock)
11162306a36Sopenharmony_ci		folio_unlock(folio);
11262306a36Sopenharmony_ci}
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_cistatic ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
11562306a36Sopenharmony_ci				     struct bin_attribute *attr, char *buf,
11662306a36Sopenharmony_ci				     loff_t pos, size_t count)
11762306a36Sopenharmony_ci{
11862306a36Sopenharmony_ci	u64 *out = (u64 *)buf;
11962306a36Sopenharmony_ci	struct folio *folio;
12062306a36Sopenharmony_ci	unsigned long pfn, end_pfn;
12162306a36Sopenharmony_ci	int bit;
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
12462306a36Sopenharmony_ci		return -EINVAL;
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	pfn = pos * BITS_PER_BYTE;
12762306a36Sopenharmony_ci	if (pfn >= max_pfn)
12862306a36Sopenharmony_ci		return 0;
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci	end_pfn = pfn + count * BITS_PER_BYTE;
13162306a36Sopenharmony_ci	if (end_pfn > max_pfn)
13262306a36Sopenharmony_ci		end_pfn = max_pfn;
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci	for (; pfn < end_pfn; pfn++) {
13562306a36Sopenharmony_ci		bit = pfn % BITMAP_CHUNK_BITS;
13662306a36Sopenharmony_ci		if (!bit)
13762306a36Sopenharmony_ci			*out = 0ULL;
13862306a36Sopenharmony_ci		folio = page_idle_get_folio(pfn);
13962306a36Sopenharmony_ci		if (folio) {
14062306a36Sopenharmony_ci			if (folio_test_idle(folio)) {
14162306a36Sopenharmony_ci				/*
14262306a36Sopenharmony_ci				 * The page might have been referenced via a
14362306a36Sopenharmony_ci				 * pte, in which case it is not idle. Clear
14462306a36Sopenharmony_ci				 * refs and recheck.
14562306a36Sopenharmony_ci				 */
14662306a36Sopenharmony_ci				page_idle_clear_pte_refs(folio);
14762306a36Sopenharmony_ci				if (folio_test_idle(folio))
14862306a36Sopenharmony_ci					*out |= 1ULL << bit;
14962306a36Sopenharmony_ci			}
15062306a36Sopenharmony_ci			folio_put(folio);
15162306a36Sopenharmony_ci		}
15262306a36Sopenharmony_ci		if (bit == BITMAP_CHUNK_BITS - 1)
15362306a36Sopenharmony_ci			out++;
15462306a36Sopenharmony_ci		cond_resched();
15562306a36Sopenharmony_ci	}
15662306a36Sopenharmony_ci	return (char *)out - buf;
15762306a36Sopenharmony_ci}
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_cistatic ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
16062306a36Sopenharmony_ci				      struct bin_attribute *attr, char *buf,
16162306a36Sopenharmony_ci				      loff_t pos, size_t count)
16262306a36Sopenharmony_ci{
16362306a36Sopenharmony_ci	const u64 *in = (u64 *)buf;
16462306a36Sopenharmony_ci	struct folio *folio;
16562306a36Sopenharmony_ci	unsigned long pfn, end_pfn;
16662306a36Sopenharmony_ci	int bit;
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_ci	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
16962306a36Sopenharmony_ci		return -EINVAL;
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	pfn = pos * BITS_PER_BYTE;
17262306a36Sopenharmony_ci	if (pfn >= max_pfn)
17362306a36Sopenharmony_ci		return -ENXIO;
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	end_pfn = pfn + count * BITS_PER_BYTE;
17662306a36Sopenharmony_ci	if (end_pfn > max_pfn)
17762306a36Sopenharmony_ci		end_pfn = max_pfn;
17862306a36Sopenharmony_ci
17962306a36Sopenharmony_ci	for (; pfn < end_pfn; pfn++) {
18062306a36Sopenharmony_ci		bit = pfn % BITMAP_CHUNK_BITS;
18162306a36Sopenharmony_ci		if ((*in >> bit) & 1) {
18262306a36Sopenharmony_ci			folio = page_idle_get_folio(pfn);
18362306a36Sopenharmony_ci			if (folio) {
18462306a36Sopenharmony_ci				page_idle_clear_pte_refs(folio);
18562306a36Sopenharmony_ci				folio_set_idle(folio);
18662306a36Sopenharmony_ci				folio_put(folio);
18762306a36Sopenharmony_ci			}
18862306a36Sopenharmony_ci		}
18962306a36Sopenharmony_ci		if (bit == BITMAP_CHUNK_BITS - 1)
19062306a36Sopenharmony_ci			in++;
19162306a36Sopenharmony_ci		cond_resched();
19262306a36Sopenharmony_ci	}
19362306a36Sopenharmony_ci	return (char *)in - buf;
19462306a36Sopenharmony_ci}
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_cistatic struct bin_attribute page_idle_bitmap_attr =
19762306a36Sopenharmony_ci		__BIN_ATTR(bitmap, 0600,
19862306a36Sopenharmony_ci			   page_idle_bitmap_read, page_idle_bitmap_write, 0);
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_cistatic struct bin_attribute *page_idle_bin_attrs[] = {
20162306a36Sopenharmony_ci	&page_idle_bitmap_attr,
20262306a36Sopenharmony_ci	NULL,
20362306a36Sopenharmony_ci};
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_cistatic const struct attribute_group page_idle_attr_group = {
20662306a36Sopenharmony_ci	.bin_attrs = page_idle_bin_attrs,
20762306a36Sopenharmony_ci	.name = "page_idle",
20862306a36Sopenharmony_ci};
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_cistatic int __init page_idle_init(void)
21162306a36Sopenharmony_ci{
21262306a36Sopenharmony_ci	int err;
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci	err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
21562306a36Sopenharmony_ci	if (err) {
21662306a36Sopenharmony_ci		pr_err("page_idle: register sysfs failed\n");
21762306a36Sopenharmony_ci		return err;
21862306a36Sopenharmony_ci	}
21962306a36Sopenharmony_ci	return 0;
22062306a36Sopenharmony_ci}
22162306a36Sopenharmony_cisubsys_initcall(page_idle_init);
222