xref: /kernel/linux/linux-5.10/mm/page_idle.c (revision 8c2ecf20)
18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci#include <linux/init.h>
38c2ecf20Sopenharmony_ci#include <linux/memblock.h>
48c2ecf20Sopenharmony_ci#include <linux/fs.h>
58c2ecf20Sopenharmony_ci#include <linux/sysfs.h>
68c2ecf20Sopenharmony_ci#include <linux/kobject.h>
78c2ecf20Sopenharmony_ci#include <linux/memory_hotplug.h>
88c2ecf20Sopenharmony_ci#include <linux/mm.h>
98c2ecf20Sopenharmony_ci#include <linux/mmzone.h>
108c2ecf20Sopenharmony_ci#include <linux/pagemap.h>
118c2ecf20Sopenharmony_ci#include <linux/rmap.h>
128c2ecf20Sopenharmony_ci#include <linux/mmu_notifier.h>
138c2ecf20Sopenharmony_ci#include <linux/page_ext.h>
148c2ecf20Sopenharmony_ci#include <linux/page_idle.h>
158c2ecf20Sopenharmony_ci
168c2ecf20Sopenharmony_ci#define BITMAP_CHUNK_SIZE	sizeof(u64)
178c2ecf20Sopenharmony_ci#define BITMAP_CHUNK_BITS	(BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci/*
208c2ecf20Sopenharmony_ci * Idle page tracking only considers user memory pages, for other types of
218c2ecf20Sopenharmony_ci * pages the idle flag is always unset and an attempt to set it is silently
228c2ecf20Sopenharmony_ci * ignored.
238c2ecf20Sopenharmony_ci *
248c2ecf20Sopenharmony_ci * We treat a page as a user memory page if it is on an LRU list, because it is
258c2ecf20Sopenharmony_ci * always safe to pass such a page to rmap_walk(), which is essential for idle
268c2ecf20Sopenharmony_ci * page tracking. With such an indicator of user pages we can skip isolated
278c2ecf20Sopenharmony_ci * pages, but since there are not usually many of them, it will hardly affect
288c2ecf20Sopenharmony_ci * the overall result.
298c2ecf20Sopenharmony_ci *
308c2ecf20Sopenharmony_ci * This function tries to get a user memory page by pfn as described above.
318c2ecf20Sopenharmony_ci */
328c2ecf20Sopenharmony_cistatic struct page *page_idle_get_page(unsigned long pfn)
338c2ecf20Sopenharmony_ci{
348c2ecf20Sopenharmony_ci	struct page *page = pfn_to_online_page(pfn);
358c2ecf20Sopenharmony_ci	pg_data_t *pgdat;
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci	if (!page || !PageLRU(page) ||
388c2ecf20Sopenharmony_ci	    !get_page_unless_zero(page))
398c2ecf20Sopenharmony_ci		return NULL;
408c2ecf20Sopenharmony_ci
418c2ecf20Sopenharmony_ci	pgdat = page_pgdat(page);
428c2ecf20Sopenharmony_ci	spin_lock_irq(&pgdat->lru_lock);
438c2ecf20Sopenharmony_ci	if (unlikely(!PageLRU(page))) {
448c2ecf20Sopenharmony_ci		put_page(page);
458c2ecf20Sopenharmony_ci		page = NULL;
468c2ecf20Sopenharmony_ci	}
478c2ecf20Sopenharmony_ci	spin_unlock_irq(&pgdat->lru_lock);
488c2ecf20Sopenharmony_ci	return page;
498c2ecf20Sopenharmony_ci}
508c2ecf20Sopenharmony_ci
518c2ecf20Sopenharmony_cistatic bool page_idle_clear_pte_refs_one(struct page *page,
528c2ecf20Sopenharmony_ci					struct vm_area_struct *vma,
538c2ecf20Sopenharmony_ci					unsigned long addr, void *arg)
548c2ecf20Sopenharmony_ci{
558c2ecf20Sopenharmony_ci	struct page_vma_mapped_walk pvmw = {
568c2ecf20Sopenharmony_ci		.page = page,
578c2ecf20Sopenharmony_ci		.vma = vma,
588c2ecf20Sopenharmony_ci		.address = addr,
598c2ecf20Sopenharmony_ci	};
608c2ecf20Sopenharmony_ci	bool referenced = false;
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci	while (page_vma_mapped_walk(&pvmw)) {
638c2ecf20Sopenharmony_ci		addr = pvmw.address;
648c2ecf20Sopenharmony_ci		if (pvmw.pte) {
658c2ecf20Sopenharmony_ci			/*
668c2ecf20Sopenharmony_ci			 * For PTE-mapped THP, one sub page is referenced,
678c2ecf20Sopenharmony_ci			 * the whole THP is referenced.
688c2ecf20Sopenharmony_ci			 */
698c2ecf20Sopenharmony_ci			if (ptep_clear_young_notify(vma, addr, pvmw.pte))
708c2ecf20Sopenharmony_ci				referenced = true;
718c2ecf20Sopenharmony_ci		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
728c2ecf20Sopenharmony_ci			if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
738c2ecf20Sopenharmony_ci				referenced = true;
748c2ecf20Sopenharmony_ci		} else {
758c2ecf20Sopenharmony_ci			/* unexpected pmd-mapped page? */
768c2ecf20Sopenharmony_ci			WARN_ON_ONCE(1);
778c2ecf20Sopenharmony_ci		}
788c2ecf20Sopenharmony_ci	}
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci	if (referenced) {
818c2ecf20Sopenharmony_ci		clear_page_idle(page);
828c2ecf20Sopenharmony_ci		/*
838c2ecf20Sopenharmony_ci		 * We cleared the referenced bit in a mapping to this page. To
848c2ecf20Sopenharmony_ci		 * avoid interference with page reclaim, mark it young so that
858c2ecf20Sopenharmony_ci		 * page_referenced() will return > 0.
868c2ecf20Sopenharmony_ci		 */
878c2ecf20Sopenharmony_ci		set_page_young(page);
888c2ecf20Sopenharmony_ci	}
898c2ecf20Sopenharmony_ci	return true;
908c2ecf20Sopenharmony_ci}
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_cistatic void page_idle_clear_pte_refs(struct page *page)
938c2ecf20Sopenharmony_ci{
948c2ecf20Sopenharmony_ci	/*
958c2ecf20Sopenharmony_ci	 * Since rwc.arg is unused, rwc is effectively immutable, so we
968c2ecf20Sopenharmony_ci	 * can make it static const to save some cycles and stack.
978c2ecf20Sopenharmony_ci	 */
988c2ecf20Sopenharmony_ci	static const struct rmap_walk_control rwc = {
998c2ecf20Sopenharmony_ci		.rmap_one = page_idle_clear_pte_refs_one,
1008c2ecf20Sopenharmony_ci		.anon_lock = page_lock_anon_vma_read,
1018c2ecf20Sopenharmony_ci	};
1028c2ecf20Sopenharmony_ci	bool need_lock;
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci	if (!page_mapped(page) ||
1058c2ecf20Sopenharmony_ci	    !page_rmapping(page))
1068c2ecf20Sopenharmony_ci		return;
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci	need_lock = !PageAnon(page) || PageKsm(page);
1098c2ecf20Sopenharmony_ci	if (need_lock && !trylock_page(page))
1108c2ecf20Sopenharmony_ci		return;
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci	rmap_walk(page, (struct rmap_walk_control *)&rwc);
1138c2ecf20Sopenharmony_ci
1148c2ecf20Sopenharmony_ci	if (need_lock)
1158c2ecf20Sopenharmony_ci		unlock_page(page);
1168c2ecf20Sopenharmony_ci}
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_cistatic ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
1198c2ecf20Sopenharmony_ci				     struct bin_attribute *attr, char *buf,
1208c2ecf20Sopenharmony_ci				     loff_t pos, size_t count)
1218c2ecf20Sopenharmony_ci{
1228c2ecf20Sopenharmony_ci	u64 *out = (u64 *)buf;
1238c2ecf20Sopenharmony_ci	struct page *page;
1248c2ecf20Sopenharmony_ci	unsigned long pfn, end_pfn;
1258c2ecf20Sopenharmony_ci	int bit;
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
1288c2ecf20Sopenharmony_ci		return -EINVAL;
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci	pfn = pos * BITS_PER_BYTE;
1318c2ecf20Sopenharmony_ci	if (pfn >= max_pfn)
1328c2ecf20Sopenharmony_ci		return 0;
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ci	end_pfn = pfn + count * BITS_PER_BYTE;
1358c2ecf20Sopenharmony_ci	if (end_pfn > max_pfn)
1368c2ecf20Sopenharmony_ci		end_pfn = max_pfn;
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci	for (; pfn < end_pfn; pfn++) {
1398c2ecf20Sopenharmony_ci		bit = pfn % BITMAP_CHUNK_BITS;
1408c2ecf20Sopenharmony_ci		if (!bit)
1418c2ecf20Sopenharmony_ci			*out = 0ULL;
1428c2ecf20Sopenharmony_ci		page = page_idle_get_page(pfn);
1438c2ecf20Sopenharmony_ci		if (page) {
1448c2ecf20Sopenharmony_ci			if (page_is_idle(page)) {
1458c2ecf20Sopenharmony_ci				/*
1468c2ecf20Sopenharmony_ci				 * The page might have been referenced via a
1478c2ecf20Sopenharmony_ci				 * pte, in which case it is not idle. Clear
1488c2ecf20Sopenharmony_ci				 * refs and recheck.
1498c2ecf20Sopenharmony_ci				 */
1508c2ecf20Sopenharmony_ci				page_idle_clear_pte_refs(page);
1518c2ecf20Sopenharmony_ci				if (page_is_idle(page))
1528c2ecf20Sopenharmony_ci					*out |= 1ULL << bit;
1538c2ecf20Sopenharmony_ci			}
1548c2ecf20Sopenharmony_ci			put_page(page);
1558c2ecf20Sopenharmony_ci		}
1568c2ecf20Sopenharmony_ci		if (bit == BITMAP_CHUNK_BITS - 1)
1578c2ecf20Sopenharmony_ci			out++;
1588c2ecf20Sopenharmony_ci		cond_resched();
1598c2ecf20Sopenharmony_ci	}
1608c2ecf20Sopenharmony_ci	return (char *)out - buf;
1618c2ecf20Sopenharmony_ci}
1628c2ecf20Sopenharmony_ci
1638c2ecf20Sopenharmony_cistatic ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
1648c2ecf20Sopenharmony_ci				      struct bin_attribute *attr, char *buf,
1658c2ecf20Sopenharmony_ci				      loff_t pos, size_t count)
1668c2ecf20Sopenharmony_ci{
1678c2ecf20Sopenharmony_ci	const u64 *in = (u64 *)buf;
1688c2ecf20Sopenharmony_ci	struct page *page;
1698c2ecf20Sopenharmony_ci	unsigned long pfn, end_pfn;
1708c2ecf20Sopenharmony_ci	int bit;
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
1738c2ecf20Sopenharmony_ci		return -EINVAL;
1748c2ecf20Sopenharmony_ci
1758c2ecf20Sopenharmony_ci	pfn = pos * BITS_PER_BYTE;
1768c2ecf20Sopenharmony_ci	if (pfn >= max_pfn)
1778c2ecf20Sopenharmony_ci		return -ENXIO;
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci	end_pfn = pfn + count * BITS_PER_BYTE;
1808c2ecf20Sopenharmony_ci	if (end_pfn > max_pfn)
1818c2ecf20Sopenharmony_ci		end_pfn = max_pfn;
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci	for (; pfn < end_pfn; pfn++) {
1848c2ecf20Sopenharmony_ci		bit = pfn % BITMAP_CHUNK_BITS;
1858c2ecf20Sopenharmony_ci		if ((*in >> bit) & 1) {
1868c2ecf20Sopenharmony_ci			page = page_idle_get_page(pfn);
1878c2ecf20Sopenharmony_ci			if (page) {
1888c2ecf20Sopenharmony_ci				page_idle_clear_pte_refs(page);
1898c2ecf20Sopenharmony_ci				set_page_idle(page);
1908c2ecf20Sopenharmony_ci				put_page(page);
1918c2ecf20Sopenharmony_ci			}
1928c2ecf20Sopenharmony_ci		}
1938c2ecf20Sopenharmony_ci		if (bit == BITMAP_CHUNK_BITS - 1)
1948c2ecf20Sopenharmony_ci			in++;
1958c2ecf20Sopenharmony_ci		cond_resched();
1968c2ecf20Sopenharmony_ci	}
1978c2ecf20Sopenharmony_ci	return (char *)in - buf;
1988c2ecf20Sopenharmony_ci}
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_cistatic struct bin_attribute page_idle_bitmap_attr =
2018c2ecf20Sopenharmony_ci		__BIN_ATTR(bitmap, 0600,
2028c2ecf20Sopenharmony_ci			   page_idle_bitmap_read, page_idle_bitmap_write, 0);
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_cistatic struct bin_attribute *page_idle_bin_attrs[] = {
2058c2ecf20Sopenharmony_ci	&page_idle_bitmap_attr,
2068c2ecf20Sopenharmony_ci	NULL,
2078c2ecf20Sopenharmony_ci};
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_cistatic const struct attribute_group page_idle_attr_group = {
2108c2ecf20Sopenharmony_ci	.bin_attrs = page_idle_bin_attrs,
2118c2ecf20Sopenharmony_ci	.name = "page_idle",
2128c2ecf20Sopenharmony_ci};
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci#ifndef CONFIG_64BIT
2158c2ecf20Sopenharmony_cistatic bool need_page_idle(void)
2168c2ecf20Sopenharmony_ci{
2178c2ecf20Sopenharmony_ci	return true;
2188c2ecf20Sopenharmony_ci}
2198c2ecf20Sopenharmony_cistruct page_ext_operations page_idle_ops = {
2208c2ecf20Sopenharmony_ci	.need = need_page_idle,
2218c2ecf20Sopenharmony_ci};
2228c2ecf20Sopenharmony_ci#endif
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_cistatic int __init page_idle_init(void)
2258c2ecf20Sopenharmony_ci{
2268c2ecf20Sopenharmony_ci	int err;
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci	err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
2298c2ecf20Sopenharmony_ci	if (err) {
2308c2ecf20Sopenharmony_ci		pr_err("page_idle: register sysfs failed\n");
2318c2ecf20Sopenharmony_ci		return err;
2328c2ecf20Sopenharmony_ci	}
2338c2ecf20Sopenharmony_ci	return 0;
2348c2ecf20Sopenharmony_ci}
2358c2ecf20Sopenharmony_cisubsys_initcall(page_idle_init);
236