18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci#include <linux/init.h> 38c2ecf20Sopenharmony_ci#include <linux/memblock.h> 48c2ecf20Sopenharmony_ci#include <linux/fs.h> 58c2ecf20Sopenharmony_ci#include <linux/sysfs.h> 68c2ecf20Sopenharmony_ci#include <linux/kobject.h> 78c2ecf20Sopenharmony_ci#include <linux/memory_hotplug.h> 88c2ecf20Sopenharmony_ci#include <linux/mm.h> 98c2ecf20Sopenharmony_ci#include <linux/mmzone.h> 108c2ecf20Sopenharmony_ci#include <linux/pagemap.h> 118c2ecf20Sopenharmony_ci#include <linux/rmap.h> 128c2ecf20Sopenharmony_ci#include <linux/mmu_notifier.h> 138c2ecf20Sopenharmony_ci#include <linux/page_ext.h> 148c2ecf20Sopenharmony_ci#include <linux/page_idle.h> 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_ci#define BITMAP_CHUNK_SIZE sizeof(u64) 178c2ecf20Sopenharmony_ci#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) 188c2ecf20Sopenharmony_ci 198c2ecf20Sopenharmony_ci/* 208c2ecf20Sopenharmony_ci * Idle page tracking only considers user memory pages, for other types of 218c2ecf20Sopenharmony_ci * pages the idle flag is always unset and an attempt to set it is silently 228c2ecf20Sopenharmony_ci * ignored. 238c2ecf20Sopenharmony_ci * 248c2ecf20Sopenharmony_ci * We treat a page as a user memory page if it is on an LRU list, because it is 258c2ecf20Sopenharmony_ci * always safe to pass such a page to rmap_walk(), which is essential for idle 268c2ecf20Sopenharmony_ci * page tracking. With such an indicator of user pages we can skip isolated 278c2ecf20Sopenharmony_ci * pages, but since there are not usually many of them, it will hardly affect 288c2ecf20Sopenharmony_ci * the overall result. 298c2ecf20Sopenharmony_ci * 308c2ecf20Sopenharmony_ci * This function tries to get a user memory page by pfn as described above. 318c2ecf20Sopenharmony_ci */ 328c2ecf20Sopenharmony_cistatic struct page *page_idle_get_page(unsigned long pfn) 338c2ecf20Sopenharmony_ci{ 348c2ecf20Sopenharmony_ci struct page *page = pfn_to_online_page(pfn); 358c2ecf20Sopenharmony_ci pg_data_t *pgdat; 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci if (!page || !PageLRU(page) || 388c2ecf20Sopenharmony_ci !get_page_unless_zero(page)) 398c2ecf20Sopenharmony_ci return NULL; 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci pgdat = page_pgdat(page); 428c2ecf20Sopenharmony_ci spin_lock_irq(&pgdat->lru_lock); 438c2ecf20Sopenharmony_ci if (unlikely(!PageLRU(page))) { 448c2ecf20Sopenharmony_ci put_page(page); 458c2ecf20Sopenharmony_ci page = NULL; 468c2ecf20Sopenharmony_ci } 478c2ecf20Sopenharmony_ci spin_unlock_irq(&pgdat->lru_lock); 488c2ecf20Sopenharmony_ci return page; 498c2ecf20Sopenharmony_ci} 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_cistatic bool page_idle_clear_pte_refs_one(struct page *page, 528c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 538c2ecf20Sopenharmony_ci unsigned long addr, void *arg) 548c2ecf20Sopenharmony_ci{ 558c2ecf20Sopenharmony_ci struct page_vma_mapped_walk pvmw = { 568c2ecf20Sopenharmony_ci .page = page, 578c2ecf20Sopenharmony_ci .vma = vma, 588c2ecf20Sopenharmony_ci .address = addr, 598c2ecf20Sopenharmony_ci }; 608c2ecf20Sopenharmony_ci bool referenced = false; 618c2ecf20Sopenharmony_ci 628c2ecf20Sopenharmony_ci while (page_vma_mapped_walk(&pvmw)) { 638c2ecf20Sopenharmony_ci addr = pvmw.address; 648c2ecf20Sopenharmony_ci if (pvmw.pte) { 658c2ecf20Sopenharmony_ci /* 668c2ecf20Sopenharmony_ci * For PTE-mapped THP, one sub page is referenced, 678c2ecf20Sopenharmony_ci * the whole THP is referenced. 688c2ecf20Sopenharmony_ci */ 698c2ecf20Sopenharmony_ci if (ptep_clear_young_notify(vma, addr, pvmw.pte)) 708c2ecf20Sopenharmony_ci referenced = true; 718c2ecf20Sopenharmony_ci } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 728c2ecf20Sopenharmony_ci if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) 738c2ecf20Sopenharmony_ci referenced = true; 748c2ecf20Sopenharmony_ci } else { 758c2ecf20Sopenharmony_ci /* unexpected pmd-mapped page? */ 768c2ecf20Sopenharmony_ci WARN_ON_ONCE(1); 778c2ecf20Sopenharmony_ci } 788c2ecf20Sopenharmony_ci } 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci if (referenced) { 818c2ecf20Sopenharmony_ci clear_page_idle(page); 828c2ecf20Sopenharmony_ci /* 838c2ecf20Sopenharmony_ci * We cleared the referenced bit in a mapping to this page. To 848c2ecf20Sopenharmony_ci * avoid interference with page reclaim, mark it young so that 858c2ecf20Sopenharmony_ci * page_referenced() will return > 0. 868c2ecf20Sopenharmony_ci */ 878c2ecf20Sopenharmony_ci set_page_young(page); 888c2ecf20Sopenharmony_ci } 898c2ecf20Sopenharmony_ci return true; 908c2ecf20Sopenharmony_ci} 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_cistatic void page_idle_clear_pte_refs(struct page *page) 938c2ecf20Sopenharmony_ci{ 948c2ecf20Sopenharmony_ci /* 958c2ecf20Sopenharmony_ci * Since rwc.arg is unused, rwc is effectively immutable, so we 968c2ecf20Sopenharmony_ci * can make it static const to save some cycles and stack. 978c2ecf20Sopenharmony_ci */ 988c2ecf20Sopenharmony_ci static const struct rmap_walk_control rwc = { 998c2ecf20Sopenharmony_ci .rmap_one = page_idle_clear_pte_refs_one, 1008c2ecf20Sopenharmony_ci .anon_lock = page_lock_anon_vma_read, 1018c2ecf20Sopenharmony_ci }; 1028c2ecf20Sopenharmony_ci bool need_lock; 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci if (!page_mapped(page) || 1058c2ecf20Sopenharmony_ci !page_rmapping(page)) 1068c2ecf20Sopenharmony_ci return; 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci need_lock = !PageAnon(page) || PageKsm(page); 1098c2ecf20Sopenharmony_ci if (need_lock && !trylock_page(page)) 1108c2ecf20Sopenharmony_ci return; 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci rmap_walk(page, (struct rmap_walk_control *)&rwc); 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci if (need_lock) 1158c2ecf20Sopenharmony_ci unlock_page(page); 1168c2ecf20Sopenharmony_ci} 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_cistatic ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, 1198c2ecf20Sopenharmony_ci struct bin_attribute *attr, char *buf, 1208c2ecf20Sopenharmony_ci loff_t pos, size_t count) 1218c2ecf20Sopenharmony_ci{ 1228c2ecf20Sopenharmony_ci u64 *out = (u64 *)buf; 1238c2ecf20Sopenharmony_ci struct page *page; 1248c2ecf20Sopenharmony_ci unsigned long pfn, end_pfn; 1258c2ecf20Sopenharmony_ci int bit; 1268c2ecf20Sopenharmony_ci 1278c2ecf20Sopenharmony_ci if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) 1288c2ecf20Sopenharmony_ci return -EINVAL; 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci pfn = pos * BITS_PER_BYTE; 1318c2ecf20Sopenharmony_ci if (pfn >= max_pfn) 1328c2ecf20Sopenharmony_ci return 0; 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_ci end_pfn = pfn + count * BITS_PER_BYTE; 1358c2ecf20Sopenharmony_ci if (end_pfn > max_pfn) 1368c2ecf20Sopenharmony_ci end_pfn = max_pfn; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci for (; pfn < end_pfn; pfn++) { 1398c2ecf20Sopenharmony_ci bit = pfn % BITMAP_CHUNK_BITS; 1408c2ecf20Sopenharmony_ci if (!bit) 1418c2ecf20Sopenharmony_ci *out = 0ULL; 1428c2ecf20Sopenharmony_ci page = page_idle_get_page(pfn); 1438c2ecf20Sopenharmony_ci if (page) { 1448c2ecf20Sopenharmony_ci if (page_is_idle(page)) { 1458c2ecf20Sopenharmony_ci /* 1468c2ecf20Sopenharmony_ci * The page might have been referenced via a 1478c2ecf20Sopenharmony_ci * pte, in which case it is not idle. Clear 1488c2ecf20Sopenharmony_ci * refs and recheck. 1498c2ecf20Sopenharmony_ci */ 1508c2ecf20Sopenharmony_ci page_idle_clear_pte_refs(page); 1518c2ecf20Sopenharmony_ci if (page_is_idle(page)) 1528c2ecf20Sopenharmony_ci *out |= 1ULL << bit; 1538c2ecf20Sopenharmony_ci } 1548c2ecf20Sopenharmony_ci put_page(page); 1558c2ecf20Sopenharmony_ci } 1568c2ecf20Sopenharmony_ci if (bit == BITMAP_CHUNK_BITS - 1) 1578c2ecf20Sopenharmony_ci out++; 1588c2ecf20Sopenharmony_ci cond_resched(); 1598c2ecf20Sopenharmony_ci } 1608c2ecf20Sopenharmony_ci return (char *)out - buf; 1618c2ecf20Sopenharmony_ci} 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_cistatic ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, 1648c2ecf20Sopenharmony_ci struct bin_attribute *attr, char *buf, 1658c2ecf20Sopenharmony_ci loff_t pos, size_t count) 1668c2ecf20Sopenharmony_ci{ 1678c2ecf20Sopenharmony_ci const u64 *in = (u64 *)buf; 1688c2ecf20Sopenharmony_ci struct page *page; 1698c2ecf20Sopenharmony_ci unsigned long pfn, end_pfn; 1708c2ecf20Sopenharmony_ci int bit; 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) 1738c2ecf20Sopenharmony_ci return -EINVAL; 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci pfn = pos * BITS_PER_BYTE; 1768c2ecf20Sopenharmony_ci if (pfn >= max_pfn) 1778c2ecf20Sopenharmony_ci return -ENXIO; 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci end_pfn = pfn + count * BITS_PER_BYTE; 1808c2ecf20Sopenharmony_ci if (end_pfn > max_pfn) 1818c2ecf20Sopenharmony_ci end_pfn = max_pfn; 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci for (; pfn < end_pfn; pfn++) { 1848c2ecf20Sopenharmony_ci bit = pfn % BITMAP_CHUNK_BITS; 1858c2ecf20Sopenharmony_ci if ((*in >> bit) & 1) { 1868c2ecf20Sopenharmony_ci page = page_idle_get_page(pfn); 1878c2ecf20Sopenharmony_ci if (page) { 1888c2ecf20Sopenharmony_ci page_idle_clear_pte_refs(page); 1898c2ecf20Sopenharmony_ci set_page_idle(page); 1908c2ecf20Sopenharmony_ci put_page(page); 1918c2ecf20Sopenharmony_ci } 1928c2ecf20Sopenharmony_ci } 1938c2ecf20Sopenharmony_ci if (bit == BITMAP_CHUNK_BITS - 1) 1948c2ecf20Sopenharmony_ci in++; 1958c2ecf20Sopenharmony_ci cond_resched(); 1968c2ecf20Sopenharmony_ci } 1978c2ecf20Sopenharmony_ci return (char *)in - buf; 1988c2ecf20Sopenharmony_ci} 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_cistatic struct bin_attribute page_idle_bitmap_attr = 2018c2ecf20Sopenharmony_ci __BIN_ATTR(bitmap, 0600, 2028c2ecf20Sopenharmony_ci page_idle_bitmap_read, page_idle_bitmap_write, 0); 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_cistatic struct bin_attribute *page_idle_bin_attrs[] = { 2058c2ecf20Sopenharmony_ci &page_idle_bitmap_attr, 2068c2ecf20Sopenharmony_ci NULL, 2078c2ecf20Sopenharmony_ci}; 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_cistatic const struct attribute_group page_idle_attr_group = { 2108c2ecf20Sopenharmony_ci .bin_attrs = page_idle_bin_attrs, 2118c2ecf20Sopenharmony_ci .name = "page_idle", 2128c2ecf20Sopenharmony_ci}; 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci#ifndef CONFIG_64BIT 2158c2ecf20Sopenharmony_cistatic bool need_page_idle(void) 2168c2ecf20Sopenharmony_ci{ 2178c2ecf20Sopenharmony_ci return true; 2188c2ecf20Sopenharmony_ci} 2198c2ecf20Sopenharmony_cistruct page_ext_operations page_idle_ops = { 2208c2ecf20Sopenharmony_ci .need = need_page_idle, 2218c2ecf20Sopenharmony_ci}; 2228c2ecf20Sopenharmony_ci#endif 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_cistatic int __init page_idle_init(void) 2258c2ecf20Sopenharmony_ci{ 2268c2ecf20Sopenharmony_ci int err; 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci err = sysfs_create_group(mm_kobj, &page_idle_attr_group); 2298c2ecf20Sopenharmony_ci if (err) { 2308c2ecf20Sopenharmony_ci pr_err("page_idle: register sysfs failed\n"); 2318c2ecf20Sopenharmony_ci return err; 2328c2ecf20Sopenharmony_ci } 2338c2ecf20Sopenharmony_ci return 0; 2348c2ecf20Sopenharmony_ci} 2358c2ecf20Sopenharmony_cisubsys_initcall(page_idle_init); 236