18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Lockless hierarchical page accounting & limiting 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/page_counter.h> 98c2ecf20Sopenharmony_ci#include <linux/atomic.h> 108c2ecf20Sopenharmony_ci#include <linux/kernel.h> 118c2ecf20Sopenharmony_ci#include <linux/string.h> 128c2ecf20Sopenharmony_ci#include <linux/sched.h> 138c2ecf20Sopenharmony_ci#include <linux/bug.h> 148c2ecf20Sopenharmony_ci#include <asm/page.h> 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_cistatic void propagate_protected_usage(struct page_counter *c, 178c2ecf20Sopenharmony_ci unsigned long usage) 188c2ecf20Sopenharmony_ci{ 198c2ecf20Sopenharmony_ci unsigned long protected, old_protected; 208c2ecf20Sopenharmony_ci unsigned long low, min; 218c2ecf20Sopenharmony_ci long delta; 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_ci if (!c->parent) 248c2ecf20Sopenharmony_ci return; 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci min = READ_ONCE(c->min); 278c2ecf20Sopenharmony_ci if (min || atomic_long_read(&c->min_usage)) { 288c2ecf20Sopenharmony_ci protected = min(usage, min); 298c2ecf20Sopenharmony_ci old_protected = atomic_long_xchg(&c->min_usage, protected); 308c2ecf20Sopenharmony_ci delta = protected - old_protected; 318c2ecf20Sopenharmony_ci if (delta) 328c2ecf20Sopenharmony_ci atomic_long_add(delta, &c->parent->children_min_usage); 338c2ecf20Sopenharmony_ci } 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci low = READ_ONCE(c->low); 368c2ecf20Sopenharmony_ci if (low || atomic_long_read(&c->low_usage)) { 378c2ecf20Sopenharmony_ci protected = min(usage, low); 388c2ecf20Sopenharmony_ci old_protected = atomic_long_xchg(&c->low_usage, protected); 398c2ecf20Sopenharmony_ci delta = protected - old_protected; 408c2ecf20Sopenharmony_ci if (delta) 418c2ecf20Sopenharmony_ci atomic_long_add(delta, &c->parent->children_low_usage); 428c2ecf20Sopenharmony_ci } 438c2ecf20Sopenharmony_ci} 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci/** 468c2ecf20Sopenharmony_ci * page_counter_cancel - take pages out of the local counter 478c2ecf20Sopenharmony_ci * @counter: counter 488c2ecf20Sopenharmony_ci * @nr_pages: number of pages to cancel 498c2ecf20Sopenharmony_ci */ 508c2ecf20Sopenharmony_civoid page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) 518c2ecf20Sopenharmony_ci{ 528c2ecf20Sopenharmony_ci long new; 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci new = atomic_long_sub_return(nr_pages, &counter->usage); 558c2ecf20Sopenharmony_ci propagate_protected_usage(counter, new); 568c2ecf20Sopenharmony_ci /* More uncharges than charges? */ 578c2ecf20Sopenharmony_ci WARN_ON_ONCE(new < 0); 588c2ecf20Sopenharmony_ci} 598c2ecf20Sopenharmony_ci 608c2ecf20Sopenharmony_ci/** 618c2ecf20Sopenharmony_ci * page_counter_charge - hierarchically charge pages 628c2ecf20Sopenharmony_ci * @counter: counter 638c2ecf20Sopenharmony_ci * @nr_pages: number of pages to charge 648c2ecf20Sopenharmony_ci * 658c2ecf20Sopenharmony_ci * NOTE: This does not consider any configured counter limits. 668c2ecf20Sopenharmony_ci */ 678c2ecf20Sopenharmony_civoid page_counter_charge(struct page_counter *counter, unsigned long nr_pages) 688c2ecf20Sopenharmony_ci{ 698c2ecf20Sopenharmony_ci struct page_counter *c; 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci for (c = counter; c; c = c->parent) { 728c2ecf20Sopenharmony_ci long new; 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_ci new = atomic_long_add_return(nr_pages, &c->usage); 758c2ecf20Sopenharmony_ci propagate_protected_usage(c, new); 768c2ecf20Sopenharmony_ci /* 778c2ecf20Sopenharmony_ci * This is indeed racy, but we can live with some 788c2ecf20Sopenharmony_ci * inaccuracy in the watermark. 798c2ecf20Sopenharmony_ci */ 808c2ecf20Sopenharmony_ci if (new > READ_ONCE(c->watermark)) 818c2ecf20Sopenharmony_ci WRITE_ONCE(c->watermark, new); 828c2ecf20Sopenharmony_ci } 838c2ecf20Sopenharmony_ci} 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci/** 868c2ecf20Sopenharmony_ci * page_counter_try_charge - try to hierarchically charge pages 878c2ecf20Sopenharmony_ci * @counter: counter 888c2ecf20Sopenharmony_ci * @nr_pages: number of pages to charge 898c2ecf20Sopenharmony_ci * @fail: points first counter to hit its limit, if any 908c2ecf20Sopenharmony_ci * 918c2ecf20Sopenharmony_ci * Returns %true on success, or %false and @fail if the counter or one 928c2ecf20Sopenharmony_ci * of its ancestors has hit its configured limit. 938c2ecf20Sopenharmony_ci */ 948c2ecf20Sopenharmony_cibool page_counter_try_charge(struct page_counter *counter, 958c2ecf20Sopenharmony_ci unsigned long nr_pages, 968c2ecf20Sopenharmony_ci struct page_counter **fail) 978c2ecf20Sopenharmony_ci{ 988c2ecf20Sopenharmony_ci struct page_counter *c; 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci for (c = counter; c; c = c->parent) { 1018c2ecf20Sopenharmony_ci long new; 1028c2ecf20Sopenharmony_ci /* 1038c2ecf20Sopenharmony_ci * Charge speculatively to avoid an expensive CAS. If 1048c2ecf20Sopenharmony_ci * a bigger charge fails, it might falsely lock out a 1058c2ecf20Sopenharmony_ci * racing smaller charge and send it into reclaim 1068c2ecf20Sopenharmony_ci * early, but the error is limited to the difference 1078c2ecf20Sopenharmony_ci * between the two sizes, which is less than 2M/4M in 1088c2ecf20Sopenharmony_ci * case of a THP locking out a regular page charge. 1098c2ecf20Sopenharmony_ci * 1108c2ecf20Sopenharmony_ci * The atomic_long_add_return() implies a full memory 1118c2ecf20Sopenharmony_ci * barrier between incrementing the count and reading 1128c2ecf20Sopenharmony_ci * the limit. When racing with page_counter_set_max(), 1138c2ecf20Sopenharmony_ci * we either see the new limit or the setter sees the 1148c2ecf20Sopenharmony_ci * counter has changed and retries. 1158c2ecf20Sopenharmony_ci */ 1168c2ecf20Sopenharmony_ci new = atomic_long_add_return(nr_pages, &c->usage); 1178c2ecf20Sopenharmony_ci if (new > c->max) { 1188c2ecf20Sopenharmony_ci atomic_long_sub(nr_pages, &c->usage); 1198c2ecf20Sopenharmony_ci propagate_protected_usage(c, new); 1208c2ecf20Sopenharmony_ci /* 1218c2ecf20Sopenharmony_ci * This is racy, but we can live with some 1228c2ecf20Sopenharmony_ci * inaccuracy in the failcnt which is only used 1238c2ecf20Sopenharmony_ci * to report stats. 1248c2ecf20Sopenharmony_ci */ 1258c2ecf20Sopenharmony_ci data_race(c->failcnt++); 1268c2ecf20Sopenharmony_ci *fail = c; 1278c2ecf20Sopenharmony_ci goto failed; 1288c2ecf20Sopenharmony_ci } 1298c2ecf20Sopenharmony_ci propagate_protected_usage(c, new); 1308c2ecf20Sopenharmony_ci /* 1318c2ecf20Sopenharmony_ci * Just like with failcnt, we can live with some 1328c2ecf20Sopenharmony_ci * inaccuracy in the watermark. 1338c2ecf20Sopenharmony_ci */ 1348c2ecf20Sopenharmony_ci if (new > READ_ONCE(c->watermark)) 1358c2ecf20Sopenharmony_ci WRITE_ONCE(c->watermark, new); 1368c2ecf20Sopenharmony_ci } 1378c2ecf20Sopenharmony_ci return true; 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_cifailed: 1408c2ecf20Sopenharmony_ci for (c = counter; c != *fail; c = c->parent) 1418c2ecf20Sopenharmony_ci page_counter_cancel(c, nr_pages); 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci return false; 1448c2ecf20Sopenharmony_ci} 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci/** 1478c2ecf20Sopenharmony_ci * page_counter_uncharge - hierarchically uncharge pages 1488c2ecf20Sopenharmony_ci * @counter: counter 1498c2ecf20Sopenharmony_ci * @nr_pages: number of pages to uncharge 1508c2ecf20Sopenharmony_ci */ 1518c2ecf20Sopenharmony_civoid page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) 1528c2ecf20Sopenharmony_ci{ 1538c2ecf20Sopenharmony_ci struct page_counter *c; 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci for (c = counter; c; c = c->parent) 1568c2ecf20Sopenharmony_ci page_counter_cancel(c, nr_pages); 1578c2ecf20Sopenharmony_ci} 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci/** 1608c2ecf20Sopenharmony_ci * page_counter_set_max - set the maximum number of pages allowed 1618c2ecf20Sopenharmony_ci * @counter: counter 1628c2ecf20Sopenharmony_ci * @nr_pages: limit to set 1638c2ecf20Sopenharmony_ci * 1648c2ecf20Sopenharmony_ci * Returns 0 on success, -EBUSY if the current number of pages on the 1658c2ecf20Sopenharmony_ci * counter already exceeds the specified limit. 1668c2ecf20Sopenharmony_ci * 1678c2ecf20Sopenharmony_ci * The caller must serialize invocations on the same counter. 1688c2ecf20Sopenharmony_ci */ 1698c2ecf20Sopenharmony_ciint page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) 1708c2ecf20Sopenharmony_ci{ 1718c2ecf20Sopenharmony_ci for (;;) { 1728c2ecf20Sopenharmony_ci unsigned long old; 1738c2ecf20Sopenharmony_ci long usage; 1748c2ecf20Sopenharmony_ci 1758c2ecf20Sopenharmony_ci /* 1768c2ecf20Sopenharmony_ci * Update the limit while making sure that it's not 1778c2ecf20Sopenharmony_ci * below the concurrently-changing counter value. 1788c2ecf20Sopenharmony_ci * 1798c2ecf20Sopenharmony_ci * The xchg implies two full memory barriers before 1808c2ecf20Sopenharmony_ci * and after, so the read-swap-read is ordered and 1818c2ecf20Sopenharmony_ci * ensures coherency with page_counter_try_charge(): 1828c2ecf20Sopenharmony_ci * that function modifies the count before checking 1838c2ecf20Sopenharmony_ci * the limit, so if it sees the old limit, we see the 1848c2ecf20Sopenharmony_ci * modified counter and retry. 1858c2ecf20Sopenharmony_ci */ 1868c2ecf20Sopenharmony_ci usage = atomic_long_read(&counter->usage); 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci if (usage > nr_pages) 1898c2ecf20Sopenharmony_ci return -EBUSY; 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci old = xchg(&counter->max, nr_pages); 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci if (atomic_long_read(&counter->usage) <= usage) 1948c2ecf20Sopenharmony_ci return 0; 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci counter->max = old; 1978c2ecf20Sopenharmony_ci cond_resched(); 1988c2ecf20Sopenharmony_ci } 1998c2ecf20Sopenharmony_ci} 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_ci/** 2028c2ecf20Sopenharmony_ci * page_counter_set_min - set the amount of protected memory 2038c2ecf20Sopenharmony_ci * @counter: counter 2048c2ecf20Sopenharmony_ci * @nr_pages: value to set 2058c2ecf20Sopenharmony_ci * 2068c2ecf20Sopenharmony_ci * The caller must serialize invocations on the same counter. 2078c2ecf20Sopenharmony_ci */ 2088c2ecf20Sopenharmony_civoid page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) 2098c2ecf20Sopenharmony_ci{ 2108c2ecf20Sopenharmony_ci struct page_counter *c; 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci WRITE_ONCE(counter->min, nr_pages); 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci for (c = counter; c; c = c->parent) 2158c2ecf20Sopenharmony_ci propagate_protected_usage(c, atomic_long_read(&c->usage)); 2168c2ecf20Sopenharmony_ci} 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_ci/** 2198c2ecf20Sopenharmony_ci * page_counter_set_low - set the amount of protected memory 2208c2ecf20Sopenharmony_ci * @counter: counter 2218c2ecf20Sopenharmony_ci * @nr_pages: value to set 2228c2ecf20Sopenharmony_ci * 2238c2ecf20Sopenharmony_ci * The caller must serialize invocations on the same counter. 2248c2ecf20Sopenharmony_ci */ 2258c2ecf20Sopenharmony_civoid page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) 2268c2ecf20Sopenharmony_ci{ 2278c2ecf20Sopenharmony_ci struct page_counter *c; 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_ci WRITE_ONCE(counter->low, nr_pages); 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci for (c = counter; c; c = c->parent) 2328c2ecf20Sopenharmony_ci propagate_protected_usage(c, atomic_long_read(&c->usage)); 2338c2ecf20Sopenharmony_ci} 2348c2ecf20Sopenharmony_ci 2358c2ecf20Sopenharmony_ci/** 2368c2ecf20Sopenharmony_ci * page_counter_memparse - memparse() for page counter limits 2378c2ecf20Sopenharmony_ci * @buf: string to parse 2388c2ecf20Sopenharmony_ci * @max: string meaning maximum possible value 2398c2ecf20Sopenharmony_ci * @nr_pages: returns the result in number of pages 2408c2ecf20Sopenharmony_ci * 2418c2ecf20Sopenharmony_ci * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be 2428c2ecf20Sopenharmony_ci * limited to %PAGE_COUNTER_MAX. 2438c2ecf20Sopenharmony_ci */ 2448c2ecf20Sopenharmony_ciint page_counter_memparse(const char *buf, const char *max, 2458c2ecf20Sopenharmony_ci unsigned long *nr_pages) 2468c2ecf20Sopenharmony_ci{ 2478c2ecf20Sopenharmony_ci char *end; 2488c2ecf20Sopenharmony_ci u64 bytes; 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_ci if (!strcmp(buf, max)) { 2518c2ecf20Sopenharmony_ci *nr_pages = PAGE_COUNTER_MAX; 2528c2ecf20Sopenharmony_ci return 0; 2538c2ecf20Sopenharmony_ci } 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ci bytes = memparse(buf, &end); 2568c2ecf20Sopenharmony_ci if (*end != '\0') 2578c2ecf20Sopenharmony_ci return -EINVAL; 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci return 0; 2628c2ecf20Sopenharmony_ci} 263