162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Lockless hierarchical page accounting & limiting 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include <linux/page_counter.h> 962306a36Sopenharmony_ci#include <linux/atomic.h> 1062306a36Sopenharmony_ci#include <linux/kernel.h> 1162306a36Sopenharmony_ci#include <linux/string.h> 1262306a36Sopenharmony_ci#include <linux/sched.h> 1362306a36Sopenharmony_ci#include <linux/bug.h> 1462306a36Sopenharmony_ci#include <asm/page.h> 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_cistatic void propagate_protected_usage(struct page_counter *c, 1762306a36Sopenharmony_ci unsigned long usage) 1862306a36Sopenharmony_ci{ 1962306a36Sopenharmony_ci unsigned long protected, old_protected; 2062306a36Sopenharmony_ci long delta; 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_ci if (!c->parent) 2362306a36Sopenharmony_ci return; 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci protected = min(usage, READ_ONCE(c->min)); 2662306a36Sopenharmony_ci old_protected = atomic_long_read(&c->min_usage); 2762306a36Sopenharmony_ci if (protected != old_protected) { 2862306a36Sopenharmony_ci old_protected = atomic_long_xchg(&c->min_usage, protected); 2962306a36Sopenharmony_ci delta = protected - old_protected; 3062306a36Sopenharmony_ci if (delta) 3162306a36Sopenharmony_ci atomic_long_add(delta, &c->parent->children_min_usage); 3262306a36Sopenharmony_ci } 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci protected = min(usage, READ_ONCE(c->low)); 3562306a36Sopenharmony_ci old_protected = atomic_long_read(&c->low_usage); 3662306a36Sopenharmony_ci if (protected != old_protected) { 3762306a36Sopenharmony_ci old_protected = atomic_long_xchg(&c->low_usage, protected); 3862306a36Sopenharmony_ci delta = protected - old_protected; 3962306a36Sopenharmony_ci if (delta) 4062306a36Sopenharmony_ci atomic_long_add(delta, &c->parent->children_low_usage); 4162306a36Sopenharmony_ci } 4262306a36Sopenharmony_ci} 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci/** 4562306a36Sopenharmony_ci * page_counter_cancel - take pages out of the local counter 4662306a36Sopenharmony_ci * @counter: counter 4762306a36Sopenharmony_ci * @nr_pages: number of pages to cancel 4862306a36Sopenharmony_ci */ 4962306a36Sopenharmony_civoid page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) 5062306a36Sopenharmony_ci{ 5162306a36Sopenharmony_ci long new; 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci new = atomic_long_sub_return(nr_pages, &counter->usage); 5462306a36Sopenharmony_ci /* More uncharges than charges? */ 5562306a36Sopenharmony_ci if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", 5662306a36Sopenharmony_ci new, nr_pages)) { 5762306a36Sopenharmony_ci new = 0; 5862306a36Sopenharmony_ci atomic_long_set(&counter->usage, new); 5962306a36Sopenharmony_ci } 6062306a36Sopenharmony_ci propagate_protected_usage(counter, new); 6162306a36Sopenharmony_ci} 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_ci/** 6462306a36Sopenharmony_ci * page_counter_charge - hierarchically charge pages 6562306a36Sopenharmony_ci * @counter: counter 6662306a36Sopenharmony_ci * @nr_pages: number of pages to charge 6762306a36Sopenharmony_ci * 6862306a36Sopenharmony_ci * NOTE: This does not consider any configured counter limits. 6962306a36Sopenharmony_ci */ 7062306a36Sopenharmony_civoid page_counter_charge(struct page_counter *counter, unsigned long nr_pages) 7162306a36Sopenharmony_ci{ 7262306a36Sopenharmony_ci struct page_counter *c; 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci for (c = counter; c; c = c->parent) { 7562306a36Sopenharmony_ci long new; 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci new = atomic_long_add_return(nr_pages, &c->usage); 7862306a36Sopenharmony_ci propagate_protected_usage(c, new); 7962306a36Sopenharmony_ci /* 8062306a36Sopenharmony_ci * This is indeed racy, but we can live with some 8162306a36Sopenharmony_ci * inaccuracy in the watermark. 8262306a36Sopenharmony_ci */ 8362306a36Sopenharmony_ci if (new > READ_ONCE(c->watermark)) 8462306a36Sopenharmony_ci WRITE_ONCE(c->watermark, new); 8562306a36Sopenharmony_ci } 8662306a36Sopenharmony_ci} 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci/** 8962306a36Sopenharmony_ci * page_counter_try_charge - try to hierarchically charge pages 9062306a36Sopenharmony_ci * @counter: counter 9162306a36Sopenharmony_ci * @nr_pages: number of pages to charge 9262306a36Sopenharmony_ci * @fail: points first counter to hit its limit, if any 9362306a36Sopenharmony_ci * 9462306a36Sopenharmony_ci * Returns %true on success, or %false and @fail if the counter or one 9562306a36Sopenharmony_ci * of its ancestors has hit its configured limit. 9662306a36Sopenharmony_ci */ 9762306a36Sopenharmony_cibool page_counter_try_charge(struct page_counter *counter, 9862306a36Sopenharmony_ci unsigned long nr_pages, 9962306a36Sopenharmony_ci struct page_counter **fail) 10062306a36Sopenharmony_ci{ 10162306a36Sopenharmony_ci struct page_counter *c; 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci for (c = counter; c; c = c->parent) { 10462306a36Sopenharmony_ci long new; 10562306a36Sopenharmony_ci /* 10662306a36Sopenharmony_ci * Charge speculatively to avoid an expensive CAS. If 10762306a36Sopenharmony_ci * a bigger charge fails, it might falsely lock out a 10862306a36Sopenharmony_ci * racing smaller charge and send it into reclaim 10962306a36Sopenharmony_ci * early, but the error is limited to the difference 11062306a36Sopenharmony_ci * between the two sizes, which is less than 2M/4M in 11162306a36Sopenharmony_ci * case of a THP locking out a regular page charge. 11262306a36Sopenharmony_ci * 11362306a36Sopenharmony_ci * The atomic_long_add_return() implies a full memory 11462306a36Sopenharmony_ci * barrier between incrementing the count and reading 11562306a36Sopenharmony_ci * the limit. When racing with page_counter_set_max(), 11662306a36Sopenharmony_ci * we either see the new limit or the setter sees the 11762306a36Sopenharmony_ci * counter has changed and retries. 11862306a36Sopenharmony_ci */ 11962306a36Sopenharmony_ci new = atomic_long_add_return(nr_pages, &c->usage); 12062306a36Sopenharmony_ci if (new > c->max) { 12162306a36Sopenharmony_ci atomic_long_sub(nr_pages, &c->usage); 12262306a36Sopenharmony_ci /* 12362306a36Sopenharmony_ci * This is racy, but we can live with some 12462306a36Sopenharmony_ci * inaccuracy in the failcnt which is only used 12562306a36Sopenharmony_ci * to report stats. 12662306a36Sopenharmony_ci */ 12762306a36Sopenharmony_ci data_race(c->failcnt++); 12862306a36Sopenharmony_ci *fail = c; 12962306a36Sopenharmony_ci goto failed; 13062306a36Sopenharmony_ci } 13162306a36Sopenharmony_ci propagate_protected_usage(c, new); 13262306a36Sopenharmony_ci /* 13362306a36Sopenharmony_ci * Just like with failcnt, we can live with some 13462306a36Sopenharmony_ci * inaccuracy in the watermark. 13562306a36Sopenharmony_ci */ 13662306a36Sopenharmony_ci if (new > READ_ONCE(c->watermark)) 13762306a36Sopenharmony_ci WRITE_ONCE(c->watermark, new); 13862306a36Sopenharmony_ci } 13962306a36Sopenharmony_ci return true; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_cifailed: 14262306a36Sopenharmony_ci for (c = counter; c != *fail; c = c->parent) 14362306a36Sopenharmony_ci page_counter_cancel(c, nr_pages); 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci return false; 14662306a36Sopenharmony_ci} 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci/** 14962306a36Sopenharmony_ci * page_counter_uncharge - hierarchically uncharge pages 15062306a36Sopenharmony_ci * @counter: counter 15162306a36Sopenharmony_ci * @nr_pages: number of pages to uncharge 15262306a36Sopenharmony_ci */ 15362306a36Sopenharmony_civoid page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) 15462306a36Sopenharmony_ci{ 15562306a36Sopenharmony_ci struct page_counter *c; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci for (c = counter; c; c = c->parent) 15862306a36Sopenharmony_ci page_counter_cancel(c, nr_pages); 15962306a36Sopenharmony_ci} 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci/** 16262306a36Sopenharmony_ci * page_counter_set_max - set the maximum number of pages allowed 16362306a36Sopenharmony_ci * @counter: counter 16462306a36Sopenharmony_ci * @nr_pages: limit to set 16562306a36Sopenharmony_ci * 16662306a36Sopenharmony_ci * Returns 0 on success, -EBUSY if the current number of pages on the 16762306a36Sopenharmony_ci * counter already exceeds the specified limit. 16862306a36Sopenharmony_ci * 16962306a36Sopenharmony_ci * The caller must serialize invocations on the same counter. 17062306a36Sopenharmony_ci */ 17162306a36Sopenharmony_ciint page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) 17262306a36Sopenharmony_ci{ 17362306a36Sopenharmony_ci for (;;) { 17462306a36Sopenharmony_ci unsigned long old; 17562306a36Sopenharmony_ci long usage; 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci /* 17862306a36Sopenharmony_ci * Update the limit while making sure that it's not 17962306a36Sopenharmony_ci * below the concurrently-changing counter value. 18062306a36Sopenharmony_ci * 18162306a36Sopenharmony_ci * The xchg implies two full memory barriers before 18262306a36Sopenharmony_ci * and after, so the read-swap-read is ordered and 18362306a36Sopenharmony_ci * ensures coherency with page_counter_try_charge(): 18462306a36Sopenharmony_ci * that function modifies the count before checking 18562306a36Sopenharmony_ci * the limit, so if it sees the old limit, we see the 18662306a36Sopenharmony_ci * modified counter and retry. 18762306a36Sopenharmony_ci */ 18862306a36Sopenharmony_ci usage = page_counter_read(counter); 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci if (usage > nr_pages) 19162306a36Sopenharmony_ci return -EBUSY; 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci old = xchg(&counter->max, nr_pages); 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci if (page_counter_read(counter) <= usage || nr_pages >= old) 19662306a36Sopenharmony_ci return 0; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci counter->max = old; 19962306a36Sopenharmony_ci cond_resched(); 20062306a36Sopenharmony_ci } 20162306a36Sopenharmony_ci} 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci/** 20462306a36Sopenharmony_ci * page_counter_set_min - set the amount of protected memory 20562306a36Sopenharmony_ci * @counter: counter 20662306a36Sopenharmony_ci * @nr_pages: value to set 20762306a36Sopenharmony_ci * 20862306a36Sopenharmony_ci * The caller must serialize invocations on the same counter. 20962306a36Sopenharmony_ci */ 21062306a36Sopenharmony_civoid page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) 21162306a36Sopenharmony_ci{ 21262306a36Sopenharmony_ci struct page_counter *c; 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci WRITE_ONCE(counter->min, nr_pages); 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci for (c = counter; c; c = c->parent) 21762306a36Sopenharmony_ci propagate_protected_usage(c, atomic_long_read(&c->usage)); 21862306a36Sopenharmony_ci} 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci/** 22162306a36Sopenharmony_ci * page_counter_set_low - set the amount of protected memory 22262306a36Sopenharmony_ci * @counter: counter 22362306a36Sopenharmony_ci * @nr_pages: value to set 22462306a36Sopenharmony_ci * 22562306a36Sopenharmony_ci * The caller must serialize invocations on the same counter. 22662306a36Sopenharmony_ci */ 22762306a36Sopenharmony_civoid page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) 22862306a36Sopenharmony_ci{ 22962306a36Sopenharmony_ci struct page_counter *c; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci WRITE_ONCE(counter->low, nr_pages); 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci for (c = counter; c; c = c->parent) 23462306a36Sopenharmony_ci propagate_protected_usage(c, atomic_long_read(&c->usage)); 23562306a36Sopenharmony_ci} 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci/** 23862306a36Sopenharmony_ci * page_counter_memparse - memparse() for page counter limits 23962306a36Sopenharmony_ci * @buf: string to parse 24062306a36Sopenharmony_ci * @max: string meaning maximum possible value 24162306a36Sopenharmony_ci * @nr_pages: returns the result in number of pages 24262306a36Sopenharmony_ci * 24362306a36Sopenharmony_ci * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be 24462306a36Sopenharmony_ci * limited to %PAGE_COUNTER_MAX. 24562306a36Sopenharmony_ci */ 24662306a36Sopenharmony_ciint page_counter_memparse(const char *buf, const char *max, 24762306a36Sopenharmony_ci unsigned long *nr_pages) 24862306a36Sopenharmony_ci{ 24962306a36Sopenharmony_ci char *end; 25062306a36Sopenharmony_ci u64 bytes; 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci if (!strcmp(buf, max)) { 25362306a36Sopenharmony_ci *nr_pages = PAGE_COUNTER_MAX; 25462306a36Sopenharmony_ci return 0; 25562306a36Sopenharmony_ci } 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci bytes = memparse(buf, &end); 25862306a36Sopenharmony_ci if (*end != '\0') 25962306a36Sopenharmony_ci return -EINVAL; 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci return 0; 26462306a36Sopenharmony_ci} 265