18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Lockless hierarchical page accounting & limiting
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#include <linux/page_counter.h>
98c2ecf20Sopenharmony_ci#include <linux/atomic.h>
108c2ecf20Sopenharmony_ci#include <linux/kernel.h>
118c2ecf20Sopenharmony_ci#include <linux/string.h>
128c2ecf20Sopenharmony_ci#include <linux/sched.h>
138c2ecf20Sopenharmony_ci#include <linux/bug.h>
148c2ecf20Sopenharmony_ci#include <asm/page.h>
158c2ecf20Sopenharmony_ci
168c2ecf20Sopenharmony_cistatic void propagate_protected_usage(struct page_counter *c,
178c2ecf20Sopenharmony_ci				      unsigned long usage)
188c2ecf20Sopenharmony_ci{
198c2ecf20Sopenharmony_ci	unsigned long protected, old_protected;
208c2ecf20Sopenharmony_ci	unsigned long low, min;
218c2ecf20Sopenharmony_ci	long delta;
228c2ecf20Sopenharmony_ci
238c2ecf20Sopenharmony_ci	if (!c->parent)
248c2ecf20Sopenharmony_ci		return;
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_ci	min = READ_ONCE(c->min);
278c2ecf20Sopenharmony_ci	if (min || atomic_long_read(&c->min_usage)) {
288c2ecf20Sopenharmony_ci		protected = min(usage, min);
298c2ecf20Sopenharmony_ci		old_protected = atomic_long_xchg(&c->min_usage, protected);
308c2ecf20Sopenharmony_ci		delta = protected - old_protected;
318c2ecf20Sopenharmony_ci		if (delta)
328c2ecf20Sopenharmony_ci			atomic_long_add(delta, &c->parent->children_min_usage);
338c2ecf20Sopenharmony_ci	}
348c2ecf20Sopenharmony_ci
358c2ecf20Sopenharmony_ci	low = READ_ONCE(c->low);
368c2ecf20Sopenharmony_ci	if (low || atomic_long_read(&c->low_usage)) {
378c2ecf20Sopenharmony_ci		protected = min(usage, low);
388c2ecf20Sopenharmony_ci		old_protected = atomic_long_xchg(&c->low_usage, protected);
398c2ecf20Sopenharmony_ci		delta = protected - old_protected;
408c2ecf20Sopenharmony_ci		if (delta)
418c2ecf20Sopenharmony_ci			atomic_long_add(delta, &c->parent->children_low_usage);
428c2ecf20Sopenharmony_ci	}
438c2ecf20Sopenharmony_ci}
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_ci/**
468c2ecf20Sopenharmony_ci * page_counter_cancel - take pages out of the local counter
478c2ecf20Sopenharmony_ci * @counter: counter
488c2ecf20Sopenharmony_ci * @nr_pages: number of pages to cancel
498c2ecf20Sopenharmony_ci */
508c2ecf20Sopenharmony_civoid page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
518c2ecf20Sopenharmony_ci{
528c2ecf20Sopenharmony_ci	long new;
538c2ecf20Sopenharmony_ci
548c2ecf20Sopenharmony_ci	new = atomic_long_sub_return(nr_pages, &counter->usage);
558c2ecf20Sopenharmony_ci	propagate_protected_usage(counter, new);
568c2ecf20Sopenharmony_ci	/* More uncharges than charges? */
578c2ecf20Sopenharmony_ci	WARN_ON_ONCE(new < 0);
588c2ecf20Sopenharmony_ci}
598c2ecf20Sopenharmony_ci
608c2ecf20Sopenharmony_ci/**
618c2ecf20Sopenharmony_ci * page_counter_charge - hierarchically charge pages
628c2ecf20Sopenharmony_ci * @counter: counter
638c2ecf20Sopenharmony_ci * @nr_pages: number of pages to charge
648c2ecf20Sopenharmony_ci *
658c2ecf20Sopenharmony_ci * NOTE: This does not consider any configured counter limits.
668c2ecf20Sopenharmony_ci */
678c2ecf20Sopenharmony_civoid page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
688c2ecf20Sopenharmony_ci{
698c2ecf20Sopenharmony_ci	struct page_counter *c;
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci	for (c = counter; c; c = c->parent) {
728c2ecf20Sopenharmony_ci		long new;
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ci		new = atomic_long_add_return(nr_pages, &c->usage);
758c2ecf20Sopenharmony_ci		propagate_protected_usage(c, new);
768c2ecf20Sopenharmony_ci		/*
778c2ecf20Sopenharmony_ci		 * This is indeed racy, but we can live with some
788c2ecf20Sopenharmony_ci		 * inaccuracy in the watermark.
798c2ecf20Sopenharmony_ci		 */
808c2ecf20Sopenharmony_ci		if (new > READ_ONCE(c->watermark))
818c2ecf20Sopenharmony_ci			WRITE_ONCE(c->watermark, new);
828c2ecf20Sopenharmony_ci	}
838c2ecf20Sopenharmony_ci}
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_ci/**
868c2ecf20Sopenharmony_ci * page_counter_try_charge - try to hierarchically charge pages
878c2ecf20Sopenharmony_ci * @counter: counter
888c2ecf20Sopenharmony_ci * @nr_pages: number of pages to charge
898c2ecf20Sopenharmony_ci * @fail: points first counter to hit its limit, if any
908c2ecf20Sopenharmony_ci *
918c2ecf20Sopenharmony_ci * Returns %true on success, or %false and @fail if the counter or one
928c2ecf20Sopenharmony_ci * of its ancestors has hit its configured limit.
938c2ecf20Sopenharmony_ci */
948c2ecf20Sopenharmony_cibool page_counter_try_charge(struct page_counter *counter,
958c2ecf20Sopenharmony_ci			     unsigned long nr_pages,
968c2ecf20Sopenharmony_ci			     struct page_counter **fail)
978c2ecf20Sopenharmony_ci{
988c2ecf20Sopenharmony_ci	struct page_counter *c;
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_ci	for (c = counter; c; c = c->parent) {
1018c2ecf20Sopenharmony_ci		long new;
1028c2ecf20Sopenharmony_ci		/*
1038c2ecf20Sopenharmony_ci		 * Charge speculatively to avoid an expensive CAS.  If
1048c2ecf20Sopenharmony_ci		 * a bigger charge fails, it might falsely lock out a
1058c2ecf20Sopenharmony_ci		 * racing smaller charge and send it into reclaim
1068c2ecf20Sopenharmony_ci		 * early, but the error is limited to the difference
1078c2ecf20Sopenharmony_ci		 * between the two sizes, which is less than 2M/4M in
1088c2ecf20Sopenharmony_ci		 * case of a THP locking out a regular page charge.
1098c2ecf20Sopenharmony_ci		 *
1108c2ecf20Sopenharmony_ci		 * The atomic_long_add_return() implies a full memory
1118c2ecf20Sopenharmony_ci		 * barrier between incrementing the count and reading
1128c2ecf20Sopenharmony_ci		 * the limit.  When racing with page_counter_set_max(),
1138c2ecf20Sopenharmony_ci		 * we either see the new limit or the setter sees the
1148c2ecf20Sopenharmony_ci		 * counter has changed and retries.
1158c2ecf20Sopenharmony_ci		 */
1168c2ecf20Sopenharmony_ci		new = atomic_long_add_return(nr_pages, &c->usage);
1178c2ecf20Sopenharmony_ci		if (new > c->max) {
1188c2ecf20Sopenharmony_ci			atomic_long_sub(nr_pages, &c->usage);
1198c2ecf20Sopenharmony_ci			propagate_protected_usage(c, new);
1208c2ecf20Sopenharmony_ci			/*
1218c2ecf20Sopenharmony_ci			 * This is racy, but we can live with some
1228c2ecf20Sopenharmony_ci			 * inaccuracy in the failcnt which is only used
1238c2ecf20Sopenharmony_ci			 * to report stats.
1248c2ecf20Sopenharmony_ci			 */
1258c2ecf20Sopenharmony_ci			data_race(c->failcnt++);
1268c2ecf20Sopenharmony_ci			*fail = c;
1278c2ecf20Sopenharmony_ci			goto failed;
1288c2ecf20Sopenharmony_ci		}
1298c2ecf20Sopenharmony_ci		propagate_protected_usage(c, new);
1308c2ecf20Sopenharmony_ci		/*
1318c2ecf20Sopenharmony_ci		 * Just like with failcnt, we can live with some
1328c2ecf20Sopenharmony_ci		 * inaccuracy in the watermark.
1338c2ecf20Sopenharmony_ci		 */
1348c2ecf20Sopenharmony_ci		if (new > READ_ONCE(c->watermark))
1358c2ecf20Sopenharmony_ci			WRITE_ONCE(c->watermark, new);
1368c2ecf20Sopenharmony_ci	}
1378c2ecf20Sopenharmony_ci	return true;
1388c2ecf20Sopenharmony_ci
1398c2ecf20Sopenharmony_cifailed:
1408c2ecf20Sopenharmony_ci	for (c = counter; c != *fail; c = c->parent)
1418c2ecf20Sopenharmony_ci		page_counter_cancel(c, nr_pages);
1428c2ecf20Sopenharmony_ci
1438c2ecf20Sopenharmony_ci	return false;
1448c2ecf20Sopenharmony_ci}
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci/**
1478c2ecf20Sopenharmony_ci * page_counter_uncharge - hierarchically uncharge pages
1488c2ecf20Sopenharmony_ci * @counter: counter
1498c2ecf20Sopenharmony_ci * @nr_pages: number of pages to uncharge
1508c2ecf20Sopenharmony_ci */
1518c2ecf20Sopenharmony_civoid page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
1528c2ecf20Sopenharmony_ci{
1538c2ecf20Sopenharmony_ci	struct page_counter *c;
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	for (c = counter; c; c = c->parent)
1568c2ecf20Sopenharmony_ci		page_counter_cancel(c, nr_pages);
1578c2ecf20Sopenharmony_ci}
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_ci/**
1608c2ecf20Sopenharmony_ci * page_counter_set_max - set the maximum number of pages allowed
1618c2ecf20Sopenharmony_ci * @counter: counter
1628c2ecf20Sopenharmony_ci * @nr_pages: limit to set
1638c2ecf20Sopenharmony_ci *
1648c2ecf20Sopenharmony_ci * Returns 0 on success, -EBUSY if the current number of pages on the
1658c2ecf20Sopenharmony_ci * counter already exceeds the specified limit.
1668c2ecf20Sopenharmony_ci *
1678c2ecf20Sopenharmony_ci * The caller must serialize invocations on the same counter.
1688c2ecf20Sopenharmony_ci */
1698c2ecf20Sopenharmony_ciint page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
1708c2ecf20Sopenharmony_ci{
1718c2ecf20Sopenharmony_ci	for (;;) {
1728c2ecf20Sopenharmony_ci		unsigned long old;
1738c2ecf20Sopenharmony_ci		long usage;
1748c2ecf20Sopenharmony_ci
1758c2ecf20Sopenharmony_ci		/*
1768c2ecf20Sopenharmony_ci		 * Update the limit while making sure that it's not
1778c2ecf20Sopenharmony_ci		 * below the concurrently-changing counter value.
1788c2ecf20Sopenharmony_ci		 *
1798c2ecf20Sopenharmony_ci		 * The xchg implies two full memory barriers before
1808c2ecf20Sopenharmony_ci		 * and after, so the read-swap-read is ordered and
1818c2ecf20Sopenharmony_ci		 * ensures coherency with page_counter_try_charge():
1828c2ecf20Sopenharmony_ci		 * that function modifies the count before checking
1838c2ecf20Sopenharmony_ci		 * the limit, so if it sees the old limit, we see the
1848c2ecf20Sopenharmony_ci		 * modified counter and retry.
1858c2ecf20Sopenharmony_ci		 */
1868c2ecf20Sopenharmony_ci		usage = atomic_long_read(&counter->usage);
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci		if (usage > nr_pages)
1898c2ecf20Sopenharmony_ci			return -EBUSY;
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci		old = xchg(&counter->max, nr_pages);
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci		if (atomic_long_read(&counter->usage) <= usage)
1948c2ecf20Sopenharmony_ci			return 0;
1958c2ecf20Sopenharmony_ci
1968c2ecf20Sopenharmony_ci		counter->max = old;
1978c2ecf20Sopenharmony_ci		cond_resched();
1988c2ecf20Sopenharmony_ci	}
1998c2ecf20Sopenharmony_ci}
2008c2ecf20Sopenharmony_ci
2018c2ecf20Sopenharmony_ci/**
2028c2ecf20Sopenharmony_ci * page_counter_set_min - set the amount of protected memory
2038c2ecf20Sopenharmony_ci * @counter: counter
2048c2ecf20Sopenharmony_ci * @nr_pages: value to set
2058c2ecf20Sopenharmony_ci *
2068c2ecf20Sopenharmony_ci * The caller must serialize invocations on the same counter.
2078c2ecf20Sopenharmony_ci */
2088c2ecf20Sopenharmony_civoid page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
2098c2ecf20Sopenharmony_ci{
2108c2ecf20Sopenharmony_ci	struct page_counter *c;
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	WRITE_ONCE(counter->min, nr_pages);
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci	for (c = counter; c; c = c->parent)
2158c2ecf20Sopenharmony_ci		propagate_protected_usage(c, atomic_long_read(&c->usage));
2168c2ecf20Sopenharmony_ci}
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci/**
2198c2ecf20Sopenharmony_ci * page_counter_set_low - set the amount of protected memory
2208c2ecf20Sopenharmony_ci * @counter: counter
2218c2ecf20Sopenharmony_ci * @nr_pages: value to set
2228c2ecf20Sopenharmony_ci *
2238c2ecf20Sopenharmony_ci * The caller must serialize invocations on the same counter.
2248c2ecf20Sopenharmony_ci */
2258c2ecf20Sopenharmony_civoid page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
2268c2ecf20Sopenharmony_ci{
2278c2ecf20Sopenharmony_ci	struct page_counter *c;
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci	WRITE_ONCE(counter->low, nr_pages);
2308c2ecf20Sopenharmony_ci
2318c2ecf20Sopenharmony_ci	for (c = counter; c; c = c->parent)
2328c2ecf20Sopenharmony_ci		propagate_protected_usage(c, atomic_long_read(&c->usage));
2338c2ecf20Sopenharmony_ci}
2348c2ecf20Sopenharmony_ci
2358c2ecf20Sopenharmony_ci/**
2368c2ecf20Sopenharmony_ci * page_counter_memparse - memparse() for page counter limits
2378c2ecf20Sopenharmony_ci * @buf: string to parse
2388c2ecf20Sopenharmony_ci * @max: string meaning maximum possible value
2398c2ecf20Sopenharmony_ci * @nr_pages: returns the result in number of pages
2408c2ecf20Sopenharmony_ci *
2418c2ecf20Sopenharmony_ci * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
2428c2ecf20Sopenharmony_ci * limited to %PAGE_COUNTER_MAX.
2438c2ecf20Sopenharmony_ci */
2448c2ecf20Sopenharmony_ciint page_counter_memparse(const char *buf, const char *max,
2458c2ecf20Sopenharmony_ci			  unsigned long *nr_pages)
2468c2ecf20Sopenharmony_ci{
2478c2ecf20Sopenharmony_ci	char *end;
2488c2ecf20Sopenharmony_ci	u64 bytes;
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_ci	if (!strcmp(buf, max)) {
2518c2ecf20Sopenharmony_ci		*nr_pages = PAGE_COUNTER_MAX;
2528c2ecf20Sopenharmony_ci		return 0;
2538c2ecf20Sopenharmony_ci	}
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci	bytes = memparse(buf, &end);
2568c2ecf20Sopenharmony_ci	if (*end != '\0')
2578c2ecf20Sopenharmony_ci		return -EINVAL;
2588c2ecf20Sopenharmony_ci
2598c2ecf20Sopenharmony_ci	*nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
2608c2ecf20Sopenharmony_ci
2618c2ecf20Sopenharmony_ci	return 0;
2628c2ecf20Sopenharmony_ci}
263