1// SPDX-License-Identifier: GPL-2.0
2#define _GNU_SOURCE
3
4#include <linux/limits.h>
5#include <fcntl.h>
6#include <stdio.h>
7#include <stdlib.h>
8#include <string.h>
9#include <sys/stat.h>
10#include <sys/types.h>
11#include <unistd.h>
12#include <sys/wait.h>
13#include <errno.h>
14#include <sys/sysinfo.h>
15#include <pthread.h>
16
17#include "../kselftest.h"
18#include "cgroup_util.h"
19
20
21/*
22 * Memory cgroup charging and vmstat data aggregation is performed using
23 * percpu batches 32 pages big (look at MEMCG_CHARGE_BATCH). So the maximum
24 * discrepancy between charge and vmstat entries is number of cpus multiplied
25 * by 32 pages multiplied by 2.
26 */
27#define MAX_VMSTAT_ERROR (4096 * 32 * 2 * get_nprocs())
28
29
30static int alloc_dcache(const char *cgroup, void *arg)
31{
32	unsigned long i;
33	struct stat st;
34	char buf[128];
35
36	for (i = 0; i < (unsigned long)arg; i++) {
37		snprintf(buf, sizeof(buf),
38			"/something-non-existent-with-a-long-name-%64lu-%d",
39			 i, getpid());
40		stat(buf, &st);
41	}
42
43	return 0;
44}
45
46/*
47 * This test allocates 100000 of negative dentries with long names.
48 * Then it checks that "slab" in memory.stat is larger than 1M.
49 * Then it sets memory.high to 1M and checks that at least 1/2
50 * of slab memory has been reclaimed.
51 */
52static int test_kmem_basic(const char *root)
53{
54	int ret = KSFT_FAIL;
55	char *cg = NULL;
56	long slab0, slab1, current;
57
58	cg = cg_name(root, "kmem_basic_test");
59	if (!cg)
60		goto cleanup;
61
62	if (cg_create(cg))
63		goto cleanup;
64
65	if (cg_run(cg, alloc_dcache, (void *)100000))
66		goto cleanup;
67
68	slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
69	if (slab0 < (1 << 20))
70		goto cleanup;
71
72	cg_write(cg, "memory.high", "1M");
73	slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
74	if (slab1 <= 0)
75		goto cleanup;
76
77	current = cg_read_long(cg, "memory.current");
78	if (current <= 0)
79		goto cleanup;
80
81	if (slab1 < slab0 / 2 && current < slab0 / 2)
82		ret = KSFT_PASS;
83cleanup:
84	cg_destroy(cg);
85	free(cg);
86
87	return ret;
88}
89
90static void *alloc_kmem_fn(void *arg)
91{
92	alloc_dcache(NULL, (void *)100);
93	return NULL;
94}
95
96static int alloc_kmem_smp(const char *cgroup, void *arg)
97{
98	int nr_threads = 2 * get_nprocs();
99	pthread_t *tinfo;
100	unsigned long i;
101	int ret = -1;
102
103	tinfo = calloc(nr_threads, sizeof(pthread_t));
104	if (tinfo == NULL)
105		return -1;
106
107	for (i = 0; i < nr_threads; i++) {
108		if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
109				   (void *)i)) {
110			free(tinfo);
111			return -1;
112		}
113	}
114
115	for (i = 0; i < nr_threads; i++) {
116		ret = pthread_join(tinfo[i], NULL);
117		if (ret)
118			break;
119	}
120
121	free(tinfo);
122	return ret;
123}
124
125static int cg_run_in_subcgroups(const char *parent,
126				int (*fn)(const char *cgroup, void *arg),
127				void *arg, int times)
128{
129	char *child;
130	int i;
131
132	for (i = 0; i < times; i++) {
133		child = cg_name_indexed(parent, "child", i);
134		if (!child)
135			return -1;
136
137		if (cg_create(child)) {
138			cg_destroy(child);
139			free(child);
140			return -1;
141		}
142
143		if (cg_run(child, fn, NULL)) {
144			cg_destroy(child);
145			free(child);
146			return -1;
147		}
148
149		cg_destroy(child);
150		free(child);
151	}
152
153	return 0;
154}
155
156/*
157 * The test creates and destroys a large number of cgroups. In each cgroup it
158 * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
159 * threads. Then it checks the sanity of numbers on the parent level:
160 * the total size of the cgroups should be roughly equal to
161 * anon + file + slab + kernel_stack.
162 */
163static int test_kmem_memcg_deletion(const char *root)
164{
165	long current, slab, anon, file, kernel_stack, sum;
166	int ret = KSFT_FAIL;
167	char *parent;
168
169	parent = cg_name(root, "kmem_memcg_deletion_test");
170	if (!parent)
171		goto cleanup;
172
173	if (cg_create(parent))
174		goto cleanup;
175
176	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
177		goto cleanup;
178
179	if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
180		goto cleanup;
181
182	current = cg_read_long(parent, "memory.current");
183	slab = cg_read_key_long(parent, "memory.stat", "slab ");
184	anon = cg_read_key_long(parent, "memory.stat", "anon ");
185	file = cg_read_key_long(parent, "memory.stat", "file ");
186	kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack ");
187	if (current < 0 || slab < 0 || anon < 0 || file < 0 ||
188	    kernel_stack < 0)
189		goto cleanup;
190
191	sum = slab + anon + file + kernel_stack;
192	if (abs(sum - current) < MAX_VMSTAT_ERROR) {
193		ret = KSFT_PASS;
194	} else {
195		printf("memory.current = %ld\n", current);
196		printf("slab + anon + file + kernel_stack = %ld\n", sum);
197		printf("slab = %ld\n", slab);
198		printf("anon = %ld\n", anon);
199		printf("file = %ld\n", file);
200		printf("kernel_stack = %ld\n", kernel_stack);
201	}
202
203cleanup:
204	cg_destroy(parent);
205	free(parent);
206
207	return ret;
208}
209
210/*
211 * The test reads the entire /proc/kpagecgroup. If the operation went
212 * successfully (and the kernel didn't panic), the test is treated as passed.
213 */
214static int test_kmem_proc_kpagecgroup(const char *root)
215{
216	unsigned long buf[128];
217	int ret = KSFT_FAIL;
218	ssize_t len;
219	int fd;
220
221	fd = open("/proc/kpagecgroup", O_RDONLY);
222	if (fd < 0)
223		return ret;
224
225	do {
226		len = read(fd, buf, sizeof(buf));
227	} while (len > 0);
228
229	if (len == 0)
230		ret = KSFT_PASS;
231
232	close(fd);
233	return ret;
234}
235
236static void *pthread_wait_fn(void *arg)
237{
238	sleep(100);
239	return NULL;
240}
241
242static int spawn_1000_threads(const char *cgroup, void *arg)
243{
244	int nr_threads = 1000;
245	pthread_t *tinfo;
246	unsigned long i;
247	long stack;
248	int ret = -1;
249
250	tinfo = calloc(nr_threads, sizeof(pthread_t));
251	if (tinfo == NULL)
252		return -1;
253
254	for (i = 0; i < nr_threads; i++) {
255		if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
256				   (void *)i)) {
257			free(tinfo);
258			return(-1);
259		}
260	}
261
262	stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
263	if (stack >= 4096 * 1000)
264		ret = 0;
265
266	free(tinfo);
267	return ret;
268}
269
270/*
271 * The test spawns a process, which spawns 1000 threads. Then it checks
272 * that memory.stat's kernel_stack is at least 1000 pages large.
273 */
274static int test_kmem_kernel_stacks(const char *root)
275{
276	int ret = KSFT_FAIL;
277	char *cg = NULL;
278
279	cg = cg_name(root, "kmem_kernel_stacks_test");
280	if (!cg)
281		goto cleanup;
282
283	if (cg_create(cg))
284		goto cleanup;
285
286	if (cg_run(cg, spawn_1000_threads, NULL))
287		goto cleanup;
288
289	ret = KSFT_PASS;
290cleanup:
291	cg_destroy(cg);
292	free(cg);
293
294	return ret;
295}
296
297/*
298 * This test sequentionally creates 30 child cgroups, allocates some
299 * kernel memory in each of them, and deletes them. Then it checks
300 * that the number of dying cgroups on the parent level is 0.
301 */
302static int test_kmem_dead_cgroups(const char *root)
303{
304	int ret = KSFT_FAIL;
305	char *parent;
306	long dead;
307	int i;
308
309	parent = cg_name(root, "kmem_dead_cgroups_test");
310	if (!parent)
311		goto cleanup;
312
313	if (cg_create(parent))
314		goto cleanup;
315
316	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
317		goto cleanup;
318
319	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
320		goto cleanup;
321
322	for (i = 0; i < 5; i++) {
323		dead = cg_read_key_long(parent, "cgroup.stat",
324					"nr_dying_descendants ");
325		if (dead == 0) {
326			ret = KSFT_PASS;
327			break;
328		}
329		/*
330		 * Reclaiming cgroups might take some time,
331		 * let's wait a bit and repeat.
332		 */
333		sleep(1);
334	}
335
336cleanup:
337	cg_destroy(parent);
338	free(parent);
339
340	return ret;
341}
342
343/*
344 * This test creates a sub-tree with 1000 memory cgroups.
345 * Then it checks that the memory.current on the parent level
346 * is greater than 0 and approximates matches the percpu value
347 * from memory.stat.
348 */
349static int test_percpu_basic(const char *root)
350{
351	int ret = KSFT_FAIL;
352	char *parent, *child;
353	long current, percpu;
354	int i;
355
356	parent = cg_name(root, "percpu_basic_test");
357	if (!parent)
358		goto cleanup;
359
360	if (cg_create(parent))
361		goto cleanup;
362
363	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
364		goto cleanup;
365
366	for (i = 0; i < 1000; i++) {
367		child = cg_name_indexed(parent, "child", i);
368		if (!child)
369			return -1;
370
371		if (cg_create(child))
372			goto cleanup_children;
373
374		free(child);
375	}
376
377	current = cg_read_long(parent, "memory.current");
378	percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
379
380	if (current > 0 && percpu > 0 && abs(current - percpu) <
381	    MAX_VMSTAT_ERROR)
382		ret = KSFT_PASS;
383	else
384		printf("memory.current %ld\npercpu %ld\n",
385		       current, percpu);
386
387cleanup_children:
388	for (i = 0; i < 1000; i++) {
389		child = cg_name_indexed(parent, "child", i);
390		cg_destroy(child);
391		free(child);
392	}
393
394cleanup:
395	cg_destroy(parent);
396	free(parent);
397
398	return ret;
399}
400
401#define T(x) { x, #x }
402struct kmem_test {
403	int (*fn)(const char *root);
404	const char *name;
405} tests[] = {
406	T(test_kmem_basic),
407	T(test_kmem_memcg_deletion),
408	T(test_kmem_proc_kpagecgroup),
409	T(test_kmem_kernel_stacks),
410	T(test_kmem_dead_cgroups),
411	T(test_percpu_basic),
412};
413#undef T
414
415int main(int argc, char **argv)
416{
417	char root[PATH_MAX];
418	int i, ret = EXIT_SUCCESS;
419
420	if (cg_find_unified_root(root, sizeof(root)))
421		ksft_exit_skip("cgroup v2 isn't mounted\n");
422
423	/*
424	 * Check that memory controller is available:
425	 * memory is listed in cgroup.controllers
426	 */
427	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
428		ksft_exit_skip("memory controller isn't available\n");
429
430	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
431		if (cg_write(root, "cgroup.subtree_control", "+memory"))
432			ksft_exit_skip("Failed to set memory controller\n");
433
434	for (i = 0; i < ARRAY_SIZE(tests); i++) {
435		switch (tests[i].fn(root)) {
436		case KSFT_PASS:
437			ksft_test_result_pass("%s\n", tests[i].name);
438			break;
439		case KSFT_SKIP:
440			ksft_test_result_skip("%s\n", tests[i].name);
441			break;
442		default:
443			ret = EXIT_FAILURE;
444			ksft_test_result_fail("%s\n", tests[i].name);
445			break;
446		}
447	}
448
449	return ret;
450}
451