1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * COW (Copy On Write) tests.
4 *
5 * Copyright 2022, Red Hat, Inc.
6 *
7 * Author(s): David Hildenbrand <david@redhat.com>
8 */
9#define _GNU_SOURCE
10#include <stdlib.h>
11#include <string.h>
12#include <stdbool.h>
13#include <stdint.h>
14#include <unistd.h>
15#include <errno.h>
16#include <fcntl.h>
17#include <assert.h>
18#include <linux/mman.h>
19#include <sys/mman.h>
20#include <sys/ioctl.h>
21#include <sys/wait.h>
22#include <linux/memfd.h>
23
24#include "local_config.h"
25#ifdef LOCAL_CONFIG_HAVE_LIBURING
26#include <liburing.h>
27#endif /* LOCAL_CONFIG_HAVE_LIBURING */
28
29#include "../../../../mm/gup_test.h"
30#include "../kselftest.h"
31#include "vm_util.h"
32
33static size_t pagesize;
34static int pagemap_fd;
35static size_t thpsize;
36static int nr_hugetlbsizes;
37static size_t hugetlbsizes[10];
38static int gup_fd;
39static bool has_huge_zeropage;
40
41static void detect_huge_zeropage(void)
42{
43	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
44		      O_RDONLY);
45	size_t enabled = 0;
46	char buf[15];
47	int ret;
48
49	if (fd < 0)
50		return;
51
52	ret = pread(fd, buf, sizeof(buf), 0);
53	if (ret > 0 && ret < sizeof(buf)) {
54		buf[ret] = 0;
55
56		enabled = strtoul(buf, NULL, 10);
57		if (enabled == 1) {
58			has_huge_zeropage = true;
59			ksft_print_msg("[INFO] huge zeropage is enabled\n");
60		}
61	}
62
63	close(fd);
64}
65
66static bool range_is_swapped(void *addr, size_t size)
67{
68	for (; size; addr += pagesize, size -= pagesize)
69		if (!pagemap_is_swapped(pagemap_fd, addr))
70			return false;
71	return true;
72}
73
74struct comm_pipes {
75	int child_ready[2];
76	int parent_ready[2];
77};
78
79static int setup_comm_pipes(struct comm_pipes *comm_pipes)
80{
81	if (pipe(comm_pipes->child_ready) < 0)
82		return -errno;
83	if (pipe(comm_pipes->parent_ready) < 0) {
84		close(comm_pipes->child_ready[0]);
85		close(comm_pipes->child_ready[1]);
86		return -errno;
87	}
88
89	return 0;
90}
91
92static void close_comm_pipes(struct comm_pipes *comm_pipes)
93{
94	close(comm_pipes->child_ready[0]);
95	close(comm_pipes->child_ready[1]);
96	close(comm_pipes->parent_ready[0]);
97	close(comm_pipes->parent_ready[1]);
98}
99
100static int child_memcmp_fn(char *mem, size_t size,
101			   struct comm_pipes *comm_pipes)
102{
103	char *old = malloc(size);
104	char buf;
105
106	/* Backup the original content. */
107	memcpy(old, mem, size);
108
109	/* Wait until the parent modified the page. */
110	write(comm_pipes->child_ready[1], "0", 1);
111	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
112		;
113
114	/* See if we still read the old values. */
115	return memcmp(old, mem, size);
116}
117
118static int child_vmsplice_memcmp_fn(char *mem, size_t size,
119				    struct comm_pipes *comm_pipes)
120{
121	struct iovec iov = {
122		.iov_base = mem,
123		.iov_len = size,
124	};
125	ssize_t cur, total, transferred;
126	char *old, *new;
127	int fds[2];
128	char buf;
129
130	old = malloc(size);
131	new = malloc(size);
132
133	/* Backup the original content. */
134	memcpy(old, mem, size);
135
136	if (pipe(fds) < 0)
137		return -errno;
138
139	/* Trigger a read-only pin. */
140	transferred = vmsplice(fds[1], &iov, 1, 0);
141	if (transferred < 0)
142		return -errno;
143	if (transferred == 0)
144		return -EINVAL;
145
146	/* Unmap it from our page tables. */
147	if (munmap(mem, size) < 0)
148		return -errno;
149
150	/* Wait until the parent modified it. */
151	write(comm_pipes->child_ready[1], "0", 1);
152	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
153		;
154
155	/* See if we still read the old values via the pipe. */
156	for (total = 0; total < transferred; total += cur) {
157		cur = read(fds[0], new + total, transferred - total);
158		if (cur < 0)
159			return -errno;
160	}
161
162	return memcmp(old, new, transferred);
163}
164
165typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
166
167static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
168				  child_fn fn)
169{
170	struct comm_pipes comm_pipes;
171	char buf;
172	int ret;
173
174	ret = setup_comm_pipes(&comm_pipes);
175	if (ret) {
176		ksft_test_result_fail("pipe() failed\n");
177		return;
178	}
179
180	ret = fork();
181	if (ret < 0) {
182		ksft_test_result_fail("fork() failed\n");
183		goto close_comm_pipes;
184	} else if (!ret) {
185		exit(fn(mem, size, &comm_pipes));
186	}
187
188	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
189		;
190
191	if (do_mprotect) {
192		/*
193		 * mprotect() optimizations might try avoiding
194		 * write-faults by directly mapping pages writable.
195		 */
196		ret = mprotect(mem, size, PROT_READ);
197		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
198		if (ret) {
199			ksft_test_result_fail("mprotect() failed\n");
200			write(comm_pipes.parent_ready[1], "0", 1);
201			wait(&ret);
202			goto close_comm_pipes;
203		}
204	}
205
206	/* Modify the page. */
207	memset(mem, 0xff, size);
208	write(comm_pipes.parent_ready[1], "0", 1);
209
210	wait(&ret);
211	if (WIFEXITED(ret))
212		ret = WEXITSTATUS(ret);
213	else
214		ret = -EINVAL;
215
216	ksft_test_result(!ret, "No leak from parent into child\n");
217close_comm_pipes:
218	close_comm_pipes(&comm_pipes);
219}
220
221static void test_cow_in_parent(char *mem, size_t size)
222{
223	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
224}
225
226static void test_cow_in_parent_mprotect(char *mem, size_t size)
227{
228	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
229}
230
231static void test_vmsplice_in_child(char *mem, size_t size)
232{
233	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
234}
235
236static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
237{
238	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
239}
240
241static void do_test_vmsplice_in_parent(char *mem, size_t size,
242				       bool before_fork)
243{
244	struct iovec iov = {
245		.iov_base = mem,
246		.iov_len = size,
247	};
248	ssize_t cur, total, transferred;
249	struct comm_pipes comm_pipes;
250	char *old, *new;
251	int ret, fds[2];
252	char buf;
253
254	old = malloc(size);
255	new = malloc(size);
256
257	memcpy(old, mem, size);
258
259	ret = setup_comm_pipes(&comm_pipes);
260	if (ret) {
261		ksft_test_result_fail("pipe() failed\n");
262		goto free;
263	}
264
265	if (pipe(fds) < 0) {
266		ksft_test_result_fail("pipe() failed\n");
267		goto close_comm_pipes;
268	}
269
270	if (before_fork) {
271		transferred = vmsplice(fds[1], &iov, 1, 0);
272		if (transferred <= 0) {
273			ksft_test_result_fail("vmsplice() failed\n");
274			goto close_pipe;
275		}
276	}
277
278	ret = fork();
279	if (ret < 0) {
280		ksft_test_result_fail("fork() failed\n");
281		goto close_pipe;
282	} else if (!ret) {
283		write(comm_pipes.child_ready[1], "0", 1);
284		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
285			;
286		/* Modify page content in the child. */
287		memset(mem, 0xff, size);
288		exit(0);
289	}
290
291	if (!before_fork) {
292		transferred = vmsplice(fds[1], &iov, 1, 0);
293		if (transferred <= 0) {
294			ksft_test_result_fail("vmsplice() failed\n");
295			wait(&ret);
296			goto close_pipe;
297		}
298	}
299
300	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
301		;
302	if (munmap(mem, size) < 0) {
303		ksft_test_result_fail("munmap() failed\n");
304		goto close_pipe;
305	}
306	write(comm_pipes.parent_ready[1], "0", 1);
307
308	/* Wait until the child is done writing. */
309	wait(&ret);
310	if (!WIFEXITED(ret)) {
311		ksft_test_result_fail("wait() failed\n");
312		goto close_pipe;
313	}
314
315	/* See if we still read the old values. */
316	for (total = 0; total < transferred; total += cur) {
317		cur = read(fds[0], new + total, transferred - total);
318		if (cur < 0) {
319			ksft_test_result_fail("read() failed\n");
320			goto close_pipe;
321		}
322	}
323
324	ksft_test_result(!memcmp(old, new, transferred),
325			 "No leak from child into parent\n");
326close_pipe:
327	close(fds[0]);
328	close(fds[1]);
329close_comm_pipes:
330	close_comm_pipes(&comm_pipes);
331free:
332	free(old);
333	free(new);
334}
335
336static void test_vmsplice_before_fork(char *mem, size_t size)
337{
338	do_test_vmsplice_in_parent(mem, size, true);
339}
340
341static void test_vmsplice_after_fork(char *mem, size_t size)
342{
343	do_test_vmsplice_in_parent(mem, size, false);
344}
345
346#ifdef LOCAL_CONFIG_HAVE_LIBURING
347static void do_test_iouring(char *mem, size_t size, bool use_fork)
348{
349	struct comm_pipes comm_pipes;
350	struct io_uring_cqe *cqe;
351	struct io_uring_sqe *sqe;
352	struct io_uring ring;
353	ssize_t cur, total;
354	struct iovec iov;
355	char *buf, *tmp;
356	int ret, fd;
357	FILE *file;
358
359	ret = setup_comm_pipes(&comm_pipes);
360	if (ret) {
361		ksft_test_result_fail("pipe() failed\n");
362		return;
363	}
364
365	file = tmpfile();
366	if (!file) {
367		ksft_test_result_fail("tmpfile() failed\n");
368		goto close_comm_pipes;
369	}
370	fd = fileno(file);
371	assert(fd);
372
373	tmp = malloc(size);
374	if (!tmp) {
375		ksft_test_result_fail("malloc() failed\n");
376		goto close_file;
377	}
378
379	/* Skip on errors, as we might just lack kernel support. */
380	ret = io_uring_queue_init(1, &ring, 0);
381	if (ret < 0) {
382		ksft_test_result_skip("io_uring_queue_init() failed\n");
383		goto free_tmp;
384	}
385
386	/*
387	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
388	 * | FOLL_LONGTERM the range.
389	 *
390	 * Skip on errors, as we might just lack kernel support or might not
391	 * have sufficient MEMLOCK permissions.
392	 */
393	iov.iov_base = mem;
394	iov.iov_len = size;
395	ret = io_uring_register_buffers(&ring, &iov, 1);
396	if (ret) {
397		ksft_test_result_skip("io_uring_register_buffers() failed\n");
398		goto queue_exit;
399	}
400
401	if (use_fork) {
402		/*
403		 * fork() and keep the child alive until we're done. Note that
404		 * we expect the pinned page to not get shared with the child.
405		 */
406		ret = fork();
407		if (ret < 0) {
408			ksft_test_result_fail("fork() failed\n");
409			goto unregister_buffers;
410		} else if (!ret) {
411			write(comm_pipes.child_ready[1], "0", 1);
412			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
413				;
414			exit(0);
415		}
416
417		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
418			;
419	} else {
420		/*
421		 * Map the page R/O into the page table. Enable softdirty
422		 * tracking to stop the page from getting mapped R/W immediately
423		 * again by mprotect() optimizations. Note that we don't have an
424		 * easy way to test if that worked (the pagemap does not export
425		 * if the page is mapped R/O vs. R/W).
426		 */
427		ret = mprotect(mem, size, PROT_READ);
428		clear_softdirty();
429		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
430		if (ret) {
431			ksft_test_result_fail("mprotect() failed\n");
432			goto unregister_buffers;
433		}
434	}
435
436	/*
437	 * Modify the page and write page content as observed by the fixed
438	 * buffer pin to the file so we can verify it.
439	 */
440	memset(mem, 0xff, size);
441	sqe = io_uring_get_sqe(&ring);
442	if (!sqe) {
443		ksft_test_result_fail("io_uring_get_sqe() failed\n");
444		goto quit_child;
445	}
446	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
447
448	ret = io_uring_submit(&ring);
449	if (ret < 0) {
450		ksft_test_result_fail("io_uring_submit() failed\n");
451		goto quit_child;
452	}
453
454	ret = io_uring_wait_cqe(&ring, &cqe);
455	if (ret < 0) {
456		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
457		goto quit_child;
458	}
459
460	if (cqe->res != size) {
461		ksft_test_result_fail("write_fixed failed\n");
462		goto quit_child;
463	}
464	io_uring_cqe_seen(&ring, cqe);
465
466	/* Read back the file content to the temporary buffer. */
467	total = 0;
468	while (total < size) {
469		cur = pread(fd, tmp + total, size - total, total);
470		if (cur < 0) {
471			ksft_test_result_fail("pread() failed\n");
472			goto quit_child;
473		}
474		total += cur;
475	}
476
477	/* Finally, check if we read what we expected. */
478	ksft_test_result(!memcmp(mem, tmp, size),
479			 "Longterm R/W pin is reliable\n");
480
481quit_child:
482	if (use_fork) {
483		write(comm_pipes.parent_ready[1], "0", 1);
484		wait(&ret);
485	}
486unregister_buffers:
487	io_uring_unregister_buffers(&ring);
488queue_exit:
489	io_uring_queue_exit(&ring);
490free_tmp:
491	free(tmp);
492close_file:
493	fclose(file);
494close_comm_pipes:
495	close_comm_pipes(&comm_pipes);
496}
497
498static void test_iouring_ro(char *mem, size_t size)
499{
500	do_test_iouring(mem, size, false);
501}
502
503static void test_iouring_fork(char *mem, size_t size)
504{
505	do_test_iouring(mem, size, true);
506}
507
508#endif /* LOCAL_CONFIG_HAVE_LIBURING */
509
510enum ro_pin_test {
511	RO_PIN_TEST,
512	RO_PIN_TEST_SHARED,
513	RO_PIN_TEST_PREVIOUSLY_SHARED,
514	RO_PIN_TEST_RO_EXCLUSIVE,
515};
516
517static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
518			   bool fast)
519{
520	struct pin_longterm_test args;
521	struct comm_pipes comm_pipes;
522	char *tmp, buf;
523	__u64 tmp_val;
524	int ret;
525
526	if (gup_fd < 0) {
527		ksft_test_result_skip("gup_test not available\n");
528		return;
529	}
530
531	tmp = malloc(size);
532	if (!tmp) {
533		ksft_test_result_fail("malloc() failed\n");
534		return;
535	}
536
537	ret = setup_comm_pipes(&comm_pipes);
538	if (ret) {
539		ksft_test_result_fail("pipe() failed\n");
540		goto free_tmp;
541	}
542
543	switch (test) {
544	case RO_PIN_TEST:
545		break;
546	case RO_PIN_TEST_SHARED:
547	case RO_PIN_TEST_PREVIOUSLY_SHARED:
548		/*
549		 * Share the pages with our child. As the pages are not pinned,
550		 * this should just work.
551		 */
552		ret = fork();
553		if (ret < 0) {
554			ksft_test_result_fail("fork() failed\n");
555			goto close_comm_pipes;
556		} else if (!ret) {
557			write(comm_pipes.child_ready[1], "0", 1);
558			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
559				;
560			exit(0);
561		}
562
563		/* Wait until our child is ready. */
564		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
565			;
566
567		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
568			/*
569			 * Tell the child to quit now and wait until it quit.
570			 * The pages should now be mapped R/O into our page
571			 * tables, but they are no longer shared.
572			 */
573			write(comm_pipes.parent_ready[1], "0", 1);
574			wait(&ret);
575			if (!WIFEXITED(ret))
576				ksft_print_msg("[INFO] wait() failed\n");
577		}
578		break;
579	case RO_PIN_TEST_RO_EXCLUSIVE:
580		/*
581		 * Map the page R/O into the page table. Enable softdirty
582		 * tracking to stop the page from getting mapped R/W immediately
583		 * again by mprotect() optimizations. Note that we don't have an
584		 * easy way to test if that worked (the pagemap does not export
585		 * if the page is mapped R/O vs. R/W).
586		 */
587		ret = mprotect(mem, size, PROT_READ);
588		clear_softdirty();
589		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
590		if (ret) {
591			ksft_test_result_fail("mprotect() failed\n");
592			goto close_comm_pipes;
593		}
594		break;
595	default:
596		assert(false);
597	}
598
599	/* Take a R/O pin. This should trigger unsharing. */
600	args.addr = (__u64)(uintptr_t)mem;
601	args.size = size;
602	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
603	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
604	if (ret) {
605		if (errno == EINVAL)
606			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
607		else
608			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
609		goto wait;
610	}
611
612	/* Modify the page. */
613	memset(mem, 0xff, size);
614
615	/*
616	 * Read back the content via the pin to the temporary buffer and
617	 * test if we observed the modification.
618	 */
619	tmp_val = (__u64)(uintptr_t)tmp;
620	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
621	if (ret)
622		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
623	else
624		ksft_test_result(!memcmp(mem, tmp, size),
625				 "Longterm R/O pin is reliable\n");
626
627	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
628	if (ret)
629		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
630wait:
631	switch (test) {
632	case RO_PIN_TEST_SHARED:
633		write(comm_pipes.parent_ready[1], "0", 1);
634		wait(&ret);
635		if (!WIFEXITED(ret))
636			ksft_print_msg("[INFO] wait() failed\n");
637		break;
638	default:
639		break;
640	}
641close_comm_pipes:
642	close_comm_pipes(&comm_pipes);
643free_tmp:
644	free(tmp);
645}
646
647static void test_ro_pin_on_shared(char *mem, size_t size)
648{
649	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
650}
651
652static void test_ro_fast_pin_on_shared(char *mem, size_t size)
653{
654	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
655}
656
657static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
658{
659	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
660}
661
662static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
663{
664	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
665}
666
667static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
668{
669	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
670}
671
672static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
673{
674	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
675}
676
677typedef void (*test_fn)(char *mem, size_t size);
678
679static void do_run_with_base_page(test_fn fn, bool swapout)
680{
681	char *mem;
682	int ret;
683
684	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
685		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
686	if (mem == MAP_FAILED) {
687		ksft_test_result_fail("mmap() failed\n");
688		return;
689	}
690
691	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
692	/* Ignore if not around on a kernel. */
693	if (ret && errno != EINVAL) {
694		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
695		goto munmap;
696	}
697
698	/* Populate a base page. */
699	memset(mem, 0, pagesize);
700
701	if (swapout) {
702		madvise(mem, pagesize, MADV_PAGEOUT);
703		if (!pagemap_is_swapped(pagemap_fd, mem)) {
704			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
705			goto munmap;
706		}
707	}
708
709	fn(mem, pagesize);
710munmap:
711	munmap(mem, pagesize);
712}
713
714static void run_with_base_page(test_fn fn, const char *desc)
715{
716	ksft_print_msg("[RUN] %s ... with base page\n", desc);
717	do_run_with_base_page(fn, false);
718}
719
720static void run_with_base_page_swap(test_fn fn, const char *desc)
721{
722	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
723	do_run_with_base_page(fn, true);
724}
725
726enum thp_run {
727	THP_RUN_PMD,
728	THP_RUN_PMD_SWAPOUT,
729	THP_RUN_PTE,
730	THP_RUN_PTE_SWAPOUT,
731	THP_RUN_SINGLE_PTE,
732	THP_RUN_SINGLE_PTE_SWAPOUT,
733	THP_RUN_PARTIAL_MREMAP,
734	THP_RUN_PARTIAL_SHARED,
735};
736
737static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
738{
739	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
740	size_t size, mmap_size, mremap_size;
741	int ret;
742
743	/* For alignment purposes, we need twice the thp size. */
744	mmap_size = 2 * thpsize;
745	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
746			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
747	if (mmap_mem == MAP_FAILED) {
748		ksft_test_result_fail("mmap() failed\n");
749		return;
750	}
751
752	/* We need a THP-aligned memory area. */
753	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
754
755	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
756	if (ret) {
757		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
758		goto munmap;
759	}
760
761	/*
762	 * Try to populate a THP. Touch the first sub-page and test if we get
763	 * another sub-page populated automatically.
764	 */
765	mem[0] = 0;
766	if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
767		ksft_test_result_skip("Did not get a THP populated\n");
768		goto munmap;
769	}
770	memset(mem, 0, thpsize);
771
772	size = thpsize;
773	switch (thp_run) {
774	case THP_RUN_PMD:
775	case THP_RUN_PMD_SWAPOUT:
776		break;
777	case THP_RUN_PTE:
778	case THP_RUN_PTE_SWAPOUT:
779		/*
780		 * Trigger PTE-mapping the THP by temporarily mapping a single
781		 * subpage R/O.
782		 */
783		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
784		if (ret) {
785			ksft_test_result_fail("mprotect() failed\n");
786			goto munmap;
787		}
788		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
789		if (ret) {
790			ksft_test_result_fail("mprotect() failed\n");
791			goto munmap;
792		}
793		break;
794	case THP_RUN_SINGLE_PTE:
795	case THP_RUN_SINGLE_PTE_SWAPOUT:
796		/*
797		 * Discard all but a single subpage of that PTE-mapped THP. What
798		 * remains is a single PTE mapping a single subpage.
799		 */
800		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
801		if (ret) {
802			ksft_test_result_fail("MADV_DONTNEED failed\n");
803			goto munmap;
804		}
805		size = pagesize;
806		break;
807	case THP_RUN_PARTIAL_MREMAP:
808		/*
809		 * Remap half of the THP. We need some new memory location
810		 * for that.
811		 */
812		mremap_size = thpsize / 2;
813		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
814				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
815		if (mem == MAP_FAILED) {
816			ksft_test_result_fail("mmap() failed\n");
817			goto munmap;
818		}
819		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
820			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
821		if (tmp != mremap_mem) {
822			ksft_test_result_fail("mremap() failed\n");
823			goto munmap;
824		}
825		size = mremap_size;
826		break;
827	case THP_RUN_PARTIAL_SHARED:
828		/*
829		 * Share the first page of the THP with a child and quit the
830		 * child. This will result in some parts of the THP never
831		 * have been shared.
832		 */
833		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
834		if (ret) {
835			ksft_test_result_fail("MADV_DONTFORK failed\n");
836			goto munmap;
837		}
838		ret = fork();
839		if (ret < 0) {
840			ksft_test_result_fail("fork() failed\n");
841			goto munmap;
842		} else if (!ret) {
843			exit(0);
844		}
845		wait(&ret);
846		/* Allow for sharing all pages again. */
847		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
848		if (ret) {
849			ksft_test_result_fail("MADV_DOFORK failed\n");
850			goto munmap;
851		}
852		break;
853	default:
854		assert(false);
855	}
856
857	switch (thp_run) {
858	case THP_RUN_PMD_SWAPOUT:
859	case THP_RUN_PTE_SWAPOUT:
860	case THP_RUN_SINGLE_PTE_SWAPOUT:
861		madvise(mem, size, MADV_PAGEOUT);
862		if (!range_is_swapped(mem, size)) {
863			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
864			goto munmap;
865		}
866		break;
867	default:
868		break;
869	}
870
871	fn(mem, size);
872munmap:
873	munmap(mmap_mem, mmap_size);
874	if (mremap_mem != MAP_FAILED)
875		munmap(mremap_mem, mremap_size);
876}
877
878static void run_with_thp(test_fn fn, const char *desc)
879{
880	ksft_print_msg("[RUN] %s ... with THP\n", desc);
881	do_run_with_thp(fn, THP_RUN_PMD);
882}
883
884static void run_with_thp_swap(test_fn fn, const char *desc)
885{
886	ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
887	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
888}
889
890static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
891{
892	ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
893	do_run_with_thp(fn, THP_RUN_PTE);
894}
895
896static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
897{
898	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
899	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
900}
901
902static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
903{
904	ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
905	do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
906}
907
908static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
909{
910	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
911	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
912}
913
914static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
915{
916	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
917	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
918}
919
920static void run_with_partial_shared_thp(test_fn fn, const char *desc)
921{
922	ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
923	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
924}
925
926static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
927{
928	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
929	char *mem, *dummy;
930
931	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
932		       hugetlbsize / 1024);
933
934	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
935
936	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
937	if (mem == MAP_FAILED) {
938		ksft_test_result_skip("need more free huge pages\n");
939		return;
940	}
941
942	/* Populate an huge page. */
943	memset(mem, 0, hugetlbsize);
944
945	/*
946	 * We need a total of two hugetlb pages to handle COW/unsharing
947	 * properly, otherwise we might get zapped by a SIGBUS.
948	 */
949	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
950	if (dummy == MAP_FAILED) {
951		ksft_test_result_skip("need more free huge pages\n");
952		goto munmap;
953	}
954	munmap(dummy, hugetlbsize);
955
956	fn(mem, hugetlbsize);
957munmap:
958	munmap(mem, hugetlbsize);
959}
960
961struct test_case {
962	const char *desc;
963	test_fn fn;
964};
965
966/*
967 * Test cases that are specific to anonymous pages: pages in private mappings
968 * that may get shared via COW during fork().
969 */
970static const struct test_case anon_test_cases[] = {
971	/*
972	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
973	 * either the child can observe modifications by the parent or the
974	 * other way around.
975	 */
976	{
977		"Basic COW after fork()",
978		test_cow_in_parent,
979	},
980	/*
981	 * Basic test, but do an additional mprotect(PROT_READ)+
982	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
983	 */
984	{
985		"Basic COW after fork() with mprotect() optimization",
986		test_cow_in_parent_mprotect,
987	},
988	/*
989	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
990	 * we miss to break COW, the child observes modifications by the parent.
991	 * This is CVE-2020-29374 reported by Jann Horn.
992	 */
993	{
994		"vmsplice() + unmap in child",
995		test_vmsplice_in_child
996	},
997	/*
998	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
999	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1000	 */
1001	{
1002		"vmsplice() + unmap in child with mprotect() optimization",
1003		test_vmsplice_in_child_mprotect
1004	},
1005	/*
1006	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1007	 * fork(); modify in the child. If we miss to break COW, the parent
1008	 * observes modifications by the child.
1009	 */
1010	{
1011		"vmsplice() before fork(), unmap in parent after fork()",
1012		test_vmsplice_before_fork,
1013	},
1014	/*
1015	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1016	 * child. If we miss to break COW, the parent observes modifications by
1017	 * the child.
1018	 */
1019	{
1020		"vmsplice() + unmap in parent after fork()",
1021		test_vmsplice_after_fork,
1022	},
1023#ifdef LOCAL_CONFIG_HAVE_LIBURING
1024	/*
1025	 * Take a R/W longterm pin and then map the page R/O into the page
1026	 * table to trigger a write fault on next access. When modifying the
1027	 * page, the page content must be visible via the pin.
1028	 */
1029	{
1030		"R/O-mapping a page registered as iouring fixed buffer",
1031		test_iouring_ro,
1032	},
1033	/*
1034	 * Take a R/W longterm pin and then fork() a child. When modifying the
1035	 * page, the page content must be visible via the pin. We expect the
1036	 * pinned page to not get shared with the child.
1037	 */
1038	{
1039		"fork() with an iouring fixed buffer",
1040		test_iouring_fork,
1041	},
1042
1043#endif /* LOCAL_CONFIG_HAVE_LIBURING */
1044	/*
1045	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1046	 * When modifying the page via the page table, the page content change
1047	 * must be visible via the pin.
1048	 */
1049	{
1050		"R/O GUP pin on R/O-mapped shared page",
1051		test_ro_pin_on_shared,
1052	},
1053	/* Same as above, but using GUP-fast. */
1054	{
1055		"R/O GUP-fast pin on R/O-mapped shared page",
1056		test_ro_fast_pin_on_shared,
1057	},
1058	/*
1059	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1060	 * was previously shared. When modifying the page via the page table,
1061	 * the page content change must be visible via the pin.
1062	 */
1063	{
1064		"R/O GUP pin on R/O-mapped previously-shared page",
1065		test_ro_pin_on_ro_previously_shared,
1066	},
1067	/* Same as above, but using GUP-fast. */
1068	{
1069		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1070		test_ro_fast_pin_on_ro_previously_shared,
1071	},
1072	/*
1073	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1074	 * When modifying the page via the page table, the page content change
1075	 * must be visible via the pin.
1076	 */
1077	{
1078		"R/O GUP pin on R/O-mapped exclusive page",
1079		test_ro_pin_on_ro_exclusive,
1080	},
1081	/* Same as above, but using GUP-fast. */
1082	{
1083		"R/O GUP-fast pin on R/O-mapped exclusive page",
1084		test_ro_fast_pin_on_ro_exclusive,
1085	},
1086};
1087
1088static void run_anon_test_case(struct test_case const *test_case)
1089{
1090	int i;
1091
1092	run_with_base_page(test_case->fn, test_case->desc);
1093	run_with_base_page_swap(test_case->fn, test_case->desc);
1094	if (thpsize) {
1095		run_with_thp(test_case->fn, test_case->desc);
1096		run_with_thp_swap(test_case->fn, test_case->desc);
1097		run_with_pte_mapped_thp(test_case->fn, test_case->desc);
1098		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
1099		run_with_single_pte_of_thp(test_case->fn, test_case->desc);
1100		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
1101		run_with_partial_mremap_thp(test_case->fn, test_case->desc);
1102		run_with_partial_shared_thp(test_case->fn, test_case->desc);
1103	}
1104	for (i = 0; i < nr_hugetlbsizes; i++)
1105		run_with_hugetlb(test_case->fn, test_case->desc,
1106				 hugetlbsizes[i]);
1107}
1108
1109static void run_anon_test_cases(void)
1110{
1111	int i;
1112
1113	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1114
1115	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1116		run_anon_test_case(&anon_test_cases[i]);
1117}
1118
1119static int tests_per_anon_test_case(void)
1120{
1121	int tests = 2 + nr_hugetlbsizes;
1122
1123	if (thpsize)
1124		tests += 8;
1125	return tests;
1126}
1127
1128enum anon_thp_collapse_test {
1129	ANON_THP_COLLAPSE_UNSHARED,
1130	ANON_THP_COLLAPSE_FULLY_SHARED,
1131	ANON_THP_COLLAPSE_LOWER_SHARED,
1132	ANON_THP_COLLAPSE_UPPER_SHARED,
1133};
1134
1135static void do_test_anon_thp_collapse(char *mem, size_t size,
1136				      enum anon_thp_collapse_test test)
1137{
1138	struct comm_pipes comm_pipes;
1139	char buf;
1140	int ret;
1141
1142	ret = setup_comm_pipes(&comm_pipes);
1143	if (ret) {
1144		ksft_test_result_fail("pipe() failed\n");
1145		return;
1146	}
1147
1148	/*
1149	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1150	 * R/O, such that we can try collapsing it later.
1151	 */
1152	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1153	if (ret) {
1154		ksft_test_result_fail("mprotect() failed\n");
1155		goto close_comm_pipes;
1156	}
1157	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1158	if (ret) {
1159		ksft_test_result_fail("mprotect() failed\n");
1160		goto close_comm_pipes;
1161	}
1162
1163	switch (test) {
1164	case ANON_THP_COLLAPSE_UNSHARED:
1165		/* Collapse before actually COW-sharing the page. */
1166		ret = madvise(mem, size, MADV_COLLAPSE);
1167		if (ret) {
1168			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1169					      strerror(errno));
1170			goto close_comm_pipes;
1171		}
1172		break;
1173	case ANON_THP_COLLAPSE_FULLY_SHARED:
1174		/* COW-share the full PTE-mapped THP. */
1175		break;
1176	case ANON_THP_COLLAPSE_LOWER_SHARED:
1177		/* Don't COW-share the upper part of the THP. */
1178		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1179		if (ret) {
1180			ksft_test_result_fail("MADV_DONTFORK failed\n");
1181			goto close_comm_pipes;
1182		}
1183		break;
1184	case ANON_THP_COLLAPSE_UPPER_SHARED:
1185		/* Don't COW-share the lower part of the THP. */
1186		ret = madvise(mem, size / 2, MADV_DONTFORK);
1187		if (ret) {
1188			ksft_test_result_fail("MADV_DONTFORK failed\n");
1189			goto close_comm_pipes;
1190		}
1191		break;
1192	default:
1193		assert(false);
1194	}
1195
1196	ret = fork();
1197	if (ret < 0) {
1198		ksft_test_result_fail("fork() failed\n");
1199		goto close_comm_pipes;
1200	} else if (!ret) {
1201		switch (test) {
1202		case ANON_THP_COLLAPSE_UNSHARED:
1203		case ANON_THP_COLLAPSE_FULLY_SHARED:
1204			exit(child_memcmp_fn(mem, size, &comm_pipes));
1205			break;
1206		case ANON_THP_COLLAPSE_LOWER_SHARED:
1207			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1208			break;
1209		case ANON_THP_COLLAPSE_UPPER_SHARED:
1210			exit(child_memcmp_fn(mem + size / 2, size / 2,
1211					     &comm_pipes));
1212			break;
1213		default:
1214			assert(false);
1215		}
1216	}
1217
1218	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1219		;
1220
1221	switch (test) {
1222	case ANON_THP_COLLAPSE_UNSHARED:
1223		break;
1224	case ANON_THP_COLLAPSE_UPPER_SHARED:
1225	case ANON_THP_COLLAPSE_LOWER_SHARED:
1226		/*
1227		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1228		 * able to actually collapse.
1229		 */
1230		ret = madvise(mem, size, MADV_DOFORK);
1231		if (ret) {
1232			ksft_test_result_fail("MADV_DOFORK failed\n");
1233			write(comm_pipes.parent_ready[1], "0", 1);
1234			wait(&ret);
1235			goto close_comm_pipes;
1236		}
1237		/* FALLTHROUGH */
1238	case ANON_THP_COLLAPSE_FULLY_SHARED:
1239		/* Collapse before anyone modified the COW-shared page. */
1240		ret = madvise(mem, size, MADV_COLLAPSE);
1241		if (ret) {
1242			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1243					      strerror(errno));
1244			write(comm_pipes.parent_ready[1], "0", 1);
1245			wait(&ret);
1246			goto close_comm_pipes;
1247		}
1248		break;
1249	default:
1250		assert(false);
1251	}
1252
1253	/* Modify the page. */
1254	memset(mem, 0xff, size);
1255	write(comm_pipes.parent_ready[1], "0", 1);
1256
1257	wait(&ret);
1258	if (WIFEXITED(ret))
1259		ret = WEXITSTATUS(ret);
1260	else
1261		ret = -EINVAL;
1262
1263	ksft_test_result(!ret, "No leak from parent into child\n");
1264close_comm_pipes:
1265	close_comm_pipes(&comm_pipes);
1266}
1267
1268static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1269{
1270	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1271}
1272
1273static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1274{
1275	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1276}
1277
1278static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1279{
1280	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1281}
1282
1283static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1284{
1285	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1286}
1287
1288/*
1289 * Test cases that are specific to anonymous THP: pages in private mappings
1290 * that may get shared via COW during fork().
1291 */
1292static const struct test_case anon_thp_test_cases[] = {
1293	/*
1294	 * Basic COW test for fork() without any GUP when collapsing a THP
1295	 * before fork().
1296	 *
1297	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1298	 * collapse") might easily get COW handling wrong when not collapsing
1299	 * exclusivity information properly.
1300	 */
1301	{
1302		"Basic COW after fork() when collapsing before fork()",
1303		test_anon_thp_collapse_unshared,
1304	},
1305	/* Basic COW test, but collapse after COW-sharing a full THP. */
1306	{
1307		"Basic COW after fork() when collapsing after fork() (fully shared)",
1308		test_anon_thp_collapse_fully_shared,
1309	},
1310	/*
1311	 * Basic COW test, but collapse after COW-sharing the lower half of a
1312	 * THP.
1313	 */
1314	{
1315		"Basic COW after fork() when collapsing after fork() (lower shared)",
1316		test_anon_thp_collapse_lower_shared,
1317	},
1318	/*
1319	 * Basic COW test, but collapse after COW-sharing the upper half of a
1320	 * THP.
1321	 */
1322	{
1323		"Basic COW after fork() when collapsing after fork() (upper shared)",
1324		test_anon_thp_collapse_upper_shared,
1325	},
1326};
1327
1328static void run_anon_thp_test_cases(void)
1329{
1330	int i;
1331
1332	if (!thpsize)
1333		return;
1334
1335	ksft_print_msg("[INFO] Anonymous THP tests\n");
1336
1337	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1338		struct test_case const *test_case = &anon_thp_test_cases[i];
1339
1340		ksft_print_msg("[RUN] %s\n", test_case->desc);
1341		do_run_with_thp(test_case->fn, THP_RUN_PMD);
1342	}
1343}
1344
1345static int tests_per_anon_thp_test_case(void)
1346{
1347	return thpsize ? 1 : 0;
1348}
1349
1350typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1351
1352static void test_cow(char *mem, const char *smem, size_t size)
1353{
1354	char *old = malloc(size);
1355
1356	/* Backup the original content. */
1357	memcpy(old, smem, size);
1358
1359	/* Modify the page. */
1360	memset(mem, 0xff, size);
1361
1362	/* See if we still read the old values via the other mapping. */
1363	ksft_test_result(!memcmp(smem, old, size),
1364			 "Other mapping not modified\n");
1365	free(old);
1366}
1367
1368static void test_ro_pin(char *mem, const char *smem, size_t size)
1369{
1370	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1371}
1372
1373static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1374{
1375	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1376}
1377
1378static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1379{
1380	char *mem, *smem, tmp;
1381
1382	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1383
1384	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1385		   MAP_PRIVATE | MAP_ANON, -1, 0);
1386	if (mem == MAP_FAILED) {
1387		ksft_test_result_fail("mmap() failed\n");
1388		return;
1389	}
1390
1391	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1392	if (mem == MAP_FAILED) {
1393		ksft_test_result_fail("mmap() failed\n");
1394		goto munmap;
1395	}
1396
1397	/* Read from the page to populate the shared zeropage. */
1398	tmp = *mem + *smem;
1399	asm volatile("" : "+r" (tmp));
1400
1401	fn(mem, smem, pagesize);
1402munmap:
1403	munmap(mem, pagesize);
1404	if (smem != MAP_FAILED)
1405		munmap(smem, pagesize);
1406}
1407
1408static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1409{
1410	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1411	size_t mmap_size;
1412	int ret;
1413
1414	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1415
1416	if (!has_huge_zeropage) {
1417		ksft_test_result_skip("Huge zeropage not enabled\n");
1418		return;
1419	}
1420
1421	/* For alignment purposes, we need twice the thp size. */
1422	mmap_size = 2 * thpsize;
1423	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1424			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1425	if (mmap_mem == MAP_FAILED) {
1426		ksft_test_result_fail("mmap() failed\n");
1427		return;
1428	}
1429	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1430			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1431	if (mmap_smem == MAP_FAILED) {
1432		ksft_test_result_fail("mmap() failed\n");
1433		goto munmap;
1434	}
1435
1436	/* We need a THP-aligned memory area. */
1437	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
1438	smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
1439
1440	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
1441	ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
1442	if (ret) {
1443		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1444		goto munmap;
1445	}
1446
1447	/*
1448	 * Read from the memory to populate the huge shared zeropage. Read from
1449	 * the first sub-page and test if we get another sub-page populated
1450	 * automatically.
1451	 */
1452	tmp = *mem + *smem;
1453	asm volatile("" : "+r" (tmp));
1454	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1455	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1456		ksft_test_result_skip("Did not get THPs populated\n");
1457		goto munmap;
1458	}
1459
1460	fn(mem, smem, thpsize);
1461munmap:
1462	munmap(mmap_mem, mmap_size);
1463	if (mmap_smem != MAP_FAILED)
1464		munmap(mmap_smem, mmap_size);
1465}
1466
1467static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1468{
1469	char *mem, *smem, tmp;
1470	int fd;
1471
1472	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1473
1474	fd = memfd_create("test", 0);
1475	if (fd < 0) {
1476		ksft_test_result_fail("memfd_create() failed\n");
1477		return;
1478	}
1479
1480	/* File consists of a single page filled with zeroes. */
1481	if (fallocate(fd, 0, 0, pagesize)) {
1482		ksft_test_result_fail("fallocate() failed\n");
1483		goto close;
1484	}
1485
1486	/* Create a private mapping of the memfd. */
1487	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1488	if (mem == MAP_FAILED) {
1489		ksft_test_result_fail("mmap() failed\n");
1490		goto close;
1491	}
1492	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1493	if (mem == MAP_FAILED) {
1494		ksft_test_result_fail("mmap() failed\n");
1495		goto munmap;
1496	}
1497
1498	/* Fault the page in. */
1499	tmp = *mem + *smem;
1500	asm volatile("" : "+r" (tmp));
1501
1502	fn(mem, smem, pagesize);
1503munmap:
1504	munmap(mem, pagesize);
1505	if (smem != MAP_FAILED)
1506		munmap(smem, pagesize);
1507close:
1508	close(fd);
1509}
1510
1511static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1512{
1513	char *mem, *smem, tmp;
1514	FILE *file;
1515	int fd;
1516
1517	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1518
1519	file = tmpfile();
1520	if (!file) {
1521		ksft_test_result_fail("tmpfile() failed\n");
1522		return;
1523	}
1524
1525	fd = fileno(file);
1526	if (fd < 0) {
1527		ksft_test_result_skip("fileno() failed\n");
1528		return;
1529	}
1530
1531	/* File consists of a single page filled with zeroes. */
1532	if (fallocate(fd, 0, 0, pagesize)) {
1533		ksft_test_result_fail("fallocate() failed\n");
1534		goto close;
1535	}
1536
1537	/* Create a private mapping of the memfd. */
1538	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1539	if (mem == MAP_FAILED) {
1540		ksft_test_result_fail("mmap() failed\n");
1541		goto close;
1542	}
1543	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1544	if (mem == MAP_FAILED) {
1545		ksft_test_result_fail("mmap() failed\n");
1546		goto munmap;
1547	}
1548
1549	/* Fault the page in. */
1550	tmp = *mem + *smem;
1551	asm volatile("" : "+r" (tmp));
1552
1553	fn(mem, smem, pagesize);
1554munmap:
1555	munmap(mem, pagesize);
1556	if (smem != MAP_FAILED)
1557		munmap(smem, pagesize);
1558close:
1559	fclose(file);
1560}
1561
1562static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1563				   size_t hugetlbsize)
1564{
1565	int flags = MFD_HUGETLB;
1566	char *mem, *smem, tmp;
1567	int fd;
1568
1569	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1570		       hugetlbsize / 1024);
1571
1572	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1573
1574	fd = memfd_create("test", flags);
1575	if (fd < 0) {
1576		ksft_test_result_skip("memfd_create() failed\n");
1577		return;
1578	}
1579
1580	/* File consists of a single page filled with zeroes. */
1581	if (fallocate(fd, 0, 0, hugetlbsize)) {
1582		ksft_test_result_skip("need more free huge pages\n");
1583		goto close;
1584	}
1585
1586	/* Create a private mapping of the memfd. */
1587	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1588		   0);
1589	if (mem == MAP_FAILED) {
1590		ksft_test_result_skip("need more free huge pages\n");
1591		goto close;
1592	}
1593	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1594	if (mem == MAP_FAILED) {
1595		ksft_test_result_fail("mmap() failed\n");
1596		goto munmap;
1597	}
1598
1599	/* Fault the page in. */
1600	tmp = *mem + *smem;
1601	asm volatile("" : "+r" (tmp));
1602
1603	fn(mem, smem, hugetlbsize);
1604munmap:
1605	munmap(mem, hugetlbsize);
1606	if (mem != MAP_FAILED)
1607		munmap(smem, hugetlbsize);
1608close:
1609	close(fd);
1610}
1611
1612struct non_anon_test_case {
1613	const char *desc;
1614	non_anon_test_fn fn;
1615};
1616
1617/*
1618 * Test cases that target any pages in private mappings that are not anonymous:
1619 * pages that may get shared via COW ndependent of fork(). This includes
1620 * the shared zeropage(s), pagecache pages, ...
1621 */
1622static const struct non_anon_test_case non_anon_test_cases[] = {
1623	/*
1624	 * Basic COW test without any GUP. If we miss to break COW, changes are
1625	 * visible via other private/shared mappings.
1626	 */
1627	{
1628		"Basic COW",
1629		test_cow,
1630	},
1631	/*
1632	 * Take a R/O longterm pin. When modifying the page via the page table,
1633	 * the page content change must be visible via the pin.
1634	 */
1635	{
1636		"R/O longterm GUP pin",
1637		test_ro_pin,
1638	},
1639	/* Same as above, but using GUP-fast. */
1640	{
1641		"R/O longterm GUP-fast pin",
1642		test_ro_fast_pin,
1643	},
1644};
1645
1646static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1647{
1648	int i;
1649
1650	run_with_zeropage(test_case->fn, test_case->desc);
1651	run_with_memfd(test_case->fn, test_case->desc);
1652	run_with_tmpfile(test_case->fn, test_case->desc);
1653	if (thpsize)
1654		run_with_huge_zeropage(test_case->fn, test_case->desc);
1655	for (i = 0; i < nr_hugetlbsizes; i++)
1656		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1657				       hugetlbsizes[i]);
1658}
1659
1660static void run_non_anon_test_cases(void)
1661{
1662	int i;
1663
1664	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1665
1666	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1667		run_non_anon_test_case(&non_anon_test_cases[i]);
1668}
1669
1670static int tests_per_non_anon_test_case(void)
1671{
1672	int tests = 3 + nr_hugetlbsizes;
1673
1674	if (thpsize)
1675		tests += 1;
1676	return tests;
1677}
1678
1679int main(int argc, char **argv)
1680{
1681	int err;
1682
1683	ksft_print_header();
1684
1685	pagesize = getpagesize();
1686	thpsize = read_pmd_pagesize();
1687	if (thpsize)
1688		ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
1689			       thpsize / 1024);
1690	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1691						    ARRAY_SIZE(hugetlbsizes));
1692	detect_huge_zeropage();
1693
1694	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1695		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1696		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1697
1698	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1699	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1700	if (pagemap_fd < 0)
1701		ksft_exit_fail_msg("opening pagemap failed\n");
1702
1703	run_anon_test_cases();
1704	run_anon_thp_test_cases();
1705	run_non_anon_test_cases();
1706
1707	err = ksft_get_fail_cnt();
1708	if (err)
1709		ksft_exit_fail_msg("%d out of %d tests failed\n",
1710				   err, ksft_test_num());
1711	return ksft_exit_pass();
1712}
1713