1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Userfaultfd tests util functions
4 *
5 * Copyright (C) 2015-2023  Red Hat, Inc.
6 */
7
8#include "uffd-common.h"
9
10#define BASE_PMD_ADDR ((void *)(1UL << 30))
11
12volatile bool test_uffdio_copy_eexist = true;
13unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
14char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
15int uffd = -1, uffd_flags, finished, *pipefd, test_type;
16bool map_shared;
17bool test_uffdio_wp = true;
18unsigned long long *count_verify;
19uffd_test_ops_t *uffd_test_ops;
20
21static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
22{
23	unsigned int memfd_flags = 0;
24	int mem_fd;
25
26	if (hugetlb)
27		memfd_flags = MFD_HUGETLB;
28	mem_fd = memfd_create("uffd-test", memfd_flags);
29	if (mem_fd < 0)
30		err("memfd_create");
31	if (ftruncate(mem_fd, mem_size))
32		err("ftruncate");
33	if (fallocate(mem_fd,
34		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
35		      mem_size))
36		err("fallocate");
37
38	return mem_fd;
39}
40
41static void anon_release_pages(char *rel_area)
42{
43	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
44		err("madvise(MADV_DONTNEED) failed");
45}
46
47static int anon_allocate_area(void **alloc_area, bool is_src)
48{
49	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
50			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
51	if (*alloc_area == MAP_FAILED) {
52		*alloc_area = NULL;
53		return -errno;
54	}
55	return 0;
56}
57
58static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
59{
60}
61
62static void hugetlb_release_pages(char *rel_area)
63{
64	if (!map_shared) {
65		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
66			err("madvise(MADV_DONTNEED) failed");
67	} else {
68		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
69			err("madvise(MADV_REMOVE) failed");
70	}
71}
72
73static int hugetlb_allocate_area(void **alloc_area, bool is_src)
74{
75	off_t size = nr_pages * page_size;
76	off_t offset = is_src ? 0 : size;
77	void *area_alias = NULL;
78	char **alloc_area_alias;
79	int mem_fd = uffd_mem_fd_create(size * 2, true);
80
81	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
82			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
83			   (is_src ? 0 : MAP_NORESERVE),
84			   mem_fd, offset);
85	if (*alloc_area == MAP_FAILED) {
86		*alloc_area = NULL;
87		return -errno;
88	}
89
90	if (map_shared) {
91		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
92				  MAP_SHARED, mem_fd, offset);
93		if (area_alias == MAP_FAILED)
94			return -errno;
95	}
96
97	if (is_src) {
98		alloc_area_alias = &area_src_alias;
99	} else {
100		alloc_area_alias = &area_dst_alias;
101	}
102	if (area_alias)
103		*alloc_area_alias = area_alias;
104
105	close(mem_fd);
106	return 0;
107}
108
109static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
110{
111	if (!map_shared)
112		return;
113
114	*start = (unsigned long) area_dst_alias + offset;
115}
116
117static void shmem_release_pages(char *rel_area)
118{
119	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
120		err("madvise(MADV_REMOVE) failed");
121}
122
123static int shmem_allocate_area(void **alloc_area, bool is_src)
124{
125	void *area_alias = NULL;
126	size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
127	unsigned long offset = is_src ? 0 : bytes;
128	char *p = NULL, *p_alias = NULL;
129	int mem_fd = uffd_mem_fd_create(bytes * 2, false);
130
131	/* TODO: clean this up.  Use a static addr is ugly */
132	p = BASE_PMD_ADDR;
133	if (!is_src)
134		/* src map + alias + interleaved hpages */
135		p += 2 * (bytes + hpage_size);
136	p_alias = p;
137	p_alias += bytes;
138	p_alias += hpage_size;  /* Prevent src/dst VMA merge */
139
140	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
141			   mem_fd, offset);
142	if (*alloc_area == MAP_FAILED) {
143		*alloc_area = NULL;
144		return -errno;
145	}
146	if (*alloc_area != p)
147		err("mmap of memfd failed at %p", p);
148
149	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
150			  mem_fd, offset);
151	if (area_alias == MAP_FAILED) {
152		munmap(*alloc_area, bytes);
153		*alloc_area = NULL;
154		return -errno;
155	}
156	if (area_alias != p_alias)
157		err("mmap of anonymous memory failed at %p", p_alias);
158
159	if (is_src)
160		area_src_alias = area_alias;
161	else
162		area_dst_alias = area_alias;
163
164	close(mem_fd);
165	return 0;
166}
167
168static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
169{
170	*start = (unsigned long)area_dst_alias + offset;
171}
172
173static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
174{
175	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
176			      read_pmd_pagesize()))
177		err("Did not find expected %d number of hugepages",
178		    expect_nr_hpages);
179}
180
181struct uffd_test_ops anon_uffd_test_ops = {
182	.allocate_area = anon_allocate_area,
183	.release_pages = anon_release_pages,
184	.alias_mapping = noop_alias_mapping,
185	.check_pmd_mapping = NULL,
186};
187
188struct uffd_test_ops shmem_uffd_test_ops = {
189	.allocate_area = shmem_allocate_area,
190	.release_pages = shmem_release_pages,
191	.alias_mapping = shmem_alias_mapping,
192	.check_pmd_mapping = shmem_check_pmd_mapping,
193};
194
195struct uffd_test_ops hugetlb_uffd_test_ops = {
196	.allocate_area = hugetlb_allocate_area,
197	.release_pages = hugetlb_release_pages,
198	.alias_mapping = hugetlb_alias_mapping,
199	.check_pmd_mapping = NULL,
200};
201
202void uffd_stats_report(struct uffd_args *args, int n_cpus)
203{
204	int i;
205	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
206
207	for (i = 0; i < n_cpus; i++) {
208		miss_total += args[i].missing_faults;
209		wp_total += args[i].wp_faults;
210		minor_total += args[i].minor_faults;
211	}
212
213	printf("userfaults: ");
214	if (miss_total) {
215		printf("%llu missing (", miss_total);
216		for (i = 0; i < n_cpus; i++)
217			printf("%lu+", args[i].missing_faults);
218		printf("\b) ");
219	}
220	if (wp_total) {
221		printf("%llu wp (", wp_total);
222		for (i = 0; i < n_cpus; i++)
223			printf("%lu+", args[i].wp_faults);
224		printf("\b) ");
225	}
226	if (minor_total) {
227		printf("%llu minor (", minor_total);
228		for (i = 0; i < n_cpus; i++)
229			printf("%lu+", args[i].minor_faults);
230		printf("\b)");
231	}
232	printf("\n");
233}
234
235int userfaultfd_open(uint64_t *features)
236{
237	struct uffdio_api uffdio_api;
238
239	uffd = uffd_open(UFFD_FLAGS);
240	if (uffd < 0)
241		return -1;
242	uffd_flags = fcntl(uffd, F_GETFD, NULL);
243
244	uffdio_api.api = UFFD_API;
245	uffdio_api.features = *features;
246	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
247		/* Probably lack of CAP_PTRACE? */
248		return -1;
249	if (uffdio_api.api != UFFD_API)
250		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
251
252	*features = uffdio_api.features;
253	return 0;
254}
255
256static inline void munmap_area(void **area)
257{
258	if (*area)
259		if (munmap(*area, nr_pages * page_size))
260			err("munmap");
261
262	*area = NULL;
263}
264
265static void uffd_test_ctx_clear(void)
266{
267	size_t i;
268
269	if (pipefd) {
270		for (i = 0; i < nr_cpus * 2; ++i) {
271			if (close(pipefd[i]))
272				err("close pipefd");
273		}
274		free(pipefd);
275		pipefd = NULL;
276	}
277
278	if (count_verify) {
279		free(count_verify);
280		count_verify = NULL;
281	}
282
283	if (uffd != -1) {
284		if (close(uffd))
285			err("close uffd");
286		uffd = -1;
287	}
288
289	munmap_area((void **)&area_src);
290	munmap_area((void **)&area_src_alias);
291	munmap_area((void **)&area_dst);
292	munmap_area((void **)&area_dst_alias);
293	munmap_area((void **)&area_remap);
294}
295
296int uffd_test_ctx_init(uint64_t features, const char **errmsg)
297{
298	unsigned long nr, cpu;
299	int ret;
300
301	uffd_test_ctx_clear();
302
303	ret = uffd_test_ops->allocate_area((void **)&area_src, true);
304	ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
305	if (ret) {
306		if (errmsg)
307			*errmsg = "memory allocation failed";
308		return ret;
309	}
310
311	ret = userfaultfd_open(&features);
312	if (ret) {
313		if (errmsg)
314			*errmsg = "possible lack of priviledge";
315		return ret;
316	}
317
318	count_verify = malloc(nr_pages * sizeof(unsigned long long));
319	if (!count_verify)
320		err("count_verify");
321
322	for (nr = 0; nr < nr_pages; nr++) {
323		*area_mutex(area_src, nr) =
324			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
325		count_verify[nr] = *area_count(area_src, nr) = 1;
326		/*
327		 * In the transition between 255 to 256, powerpc will
328		 * read out of order in my_bcmp and see both bytes as
329		 * zero, so leave a placeholder below always non-zero
330		 * after the count, to avoid my_bcmp to trigger false
331		 * positives.
332		 */
333		*(area_count(area_src, nr) + 1) = 1;
334	}
335
336	/*
337	 * After initialization of area_src, we must explicitly release pages
338	 * for area_dst to make sure it's fully empty.  Otherwise we could have
339	 * some area_dst pages be errornously initialized with zero pages,
340	 * hence we could hit memory corruption later in the test.
341	 *
342	 * One example is when THP is globally enabled, above allocate_area()
343	 * calls could have the two areas merged into a single VMA (as they
344	 * will have the same VMA flags so they're mergeable).  When we
345	 * initialize the area_src above, it's possible that some part of
346	 * area_dst could have been faulted in via one huge THP that will be
347	 * shared between area_src and area_dst.  It could cause some of the
348	 * area_dst won't be trapped by missing userfaults.
349	 *
350	 * This release_pages() will guarantee even if that happened, we'll
351	 * proactively split the thp and drop any accidentally initialized
352	 * pages within area_dst.
353	 */
354	uffd_test_ops->release_pages(area_dst);
355
356	pipefd = malloc(sizeof(int) * nr_cpus * 2);
357	if (!pipefd)
358		err("pipefd");
359	for (cpu = 0; cpu < nr_cpus; cpu++)
360		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
361			err("pipe");
362
363	return 0;
364}
365
366void wp_range(int ufd, __u64 start, __u64 len, bool wp)
367{
368	struct uffdio_writeprotect prms;
369
370	/* Write protection page faults */
371	prms.range.start = start;
372	prms.range.len = len;
373	/* Undo write-protect, do wakeup after that */
374	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
375
376	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
377		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
378}
379
380static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
381{
382	struct uffdio_continue req;
383	int ret;
384
385	req.range.start = start;
386	req.range.len = len;
387	req.mode = 0;
388	if (wp)
389		req.mode |= UFFDIO_CONTINUE_MODE_WP;
390
391	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
392		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
393		    (uint64_t)start);
394
395	/*
396	 * Error handling within the kernel for continue is subtly different
397	 * from copy or zeropage, so it may be a source of bugs. Trigger an
398	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
399	 */
400	req.mapped = 0;
401	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
402	if (ret >= 0 || req.mapped != -EEXIST)
403		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
404		    ret, (int64_t) req.mapped);
405}
406
407int uffd_read_msg(int ufd, struct uffd_msg *msg)
408{
409	int ret = read(uffd, msg, sizeof(*msg));
410
411	if (ret != sizeof(*msg)) {
412		if (ret < 0) {
413			if (errno == EAGAIN || errno == EINTR)
414				return 1;
415			err("blocking read error");
416		} else {
417			err("short read");
418		}
419	}
420
421	return 0;
422}
423
424void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
425{
426	unsigned long offset;
427
428	if (msg->event != UFFD_EVENT_PAGEFAULT)
429		err("unexpected msg event %u", msg->event);
430
431	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
432		/* Write protect page faults */
433		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
434		args->wp_faults++;
435	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
436		uint8_t *area;
437		int b;
438
439		/*
440		 * Minor page faults
441		 *
442		 * To prove we can modify the original range for testing
443		 * purposes, we're going to bit flip this range before
444		 * continuing.
445		 *
446		 * Note that this requires all minor page fault tests operate on
447		 * area_dst (non-UFFD-registered) and area_dst_alias
448		 * (UFFD-registered).
449		 */
450
451		area = (uint8_t *)(area_dst +
452				   ((char *)msg->arg.pagefault.address -
453				    area_dst_alias));
454		for (b = 0; b < page_size; ++b)
455			area[b] = ~area[b];
456		continue_range(uffd, msg->arg.pagefault.address, page_size,
457			       args->apply_wp);
458		args->minor_faults++;
459	} else {
460		/*
461		 * Missing page faults.
462		 *
463		 * Here we force a write check for each of the missing mode
464		 * faults.  It's guaranteed because the only threads that
465		 * will trigger uffd faults are the locking threads, and
466		 * their first instruction to touch the missing page will
467		 * always be pthread_mutex_lock().
468		 *
469		 * Note that here we relied on an NPTL glibc impl detail to
470		 * always read the lock type at the entry of the lock op
471		 * (pthread_mutex_t.__data.__type, offset 0x10) before
472		 * doing any locking operations to guarantee that.  It's
473		 * actually not good to rely on this impl detail because
474		 * logically a pthread-compatible lib can implement the
475		 * locks without types and we can fail when linking with
476		 * them.  However since we used to find bugs with this
477		 * strict check we still keep it around.  Hopefully this
478		 * could be a good hint when it fails again.  If one day
479		 * it'll break on some other impl of glibc we'll revisit.
480		 */
481		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
482			err("unexpected write fault");
483
484		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
485		offset &= ~(page_size-1);
486
487		if (copy_page(uffd, offset, args->apply_wp))
488			args->missing_faults++;
489	}
490}
491
492void *uffd_poll_thread(void *arg)
493{
494	struct uffd_args *args = (struct uffd_args *)arg;
495	unsigned long cpu = args->cpu;
496	struct pollfd pollfd[2];
497	struct uffd_msg msg;
498	struct uffdio_register uffd_reg;
499	int ret;
500	char tmp_chr;
501
502	if (!args->handle_fault)
503		args->handle_fault = uffd_handle_page_fault;
504
505	pollfd[0].fd = uffd;
506	pollfd[0].events = POLLIN;
507	pollfd[1].fd = pipefd[cpu*2];
508	pollfd[1].events = POLLIN;
509
510	for (;;) {
511		ret = poll(pollfd, 2, -1);
512		if (ret <= 0) {
513			if (errno == EINTR || errno == EAGAIN)
514				continue;
515			err("poll error: %d", ret);
516		}
517		if (pollfd[1].revents) {
518			if (!(pollfd[1].revents & POLLIN))
519				err("pollfd[1].revents %d", pollfd[1].revents);
520			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
521				err("read pipefd error");
522			break;
523		}
524		if (!(pollfd[0].revents & POLLIN))
525			err("pollfd[0].revents %d", pollfd[0].revents);
526		if (uffd_read_msg(uffd, &msg))
527			continue;
528		switch (msg.event) {
529		default:
530			err("unexpected msg event %u\n", msg.event);
531			break;
532		case UFFD_EVENT_PAGEFAULT:
533			args->handle_fault(&msg, args);
534			break;
535		case UFFD_EVENT_FORK:
536			close(uffd);
537			uffd = msg.arg.fork.ufd;
538			pollfd[0].fd = uffd;
539			break;
540		case UFFD_EVENT_REMOVE:
541			uffd_reg.range.start = msg.arg.remove.start;
542			uffd_reg.range.len = msg.arg.remove.end -
543				msg.arg.remove.start;
544			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
545				err("remove failure");
546			break;
547		case UFFD_EVENT_REMAP:
548			area_remap = area_dst;  /* save for later unmap */
549			area_dst = (char *)(unsigned long)msg.arg.remap.to;
550			break;
551		}
552	}
553
554	return NULL;
555}
556
557static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
558			    unsigned long offset)
559{
560	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
561				     uffdio_copy->len,
562				     offset);
563	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
564		/* real retval in ufdio_copy.copy */
565		if (uffdio_copy->copy != -EEXIST)
566			err("UFFDIO_COPY retry error: %"PRId64,
567			    (int64_t)uffdio_copy->copy);
568	} else {
569		err("UFFDIO_COPY retry unexpected: %"PRId64,
570		    (int64_t)uffdio_copy->copy);
571	}
572}
573
574static void wake_range(int ufd, unsigned long addr, unsigned long len)
575{
576	struct uffdio_range uffdio_wake;
577
578	uffdio_wake.start = addr;
579	uffdio_wake.len = len;
580
581	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
582		fprintf(stderr, "error waking %lu\n",
583			addr), exit(1);
584}
585
586int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
587{
588	struct uffdio_copy uffdio_copy;
589
590	if (offset >= nr_pages * page_size)
591		err("unexpected offset %lu\n", offset);
592	uffdio_copy.dst = (unsigned long) area_dst + offset;
593	uffdio_copy.src = (unsigned long) area_src + offset;
594	uffdio_copy.len = page_size;
595	if (wp)
596		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
597	else
598		uffdio_copy.mode = 0;
599	uffdio_copy.copy = 0;
600	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
601		/* real retval in ufdio_copy.copy */
602		if (uffdio_copy.copy != -EEXIST)
603			err("UFFDIO_COPY error: %"PRId64,
604			    (int64_t)uffdio_copy.copy);
605		wake_range(ufd, uffdio_copy.dst, page_size);
606	} else if (uffdio_copy.copy != page_size) {
607		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
608	} else {
609		if (test_uffdio_copy_eexist && retry) {
610			test_uffdio_copy_eexist = false;
611			retry_copy_page(ufd, &uffdio_copy, offset);
612		}
613		return 1;
614	}
615	return 0;
616}
617
618int copy_page(int ufd, unsigned long offset, bool wp)
619{
620	return __copy_page(ufd, offset, false, wp);
621}
622
623int uffd_open_dev(unsigned int flags)
624{
625	int fd, uffd;
626
627	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
628	if (fd < 0)
629		return fd;
630	uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
631	close(fd);
632
633	return uffd;
634}
635
636int uffd_open_sys(unsigned int flags)
637{
638#ifdef __NR_userfaultfd
639	return syscall(__NR_userfaultfd, flags);
640#else
641	return -1;
642#endif
643}
644
645int uffd_open(unsigned int flags)
646{
647	int uffd = uffd_open_sys(flags);
648
649	if (uffd < 0)
650		uffd = uffd_open_dev(flags);
651
652	return uffd;
653}
654
655int uffd_get_features(uint64_t *features)
656{
657	struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
658	/*
659	 * This should by default work in most kernels; the feature list
660	 * will be the same no matter what we pass in here.
661	 */
662	int fd = uffd_open(UFFD_USER_MODE_ONLY);
663
664	if (fd < 0)
665		/* Maybe the kernel is older than user-only mode? */
666		fd = uffd_open(0);
667
668	if (fd < 0)
669		return fd;
670
671	if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
672		close(fd);
673		return -errno;
674	}
675
676	*features = uffdio_api.features;
677	close(fd);
678
679	return 0;
680}
681