162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Userfaultfd tests util functions
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2015-2023  Red Hat, Inc.
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#include "uffd-common.h"
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#define BASE_PMD_ADDR ((void *)(1UL << 30))
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_civolatile bool test_uffdio_copy_eexist = true;
1362306a36Sopenharmony_ciunsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
1462306a36Sopenharmony_cichar *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
1562306a36Sopenharmony_ciint uffd = -1, uffd_flags, finished, *pipefd, test_type;
1662306a36Sopenharmony_cibool map_shared;
1762306a36Sopenharmony_cibool test_uffdio_wp = true;
1862306a36Sopenharmony_ciunsigned long long *count_verify;
1962306a36Sopenharmony_ciuffd_test_ops_t *uffd_test_ops;
2062306a36Sopenharmony_ci
2162306a36Sopenharmony_cistatic int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
2262306a36Sopenharmony_ci{
2362306a36Sopenharmony_ci	unsigned int memfd_flags = 0;
2462306a36Sopenharmony_ci	int mem_fd;
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci	if (hugetlb)
2762306a36Sopenharmony_ci		memfd_flags = MFD_HUGETLB;
2862306a36Sopenharmony_ci	mem_fd = memfd_create("uffd-test", memfd_flags);
2962306a36Sopenharmony_ci	if (mem_fd < 0)
3062306a36Sopenharmony_ci		err("memfd_create");
3162306a36Sopenharmony_ci	if (ftruncate(mem_fd, mem_size))
3262306a36Sopenharmony_ci		err("ftruncate");
3362306a36Sopenharmony_ci	if (fallocate(mem_fd,
3462306a36Sopenharmony_ci		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
3562306a36Sopenharmony_ci		      mem_size))
3662306a36Sopenharmony_ci		err("fallocate");
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_ci	return mem_fd;
3962306a36Sopenharmony_ci}
4062306a36Sopenharmony_ci
4162306a36Sopenharmony_cistatic void anon_release_pages(char *rel_area)
4262306a36Sopenharmony_ci{
4362306a36Sopenharmony_ci	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
4462306a36Sopenharmony_ci		err("madvise(MADV_DONTNEED) failed");
4562306a36Sopenharmony_ci}
4662306a36Sopenharmony_ci
4762306a36Sopenharmony_cistatic int anon_allocate_area(void **alloc_area, bool is_src)
4862306a36Sopenharmony_ci{
4962306a36Sopenharmony_ci	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
5062306a36Sopenharmony_ci			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
5162306a36Sopenharmony_ci	if (*alloc_area == MAP_FAILED) {
5262306a36Sopenharmony_ci		*alloc_area = NULL;
5362306a36Sopenharmony_ci		return -errno;
5462306a36Sopenharmony_ci	}
5562306a36Sopenharmony_ci	return 0;
5662306a36Sopenharmony_ci}
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_cistatic void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
5962306a36Sopenharmony_ci{
6062306a36Sopenharmony_ci}
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_cistatic void hugetlb_release_pages(char *rel_area)
6362306a36Sopenharmony_ci{
6462306a36Sopenharmony_ci	if (!map_shared) {
6562306a36Sopenharmony_ci		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
6662306a36Sopenharmony_ci			err("madvise(MADV_DONTNEED) failed");
6762306a36Sopenharmony_ci	} else {
6862306a36Sopenharmony_ci		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
6962306a36Sopenharmony_ci			err("madvise(MADV_REMOVE) failed");
7062306a36Sopenharmony_ci	}
7162306a36Sopenharmony_ci}
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_cistatic int hugetlb_allocate_area(void **alloc_area, bool is_src)
7462306a36Sopenharmony_ci{
7562306a36Sopenharmony_ci	off_t size = nr_pages * page_size;
7662306a36Sopenharmony_ci	off_t offset = is_src ? 0 : size;
7762306a36Sopenharmony_ci	void *area_alias = NULL;
7862306a36Sopenharmony_ci	char **alloc_area_alias;
7962306a36Sopenharmony_ci	int mem_fd = uffd_mem_fd_create(size * 2, true);
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
8262306a36Sopenharmony_ci			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
8362306a36Sopenharmony_ci			   (is_src ? 0 : MAP_NORESERVE),
8462306a36Sopenharmony_ci			   mem_fd, offset);
8562306a36Sopenharmony_ci	if (*alloc_area == MAP_FAILED) {
8662306a36Sopenharmony_ci		*alloc_area = NULL;
8762306a36Sopenharmony_ci		return -errno;
8862306a36Sopenharmony_ci	}
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci	if (map_shared) {
9162306a36Sopenharmony_ci		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
9262306a36Sopenharmony_ci				  MAP_SHARED, mem_fd, offset);
9362306a36Sopenharmony_ci		if (area_alias == MAP_FAILED)
9462306a36Sopenharmony_ci			return -errno;
9562306a36Sopenharmony_ci	}
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	if (is_src) {
9862306a36Sopenharmony_ci		alloc_area_alias = &area_src_alias;
9962306a36Sopenharmony_ci	} else {
10062306a36Sopenharmony_ci		alloc_area_alias = &area_dst_alias;
10162306a36Sopenharmony_ci	}
10262306a36Sopenharmony_ci	if (area_alias)
10362306a36Sopenharmony_ci		*alloc_area_alias = area_alias;
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	close(mem_fd);
10662306a36Sopenharmony_ci	return 0;
10762306a36Sopenharmony_ci}
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_cistatic void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
11062306a36Sopenharmony_ci{
11162306a36Sopenharmony_ci	if (!map_shared)
11262306a36Sopenharmony_ci		return;
11362306a36Sopenharmony_ci
11462306a36Sopenharmony_ci	*start = (unsigned long) area_dst_alias + offset;
11562306a36Sopenharmony_ci}
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_cistatic void shmem_release_pages(char *rel_area)
11862306a36Sopenharmony_ci{
11962306a36Sopenharmony_ci	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
12062306a36Sopenharmony_ci		err("madvise(MADV_REMOVE) failed");
12162306a36Sopenharmony_ci}
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_cistatic int shmem_allocate_area(void **alloc_area, bool is_src)
12462306a36Sopenharmony_ci{
12562306a36Sopenharmony_ci	void *area_alias = NULL;
12662306a36Sopenharmony_ci	size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
12762306a36Sopenharmony_ci	unsigned long offset = is_src ? 0 : bytes;
12862306a36Sopenharmony_ci	char *p = NULL, *p_alias = NULL;
12962306a36Sopenharmony_ci	int mem_fd = uffd_mem_fd_create(bytes * 2, false);
13062306a36Sopenharmony_ci
13162306a36Sopenharmony_ci	/* TODO: clean this up.  Use a static addr is ugly */
13262306a36Sopenharmony_ci	p = BASE_PMD_ADDR;
13362306a36Sopenharmony_ci	if (!is_src)
13462306a36Sopenharmony_ci		/* src map + alias + interleaved hpages */
13562306a36Sopenharmony_ci		p += 2 * (bytes + hpage_size);
13662306a36Sopenharmony_ci	p_alias = p;
13762306a36Sopenharmony_ci	p_alias += bytes;
13862306a36Sopenharmony_ci	p_alias += hpage_size;  /* Prevent src/dst VMA merge */
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
14162306a36Sopenharmony_ci			   mem_fd, offset);
14262306a36Sopenharmony_ci	if (*alloc_area == MAP_FAILED) {
14362306a36Sopenharmony_ci		*alloc_area = NULL;
14462306a36Sopenharmony_ci		return -errno;
14562306a36Sopenharmony_ci	}
14662306a36Sopenharmony_ci	if (*alloc_area != p)
14762306a36Sopenharmony_ci		err("mmap of memfd failed at %p", p);
14862306a36Sopenharmony_ci
14962306a36Sopenharmony_ci	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
15062306a36Sopenharmony_ci			  mem_fd, offset);
15162306a36Sopenharmony_ci	if (area_alias == MAP_FAILED) {
15262306a36Sopenharmony_ci		munmap(*alloc_area, bytes);
15362306a36Sopenharmony_ci		*alloc_area = NULL;
15462306a36Sopenharmony_ci		return -errno;
15562306a36Sopenharmony_ci	}
15662306a36Sopenharmony_ci	if (area_alias != p_alias)
15762306a36Sopenharmony_ci		err("mmap of anonymous memory failed at %p", p_alias);
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	if (is_src)
16062306a36Sopenharmony_ci		area_src_alias = area_alias;
16162306a36Sopenharmony_ci	else
16262306a36Sopenharmony_ci		area_dst_alias = area_alias;
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	close(mem_fd);
16562306a36Sopenharmony_ci	return 0;
16662306a36Sopenharmony_ci}
16762306a36Sopenharmony_ci
16862306a36Sopenharmony_cistatic void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
16962306a36Sopenharmony_ci{
17062306a36Sopenharmony_ci	*start = (unsigned long)area_dst_alias + offset;
17162306a36Sopenharmony_ci}
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_cistatic void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
17462306a36Sopenharmony_ci{
17562306a36Sopenharmony_ci	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
17662306a36Sopenharmony_ci			      read_pmd_pagesize()))
17762306a36Sopenharmony_ci		err("Did not find expected %d number of hugepages",
17862306a36Sopenharmony_ci		    expect_nr_hpages);
17962306a36Sopenharmony_ci}
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_cistruct uffd_test_ops anon_uffd_test_ops = {
18262306a36Sopenharmony_ci	.allocate_area = anon_allocate_area,
18362306a36Sopenharmony_ci	.release_pages = anon_release_pages,
18462306a36Sopenharmony_ci	.alias_mapping = noop_alias_mapping,
18562306a36Sopenharmony_ci	.check_pmd_mapping = NULL,
18662306a36Sopenharmony_ci};
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_cistruct uffd_test_ops shmem_uffd_test_ops = {
18962306a36Sopenharmony_ci	.allocate_area = shmem_allocate_area,
19062306a36Sopenharmony_ci	.release_pages = shmem_release_pages,
19162306a36Sopenharmony_ci	.alias_mapping = shmem_alias_mapping,
19262306a36Sopenharmony_ci	.check_pmd_mapping = shmem_check_pmd_mapping,
19362306a36Sopenharmony_ci};
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_cistruct uffd_test_ops hugetlb_uffd_test_ops = {
19662306a36Sopenharmony_ci	.allocate_area = hugetlb_allocate_area,
19762306a36Sopenharmony_ci	.release_pages = hugetlb_release_pages,
19862306a36Sopenharmony_ci	.alias_mapping = hugetlb_alias_mapping,
19962306a36Sopenharmony_ci	.check_pmd_mapping = NULL,
20062306a36Sopenharmony_ci};
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_civoid uffd_stats_report(struct uffd_args *args, int n_cpus)
20362306a36Sopenharmony_ci{
20462306a36Sopenharmony_ci	int i;
20562306a36Sopenharmony_ci	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	for (i = 0; i < n_cpus; i++) {
20862306a36Sopenharmony_ci		miss_total += args[i].missing_faults;
20962306a36Sopenharmony_ci		wp_total += args[i].wp_faults;
21062306a36Sopenharmony_ci		minor_total += args[i].minor_faults;
21162306a36Sopenharmony_ci	}
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci	printf("userfaults: ");
21462306a36Sopenharmony_ci	if (miss_total) {
21562306a36Sopenharmony_ci		printf("%llu missing (", miss_total);
21662306a36Sopenharmony_ci		for (i = 0; i < n_cpus; i++)
21762306a36Sopenharmony_ci			printf("%lu+", args[i].missing_faults);
21862306a36Sopenharmony_ci		printf("\b) ");
21962306a36Sopenharmony_ci	}
22062306a36Sopenharmony_ci	if (wp_total) {
22162306a36Sopenharmony_ci		printf("%llu wp (", wp_total);
22262306a36Sopenharmony_ci		for (i = 0; i < n_cpus; i++)
22362306a36Sopenharmony_ci			printf("%lu+", args[i].wp_faults);
22462306a36Sopenharmony_ci		printf("\b) ");
22562306a36Sopenharmony_ci	}
22662306a36Sopenharmony_ci	if (minor_total) {
22762306a36Sopenharmony_ci		printf("%llu minor (", minor_total);
22862306a36Sopenharmony_ci		for (i = 0; i < n_cpus; i++)
22962306a36Sopenharmony_ci			printf("%lu+", args[i].minor_faults);
23062306a36Sopenharmony_ci		printf("\b)");
23162306a36Sopenharmony_ci	}
23262306a36Sopenharmony_ci	printf("\n");
23362306a36Sopenharmony_ci}
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ciint userfaultfd_open(uint64_t *features)
23662306a36Sopenharmony_ci{
23762306a36Sopenharmony_ci	struct uffdio_api uffdio_api;
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci	uffd = uffd_open(UFFD_FLAGS);
24062306a36Sopenharmony_ci	if (uffd < 0)
24162306a36Sopenharmony_ci		return -1;
24262306a36Sopenharmony_ci	uffd_flags = fcntl(uffd, F_GETFD, NULL);
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	uffdio_api.api = UFFD_API;
24562306a36Sopenharmony_ci	uffdio_api.features = *features;
24662306a36Sopenharmony_ci	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
24762306a36Sopenharmony_ci		/* Probably lack of CAP_PTRACE? */
24862306a36Sopenharmony_ci		return -1;
24962306a36Sopenharmony_ci	if (uffdio_api.api != UFFD_API)
25062306a36Sopenharmony_ci		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci	*features = uffdio_api.features;
25362306a36Sopenharmony_ci	return 0;
25462306a36Sopenharmony_ci}
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_cistatic inline void munmap_area(void **area)
25762306a36Sopenharmony_ci{
25862306a36Sopenharmony_ci	if (*area)
25962306a36Sopenharmony_ci		if (munmap(*area, nr_pages * page_size))
26062306a36Sopenharmony_ci			err("munmap");
26162306a36Sopenharmony_ci
26262306a36Sopenharmony_ci	*area = NULL;
26362306a36Sopenharmony_ci}
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_cistatic void uffd_test_ctx_clear(void)
26662306a36Sopenharmony_ci{
26762306a36Sopenharmony_ci	size_t i;
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	if (pipefd) {
27062306a36Sopenharmony_ci		for (i = 0; i < nr_cpus * 2; ++i) {
27162306a36Sopenharmony_ci			if (close(pipefd[i]))
27262306a36Sopenharmony_ci				err("close pipefd");
27362306a36Sopenharmony_ci		}
27462306a36Sopenharmony_ci		free(pipefd);
27562306a36Sopenharmony_ci		pipefd = NULL;
27662306a36Sopenharmony_ci	}
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	if (count_verify) {
27962306a36Sopenharmony_ci		free(count_verify);
28062306a36Sopenharmony_ci		count_verify = NULL;
28162306a36Sopenharmony_ci	}
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_ci	if (uffd != -1) {
28462306a36Sopenharmony_ci		if (close(uffd))
28562306a36Sopenharmony_ci			err("close uffd");
28662306a36Sopenharmony_ci		uffd = -1;
28762306a36Sopenharmony_ci	}
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	munmap_area((void **)&area_src);
29062306a36Sopenharmony_ci	munmap_area((void **)&area_src_alias);
29162306a36Sopenharmony_ci	munmap_area((void **)&area_dst);
29262306a36Sopenharmony_ci	munmap_area((void **)&area_dst_alias);
29362306a36Sopenharmony_ci	munmap_area((void **)&area_remap);
29462306a36Sopenharmony_ci}
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ciint uffd_test_ctx_init(uint64_t features, const char **errmsg)
29762306a36Sopenharmony_ci{
29862306a36Sopenharmony_ci	unsigned long nr, cpu;
29962306a36Sopenharmony_ci	int ret;
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci	uffd_test_ctx_clear();
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	ret = uffd_test_ops->allocate_area((void **)&area_src, true);
30462306a36Sopenharmony_ci	ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
30562306a36Sopenharmony_ci	if (ret) {
30662306a36Sopenharmony_ci		if (errmsg)
30762306a36Sopenharmony_ci			*errmsg = "memory allocation failed";
30862306a36Sopenharmony_ci		return ret;
30962306a36Sopenharmony_ci	}
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_ci	ret = userfaultfd_open(&features);
31262306a36Sopenharmony_ci	if (ret) {
31362306a36Sopenharmony_ci		if (errmsg)
31462306a36Sopenharmony_ci			*errmsg = "possible lack of priviledge";
31562306a36Sopenharmony_ci		return ret;
31662306a36Sopenharmony_ci	}
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	count_verify = malloc(nr_pages * sizeof(unsigned long long));
31962306a36Sopenharmony_ci	if (!count_verify)
32062306a36Sopenharmony_ci		err("count_verify");
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci	for (nr = 0; nr < nr_pages; nr++) {
32362306a36Sopenharmony_ci		*area_mutex(area_src, nr) =
32462306a36Sopenharmony_ci			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
32562306a36Sopenharmony_ci		count_verify[nr] = *area_count(area_src, nr) = 1;
32662306a36Sopenharmony_ci		/*
32762306a36Sopenharmony_ci		 * In the transition between 255 to 256, powerpc will
32862306a36Sopenharmony_ci		 * read out of order in my_bcmp and see both bytes as
32962306a36Sopenharmony_ci		 * zero, so leave a placeholder below always non-zero
33062306a36Sopenharmony_ci		 * after the count, to avoid my_bcmp to trigger false
33162306a36Sopenharmony_ci		 * positives.
33262306a36Sopenharmony_ci		 */
33362306a36Sopenharmony_ci		*(area_count(area_src, nr) + 1) = 1;
33462306a36Sopenharmony_ci	}
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci	/*
33762306a36Sopenharmony_ci	 * After initialization of area_src, we must explicitly release pages
33862306a36Sopenharmony_ci	 * for area_dst to make sure it's fully empty.  Otherwise we could have
33962306a36Sopenharmony_ci	 * some area_dst pages be errornously initialized with zero pages,
34062306a36Sopenharmony_ci	 * hence we could hit memory corruption later in the test.
34162306a36Sopenharmony_ci	 *
34262306a36Sopenharmony_ci	 * One example is when THP is globally enabled, above allocate_area()
34362306a36Sopenharmony_ci	 * calls could have the two areas merged into a single VMA (as they
34462306a36Sopenharmony_ci	 * will have the same VMA flags so they're mergeable).  When we
34562306a36Sopenharmony_ci	 * initialize the area_src above, it's possible that some part of
34662306a36Sopenharmony_ci	 * area_dst could have been faulted in via one huge THP that will be
34762306a36Sopenharmony_ci	 * shared between area_src and area_dst.  It could cause some of the
34862306a36Sopenharmony_ci	 * area_dst won't be trapped by missing userfaults.
34962306a36Sopenharmony_ci	 *
35062306a36Sopenharmony_ci	 * This release_pages() will guarantee even if that happened, we'll
35162306a36Sopenharmony_ci	 * proactively split the thp and drop any accidentally initialized
35262306a36Sopenharmony_ci	 * pages within area_dst.
35362306a36Sopenharmony_ci	 */
35462306a36Sopenharmony_ci	uffd_test_ops->release_pages(area_dst);
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci	pipefd = malloc(sizeof(int) * nr_cpus * 2);
35762306a36Sopenharmony_ci	if (!pipefd)
35862306a36Sopenharmony_ci		err("pipefd");
35962306a36Sopenharmony_ci	for (cpu = 0; cpu < nr_cpus; cpu++)
36062306a36Sopenharmony_ci		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
36162306a36Sopenharmony_ci			err("pipe");
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci	return 0;
36462306a36Sopenharmony_ci}
36562306a36Sopenharmony_ci
36662306a36Sopenharmony_civoid wp_range(int ufd, __u64 start, __u64 len, bool wp)
36762306a36Sopenharmony_ci{
36862306a36Sopenharmony_ci	struct uffdio_writeprotect prms;
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	/* Write protection page faults */
37162306a36Sopenharmony_ci	prms.range.start = start;
37262306a36Sopenharmony_ci	prms.range.len = len;
37362306a36Sopenharmony_ci	/* Undo write-protect, do wakeup after that */
37462306a36Sopenharmony_ci	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
37762306a36Sopenharmony_ci		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
37862306a36Sopenharmony_ci}
37962306a36Sopenharmony_ci
38062306a36Sopenharmony_cistatic void continue_range(int ufd, __u64 start, __u64 len, bool wp)
38162306a36Sopenharmony_ci{
38262306a36Sopenharmony_ci	struct uffdio_continue req;
38362306a36Sopenharmony_ci	int ret;
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci	req.range.start = start;
38662306a36Sopenharmony_ci	req.range.len = len;
38762306a36Sopenharmony_ci	req.mode = 0;
38862306a36Sopenharmony_ci	if (wp)
38962306a36Sopenharmony_ci		req.mode |= UFFDIO_CONTINUE_MODE_WP;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
39262306a36Sopenharmony_ci		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
39362306a36Sopenharmony_ci		    (uint64_t)start);
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci	/*
39662306a36Sopenharmony_ci	 * Error handling within the kernel for continue is subtly different
39762306a36Sopenharmony_ci	 * from copy or zeropage, so it may be a source of bugs. Trigger an
39862306a36Sopenharmony_ci	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
39962306a36Sopenharmony_ci	 */
40062306a36Sopenharmony_ci	req.mapped = 0;
40162306a36Sopenharmony_ci	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
40262306a36Sopenharmony_ci	if (ret >= 0 || req.mapped != -EEXIST)
40362306a36Sopenharmony_ci		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
40462306a36Sopenharmony_ci		    ret, (int64_t) req.mapped);
40562306a36Sopenharmony_ci}
40662306a36Sopenharmony_ci
40762306a36Sopenharmony_ciint uffd_read_msg(int ufd, struct uffd_msg *msg)
40862306a36Sopenharmony_ci{
40962306a36Sopenharmony_ci	int ret = read(uffd, msg, sizeof(*msg));
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci	if (ret != sizeof(*msg)) {
41262306a36Sopenharmony_ci		if (ret < 0) {
41362306a36Sopenharmony_ci			if (errno == EAGAIN || errno == EINTR)
41462306a36Sopenharmony_ci				return 1;
41562306a36Sopenharmony_ci			err("blocking read error");
41662306a36Sopenharmony_ci		} else {
41762306a36Sopenharmony_ci			err("short read");
41862306a36Sopenharmony_ci		}
41962306a36Sopenharmony_ci	}
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci	return 0;
42262306a36Sopenharmony_ci}
42362306a36Sopenharmony_ci
42462306a36Sopenharmony_civoid uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
42562306a36Sopenharmony_ci{
42662306a36Sopenharmony_ci	unsigned long offset;
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci	if (msg->event != UFFD_EVENT_PAGEFAULT)
42962306a36Sopenharmony_ci		err("unexpected msg event %u", msg->event);
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
43262306a36Sopenharmony_ci		/* Write protect page faults */
43362306a36Sopenharmony_ci		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
43462306a36Sopenharmony_ci		args->wp_faults++;
43562306a36Sopenharmony_ci	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
43662306a36Sopenharmony_ci		uint8_t *area;
43762306a36Sopenharmony_ci		int b;
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci		/*
44062306a36Sopenharmony_ci		 * Minor page faults
44162306a36Sopenharmony_ci		 *
44262306a36Sopenharmony_ci		 * To prove we can modify the original range for testing
44362306a36Sopenharmony_ci		 * purposes, we're going to bit flip this range before
44462306a36Sopenharmony_ci		 * continuing.
44562306a36Sopenharmony_ci		 *
44662306a36Sopenharmony_ci		 * Note that this requires all minor page fault tests operate on
44762306a36Sopenharmony_ci		 * area_dst (non-UFFD-registered) and area_dst_alias
44862306a36Sopenharmony_ci		 * (UFFD-registered).
44962306a36Sopenharmony_ci		 */
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci		area = (uint8_t *)(area_dst +
45262306a36Sopenharmony_ci				   ((char *)msg->arg.pagefault.address -
45362306a36Sopenharmony_ci				    area_dst_alias));
45462306a36Sopenharmony_ci		for (b = 0; b < page_size; ++b)
45562306a36Sopenharmony_ci			area[b] = ~area[b];
45662306a36Sopenharmony_ci		continue_range(uffd, msg->arg.pagefault.address, page_size,
45762306a36Sopenharmony_ci			       args->apply_wp);
45862306a36Sopenharmony_ci		args->minor_faults++;
45962306a36Sopenharmony_ci	} else {
46062306a36Sopenharmony_ci		/*
46162306a36Sopenharmony_ci		 * Missing page faults.
46262306a36Sopenharmony_ci		 *
46362306a36Sopenharmony_ci		 * Here we force a write check for each of the missing mode
46462306a36Sopenharmony_ci		 * faults.  It's guaranteed because the only threads that
46562306a36Sopenharmony_ci		 * will trigger uffd faults are the locking threads, and
46662306a36Sopenharmony_ci		 * their first instruction to touch the missing page will
46762306a36Sopenharmony_ci		 * always be pthread_mutex_lock().
46862306a36Sopenharmony_ci		 *
46962306a36Sopenharmony_ci		 * Note that here we relied on an NPTL glibc impl detail to
47062306a36Sopenharmony_ci		 * always read the lock type at the entry of the lock op
47162306a36Sopenharmony_ci		 * (pthread_mutex_t.__data.__type, offset 0x10) before
47262306a36Sopenharmony_ci		 * doing any locking operations to guarantee that.  It's
47362306a36Sopenharmony_ci		 * actually not good to rely on this impl detail because
47462306a36Sopenharmony_ci		 * logically a pthread-compatible lib can implement the
47562306a36Sopenharmony_ci		 * locks without types and we can fail when linking with
47662306a36Sopenharmony_ci		 * them.  However since we used to find bugs with this
47762306a36Sopenharmony_ci		 * strict check we still keep it around.  Hopefully this
47862306a36Sopenharmony_ci		 * could be a good hint when it fails again.  If one day
47962306a36Sopenharmony_ci		 * it'll break on some other impl of glibc we'll revisit.
48062306a36Sopenharmony_ci		 */
48162306a36Sopenharmony_ci		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
48262306a36Sopenharmony_ci			err("unexpected write fault");
48362306a36Sopenharmony_ci
48462306a36Sopenharmony_ci		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
48562306a36Sopenharmony_ci		offset &= ~(page_size-1);
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci		if (copy_page(uffd, offset, args->apply_wp))
48862306a36Sopenharmony_ci			args->missing_faults++;
48962306a36Sopenharmony_ci	}
49062306a36Sopenharmony_ci}
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_civoid *uffd_poll_thread(void *arg)
49362306a36Sopenharmony_ci{
49462306a36Sopenharmony_ci	struct uffd_args *args = (struct uffd_args *)arg;
49562306a36Sopenharmony_ci	unsigned long cpu = args->cpu;
49662306a36Sopenharmony_ci	struct pollfd pollfd[2];
49762306a36Sopenharmony_ci	struct uffd_msg msg;
49862306a36Sopenharmony_ci	struct uffdio_register uffd_reg;
49962306a36Sopenharmony_ci	int ret;
50062306a36Sopenharmony_ci	char tmp_chr;
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci	if (!args->handle_fault)
50362306a36Sopenharmony_ci		args->handle_fault = uffd_handle_page_fault;
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci	pollfd[0].fd = uffd;
50662306a36Sopenharmony_ci	pollfd[0].events = POLLIN;
50762306a36Sopenharmony_ci	pollfd[1].fd = pipefd[cpu*2];
50862306a36Sopenharmony_ci	pollfd[1].events = POLLIN;
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	for (;;) {
51162306a36Sopenharmony_ci		ret = poll(pollfd, 2, -1);
51262306a36Sopenharmony_ci		if (ret <= 0) {
51362306a36Sopenharmony_ci			if (errno == EINTR || errno == EAGAIN)
51462306a36Sopenharmony_ci				continue;
51562306a36Sopenharmony_ci			err("poll error: %d", ret);
51662306a36Sopenharmony_ci		}
51762306a36Sopenharmony_ci		if (pollfd[1].revents) {
51862306a36Sopenharmony_ci			if (!(pollfd[1].revents & POLLIN))
51962306a36Sopenharmony_ci				err("pollfd[1].revents %d", pollfd[1].revents);
52062306a36Sopenharmony_ci			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
52162306a36Sopenharmony_ci				err("read pipefd error");
52262306a36Sopenharmony_ci			break;
52362306a36Sopenharmony_ci		}
52462306a36Sopenharmony_ci		if (!(pollfd[0].revents & POLLIN))
52562306a36Sopenharmony_ci			err("pollfd[0].revents %d", pollfd[0].revents);
52662306a36Sopenharmony_ci		if (uffd_read_msg(uffd, &msg))
52762306a36Sopenharmony_ci			continue;
52862306a36Sopenharmony_ci		switch (msg.event) {
52962306a36Sopenharmony_ci		default:
53062306a36Sopenharmony_ci			err("unexpected msg event %u\n", msg.event);
53162306a36Sopenharmony_ci			break;
53262306a36Sopenharmony_ci		case UFFD_EVENT_PAGEFAULT:
53362306a36Sopenharmony_ci			args->handle_fault(&msg, args);
53462306a36Sopenharmony_ci			break;
53562306a36Sopenharmony_ci		case UFFD_EVENT_FORK:
53662306a36Sopenharmony_ci			close(uffd);
53762306a36Sopenharmony_ci			uffd = msg.arg.fork.ufd;
53862306a36Sopenharmony_ci			pollfd[0].fd = uffd;
53962306a36Sopenharmony_ci			break;
54062306a36Sopenharmony_ci		case UFFD_EVENT_REMOVE:
54162306a36Sopenharmony_ci			uffd_reg.range.start = msg.arg.remove.start;
54262306a36Sopenharmony_ci			uffd_reg.range.len = msg.arg.remove.end -
54362306a36Sopenharmony_ci				msg.arg.remove.start;
54462306a36Sopenharmony_ci			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
54562306a36Sopenharmony_ci				err("remove failure");
54662306a36Sopenharmony_ci			break;
54762306a36Sopenharmony_ci		case UFFD_EVENT_REMAP:
54862306a36Sopenharmony_ci			area_remap = area_dst;  /* save for later unmap */
54962306a36Sopenharmony_ci			area_dst = (char *)(unsigned long)msg.arg.remap.to;
55062306a36Sopenharmony_ci			break;
55162306a36Sopenharmony_ci		}
55262306a36Sopenharmony_ci	}
55362306a36Sopenharmony_ci
55462306a36Sopenharmony_ci	return NULL;
55562306a36Sopenharmony_ci}
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_cistatic void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
55862306a36Sopenharmony_ci			    unsigned long offset)
55962306a36Sopenharmony_ci{
56062306a36Sopenharmony_ci	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
56162306a36Sopenharmony_ci				     uffdio_copy->len,
56262306a36Sopenharmony_ci				     offset);
56362306a36Sopenharmony_ci	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
56462306a36Sopenharmony_ci		/* real retval in ufdio_copy.copy */
56562306a36Sopenharmony_ci		if (uffdio_copy->copy != -EEXIST)
56662306a36Sopenharmony_ci			err("UFFDIO_COPY retry error: %"PRId64,
56762306a36Sopenharmony_ci			    (int64_t)uffdio_copy->copy);
56862306a36Sopenharmony_ci	} else {
56962306a36Sopenharmony_ci		err("UFFDIO_COPY retry unexpected: %"PRId64,
57062306a36Sopenharmony_ci		    (int64_t)uffdio_copy->copy);
57162306a36Sopenharmony_ci	}
57262306a36Sopenharmony_ci}
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_cistatic void wake_range(int ufd, unsigned long addr, unsigned long len)
57562306a36Sopenharmony_ci{
57662306a36Sopenharmony_ci	struct uffdio_range uffdio_wake;
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	uffdio_wake.start = addr;
57962306a36Sopenharmony_ci	uffdio_wake.len = len;
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
58262306a36Sopenharmony_ci		fprintf(stderr, "error waking %lu\n",
58362306a36Sopenharmony_ci			addr), exit(1);
58462306a36Sopenharmony_ci}
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_ciint __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
58762306a36Sopenharmony_ci{
58862306a36Sopenharmony_ci	struct uffdio_copy uffdio_copy;
58962306a36Sopenharmony_ci
59062306a36Sopenharmony_ci	if (offset >= nr_pages * page_size)
59162306a36Sopenharmony_ci		err("unexpected offset %lu\n", offset);
59262306a36Sopenharmony_ci	uffdio_copy.dst = (unsigned long) area_dst + offset;
59362306a36Sopenharmony_ci	uffdio_copy.src = (unsigned long) area_src + offset;
59462306a36Sopenharmony_ci	uffdio_copy.len = page_size;
59562306a36Sopenharmony_ci	if (wp)
59662306a36Sopenharmony_ci		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
59762306a36Sopenharmony_ci	else
59862306a36Sopenharmony_ci		uffdio_copy.mode = 0;
59962306a36Sopenharmony_ci	uffdio_copy.copy = 0;
60062306a36Sopenharmony_ci	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
60162306a36Sopenharmony_ci		/* real retval in ufdio_copy.copy */
60262306a36Sopenharmony_ci		if (uffdio_copy.copy != -EEXIST)
60362306a36Sopenharmony_ci			err("UFFDIO_COPY error: %"PRId64,
60462306a36Sopenharmony_ci			    (int64_t)uffdio_copy.copy);
60562306a36Sopenharmony_ci		wake_range(ufd, uffdio_copy.dst, page_size);
60662306a36Sopenharmony_ci	} else if (uffdio_copy.copy != page_size) {
60762306a36Sopenharmony_ci		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
60862306a36Sopenharmony_ci	} else {
60962306a36Sopenharmony_ci		if (test_uffdio_copy_eexist && retry) {
61062306a36Sopenharmony_ci			test_uffdio_copy_eexist = false;
61162306a36Sopenharmony_ci			retry_copy_page(ufd, &uffdio_copy, offset);
61262306a36Sopenharmony_ci		}
61362306a36Sopenharmony_ci		return 1;
61462306a36Sopenharmony_ci	}
61562306a36Sopenharmony_ci	return 0;
61662306a36Sopenharmony_ci}
61762306a36Sopenharmony_ci
61862306a36Sopenharmony_ciint copy_page(int ufd, unsigned long offset, bool wp)
61962306a36Sopenharmony_ci{
62062306a36Sopenharmony_ci	return __copy_page(ufd, offset, false, wp);
62162306a36Sopenharmony_ci}
62262306a36Sopenharmony_ci
62362306a36Sopenharmony_ciint uffd_open_dev(unsigned int flags)
62462306a36Sopenharmony_ci{
62562306a36Sopenharmony_ci	int fd, uffd;
62662306a36Sopenharmony_ci
62762306a36Sopenharmony_ci	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
62862306a36Sopenharmony_ci	if (fd < 0)
62962306a36Sopenharmony_ci		return fd;
63062306a36Sopenharmony_ci	uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
63162306a36Sopenharmony_ci	close(fd);
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci	return uffd;
63462306a36Sopenharmony_ci}
63562306a36Sopenharmony_ci
63662306a36Sopenharmony_ciint uffd_open_sys(unsigned int flags)
63762306a36Sopenharmony_ci{
63862306a36Sopenharmony_ci#ifdef __NR_userfaultfd
63962306a36Sopenharmony_ci	return syscall(__NR_userfaultfd, flags);
64062306a36Sopenharmony_ci#else
64162306a36Sopenharmony_ci	return -1;
64262306a36Sopenharmony_ci#endif
64362306a36Sopenharmony_ci}
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ciint uffd_open(unsigned int flags)
64662306a36Sopenharmony_ci{
64762306a36Sopenharmony_ci	int uffd = uffd_open_sys(flags);
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_ci	if (uffd < 0)
65062306a36Sopenharmony_ci		uffd = uffd_open_dev(flags);
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci	return uffd;
65362306a36Sopenharmony_ci}
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_ciint uffd_get_features(uint64_t *features)
65662306a36Sopenharmony_ci{
65762306a36Sopenharmony_ci	struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
65862306a36Sopenharmony_ci	/*
65962306a36Sopenharmony_ci	 * This should by default work in most kernels; the feature list
66062306a36Sopenharmony_ci	 * will be the same no matter what we pass in here.
66162306a36Sopenharmony_ci	 */
66262306a36Sopenharmony_ci	int fd = uffd_open(UFFD_USER_MODE_ONLY);
66362306a36Sopenharmony_ci
66462306a36Sopenharmony_ci	if (fd < 0)
66562306a36Sopenharmony_ci		/* Maybe the kernel is older than user-only mode? */
66662306a36Sopenharmony_ci		fd = uffd_open(0);
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_ci	if (fd < 0)
66962306a36Sopenharmony_ci		return fd;
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
67262306a36Sopenharmony_ci		close(fd);
67362306a36Sopenharmony_ci		return -errno;
67462306a36Sopenharmony_ci	}
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_ci	*features = uffdio_api.features;
67762306a36Sopenharmony_ci	close(fd);
67862306a36Sopenharmony_ci
67962306a36Sopenharmony_ci	return 0;
68062306a36Sopenharmony_ci}
681