162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Userfaultfd tests util functions 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2015-2023 Red Hat, Inc. 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include "uffd-common.h" 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#define BASE_PMD_ADDR ((void *)(1UL << 30)) 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_civolatile bool test_uffdio_copy_eexist = true; 1362306a36Sopenharmony_ciunsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; 1462306a36Sopenharmony_cichar *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; 1562306a36Sopenharmony_ciint uffd = -1, uffd_flags, finished, *pipefd, test_type; 1662306a36Sopenharmony_cibool map_shared; 1762306a36Sopenharmony_cibool test_uffdio_wp = true; 1862306a36Sopenharmony_ciunsigned long long *count_verify; 1962306a36Sopenharmony_ciuffd_test_ops_t *uffd_test_ops; 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_cistatic int uffd_mem_fd_create(off_t mem_size, bool hugetlb) 2262306a36Sopenharmony_ci{ 2362306a36Sopenharmony_ci unsigned int memfd_flags = 0; 2462306a36Sopenharmony_ci int mem_fd; 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci if (hugetlb) 2762306a36Sopenharmony_ci memfd_flags = MFD_HUGETLB; 2862306a36Sopenharmony_ci mem_fd = memfd_create("uffd-test", memfd_flags); 2962306a36Sopenharmony_ci if (mem_fd < 0) 3062306a36Sopenharmony_ci err("memfd_create"); 3162306a36Sopenharmony_ci if (ftruncate(mem_fd, mem_size)) 3262306a36Sopenharmony_ci err("ftruncate"); 3362306a36Sopenharmony_ci if (fallocate(mem_fd, 3462306a36Sopenharmony_ci FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 3562306a36Sopenharmony_ci mem_size)) 3662306a36Sopenharmony_ci err("fallocate"); 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci return mem_fd; 3962306a36Sopenharmony_ci} 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_cistatic void anon_release_pages(char *rel_area) 4262306a36Sopenharmony_ci{ 4362306a36Sopenharmony_ci if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 4462306a36Sopenharmony_ci err("madvise(MADV_DONTNEED) failed"); 4562306a36Sopenharmony_ci} 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_cistatic int anon_allocate_area(void **alloc_area, bool is_src) 4862306a36Sopenharmony_ci{ 4962306a36Sopenharmony_ci *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 5062306a36Sopenharmony_ci MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 5162306a36Sopenharmony_ci if (*alloc_area == MAP_FAILED) { 5262306a36Sopenharmony_ci *alloc_area = NULL; 5362306a36Sopenharmony_ci return -errno; 5462306a36Sopenharmony_ci } 5562306a36Sopenharmony_ci return 0; 5662306a36Sopenharmony_ci} 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_cistatic void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) 5962306a36Sopenharmony_ci{ 6062306a36Sopenharmony_ci} 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_cistatic void hugetlb_release_pages(char *rel_area) 6362306a36Sopenharmony_ci{ 6462306a36Sopenharmony_ci if (!map_shared) { 6562306a36Sopenharmony_ci if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 6662306a36Sopenharmony_ci err("madvise(MADV_DONTNEED) failed"); 6762306a36Sopenharmony_ci } else { 6862306a36Sopenharmony_ci if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 6962306a36Sopenharmony_ci err("madvise(MADV_REMOVE) failed"); 7062306a36Sopenharmony_ci } 7162306a36Sopenharmony_ci} 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_cistatic int hugetlb_allocate_area(void **alloc_area, bool is_src) 7462306a36Sopenharmony_ci{ 7562306a36Sopenharmony_ci off_t size = nr_pages * page_size; 7662306a36Sopenharmony_ci off_t offset = is_src ? 0 : size; 7762306a36Sopenharmony_ci void *area_alias = NULL; 7862306a36Sopenharmony_ci char **alloc_area_alias; 7962306a36Sopenharmony_ci int mem_fd = uffd_mem_fd_create(size * 2, true); 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, 8262306a36Sopenharmony_ci (map_shared ? MAP_SHARED : MAP_PRIVATE) | 8362306a36Sopenharmony_ci (is_src ? 0 : MAP_NORESERVE), 8462306a36Sopenharmony_ci mem_fd, offset); 8562306a36Sopenharmony_ci if (*alloc_area == MAP_FAILED) { 8662306a36Sopenharmony_ci *alloc_area = NULL; 8762306a36Sopenharmony_ci return -errno; 8862306a36Sopenharmony_ci } 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci if (map_shared) { 9162306a36Sopenharmony_ci area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, 9262306a36Sopenharmony_ci MAP_SHARED, mem_fd, offset); 9362306a36Sopenharmony_ci if (area_alias == MAP_FAILED) 9462306a36Sopenharmony_ci return -errno; 9562306a36Sopenharmony_ci } 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci if (is_src) { 9862306a36Sopenharmony_ci alloc_area_alias = &area_src_alias; 9962306a36Sopenharmony_ci } else { 10062306a36Sopenharmony_ci alloc_area_alias = &area_dst_alias; 10162306a36Sopenharmony_ci } 10262306a36Sopenharmony_ci if (area_alias) 10362306a36Sopenharmony_ci *alloc_area_alias = area_alias; 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci close(mem_fd); 10662306a36Sopenharmony_ci return 0; 10762306a36Sopenharmony_ci} 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_cistatic void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) 11062306a36Sopenharmony_ci{ 11162306a36Sopenharmony_ci if (!map_shared) 11262306a36Sopenharmony_ci return; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci *start = (unsigned long) area_dst_alias + offset; 11562306a36Sopenharmony_ci} 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_cistatic void shmem_release_pages(char *rel_area) 11862306a36Sopenharmony_ci{ 11962306a36Sopenharmony_ci if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 12062306a36Sopenharmony_ci err("madvise(MADV_REMOVE) failed"); 12162306a36Sopenharmony_ci} 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_cistatic int shmem_allocate_area(void **alloc_area, bool is_src) 12462306a36Sopenharmony_ci{ 12562306a36Sopenharmony_ci void *area_alias = NULL; 12662306a36Sopenharmony_ci size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize(); 12762306a36Sopenharmony_ci unsigned long offset = is_src ? 0 : bytes; 12862306a36Sopenharmony_ci char *p = NULL, *p_alias = NULL; 12962306a36Sopenharmony_ci int mem_fd = uffd_mem_fd_create(bytes * 2, false); 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci /* TODO: clean this up. Use a static addr is ugly */ 13262306a36Sopenharmony_ci p = BASE_PMD_ADDR; 13362306a36Sopenharmony_ci if (!is_src) 13462306a36Sopenharmony_ci /* src map + alias + interleaved hpages */ 13562306a36Sopenharmony_ci p += 2 * (bytes + hpage_size); 13662306a36Sopenharmony_ci p_alias = p; 13762306a36Sopenharmony_ci p_alias += bytes; 13862306a36Sopenharmony_ci p_alias += hpage_size; /* Prevent src/dst VMA merge */ 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, 14162306a36Sopenharmony_ci mem_fd, offset); 14262306a36Sopenharmony_ci if (*alloc_area == MAP_FAILED) { 14362306a36Sopenharmony_ci *alloc_area = NULL; 14462306a36Sopenharmony_ci return -errno; 14562306a36Sopenharmony_ci } 14662306a36Sopenharmony_ci if (*alloc_area != p) 14762306a36Sopenharmony_ci err("mmap of memfd failed at %p", p); 14862306a36Sopenharmony_ci 14962306a36Sopenharmony_ci area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, 15062306a36Sopenharmony_ci mem_fd, offset); 15162306a36Sopenharmony_ci if (area_alias == MAP_FAILED) { 15262306a36Sopenharmony_ci munmap(*alloc_area, bytes); 15362306a36Sopenharmony_ci *alloc_area = NULL; 15462306a36Sopenharmony_ci return -errno; 15562306a36Sopenharmony_ci } 15662306a36Sopenharmony_ci if (area_alias != p_alias) 15762306a36Sopenharmony_ci err("mmap of anonymous memory failed at %p", p_alias); 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci if (is_src) 16062306a36Sopenharmony_ci area_src_alias = area_alias; 16162306a36Sopenharmony_ci else 16262306a36Sopenharmony_ci area_dst_alias = area_alias; 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci close(mem_fd); 16562306a36Sopenharmony_ci return 0; 16662306a36Sopenharmony_ci} 16762306a36Sopenharmony_ci 16862306a36Sopenharmony_cistatic void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) 16962306a36Sopenharmony_ci{ 17062306a36Sopenharmony_ci *start = (unsigned long)area_dst_alias + offset; 17162306a36Sopenharmony_ci} 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_cistatic void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) 17462306a36Sopenharmony_ci{ 17562306a36Sopenharmony_ci if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, 17662306a36Sopenharmony_ci read_pmd_pagesize())) 17762306a36Sopenharmony_ci err("Did not find expected %d number of hugepages", 17862306a36Sopenharmony_ci expect_nr_hpages); 17962306a36Sopenharmony_ci} 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_cistruct uffd_test_ops anon_uffd_test_ops = { 18262306a36Sopenharmony_ci .allocate_area = anon_allocate_area, 18362306a36Sopenharmony_ci .release_pages = anon_release_pages, 18462306a36Sopenharmony_ci .alias_mapping = noop_alias_mapping, 18562306a36Sopenharmony_ci .check_pmd_mapping = NULL, 18662306a36Sopenharmony_ci}; 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_cistruct uffd_test_ops shmem_uffd_test_ops = { 18962306a36Sopenharmony_ci .allocate_area = shmem_allocate_area, 19062306a36Sopenharmony_ci .release_pages = shmem_release_pages, 19162306a36Sopenharmony_ci .alias_mapping = shmem_alias_mapping, 19262306a36Sopenharmony_ci .check_pmd_mapping = shmem_check_pmd_mapping, 19362306a36Sopenharmony_ci}; 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_cistruct uffd_test_ops hugetlb_uffd_test_ops = { 19662306a36Sopenharmony_ci .allocate_area = hugetlb_allocate_area, 19762306a36Sopenharmony_ci .release_pages = hugetlb_release_pages, 19862306a36Sopenharmony_ci .alias_mapping = hugetlb_alias_mapping, 19962306a36Sopenharmony_ci .check_pmd_mapping = NULL, 20062306a36Sopenharmony_ci}; 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_civoid uffd_stats_report(struct uffd_args *args, int n_cpus) 20362306a36Sopenharmony_ci{ 20462306a36Sopenharmony_ci int i; 20562306a36Sopenharmony_ci unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci for (i = 0; i < n_cpus; i++) { 20862306a36Sopenharmony_ci miss_total += args[i].missing_faults; 20962306a36Sopenharmony_ci wp_total += args[i].wp_faults; 21062306a36Sopenharmony_ci minor_total += args[i].minor_faults; 21162306a36Sopenharmony_ci } 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci printf("userfaults: "); 21462306a36Sopenharmony_ci if (miss_total) { 21562306a36Sopenharmony_ci printf("%llu missing (", miss_total); 21662306a36Sopenharmony_ci for (i = 0; i < n_cpus; i++) 21762306a36Sopenharmony_ci printf("%lu+", args[i].missing_faults); 21862306a36Sopenharmony_ci printf("\b) "); 21962306a36Sopenharmony_ci } 22062306a36Sopenharmony_ci if (wp_total) { 22162306a36Sopenharmony_ci printf("%llu wp (", wp_total); 22262306a36Sopenharmony_ci for (i = 0; i < n_cpus; i++) 22362306a36Sopenharmony_ci printf("%lu+", args[i].wp_faults); 22462306a36Sopenharmony_ci printf("\b) "); 22562306a36Sopenharmony_ci } 22662306a36Sopenharmony_ci if (minor_total) { 22762306a36Sopenharmony_ci printf("%llu minor (", minor_total); 22862306a36Sopenharmony_ci for (i = 0; i < n_cpus; i++) 22962306a36Sopenharmony_ci printf("%lu+", args[i].minor_faults); 23062306a36Sopenharmony_ci printf("\b)"); 23162306a36Sopenharmony_ci } 23262306a36Sopenharmony_ci printf("\n"); 23362306a36Sopenharmony_ci} 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ciint userfaultfd_open(uint64_t *features) 23662306a36Sopenharmony_ci{ 23762306a36Sopenharmony_ci struct uffdio_api uffdio_api; 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci uffd = uffd_open(UFFD_FLAGS); 24062306a36Sopenharmony_ci if (uffd < 0) 24162306a36Sopenharmony_ci return -1; 24262306a36Sopenharmony_ci uffd_flags = fcntl(uffd, F_GETFD, NULL); 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci uffdio_api.api = UFFD_API; 24562306a36Sopenharmony_ci uffdio_api.features = *features; 24662306a36Sopenharmony_ci if (ioctl(uffd, UFFDIO_API, &uffdio_api)) 24762306a36Sopenharmony_ci /* Probably lack of CAP_PTRACE? */ 24862306a36Sopenharmony_ci return -1; 24962306a36Sopenharmony_ci if (uffdio_api.api != UFFD_API) 25062306a36Sopenharmony_ci err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci *features = uffdio_api.features; 25362306a36Sopenharmony_ci return 0; 25462306a36Sopenharmony_ci} 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_cistatic inline void munmap_area(void **area) 25762306a36Sopenharmony_ci{ 25862306a36Sopenharmony_ci if (*area) 25962306a36Sopenharmony_ci if (munmap(*area, nr_pages * page_size)) 26062306a36Sopenharmony_ci err("munmap"); 26162306a36Sopenharmony_ci 26262306a36Sopenharmony_ci *area = NULL; 26362306a36Sopenharmony_ci} 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_cistatic void uffd_test_ctx_clear(void) 26662306a36Sopenharmony_ci{ 26762306a36Sopenharmony_ci size_t i; 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci if (pipefd) { 27062306a36Sopenharmony_ci for (i = 0; i < nr_cpus * 2; ++i) { 27162306a36Sopenharmony_ci if (close(pipefd[i])) 27262306a36Sopenharmony_ci err("close pipefd"); 27362306a36Sopenharmony_ci } 27462306a36Sopenharmony_ci free(pipefd); 27562306a36Sopenharmony_ci pipefd = NULL; 27662306a36Sopenharmony_ci } 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci if (count_verify) { 27962306a36Sopenharmony_ci free(count_verify); 28062306a36Sopenharmony_ci count_verify = NULL; 28162306a36Sopenharmony_ci } 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci if (uffd != -1) { 28462306a36Sopenharmony_ci if (close(uffd)) 28562306a36Sopenharmony_ci err("close uffd"); 28662306a36Sopenharmony_ci uffd = -1; 28762306a36Sopenharmony_ci } 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci munmap_area((void **)&area_src); 29062306a36Sopenharmony_ci munmap_area((void **)&area_src_alias); 29162306a36Sopenharmony_ci munmap_area((void **)&area_dst); 29262306a36Sopenharmony_ci munmap_area((void **)&area_dst_alias); 29362306a36Sopenharmony_ci munmap_area((void **)&area_remap); 29462306a36Sopenharmony_ci} 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ciint uffd_test_ctx_init(uint64_t features, const char **errmsg) 29762306a36Sopenharmony_ci{ 29862306a36Sopenharmony_ci unsigned long nr, cpu; 29962306a36Sopenharmony_ci int ret; 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci uffd_test_ctx_clear(); 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci ret = uffd_test_ops->allocate_area((void **)&area_src, true); 30462306a36Sopenharmony_ci ret |= uffd_test_ops->allocate_area((void **)&area_dst, false); 30562306a36Sopenharmony_ci if (ret) { 30662306a36Sopenharmony_ci if (errmsg) 30762306a36Sopenharmony_ci *errmsg = "memory allocation failed"; 30862306a36Sopenharmony_ci return ret; 30962306a36Sopenharmony_ci } 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci ret = userfaultfd_open(&features); 31262306a36Sopenharmony_ci if (ret) { 31362306a36Sopenharmony_ci if (errmsg) 31462306a36Sopenharmony_ci *errmsg = "possible lack of priviledge"; 31562306a36Sopenharmony_ci return ret; 31662306a36Sopenharmony_ci } 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci count_verify = malloc(nr_pages * sizeof(unsigned long long)); 31962306a36Sopenharmony_ci if (!count_verify) 32062306a36Sopenharmony_ci err("count_verify"); 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_ci for (nr = 0; nr < nr_pages; nr++) { 32362306a36Sopenharmony_ci *area_mutex(area_src, nr) = 32462306a36Sopenharmony_ci (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; 32562306a36Sopenharmony_ci count_verify[nr] = *area_count(area_src, nr) = 1; 32662306a36Sopenharmony_ci /* 32762306a36Sopenharmony_ci * In the transition between 255 to 256, powerpc will 32862306a36Sopenharmony_ci * read out of order in my_bcmp and see both bytes as 32962306a36Sopenharmony_ci * zero, so leave a placeholder below always non-zero 33062306a36Sopenharmony_ci * after the count, to avoid my_bcmp to trigger false 33162306a36Sopenharmony_ci * positives. 33262306a36Sopenharmony_ci */ 33362306a36Sopenharmony_ci *(area_count(area_src, nr) + 1) = 1; 33462306a36Sopenharmony_ci } 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci /* 33762306a36Sopenharmony_ci * After initialization of area_src, we must explicitly release pages 33862306a36Sopenharmony_ci * for area_dst to make sure it's fully empty. Otherwise we could have 33962306a36Sopenharmony_ci * some area_dst pages be errornously initialized with zero pages, 34062306a36Sopenharmony_ci * hence we could hit memory corruption later in the test. 34162306a36Sopenharmony_ci * 34262306a36Sopenharmony_ci * One example is when THP is globally enabled, above allocate_area() 34362306a36Sopenharmony_ci * calls could have the two areas merged into a single VMA (as they 34462306a36Sopenharmony_ci * will have the same VMA flags so they're mergeable). When we 34562306a36Sopenharmony_ci * initialize the area_src above, it's possible that some part of 34662306a36Sopenharmony_ci * area_dst could have been faulted in via one huge THP that will be 34762306a36Sopenharmony_ci * shared between area_src and area_dst. It could cause some of the 34862306a36Sopenharmony_ci * area_dst won't be trapped by missing userfaults. 34962306a36Sopenharmony_ci * 35062306a36Sopenharmony_ci * This release_pages() will guarantee even if that happened, we'll 35162306a36Sopenharmony_ci * proactively split the thp and drop any accidentally initialized 35262306a36Sopenharmony_ci * pages within area_dst. 35362306a36Sopenharmony_ci */ 35462306a36Sopenharmony_ci uffd_test_ops->release_pages(area_dst); 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci pipefd = malloc(sizeof(int) * nr_cpus * 2); 35762306a36Sopenharmony_ci if (!pipefd) 35862306a36Sopenharmony_ci err("pipefd"); 35962306a36Sopenharmony_ci for (cpu = 0; cpu < nr_cpus; cpu++) 36062306a36Sopenharmony_ci if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) 36162306a36Sopenharmony_ci err("pipe"); 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci return 0; 36462306a36Sopenharmony_ci} 36562306a36Sopenharmony_ci 36662306a36Sopenharmony_civoid wp_range(int ufd, __u64 start, __u64 len, bool wp) 36762306a36Sopenharmony_ci{ 36862306a36Sopenharmony_ci struct uffdio_writeprotect prms; 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci /* Write protection page faults */ 37162306a36Sopenharmony_ci prms.range.start = start; 37262306a36Sopenharmony_ci prms.range.len = len; 37362306a36Sopenharmony_ci /* Undo write-protect, do wakeup after that */ 37462306a36Sopenharmony_ci prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) 37762306a36Sopenharmony_ci err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); 37862306a36Sopenharmony_ci} 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_cistatic void continue_range(int ufd, __u64 start, __u64 len, bool wp) 38162306a36Sopenharmony_ci{ 38262306a36Sopenharmony_ci struct uffdio_continue req; 38362306a36Sopenharmony_ci int ret; 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci req.range.start = start; 38662306a36Sopenharmony_ci req.range.len = len; 38762306a36Sopenharmony_ci req.mode = 0; 38862306a36Sopenharmony_ci if (wp) 38962306a36Sopenharmony_ci req.mode |= UFFDIO_CONTINUE_MODE_WP; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci if (ioctl(ufd, UFFDIO_CONTINUE, &req)) 39262306a36Sopenharmony_ci err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, 39362306a36Sopenharmony_ci (uint64_t)start); 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci /* 39662306a36Sopenharmony_ci * Error handling within the kernel for continue is subtly different 39762306a36Sopenharmony_ci * from copy or zeropage, so it may be a source of bugs. Trigger an 39862306a36Sopenharmony_ci * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. 39962306a36Sopenharmony_ci */ 40062306a36Sopenharmony_ci req.mapped = 0; 40162306a36Sopenharmony_ci ret = ioctl(ufd, UFFDIO_CONTINUE, &req); 40262306a36Sopenharmony_ci if (ret >= 0 || req.mapped != -EEXIST) 40362306a36Sopenharmony_ci err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, 40462306a36Sopenharmony_ci ret, (int64_t) req.mapped); 40562306a36Sopenharmony_ci} 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ciint uffd_read_msg(int ufd, struct uffd_msg *msg) 40862306a36Sopenharmony_ci{ 40962306a36Sopenharmony_ci int ret = read(uffd, msg, sizeof(*msg)); 41062306a36Sopenharmony_ci 41162306a36Sopenharmony_ci if (ret != sizeof(*msg)) { 41262306a36Sopenharmony_ci if (ret < 0) { 41362306a36Sopenharmony_ci if (errno == EAGAIN || errno == EINTR) 41462306a36Sopenharmony_ci return 1; 41562306a36Sopenharmony_ci err("blocking read error"); 41662306a36Sopenharmony_ci } else { 41762306a36Sopenharmony_ci err("short read"); 41862306a36Sopenharmony_ci } 41962306a36Sopenharmony_ci } 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci return 0; 42262306a36Sopenharmony_ci} 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_civoid uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) 42562306a36Sopenharmony_ci{ 42662306a36Sopenharmony_ci unsigned long offset; 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci if (msg->event != UFFD_EVENT_PAGEFAULT) 42962306a36Sopenharmony_ci err("unexpected msg event %u", msg->event); 43062306a36Sopenharmony_ci 43162306a36Sopenharmony_ci if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { 43262306a36Sopenharmony_ci /* Write protect page faults */ 43362306a36Sopenharmony_ci wp_range(uffd, msg->arg.pagefault.address, page_size, false); 43462306a36Sopenharmony_ci args->wp_faults++; 43562306a36Sopenharmony_ci } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { 43662306a36Sopenharmony_ci uint8_t *area; 43762306a36Sopenharmony_ci int b; 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci /* 44062306a36Sopenharmony_ci * Minor page faults 44162306a36Sopenharmony_ci * 44262306a36Sopenharmony_ci * To prove we can modify the original range for testing 44362306a36Sopenharmony_ci * purposes, we're going to bit flip this range before 44462306a36Sopenharmony_ci * continuing. 44562306a36Sopenharmony_ci * 44662306a36Sopenharmony_ci * Note that this requires all minor page fault tests operate on 44762306a36Sopenharmony_ci * area_dst (non-UFFD-registered) and area_dst_alias 44862306a36Sopenharmony_ci * (UFFD-registered). 44962306a36Sopenharmony_ci */ 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci area = (uint8_t *)(area_dst + 45262306a36Sopenharmony_ci ((char *)msg->arg.pagefault.address - 45362306a36Sopenharmony_ci area_dst_alias)); 45462306a36Sopenharmony_ci for (b = 0; b < page_size; ++b) 45562306a36Sopenharmony_ci area[b] = ~area[b]; 45662306a36Sopenharmony_ci continue_range(uffd, msg->arg.pagefault.address, page_size, 45762306a36Sopenharmony_ci args->apply_wp); 45862306a36Sopenharmony_ci args->minor_faults++; 45962306a36Sopenharmony_ci } else { 46062306a36Sopenharmony_ci /* 46162306a36Sopenharmony_ci * Missing page faults. 46262306a36Sopenharmony_ci * 46362306a36Sopenharmony_ci * Here we force a write check for each of the missing mode 46462306a36Sopenharmony_ci * faults. It's guaranteed because the only threads that 46562306a36Sopenharmony_ci * will trigger uffd faults are the locking threads, and 46662306a36Sopenharmony_ci * their first instruction to touch the missing page will 46762306a36Sopenharmony_ci * always be pthread_mutex_lock(). 46862306a36Sopenharmony_ci * 46962306a36Sopenharmony_ci * Note that here we relied on an NPTL glibc impl detail to 47062306a36Sopenharmony_ci * always read the lock type at the entry of the lock op 47162306a36Sopenharmony_ci * (pthread_mutex_t.__data.__type, offset 0x10) before 47262306a36Sopenharmony_ci * doing any locking operations to guarantee that. It's 47362306a36Sopenharmony_ci * actually not good to rely on this impl detail because 47462306a36Sopenharmony_ci * logically a pthread-compatible lib can implement the 47562306a36Sopenharmony_ci * locks without types and we can fail when linking with 47662306a36Sopenharmony_ci * them. However since we used to find bugs with this 47762306a36Sopenharmony_ci * strict check we still keep it around. Hopefully this 47862306a36Sopenharmony_ci * could be a good hint when it fails again. If one day 47962306a36Sopenharmony_ci * it'll break on some other impl of glibc we'll revisit. 48062306a36Sopenharmony_ci */ 48162306a36Sopenharmony_ci if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) 48262306a36Sopenharmony_ci err("unexpected write fault"); 48362306a36Sopenharmony_ci 48462306a36Sopenharmony_ci offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; 48562306a36Sopenharmony_ci offset &= ~(page_size-1); 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci if (copy_page(uffd, offset, args->apply_wp)) 48862306a36Sopenharmony_ci args->missing_faults++; 48962306a36Sopenharmony_ci } 49062306a36Sopenharmony_ci} 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_civoid *uffd_poll_thread(void *arg) 49362306a36Sopenharmony_ci{ 49462306a36Sopenharmony_ci struct uffd_args *args = (struct uffd_args *)arg; 49562306a36Sopenharmony_ci unsigned long cpu = args->cpu; 49662306a36Sopenharmony_ci struct pollfd pollfd[2]; 49762306a36Sopenharmony_ci struct uffd_msg msg; 49862306a36Sopenharmony_ci struct uffdio_register uffd_reg; 49962306a36Sopenharmony_ci int ret; 50062306a36Sopenharmony_ci char tmp_chr; 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci if (!args->handle_fault) 50362306a36Sopenharmony_ci args->handle_fault = uffd_handle_page_fault; 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci pollfd[0].fd = uffd; 50662306a36Sopenharmony_ci pollfd[0].events = POLLIN; 50762306a36Sopenharmony_ci pollfd[1].fd = pipefd[cpu*2]; 50862306a36Sopenharmony_ci pollfd[1].events = POLLIN; 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci for (;;) { 51162306a36Sopenharmony_ci ret = poll(pollfd, 2, -1); 51262306a36Sopenharmony_ci if (ret <= 0) { 51362306a36Sopenharmony_ci if (errno == EINTR || errno == EAGAIN) 51462306a36Sopenharmony_ci continue; 51562306a36Sopenharmony_ci err("poll error: %d", ret); 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci if (pollfd[1].revents) { 51862306a36Sopenharmony_ci if (!(pollfd[1].revents & POLLIN)) 51962306a36Sopenharmony_ci err("pollfd[1].revents %d", pollfd[1].revents); 52062306a36Sopenharmony_ci if (read(pollfd[1].fd, &tmp_chr, 1) != 1) 52162306a36Sopenharmony_ci err("read pipefd error"); 52262306a36Sopenharmony_ci break; 52362306a36Sopenharmony_ci } 52462306a36Sopenharmony_ci if (!(pollfd[0].revents & POLLIN)) 52562306a36Sopenharmony_ci err("pollfd[0].revents %d", pollfd[0].revents); 52662306a36Sopenharmony_ci if (uffd_read_msg(uffd, &msg)) 52762306a36Sopenharmony_ci continue; 52862306a36Sopenharmony_ci switch (msg.event) { 52962306a36Sopenharmony_ci default: 53062306a36Sopenharmony_ci err("unexpected msg event %u\n", msg.event); 53162306a36Sopenharmony_ci break; 53262306a36Sopenharmony_ci case UFFD_EVENT_PAGEFAULT: 53362306a36Sopenharmony_ci args->handle_fault(&msg, args); 53462306a36Sopenharmony_ci break; 53562306a36Sopenharmony_ci case UFFD_EVENT_FORK: 53662306a36Sopenharmony_ci close(uffd); 53762306a36Sopenharmony_ci uffd = msg.arg.fork.ufd; 53862306a36Sopenharmony_ci pollfd[0].fd = uffd; 53962306a36Sopenharmony_ci break; 54062306a36Sopenharmony_ci case UFFD_EVENT_REMOVE: 54162306a36Sopenharmony_ci uffd_reg.range.start = msg.arg.remove.start; 54262306a36Sopenharmony_ci uffd_reg.range.len = msg.arg.remove.end - 54362306a36Sopenharmony_ci msg.arg.remove.start; 54462306a36Sopenharmony_ci if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) 54562306a36Sopenharmony_ci err("remove failure"); 54662306a36Sopenharmony_ci break; 54762306a36Sopenharmony_ci case UFFD_EVENT_REMAP: 54862306a36Sopenharmony_ci area_remap = area_dst; /* save for later unmap */ 54962306a36Sopenharmony_ci area_dst = (char *)(unsigned long)msg.arg.remap.to; 55062306a36Sopenharmony_ci break; 55162306a36Sopenharmony_ci } 55262306a36Sopenharmony_ci } 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci return NULL; 55562306a36Sopenharmony_ci} 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_cistatic void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, 55862306a36Sopenharmony_ci unsigned long offset) 55962306a36Sopenharmony_ci{ 56062306a36Sopenharmony_ci uffd_test_ops->alias_mapping(&uffdio_copy->dst, 56162306a36Sopenharmony_ci uffdio_copy->len, 56262306a36Sopenharmony_ci offset); 56362306a36Sopenharmony_ci if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { 56462306a36Sopenharmony_ci /* real retval in ufdio_copy.copy */ 56562306a36Sopenharmony_ci if (uffdio_copy->copy != -EEXIST) 56662306a36Sopenharmony_ci err("UFFDIO_COPY retry error: %"PRId64, 56762306a36Sopenharmony_ci (int64_t)uffdio_copy->copy); 56862306a36Sopenharmony_ci } else { 56962306a36Sopenharmony_ci err("UFFDIO_COPY retry unexpected: %"PRId64, 57062306a36Sopenharmony_ci (int64_t)uffdio_copy->copy); 57162306a36Sopenharmony_ci } 57262306a36Sopenharmony_ci} 57362306a36Sopenharmony_ci 57462306a36Sopenharmony_cistatic void wake_range(int ufd, unsigned long addr, unsigned long len) 57562306a36Sopenharmony_ci{ 57662306a36Sopenharmony_ci struct uffdio_range uffdio_wake; 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci uffdio_wake.start = addr; 57962306a36Sopenharmony_ci uffdio_wake.len = len; 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) 58262306a36Sopenharmony_ci fprintf(stderr, "error waking %lu\n", 58362306a36Sopenharmony_ci addr), exit(1); 58462306a36Sopenharmony_ci} 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_ciint __copy_page(int ufd, unsigned long offset, bool retry, bool wp) 58762306a36Sopenharmony_ci{ 58862306a36Sopenharmony_ci struct uffdio_copy uffdio_copy; 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_ci if (offset >= nr_pages * page_size) 59162306a36Sopenharmony_ci err("unexpected offset %lu\n", offset); 59262306a36Sopenharmony_ci uffdio_copy.dst = (unsigned long) area_dst + offset; 59362306a36Sopenharmony_ci uffdio_copy.src = (unsigned long) area_src + offset; 59462306a36Sopenharmony_ci uffdio_copy.len = page_size; 59562306a36Sopenharmony_ci if (wp) 59662306a36Sopenharmony_ci uffdio_copy.mode = UFFDIO_COPY_MODE_WP; 59762306a36Sopenharmony_ci else 59862306a36Sopenharmony_ci uffdio_copy.mode = 0; 59962306a36Sopenharmony_ci uffdio_copy.copy = 0; 60062306a36Sopenharmony_ci if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { 60162306a36Sopenharmony_ci /* real retval in ufdio_copy.copy */ 60262306a36Sopenharmony_ci if (uffdio_copy.copy != -EEXIST) 60362306a36Sopenharmony_ci err("UFFDIO_COPY error: %"PRId64, 60462306a36Sopenharmony_ci (int64_t)uffdio_copy.copy); 60562306a36Sopenharmony_ci wake_range(ufd, uffdio_copy.dst, page_size); 60662306a36Sopenharmony_ci } else if (uffdio_copy.copy != page_size) { 60762306a36Sopenharmony_ci err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); 60862306a36Sopenharmony_ci } else { 60962306a36Sopenharmony_ci if (test_uffdio_copy_eexist && retry) { 61062306a36Sopenharmony_ci test_uffdio_copy_eexist = false; 61162306a36Sopenharmony_ci retry_copy_page(ufd, &uffdio_copy, offset); 61262306a36Sopenharmony_ci } 61362306a36Sopenharmony_ci return 1; 61462306a36Sopenharmony_ci } 61562306a36Sopenharmony_ci return 0; 61662306a36Sopenharmony_ci} 61762306a36Sopenharmony_ci 61862306a36Sopenharmony_ciint copy_page(int ufd, unsigned long offset, bool wp) 61962306a36Sopenharmony_ci{ 62062306a36Sopenharmony_ci return __copy_page(ufd, offset, false, wp); 62162306a36Sopenharmony_ci} 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ciint uffd_open_dev(unsigned int flags) 62462306a36Sopenharmony_ci{ 62562306a36Sopenharmony_ci int fd, uffd; 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_ci fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); 62862306a36Sopenharmony_ci if (fd < 0) 62962306a36Sopenharmony_ci return fd; 63062306a36Sopenharmony_ci uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags); 63162306a36Sopenharmony_ci close(fd); 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci return uffd; 63462306a36Sopenharmony_ci} 63562306a36Sopenharmony_ci 63662306a36Sopenharmony_ciint uffd_open_sys(unsigned int flags) 63762306a36Sopenharmony_ci{ 63862306a36Sopenharmony_ci#ifdef __NR_userfaultfd 63962306a36Sopenharmony_ci return syscall(__NR_userfaultfd, flags); 64062306a36Sopenharmony_ci#else 64162306a36Sopenharmony_ci return -1; 64262306a36Sopenharmony_ci#endif 64362306a36Sopenharmony_ci} 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ciint uffd_open(unsigned int flags) 64662306a36Sopenharmony_ci{ 64762306a36Sopenharmony_ci int uffd = uffd_open_sys(flags); 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_ci if (uffd < 0) 65062306a36Sopenharmony_ci uffd = uffd_open_dev(flags); 65162306a36Sopenharmony_ci 65262306a36Sopenharmony_ci return uffd; 65362306a36Sopenharmony_ci} 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_ciint uffd_get_features(uint64_t *features) 65662306a36Sopenharmony_ci{ 65762306a36Sopenharmony_ci struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 }; 65862306a36Sopenharmony_ci /* 65962306a36Sopenharmony_ci * This should by default work in most kernels; the feature list 66062306a36Sopenharmony_ci * will be the same no matter what we pass in here. 66162306a36Sopenharmony_ci */ 66262306a36Sopenharmony_ci int fd = uffd_open(UFFD_USER_MODE_ONLY); 66362306a36Sopenharmony_ci 66462306a36Sopenharmony_ci if (fd < 0) 66562306a36Sopenharmony_ci /* Maybe the kernel is older than user-only mode? */ 66662306a36Sopenharmony_ci fd = uffd_open(0); 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_ci if (fd < 0) 66962306a36Sopenharmony_ci return fd; 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci if (ioctl(fd, UFFDIO_API, &uffdio_api)) { 67262306a36Sopenharmony_ci close(fd); 67362306a36Sopenharmony_ci return -errno; 67462306a36Sopenharmony_ci } 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_ci *features = uffdio_api.features; 67762306a36Sopenharmony_ci close(fd); 67862306a36Sopenharmony_ci 67962306a36Sopenharmony_ci return 0; 68062306a36Sopenharmony_ci} 681