162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * memfd GUP test-case 462306a36Sopenharmony_ci * This tests memfd interactions with get_user_pages(). We require the 562306a36Sopenharmony_ci * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This 662306a36Sopenharmony_ci * file-system delays _all_ reads by 1s and forces direct-IO. This means, any 762306a36Sopenharmony_ci * read() on files in that file-system will pin the receive-buffer pages for at 862306a36Sopenharmony_ci * least 1s via get_user_pages(). 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * We use this trick to race ADD_SEALS against a write on a memfd object. The 1162306a36Sopenharmony_ci * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use 1262306a36Sopenharmony_ci * the read() syscall with our memory-mapped memfd object as receive buffer to 1362306a36Sopenharmony_ci * force the kernel to write into our memfd object. 1462306a36Sopenharmony_ci */ 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci#define _GNU_SOURCE 1762306a36Sopenharmony_ci#define __EXPORTED_HEADERS__ 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_ci#include <errno.h> 2062306a36Sopenharmony_ci#include <inttypes.h> 2162306a36Sopenharmony_ci#include <limits.h> 2262306a36Sopenharmony_ci#include <linux/falloc.h> 2362306a36Sopenharmony_ci#include <fcntl.h> 2462306a36Sopenharmony_ci#include <linux/memfd.h> 2562306a36Sopenharmony_ci#include <linux/types.h> 2662306a36Sopenharmony_ci#include <sched.h> 2762306a36Sopenharmony_ci#include <stdio.h> 2862306a36Sopenharmony_ci#include <stdlib.h> 2962306a36Sopenharmony_ci#include <signal.h> 3062306a36Sopenharmony_ci#include <string.h> 3162306a36Sopenharmony_ci#include <sys/mman.h> 3262306a36Sopenharmony_ci#include <sys/stat.h> 3362306a36Sopenharmony_ci#include <sys/syscall.h> 3462306a36Sopenharmony_ci#include <sys/wait.h> 3562306a36Sopenharmony_ci#include <unistd.h> 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci#include "common.h" 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ci#define MFD_DEF_SIZE 8192 4062306a36Sopenharmony_ci#define STACK_SIZE 65536 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_cistatic size_t mfd_def_size = MFD_DEF_SIZE; 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_cistatic int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) 4562306a36Sopenharmony_ci{ 4662306a36Sopenharmony_ci int r, fd; 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci fd = sys_memfd_create(name, flags); 4962306a36Sopenharmony_ci if (fd < 0) { 5062306a36Sopenharmony_ci printf("memfd_create(\"%s\", %u) failed: %m\n", 5162306a36Sopenharmony_ci name, flags); 5262306a36Sopenharmony_ci abort(); 5362306a36Sopenharmony_ci } 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci r = ftruncate(fd, sz); 5662306a36Sopenharmony_ci if (r < 0) { 5762306a36Sopenharmony_ci printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz); 5862306a36Sopenharmony_ci abort(); 5962306a36Sopenharmony_ci } 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci return fd; 6262306a36Sopenharmony_ci} 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_cistatic __u64 mfd_assert_get_seals(int fd) 6562306a36Sopenharmony_ci{ 6662306a36Sopenharmony_ci long r; 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci r = fcntl(fd, F_GET_SEALS); 6962306a36Sopenharmony_ci if (r < 0) { 7062306a36Sopenharmony_ci printf("GET_SEALS(%d) failed: %m\n", fd); 7162306a36Sopenharmony_ci abort(); 7262306a36Sopenharmony_ci } 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci return r; 7562306a36Sopenharmony_ci} 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_cistatic void mfd_assert_has_seals(int fd, __u64 seals) 7862306a36Sopenharmony_ci{ 7962306a36Sopenharmony_ci __u64 s; 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci s = mfd_assert_get_seals(fd); 8262306a36Sopenharmony_ci if (s != seals) { 8362306a36Sopenharmony_ci printf("%llu != %llu = GET_SEALS(%d)\n", 8462306a36Sopenharmony_ci (unsigned long long)seals, (unsigned long long)s, fd); 8562306a36Sopenharmony_ci abort(); 8662306a36Sopenharmony_ci } 8762306a36Sopenharmony_ci} 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_cistatic void mfd_assert_add_seals(int fd, __u64 seals) 9062306a36Sopenharmony_ci{ 9162306a36Sopenharmony_ci long r; 9262306a36Sopenharmony_ci __u64 s; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci s = mfd_assert_get_seals(fd); 9562306a36Sopenharmony_ci r = fcntl(fd, F_ADD_SEALS, seals); 9662306a36Sopenharmony_ci if (r < 0) { 9762306a36Sopenharmony_ci printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n", 9862306a36Sopenharmony_ci fd, (unsigned long long)s, (unsigned long long)seals); 9962306a36Sopenharmony_ci abort(); 10062306a36Sopenharmony_ci } 10162306a36Sopenharmony_ci} 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_cistatic int mfd_busy_add_seals(int fd, __u64 seals) 10462306a36Sopenharmony_ci{ 10562306a36Sopenharmony_ci long r; 10662306a36Sopenharmony_ci __u64 s; 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci r = fcntl(fd, F_GET_SEALS); 10962306a36Sopenharmony_ci if (r < 0) 11062306a36Sopenharmony_ci s = 0; 11162306a36Sopenharmony_ci else 11262306a36Sopenharmony_ci s = r; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci r = fcntl(fd, F_ADD_SEALS, seals); 11562306a36Sopenharmony_ci if (r < 0 && errno != EBUSY) { 11662306a36Sopenharmony_ci printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n", 11762306a36Sopenharmony_ci fd, (unsigned long long)s, (unsigned long long)seals); 11862306a36Sopenharmony_ci abort(); 11962306a36Sopenharmony_ci } 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci return r; 12262306a36Sopenharmony_ci} 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_cistatic void *mfd_assert_mmap_shared(int fd) 12562306a36Sopenharmony_ci{ 12662306a36Sopenharmony_ci void *p; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci p = mmap(NULL, 12962306a36Sopenharmony_ci mfd_def_size, 13062306a36Sopenharmony_ci PROT_READ | PROT_WRITE, 13162306a36Sopenharmony_ci MAP_SHARED, 13262306a36Sopenharmony_ci fd, 13362306a36Sopenharmony_ci 0); 13462306a36Sopenharmony_ci if (p == MAP_FAILED) { 13562306a36Sopenharmony_ci printf("mmap() failed: %m\n"); 13662306a36Sopenharmony_ci abort(); 13762306a36Sopenharmony_ci } 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci return p; 14062306a36Sopenharmony_ci} 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_cistatic void *mfd_assert_mmap_private(int fd) 14362306a36Sopenharmony_ci{ 14462306a36Sopenharmony_ci void *p; 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci p = mmap(NULL, 14762306a36Sopenharmony_ci mfd_def_size, 14862306a36Sopenharmony_ci PROT_READ | PROT_WRITE, 14962306a36Sopenharmony_ci MAP_PRIVATE, 15062306a36Sopenharmony_ci fd, 15162306a36Sopenharmony_ci 0); 15262306a36Sopenharmony_ci if (p == MAP_FAILED) { 15362306a36Sopenharmony_ci printf("mmap() failed: %m\n"); 15462306a36Sopenharmony_ci abort(); 15562306a36Sopenharmony_ci } 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci return p; 15862306a36Sopenharmony_ci} 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_cistatic int global_mfd = -1; 16162306a36Sopenharmony_cistatic void *global_p = NULL; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_cistatic int sealing_thread_fn(void *arg) 16462306a36Sopenharmony_ci{ 16562306a36Sopenharmony_ci int sig, r; 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci /* 16862306a36Sopenharmony_ci * This thread first waits 200ms so any pending operation in the parent 16962306a36Sopenharmony_ci * is correctly started. After that, it tries to seal @global_mfd as 17062306a36Sopenharmony_ci * SEAL_WRITE. This _must_ fail as the parent thread has a read() into 17162306a36Sopenharmony_ci * that memory mapped object still ongoing. 17262306a36Sopenharmony_ci * We then wait one more second and try sealing again. This time it 17362306a36Sopenharmony_ci * must succeed as there shouldn't be anyone else pinning the pages. 17462306a36Sopenharmony_ci */ 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci /* wait 200ms for FUSE-request to be active */ 17762306a36Sopenharmony_ci usleep(200000); 17862306a36Sopenharmony_ci 17962306a36Sopenharmony_ci /* unmount mapping before sealing to avoid i_mmap_writable failures */ 18062306a36Sopenharmony_ci munmap(global_p, mfd_def_size); 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci /* Try sealing the global file; expect EBUSY or success. Current 18362306a36Sopenharmony_ci * kernels will never succeed, but in the future, kernels might 18462306a36Sopenharmony_ci * implement page-replacements or other fancy ways to avoid racing 18562306a36Sopenharmony_ci * writes. */ 18662306a36Sopenharmony_ci r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE); 18762306a36Sopenharmony_ci if (r >= 0) { 18862306a36Sopenharmony_ci printf("HURRAY! This kernel fixed GUP races!\n"); 18962306a36Sopenharmony_ci } else { 19062306a36Sopenharmony_ci /* wait 1s more so the FUSE-request is done */ 19162306a36Sopenharmony_ci sleep(1); 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci /* try sealing the global file again */ 19462306a36Sopenharmony_ci mfd_assert_add_seals(global_mfd, F_SEAL_WRITE); 19562306a36Sopenharmony_ci } 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci return 0; 19862306a36Sopenharmony_ci} 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_cistatic pid_t spawn_sealing_thread(void) 20162306a36Sopenharmony_ci{ 20262306a36Sopenharmony_ci uint8_t *stack; 20362306a36Sopenharmony_ci pid_t pid; 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci stack = malloc(STACK_SIZE); 20662306a36Sopenharmony_ci if (!stack) { 20762306a36Sopenharmony_ci printf("malloc(STACK_SIZE) failed: %m\n"); 20862306a36Sopenharmony_ci abort(); 20962306a36Sopenharmony_ci } 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci pid = clone(sealing_thread_fn, 21262306a36Sopenharmony_ci stack + STACK_SIZE, 21362306a36Sopenharmony_ci SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, 21462306a36Sopenharmony_ci NULL); 21562306a36Sopenharmony_ci if (pid < 0) { 21662306a36Sopenharmony_ci printf("clone() failed: %m\n"); 21762306a36Sopenharmony_ci abort(); 21862306a36Sopenharmony_ci } 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci return pid; 22162306a36Sopenharmony_ci} 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_cistatic void join_sealing_thread(pid_t pid) 22462306a36Sopenharmony_ci{ 22562306a36Sopenharmony_ci waitpid(pid, NULL, 0); 22662306a36Sopenharmony_ci} 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ciint main(int argc, char **argv) 22962306a36Sopenharmony_ci{ 23062306a36Sopenharmony_ci char *zero; 23162306a36Sopenharmony_ci int fd, mfd, r; 23262306a36Sopenharmony_ci void *p; 23362306a36Sopenharmony_ci int was_sealed; 23462306a36Sopenharmony_ci pid_t pid; 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci if (argc < 2) { 23762306a36Sopenharmony_ci printf("error: please pass path to file in fuse_mnt mount-point\n"); 23862306a36Sopenharmony_ci abort(); 23962306a36Sopenharmony_ci } 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci if (argc >= 3) { 24262306a36Sopenharmony_ci if (!strcmp(argv[2], "hugetlbfs")) { 24362306a36Sopenharmony_ci unsigned long hpage_size = default_huge_page_size(); 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci if (!hpage_size) { 24662306a36Sopenharmony_ci printf("Unable to determine huge page size\n"); 24762306a36Sopenharmony_ci abort(); 24862306a36Sopenharmony_ci } 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci hugetlbfs_test = 1; 25162306a36Sopenharmony_ci mfd_def_size = hpage_size * 2; 25262306a36Sopenharmony_ci } else { 25362306a36Sopenharmony_ci printf("Unknown option: %s\n", argv[2]); 25462306a36Sopenharmony_ci abort(); 25562306a36Sopenharmony_ci } 25662306a36Sopenharmony_ci } 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci zero = calloc(sizeof(*zero), mfd_def_size); 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci /* open FUSE memfd file for GUP testing */ 26162306a36Sopenharmony_ci printf("opening: %s\n", argv[1]); 26262306a36Sopenharmony_ci fd = open(argv[1], O_RDONLY | O_CLOEXEC); 26362306a36Sopenharmony_ci if (fd < 0) { 26462306a36Sopenharmony_ci printf("cannot open(\"%s\"): %m\n", argv[1]); 26562306a36Sopenharmony_ci abort(); 26662306a36Sopenharmony_ci } 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci /* create new memfd-object */ 26962306a36Sopenharmony_ci mfd = mfd_assert_new("kern_memfd_fuse", 27062306a36Sopenharmony_ci mfd_def_size, 27162306a36Sopenharmony_ci MFD_CLOEXEC | MFD_ALLOW_SEALING); 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci /* mmap memfd-object for writing */ 27462306a36Sopenharmony_ci p = mfd_assert_mmap_shared(mfd); 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci /* pass mfd+mapping to a separate sealing-thread which tries to seal 27762306a36Sopenharmony_ci * the memfd objects with SEAL_WRITE while we write into it */ 27862306a36Sopenharmony_ci global_mfd = mfd; 27962306a36Sopenharmony_ci global_p = p; 28062306a36Sopenharmony_ci pid = spawn_sealing_thread(); 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci /* Use read() on the FUSE file to read into our memory-mapped memfd 28362306a36Sopenharmony_ci * object. This races the other thread which tries to seal the 28462306a36Sopenharmony_ci * memfd-object. 28562306a36Sopenharmony_ci * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s. 28662306a36Sopenharmony_ci * This guarantees that the receive-buffer is pinned for 1s until the 28762306a36Sopenharmony_ci * data is written into it. The racing ADD_SEALS should thus fail as 28862306a36Sopenharmony_ci * the pages are still pinned. */ 28962306a36Sopenharmony_ci r = read(fd, p, mfd_def_size); 29062306a36Sopenharmony_ci if (r < 0) { 29162306a36Sopenharmony_ci printf("read() failed: %m\n"); 29262306a36Sopenharmony_ci abort(); 29362306a36Sopenharmony_ci } else if (!r) { 29462306a36Sopenharmony_ci printf("unexpected EOF on read()\n"); 29562306a36Sopenharmony_ci abort(); 29662306a36Sopenharmony_ci } 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE; 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci /* Wait for sealing-thread to finish and verify that it 30162306a36Sopenharmony_ci * successfully sealed the file after the second try. */ 30262306a36Sopenharmony_ci join_sealing_thread(pid); 30362306a36Sopenharmony_ci mfd_assert_has_seals(mfd, F_SEAL_WRITE); 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci /* *IF* the memfd-object was sealed at the time our read() returned, 30662306a36Sopenharmony_ci * then the kernel did a page-replacement or canceled the read() (or 30762306a36Sopenharmony_ci * whatever magic it did..). In that case, the memfd object is still 30862306a36Sopenharmony_ci * all zero. 30962306a36Sopenharmony_ci * In case the memfd-object was *not* sealed, the read() was successfull 31062306a36Sopenharmony_ci * and the memfd object must *not* be all zero. 31162306a36Sopenharmony_ci * Note that in real scenarios, there might be a mixture of both, but 31262306a36Sopenharmony_ci * in this test-cases, we have explicit 200ms delays which should be 31362306a36Sopenharmony_ci * enough to avoid any in-flight writes. */ 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci p = mfd_assert_mmap_private(mfd); 31662306a36Sopenharmony_ci if (was_sealed && memcmp(p, zero, mfd_def_size)) { 31762306a36Sopenharmony_ci printf("memfd sealed during read() but data not discarded\n"); 31862306a36Sopenharmony_ci abort(); 31962306a36Sopenharmony_ci } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) { 32062306a36Sopenharmony_ci printf("memfd sealed after read() but data discarded\n"); 32162306a36Sopenharmony_ci abort(); 32262306a36Sopenharmony_ci } 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci close(mfd); 32562306a36Sopenharmony_ci close(fd); 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_ci printf("fuse: DONE\n"); 32862306a36Sopenharmony_ci free(zero); 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_ci return 0; 33162306a36Sopenharmony_ci} 332