162306a36Sopenharmony_ci#include <signal.h> 262306a36Sopenharmony_ci#include <stdio.h> 362306a36Sopenharmony_ci#include <stdlib.h> 462306a36Sopenharmony_ci#include <unistd.h> 562306a36Sopenharmony_ci#include <errno.h> 662306a36Sopenharmony_ci#include <fcntl.h> 762306a36Sopenharmony_ci#include <string.h> 862306a36Sopenharmony_ci#include <stddef.h> 962306a36Sopenharmony_ci#include <sys/sysmacros.h> 1062306a36Sopenharmony_ci#include <sys/types.h> 1162306a36Sopenharmony_ci#include <sys/wait.h> 1262306a36Sopenharmony_ci#include <sys/socket.h> 1362306a36Sopenharmony_ci#include <sys/stat.h> 1462306a36Sopenharmony_ci#include <sys/mman.h> 1562306a36Sopenharmony_ci#include <sys/syscall.h> 1662306a36Sopenharmony_ci#include <sys/user.h> 1762306a36Sopenharmony_ci#include <sys/ioctl.h> 1862306a36Sopenharmony_ci#include <sys/ptrace.h> 1962306a36Sopenharmony_ci#include <sys/mount.h> 2062306a36Sopenharmony_ci#include <linux/limits.h> 2162306a36Sopenharmony_ci#include <linux/filter.h> 2262306a36Sopenharmony_ci#include <linux/seccomp.h> 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_ci#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_cistatic int seccomp(unsigned int op, unsigned int flags, void *args) 2762306a36Sopenharmony_ci{ 2862306a36Sopenharmony_ci errno = 0; 2962306a36Sopenharmony_ci return syscall(__NR_seccomp, op, flags, args); 3062306a36Sopenharmony_ci} 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_cistatic int send_fd(int sock, int fd) 3362306a36Sopenharmony_ci{ 3462306a36Sopenharmony_ci struct msghdr msg = {}; 3562306a36Sopenharmony_ci struct cmsghdr *cmsg; 3662306a36Sopenharmony_ci char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; 3762306a36Sopenharmony_ci struct iovec io = { 3862306a36Sopenharmony_ci .iov_base = &c, 3962306a36Sopenharmony_ci .iov_len = 1, 4062306a36Sopenharmony_ci }; 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci msg.msg_iov = &io; 4362306a36Sopenharmony_ci msg.msg_iovlen = 1; 4462306a36Sopenharmony_ci msg.msg_control = buf; 4562306a36Sopenharmony_ci msg.msg_controllen = sizeof(buf); 4662306a36Sopenharmony_ci cmsg = CMSG_FIRSTHDR(&msg); 4762306a36Sopenharmony_ci cmsg->cmsg_level = SOL_SOCKET; 4862306a36Sopenharmony_ci cmsg->cmsg_type = SCM_RIGHTS; 4962306a36Sopenharmony_ci cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 5062306a36Sopenharmony_ci *((int *)CMSG_DATA(cmsg)) = fd; 5162306a36Sopenharmony_ci msg.msg_controllen = cmsg->cmsg_len; 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci if (sendmsg(sock, &msg, 0) < 0) { 5462306a36Sopenharmony_ci perror("sendmsg"); 5562306a36Sopenharmony_ci return -1; 5662306a36Sopenharmony_ci } 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci return 0; 5962306a36Sopenharmony_ci} 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_cistatic int recv_fd(int sock) 6262306a36Sopenharmony_ci{ 6362306a36Sopenharmony_ci struct msghdr msg = {}; 6462306a36Sopenharmony_ci struct cmsghdr *cmsg; 6562306a36Sopenharmony_ci char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c'; 6662306a36Sopenharmony_ci struct iovec io = { 6762306a36Sopenharmony_ci .iov_base = &c, 6862306a36Sopenharmony_ci .iov_len = 1, 6962306a36Sopenharmony_ci }; 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci msg.msg_iov = &io; 7262306a36Sopenharmony_ci msg.msg_iovlen = 1; 7362306a36Sopenharmony_ci msg.msg_control = buf; 7462306a36Sopenharmony_ci msg.msg_controllen = sizeof(buf); 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci if (recvmsg(sock, &msg, 0) < 0) { 7762306a36Sopenharmony_ci perror("recvmsg"); 7862306a36Sopenharmony_ci return -1; 7962306a36Sopenharmony_ci } 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci cmsg = CMSG_FIRSTHDR(&msg); 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci return *((int *)CMSG_DATA(cmsg)); 8462306a36Sopenharmony_ci} 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_cistatic int user_trap_syscall(int nr, unsigned int flags) 8762306a36Sopenharmony_ci{ 8862306a36Sopenharmony_ci struct sock_filter filter[] = { 8962306a36Sopenharmony_ci BPF_STMT(BPF_LD+BPF_W+BPF_ABS, 9062306a36Sopenharmony_ci offsetof(struct seccomp_data, nr)), 9162306a36Sopenharmony_ci BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1), 9262306a36Sopenharmony_ci BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF), 9362306a36Sopenharmony_ci BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), 9462306a36Sopenharmony_ci }; 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci struct sock_fprog prog = { 9762306a36Sopenharmony_ci .len = (unsigned short)ARRAY_SIZE(filter), 9862306a36Sopenharmony_ci .filter = filter, 9962306a36Sopenharmony_ci }; 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); 10262306a36Sopenharmony_ci} 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_cistatic int handle_req(struct seccomp_notif *req, 10562306a36Sopenharmony_ci struct seccomp_notif_resp *resp, int listener) 10662306a36Sopenharmony_ci{ 10762306a36Sopenharmony_ci char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX]; 10862306a36Sopenharmony_ci int ret = -1, mem; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci resp->id = req->id; 11162306a36Sopenharmony_ci resp->error = -EPERM; 11262306a36Sopenharmony_ci resp->val = 0; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci if (req->data.nr != __NR_mount) { 11562306a36Sopenharmony_ci fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr); 11662306a36Sopenharmony_ci return -1; 11762306a36Sopenharmony_ci } 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci /* Only allow bind mounts. */ 12062306a36Sopenharmony_ci if (!(req->data.args[3] & MS_BIND)) 12162306a36Sopenharmony_ci return 0; 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci /* 12462306a36Sopenharmony_ci * Ok, let's read the task's memory to see where they wanted their 12562306a36Sopenharmony_ci * mount to go. 12662306a36Sopenharmony_ci */ 12762306a36Sopenharmony_ci snprintf(path, sizeof(path), "/proc/%d/mem", req->pid); 12862306a36Sopenharmony_ci mem = open(path, O_RDONLY); 12962306a36Sopenharmony_ci if (mem < 0) { 13062306a36Sopenharmony_ci perror("open mem"); 13162306a36Sopenharmony_ci return -1; 13262306a36Sopenharmony_ci } 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci /* 13562306a36Sopenharmony_ci * Now we avoid a TOCTOU: we referred to a pid by its pid, but since 13662306a36Sopenharmony_ci * the pid that made the syscall may have died, we need to confirm that 13762306a36Sopenharmony_ci * the pid is still valid after we open its /proc/pid/mem file. We can 13862306a36Sopenharmony_ci * ask the listener fd this as follows. 13962306a36Sopenharmony_ci * 14062306a36Sopenharmony_ci * Note that this check should occur *after* any task-specific 14162306a36Sopenharmony_ci * resources are opened, to make sure that the task has not died and 14262306a36Sopenharmony_ci * we're not wrongly reading someone else's state in order to make 14362306a36Sopenharmony_ci * decisions. 14462306a36Sopenharmony_ci */ 14562306a36Sopenharmony_ci if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) { 14662306a36Sopenharmony_ci fprintf(stderr, "task died before we could map its memory\n"); 14762306a36Sopenharmony_ci goto out; 14862306a36Sopenharmony_ci } 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci /* 15162306a36Sopenharmony_ci * Phew, we've got the right /proc/pid/mem. Now we can read it. Note 15262306a36Sopenharmony_ci * that to avoid another TOCTOU, we should read all of the pointer args 15362306a36Sopenharmony_ci * before we decide to allow the syscall. 15462306a36Sopenharmony_ci */ 15562306a36Sopenharmony_ci if (lseek(mem, req->data.args[0], SEEK_SET) < 0) { 15662306a36Sopenharmony_ci perror("seek"); 15762306a36Sopenharmony_ci goto out; 15862306a36Sopenharmony_ci } 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci ret = read(mem, source, sizeof(source)); 16162306a36Sopenharmony_ci if (ret < 0) { 16262306a36Sopenharmony_ci perror("read"); 16362306a36Sopenharmony_ci goto out; 16462306a36Sopenharmony_ci } 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci if (lseek(mem, req->data.args[1], SEEK_SET) < 0) { 16762306a36Sopenharmony_ci perror("seek"); 16862306a36Sopenharmony_ci goto out; 16962306a36Sopenharmony_ci } 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci ret = read(mem, target, sizeof(target)); 17262306a36Sopenharmony_ci if (ret < 0) { 17362306a36Sopenharmony_ci perror("read"); 17462306a36Sopenharmony_ci goto out; 17562306a36Sopenharmony_ci } 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci /* 17862306a36Sopenharmony_ci * Our policy is to only allow bind mounts inside /tmp. This isn't very 17962306a36Sopenharmony_ci * interesting, because we could do unprivlieged bind mounts with user 18062306a36Sopenharmony_ci * namespaces already, but you get the idea. 18162306a36Sopenharmony_ci */ 18262306a36Sopenharmony_ci if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) { 18362306a36Sopenharmony_ci if (mount(source, target, NULL, req->data.args[3], NULL) < 0) { 18462306a36Sopenharmony_ci ret = -1; 18562306a36Sopenharmony_ci perror("actual mount"); 18662306a36Sopenharmony_ci goto out; 18762306a36Sopenharmony_ci } 18862306a36Sopenharmony_ci resp->error = 0; 18962306a36Sopenharmony_ci } 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci /* Even if we didn't allow it because of policy, generating the 19262306a36Sopenharmony_ci * response was be a success, because we want to tell the worker EPERM. 19362306a36Sopenharmony_ci */ 19462306a36Sopenharmony_ci ret = 0; 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ciout: 19762306a36Sopenharmony_ci close(mem); 19862306a36Sopenharmony_ci return ret; 19962306a36Sopenharmony_ci} 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ciint main(void) 20262306a36Sopenharmony_ci{ 20362306a36Sopenharmony_ci int sk_pair[2], ret = 1, status, listener; 20462306a36Sopenharmony_ci pid_t worker = 0 , tracer = 0; 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) { 20762306a36Sopenharmony_ci perror("socketpair"); 20862306a36Sopenharmony_ci return 1; 20962306a36Sopenharmony_ci } 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci worker = fork(); 21262306a36Sopenharmony_ci if (worker < 0) { 21362306a36Sopenharmony_ci perror("fork"); 21462306a36Sopenharmony_ci goto close_pair; 21562306a36Sopenharmony_ci } 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci if (worker == 0) { 21862306a36Sopenharmony_ci listener = user_trap_syscall(__NR_mount, 21962306a36Sopenharmony_ci SECCOMP_FILTER_FLAG_NEW_LISTENER); 22062306a36Sopenharmony_ci if (listener < 0) { 22162306a36Sopenharmony_ci perror("seccomp"); 22262306a36Sopenharmony_ci exit(1); 22362306a36Sopenharmony_ci } 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci /* 22662306a36Sopenharmony_ci * Drop privileges. We definitely can't mount as uid 1000. 22762306a36Sopenharmony_ci */ 22862306a36Sopenharmony_ci if (setuid(1000) < 0) { 22962306a36Sopenharmony_ci perror("setuid"); 23062306a36Sopenharmony_ci exit(1); 23162306a36Sopenharmony_ci } 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci /* 23462306a36Sopenharmony_ci * Send the listener to the parent; also serves as 23562306a36Sopenharmony_ci * synchronization. 23662306a36Sopenharmony_ci */ 23762306a36Sopenharmony_ci if (send_fd(sk_pair[1], listener) < 0) 23862306a36Sopenharmony_ci exit(1); 23962306a36Sopenharmony_ci close(listener); 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci if (mkdir("/tmp/foo", 0755) < 0) { 24262306a36Sopenharmony_ci perror("mkdir"); 24362306a36Sopenharmony_ci exit(1); 24462306a36Sopenharmony_ci } 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci /* 24762306a36Sopenharmony_ci * Try a bad mount just for grins. 24862306a36Sopenharmony_ci */ 24962306a36Sopenharmony_ci if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) { 25062306a36Sopenharmony_ci fprintf(stderr, "huh? mounted /dev/sda?\n"); 25162306a36Sopenharmony_ci exit(1); 25262306a36Sopenharmony_ci } 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci if (errno != EPERM) { 25562306a36Sopenharmony_ci perror("bad error from mount"); 25662306a36Sopenharmony_ci exit(1); 25762306a36Sopenharmony_ci } 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci /* 26062306a36Sopenharmony_ci * Ok, we expect this one to succeed. 26162306a36Sopenharmony_ci */ 26262306a36Sopenharmony_ci if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) { 26362306a36Sopenharmony_ci perror("mount"); 26462306a36Sopenharmony_ci exit(1); 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci exit(0); 26862306a36Sopenharmony_ci } 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci /* 27162306a36Sopenharmony_ci * Get the listener from the child. 27262306a36Sopenharmony_ci */ 27362306a36Sopenharmony_ci listener = recv_fd(sk_pair[0]); 27462306a36Sopenharmony_ci if (listener < 0) 27562306a36Sopenharmony_ci goto out_kill; 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci /* 27862306a36Sopenharmony_ci * Fork a task to handle the requests. This isn't strictly necessary, 27962306a36Sopenharmony_ci * but it makes the particular writing of this sample easier, since we 28062306a36Sopenharmony_ci * can just wait ofr the tracee to exit and kill the tracer. 28162306a36Sopenharmony_ci */ 28262306a36Sopenharmony_ci tracer = fork(); 28362306a36Sopenharmony_ci if (tracer < 0) { 28462306a36Sopenharmony_ci perror("fork"); 28562306a36Sopenharmony_ci goto out_kill; 28662306a36Sopenharmony_ci } 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci if (tracer == 0) { 28962306a36Sopenharmony_ci struct seccomp_notif *req; 29062306a36Sopenharmony_ci struct seccomp_notif_resp *resp; 29162306a36Sopenharmony_ci struct seccomp_notif_sizes sizes; 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) { 29462306a36Sopenharmony_ci perror("seccomp(GET_NOTIF_SIZES)"); 29562306a36Sopenharmony_ci goto out_close; 29662306a36Sopenharmony_ci } 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci req = malloc(sizes.seccomp_notif); 29962306a36Sopenharmony_ci if (!req) 30062306a36Sopenharmony_ci goto out_close; 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci resp = malloc(sizes.seccomp_notif_resp); 30362306a36Sopenharmony_ci if (!resp) 30462306a36Sopenharmony_ci goto out_req; 30562306a36Sopenharmony_ci memset(resp, 0, sizes.seccomp_notif_resp); 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci while (1) { 30862306a36Sopenharmony_ci memset(req, 0, sizes.seccomp_notif); 30962306a36Sopenharmony_ci if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) { 31062306a36Sopenharmony_ci perror("ioctl recv"); 31162306a36Sopenharmony_ci goto out_resp; 31262306a36Sopenharmony_ci } 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_ci if (handle_req(req, resp, listener) < 0) 31562306a36Sopenharmony_ci goto out_resp; 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci /* 31862306a36Sopenharmony_ci * ENOENT here means that the task may have gotten a 31962306a36Sopenharmony_ci * signal and restarted the syscall. It's up to the 32062306a36Sopenharmony_ci * handler to decide what to do in this case, but for 32162306a36Sopenharmony_ci * the sample code, we just ignore it. Probably 32262306a36Sopenharmony_ci * something better should happen, like undoing the 32362306a36Sopenharmony_ci * mount, or keeping track of the args to make sure we 32462306a36Sopenharmony_ci * don't do it again. 32562306a36Sopenharmony_ci */ 32662306a36Sopenharmony_ci if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 && 32762306a36Sopenharmony_ci errno != ENOENT) { 32862306a36Sopenharmony_ci perror("ioctl send"); 32962306a36Sopenharmony_ci goto out_resp; 33062306a36Sopenharmony_ci } 33162306a36Sopenharmony_ci } 33262306a36Sopenharmony_ciout_resp: 33362306a36Sopenharmony_ci free(resp); 33462306a36Sopenharmony_ciout_req: 33562306a36Sopenharmony_ci free(req); 33662306a36Sopenharmony_ciout_close: 33762306a36Sopenharmony_ci close(listener); 33862306a36Sopenharmony_ci exit(1); 33962306a36Sopenharmony_ci } 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci close(listener); 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci if (waitpid(worker, &status, 0) != worker) { 34462306a36Sopenharmony_ci perror("waitpid"); 34562306a36Sopenharmony_ci goto out_kill; 34662306a36Sopenharmony_ci } 34762306a36Sopenharmony_ci 34862306a36Sopenharmony_ci if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) { 34962306a36Sopenharmony_ci perror("umount2"); 35062306a36Sopenharmony_ci goto out_kill; 35162306a36Sopenharmony_ci } 35262306a36Sopenharmony_ci 35362306a36Sopenharmony_ci if (remove("/tmp/foo") < 0 && errno != ENOENT) { 35462306a36Sopenharmony_ci perror("remove"); 35562306a36Sopenharmony_ci exit(1); 35662306a36Sopenharmony_ci } 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci if (!WIFEXITED(status) || WEXITSTATUS(status)) { 35962306a36Sopenharmony_ci fprintf(stderr, "worker exited nonzero\n"); 36062306a36Sopenharmony_ci goto out_kill; 36162306a36Sopenharmony_ci } 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci ret = 0; 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ciout_kill: 36662306a36Sopenharmony_ci if (tracer > 0) 36762306a36Sopenharmony_ci kill(tracer, SIGKILL); 36862306a36Sopenharmony_ci if (worker > 0) 36962306a36Sopenharmony_ci kill(worker, SIGKILL); 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ciclose_pair: 37262306a36Sopenharmony_ci close(sk_pair[0]); 37362306a36Sopenharmony_ci close(sk_pair[1]); 37462306a36Sopenharmony_ci return ret; 37562306a36Sopenharmony_ci} 376