1/* Authors: Gregory P. Smith & Jeffrey Yasskin */
2#ifndef Py_BUILD_CORE_BUILTIN
3#  define Py_BUILD_CORE_MODULE 1
4#endif
5
6#include "Python.h"
7#include "pycore_fileutils.h"
8#if defined(HAVE_PIPE2) && !defined(_GNU_SOURCE)
9# define _GNU_SOURCE
10#endif
11#include <unistd.h>
12#include <fcntl.h>
13#ifdef HAVE_SYS_TYPES_H
14#include <sys/types.h>
15#endif
16#if defined(HAVE_SYS_STAT_H)
17#include <sys/stat.h>
18#endif
19#ifdef HAVE_SYS_SYSCALL_H
20#include <sys/syscall.h>
21#endif
22#if defined(HAVE_SYS_RESOURCE_H)
23#include <sys/resource.h>
24#endif
25#ifdef HAVE_DIRENT_H
26#include <dirent.h>
27#endif
28#ifdef HAVE_GRP_H
29#include <grp.h>
30#endif /* HAVE_GRP_H */
31
32#include "posixmodule.h"
33
34#ifdef _Py_MEMORY_SANITIZER
35# include <sanitizer/msan_interface.h>
36#endif
37
38#if defined(__ANDROID__) && __ANDROID_API__ < 21 && !defined(SYS_getdents64)
39# include <sys/linux-syscalls.h>
40# define SYS_getdents64  __NR_getdents64
41#endif
42
43#if defined(__linux__) && defined(HAVE_VFORK) && defined(HAVE_SIGNAL_H) && \
44    defined(HAVE_PTHREAD_SIGMASK) && !defined(HAVE_BROKEN_PTHREAD_SIGMASK)
45/* If this is ever expanded to non-Linux platforms, verify what calls are
46 * allowed after vfork(). Ex: setsid() may be disallowed on macOS? */
47# include <signal.h>
48# define VFORK_USABLE 1
49#endif
50
51#if defined(__sun) && defined(__SVR4)
52/* readdir64 is used to work around Solaris 9 bug 6395699. */
53# define readdir readdir64
54# define dirent dirent64
55# if !defined(HAVE_DIRFD)
56/* Some versions of Solaris lack dirfd(). */
57#  define dirfd(dirp) ((dirp)->dd_fd)
58#  define HAVE_DIRFD
59# endif
60#endif
61
62#if defined(__FreeBSD__) || (defined(__APPLE__) && defined(__MACH__)) || defined(__DragonFly__)
63# define FD_DIR "/dev/fd"
64#else
65# define FD_DIR "/proc/self/fd"
66#endif
67
68#ifdef NGROUPS_MAX
69#define MAX_GROUPS NGROUPS_MAX
70#else
71#define MAX_GROUPS 64
72#endif
73
74#define POSIX_CALL(call)   do { if ((call) == -1) goto error; } while (0)
75
76static struct PyModuleDef _posixsubprocessmodule;
77
78/* Convert ASCII to a positive int, no libc call. no overflow. -1 on error. */
79static int
80_pos_int_from_ascii(const char *name)
81{
82    int num = 0;
83    while (*name >= '0' && *name <= '9') {
84        num = num * 10 + (*name - '0');
85        ++name;
86    }
87    if (*name)
88        return -1;  /* Non digit found, not a number. */
89    return num;
90}
91
92
93#if defined(__FreeBSD__) || defined(__DragonFly__)
94/* When /dev/fd isn't mounted it is often a static directory populated
95 * with 0 1 2 or entries for 0 .. 63 on FreeBSD, NetBSD, OpenBSD and DragonFlyBSD.
96 * NetBSD and OpenBSD have a /proc fs available (though not necessarily
97 * mounted) and do not have fdescfs for /dev/fd.  MacOS X has a devfs
98 * that properly supports /dev/fd.
99 */
100static int
101_is_fdescfs_mounted_on_dev_fd(void)
102{
103    struct stat dev_stat;
104    struct stat dev_fd_stat;
105    if (stat("/dev", &dev_stat) != 0)
106        return 0;
107    if (stat(FD_DIR, &dev_fd_stat) != 0)
108        return 0;
109    if (dev_stat.st_dev == dev_fd_stat.st_dev)
110        return 0;  /* / == /dev == /dev/fd means it is static. #fail */
111    return 1;
112}
113#endif
114
115
116/* Returns 1 if there is a problem with fd_sequence, 0 otherwise. */
117static int
118_sanity_check_python_fd_sequence(PyObject *fd_sequence)
119{
120    Py_ssize_t seq_idx;
121    long prev_fd = -1;
122    for (seq_idx = 0; seq_idx < PyTuple_GET_SIZE(fd_sequence); ++seq_idx) {
123        PyObject* py_fd = PyTuple_GET_ITEM(fd_sequence, seq_idx);
124        long iter_fd;
125        if (!PyLong_Check(py_fd)) {
126            return 1;
127        }
128        iter_fd = PyLong_AsLong(py_fd);
129        if (iter_fd < 0 || iter_fd <= prev_fd || iter_fd > INT_MAX) {
130            /* Negative, overflow, unsorted, too big for a fd. */
131            return 1;
132        }
133        prev_fd = iter_fd;
134    }
135    return 0;
136}
137
138
139/* Is fd found in the sorted Python Sequence? */
140static int
141_is_fd_in_sorted_fd_sequence(int fd, int *fd_sequence,
142                             Py_ssize_t fd_sequence_len)
143{
144    /* Binary search. */
145    Py_ssize_t search_min = 0;
146    Py_ssize_t search_max = fd_sequence_len - 1;
147    if (search_max < 0)
148        return 0;
149    do {
150        long middle = (search_min + search_max) / 2;
151        long middle_fd = fd_sequence[middle];
152        if (fd == middle_fd)
153            return 1;
154        if (fd > middle_fd)
155            search_min = middle + 1;
156        else
157            search_max = middle - 1;
158    } while (search_min <= search_max);
159    return 0;
160}
161
162/*
163 * Do all the Python C API calls in the parent process to turn the pass_fds
164 * "py_fds_to_keep" tuple into a C array.  The caller owns allocation and
165 * freeing of the array.
166 *
167 * On error an unknown number of array elements may have been filled in.
168 * A Python exception has been set when an error is returned.
169 *
170 * Returns: -1 on error, 0 on success.
171 */
172static int
173convert_fds_to_keep_to_c(PyObject *py_fds_to_keep, int *c_fds_to_keep)
174{
175    Py_ssize_t i, len;
176
177    len = PyTuple_GET_SIZE(py_fds_to_keep);
178    for (i = 0; i < len; ++i) {
179        PyObject* fdobj = PyTuple_GET_ITEM(py_fds_to_keep, i);
180        long fd = PyLong_AsLong(fdobj);
181        if (fd == -1 && PyErr_Occurred()) {
182            return -1;
183        }
184        if (fd < 0 || fd > INT_MAX) {
185            PyErr_SetString(PyExc_ValueError,
186                            "fd out of range in fds_to_keep.");
187            return -1;
188        }
189        c_fds_to_keep[i] = (int)fd;
190    }
191    return 0;
192}
193
194
195/* This function must be async-signal-safe as it is called from child_exec()
196 * after fork() or vfork().
197 */
198static int
199make_inheritable(int *c_fds_to_keep, Py_ssize_t len, int errpipe_write)
200{
201    Py_ssize_t i;
202
203    for (i = 0; i < len; ++i) {
204        int fd = c_fds_to_keep[i];
205        if (fd == errpipe_write) {
206            /* errpipe_write is part of fds_to_keep. It must be closed at
207               exec(), but kept open in the child process until exec() is
208               called. */
209            continue;
210        }
211        if (_Py_set_inheritable_async_safe(fd, 1, NULL) < 0)
212            return -1;
213    }
214    return 0;
215}
216
217
218/* Get the maximum file descriptor that could be opened by this process.
219 * This function is async signal safe for use between fork() and exec().
220 */
221static long
222safe_get_max_fd(void)
223{
224    long local_max_fd;
225#if defined(__NetBSD__)
226    local_max_fd = fcntl(0, F_MAXFD);
227    if (local_max_fd >= 0)
228        return local_max_fd;
229#endif
230#if defined(HAVE_SYS_RESOURCE_H) && defined(__OpenBSD__)
231    struct rlimit rl;
232    /* Not on the POSIX async signal safe functions list but likely
233     * safe.  TODO - Someone should audit OpenBSD to make sure. */
234    if (getrlimit(RLIMIT_NOFILE, &rl) >= 0)
235        return (long) rl.rlim_max;
236#endif
237#ifdef _SC_OPEN_MAX
238    local_max_fd = sysconf(_SC_OPEN_MAX);
239    if (local_max_fd == -1)
240#endif
241        local_max_fd = 256;  /* Matches legacy Lib/subprocess.py behavior. */
242    return local_max_fd;
243}
244
245
246/* Close all file descriptors in the given range except for those in
247 * fds_to_keep by invoking closer on each subrange.
248 *
249 * If end_fd == -1, it's guessed via safe_get_max_fd(), but it isn't
250 * possible to know for sure what the max fd to go up to is for
251 * processes with the capability of raising their maximum, or in case
252 * a process opened a high fd and then lowered its maximum.
253 */
254static int
255_close_range_except(int start_fd,
256                    int end_fd,
257                    int *fds_to_keep,
258                    Py_ssize_t fds_to_keep_len,
259                    int (*closer)(int, int))
260{
261    if (end_fd == -1) {
262        end_fd = Py_MIN(safe_get_max_fd(), INT_MAX);
263    }
264    Py_ssize_t keep_seq_idx;
265    /* As fds_to_keep is sorted we can loop through the list closing
266     * fds in between any in the keep list falling within our range. */
267    for (keep_seq_idx = 0; keep_seq_idx < fds_to_keep_len; ++keep_seq_idx) {
268        int keep_fd = fds_to_keep[keep_seq_idx];
269        if (keep_fd < start_fd)
270            continue;
271        if (closer(start_fd, keep_fd - 1) != 0)
272            return -1;
273        start_fd = keep_fd + 1;
274    }
275    if (start_fd <= end_fd) {
276        if (closer(start_fd, end_fd) != 0)
277            return -1;
278    }
279    return 0;
280}
281
282#if defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)
283/* It doesn't matter if d_name has room for NAME_MAX chars; we're using this
284 * only to read a directory of short file descriptor number names.  The kernel
285 * will return an error if we didn't give it enough space.  Highly Unlikely.
286 * This structure is very old and stable: It will not change unless the kernel
287 * chooses to break compatibility with all existing binaries.  Highly Unlikely.
288 */
289struct linux_dirent64 {
290   unsigned long long d_ino;
291   long long d_off;
292   unsigned short d_reclen;     /* Length of this linux_dirent */
293   unsigned char  d_type;
294   char           d_name[256];  /* Filename (null-terminated) */
295};
296
297static int
298_brute_force_closer(int first, int last)
299{
300    for (int i = first; i <= last; i++) {
301        /* Ignore errors */
302        (void)close(i);
303    }
304    return 0;
305}
306
307/* Close all open file descriptors in the range from start_fd and higher
308 * Do not close any in the sorted fds_to_keep list.
309 *
310 * This version is async signal safe as it does not make any unsafe C library
311 * calls, malloc calls or handle any locks.  It is _unfortunate_ to be forced
312 * to resort to making a kernel system call directly but this is the ONLY api
313 * available that does no harm.  opendir/readdir/closedir perform memory
314 * allocation and locking so while they usually work they are not guaranteed
315 * to (especially if you have replaced your malloc implementation).  A version
316 * of this function that uses those can be found in the _maybe_unsafe variant.
317 *
318 * This is Linux specific because that is all I am ready to test it on.  It
319 * should be easy to add OS specific dirent or dirent64 structures and modify
320 * it with some cpp #define magic to work on other OSes as well if you want.
321 */
322static void
323_close_open_fds_safe(int start_fd, int *fds_to_keep, Py_ssize_t fds_to_keep_len)
324{
325    int fd_dir_fd;
326
327    fd_dir_fd = _Py_open_noraise(FD_DIR, O_RDONLY);
328    if (fd_dir_fd == -1) {
329        /* No way to get a list of open fds. */
330        _close_range_except(start_fd, -1,
331                            fds_to_keep, fds_to_keep_len,
332                            _brute_force_closer);
333        return;
334    } else {
335        char buffer[sizeof(struct linux_dirent64)];
336        int bytes;
337        while ((bytes = syscall(SYS_getdents64, fd_dir_fd,
338                                (struct linux_dirent64 *)buffer,
339                                sizeof(buffer))) > 0) {
340            struct linux_dirent64 *entry;
341            int offset;
342#ifdef _Py_MEMORY_SANITIZER
343            __msan_unpoison(buffer, bytes);
344#endif
345            for (offset = 0; offset < bytes; offset += entry->d_reclen) {
346                int fd;
347                entry = (struct linux_dirent64 *)(buffer + offset);
348                if ((fd = _pos_int_from_ascii(entry->d_name)) < 0)
349                    continue;  /* Not a number. */
350                if (fd != fd_dir_fd && fd >= start_fd &&
351                    !_is_fd_in_sorted_fd_sequence(fd, fds_to_keep,
352                                                  fds_to_keep_len)) {
353                    close(fd);
354                }
355            }
356        }
357        close(fd_dir_fd);
358    }
359}
360
361#define _close_open_fds_fallback _close_open_fds_safe
362
363#else  /* NOT (defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)) */
364
365static int
366_unsafe_closer(int first, int last)
367{
368    _Py_closerange(first, last);
369    return 0;
370}
371
372/* Close all open file descriptors from start_fd and higher.
373 * Do not close any in the sorted fds_to_keep tuple.
374 *
375 * This function violates the strict use of async signal safe functions. :(
376 * It calls opendir(), readdir() and closedir().  Of these, the one most
377 * likely to ever cause a problem is opendir() as it performs an internal
378 * malloc().  Practically this should not be a problem.  The Java VM makes the
379 * same calls between fork and exec in its own UNIXProcess_md.c implementation.
380 *
381 * readdir_r() is not used because it provides no benefit.  It is typically
382 * implemented as readdir() followed by memcpy().  See also:
383 *   http://womble.decadent.org.uk/readdir_r-advisory.html
384 */
385static void
386_close_open_fds_maybe_unsafe(int start_fd, int *fds_to_keep,
387                             Py_ssize_t fds_to_keep_len)
388{
389    DIR *proc_fd_dir;
390#ifndef HAVE_DIRFD
391    while (_is_fd_in_sorted_fd_sequence(start_fd, fds_to_keep,
392                                        fds_to_keep_len)) {
393        ++start_fd;
394    }
395    /* Close our lowest fd before we call opendir so that it is likely to
396     * reuse that fd otherwise we might close opendir's file descriptor in
397     * our loop.  This trick assumes that fd's are allocated on a lowest
398     * available basis. */
399    close(start_fd);
400    ++start_fd;
401#endif
402
403#if defined(__FreeBSD__) || defined(__DragonFly__)
404    if (!_is_fdescfs_mounted_on_dev_fd())
405        proc_fd_dir = NULL;
406    else
407#endif
408        proc_fd_dir = opendir(FD_DIR);
409    if (!proc_fd_dir) {
410        /* No way to get a list of open fds. */
411        _close_range_except(start_fd, -1, fds_to_keep, fds_to_keep_len,
412                            _unsafe_closer);
413    } else {
414        struct dirent *dir_entry;
415#ifdef HAVE_DIRFD
416        int fd_used_by_opendir = dirfd(proc_fd_dir);
417#else
418        int fd_used_by_opendir = start_fd - 1;
419#endif
420        errno = 0;
421        while ((dir_entry = readdir(proc_fd_dir))) {
422            int fd;
423            if ((fd = _pos_int_from_ascii(dir_entry->d_name)) < 0)
424                continue;  /* Not a number. */
425            if (fd != fd_used_by_opendir && fd >= start_fd &&
426                !_is_fd_in_sorted_fd_sequence(fd, fds_to_keep,
427                                              fds_to_keep_len)) {
428                close(fd);
429            }
430            errno = 0;
431        }
432        if (errno) {
433            /* readdir error, revert behavior. Highly Unlikely. */
434            _close_range_except(start_fd, -1, fds_to_keep, fds_to_keep_len,
435                                _unsafe_closer);
436        }
437        closedir(proc_fd_dir);
438    }
439}
440
441#define _close_open_fds_fallback _close_open_fds_maybe_unsafe
442
443#endif  /* else NOT (defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)) */
444
445/* We can use close_range() library function only if it's known to be
446 * async-signal-safe.
447 *
448 * On Linux, glibc explicitly documents it to be a thin wrapper over
449 * the system call, and other C libraries are likely to follow glibc.
450 */
451#if defined(HAVE_CLOSE_RANGE) && \
452    (defined(__linux__) || defined(__FreeBSD__))
453#define HAVE_ASYNC_SAFE_CLOSE_RANGE
454
455static int
456_close_range_closer(int first, int last)
457{
458    return close_range(first, last, 0);
459}
460#endif
461
462static void
463_close_open_fds(int start_fd, int *fds_to_keep, Py_ssize_t fds_to_keep_len)
464{
465#ifdef HAVE_ASYNC_SAFE_CLOSE_RANGE
466    if (_close_range_except(
467            start_fd, INT_MAX, fds_to_keep, fds_to_keep_len,
468            _close_range_closer) == 0) {
469        return;
470    }
471#endif
472    _close_open_fds_fallback(start_fd, fds_to_keep, fds_to_keep_len);
473}
474
475#ifdef VFORK_USABLE
476/* Reset dispositions for all signals to SIG_DFL except for ignored
477 * signals. This way we ensure that no signal handlers can run
478 * after we unblock signals in a child created by vfork().
479 */
480static void
481reset_signal_handlers(const sigset_t *child_sigmask)
482{
483    struct sigaction sa_dfl = {.sa_handler = SIG_DFL};
484    for (int sig = 1; sig < _NSIG; sig++) {
485        /* Dispositions for SIGKILL and SIGSTOP can't be changed. */
486        if (sig == SIGKILL || sig == SIGSTOP) {
487            continue;
488        }
489
490        /* There is no need to reset the disposition of signals that will
491         * remain blocked across execve() since the kernel will do it. */
492        if (sigismember(child_sigmask, sig) == 1) {
493            continue;
494        }
495
496        struct sigaction sa;
497        /* C libraries usually return EINVAL for signals used
498         * internally (e.g. for thread cancellation), so simply
499         * skip errors here. */
500        if (sigaction(sig, NULL, &sa) == -1) {
501            continue;
502        }
503
504        /* void *h works as these fields are both pointer types already. */
505        void *h = (sa.sa_flags & SA_SIGINFO ? (void *)sa.sa_sigaction :
506                                              (void *)sa.sa_handler);
507        if (h == SIG_IGN || h == SIG_DFL) {
508            continue;
509        }
510
511        /* This call can't reasonably fail, but if it does, terminating
512         * the child seems to be too harsh, so ignore errors. */
513        (void) sigaction(sig, &sa_dfl, NULL);
514    }
515}
516#endif /* VFORK_USABLE */
517
518
519/*
520 * This function is code executed in the child process immediately after
521 * (v)fork to set things up and call exec().
522 *
523 * All of the code in this function must only use async-signal-safe functions,
524 * listed at `man 7 signal` or
525 * http://www.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html.
526 *
527 * This restriction is documented at
528 * http://www.opengroup.org/onlinepubs/009695399/functions/fork.html.
529 *
530 * If this function is called after vfork(), even more care must be taken.
531 * The lack of preparations that C libraries normally take on fork(),
532 * as well as sharing the address space with the parent, might make even
533 * async-signal-safe functions vfork-unsafe. In particular, on Linux,
534 * set*id() and setgroups() library functions must not be called, since
535 * they have to interact with the library-level thread list and send
536 * library-internal signals to implement per-process credentials semantics
537 * required by POSIX but not supported natively on Linux. Another reason to
538 * avoid this family of functions is that sharing an address space between
539 * processes running with different privileges is inherently insecure.
540 * See bpo-35823 for further discussion and references.
541 *
542 * In some C libraries, setrlimit() has the same thread list/signalling
543 * behavior since resource limits were per-thread attributes before
544 * Linux 2.6.10. Musl, as of 1.2.1, is known to have this issue
545 * (https://www.openwall.com/lists/musl/2020/10/15/6).
546 *
547 * If vfork-unsafe functionality is desired after vfork(), consider using
548 * syscall() to obtain it.
549 */
550Py_NO_INLINE static void
551child_exec(char *const exec_array[],
552           char *const argv[],
553           char *const envp[],
554           const char *cwd,
555           int p2cread, int p2cwrite,
556           int c2pread, int c2pwrite,
557           int errread, int errwrite,
558           int errpipe_read, int errpipe_write,
559           int close_fds, int restore_signals,
560           int call_setsid, pid_t pgid_to_set,
561           int call_setgid, gid_t gid,
562           int call_setgroups, size_t groups_size, const gid_t *groups,
563           int call_setuid, uid_t uid, int child_umask,
564           const void *child_sigmask,
565           int *fds_to_keep, Py_ssize_t fds_to_keep_len,
566           PyObject *preexec_fn,
567           PyObject *preexec_fn_args_tuple)
568{
569    int i, saved_errno, reached_preexec = 0;
570    PyObject *result;
571    const char* err_msg = "";
572    /* Buffer large enough to hold a hex integer.  We can't malloc. */
573    char hex_errno[sizeof(saved_errno)*2+1];
574
575    if (make_inheritable(fds_to_keep, fds_to_keep_len, errpipe_write) < 0)
576        goto error;
577
578    /* Close parent's pipe ends. */
579    if (p2cwrite != -1)
580        POSIX_CALL(close(p2cwrite));
581    if (c2pread != -1)
582        POSIX_CALL(close(c2pread));
583    if (errread != -1)
584        POSIX_CALL(close(errread));
585    POSIX_CALL(close(errpipe_read));
586
587    /* When duping fds, if there arises a situation where one of the fds is
588       either 0, 1 or 2, it is possible that it is overwritten (#12607). */
589    if (c2pwrite == 0) {
590        POSIX_CALL(c2pwrite = dup(c2pwrite));
591        /* issue32270 */
592        if (_Py_set_inheritable_async_safe(c2pwrite, 0, NULL) < 0) {
593            goto error;
594        }
595    }
596    while (errwrite == 0 || errwrite == 1) {
597        POSIX_CALL(errwrite = dup(errwrite));
598        /* issue32270 */
599        if (_Py_set_inheritable_async_safe(errwrite, 0, NULL) < 0) {
600            goto error;
601        }
602    }
603
604    /* Dup fds for child.
605       dup2() removes the CLOEXEC flag but we must do it ourselves if dup2()
606       would be a no-op (issue #10806). */
607    if (p2cread == 0) {
608        if (_Py_set_inheritable_async_safe(p2cread, 1, NULL) < 0)
609            goto error;
610    }
611    else if (p2cread != -1)
612        POSIX_CALL(dup2(p2cread, 0));  /* stdin */
613
614    if (c2pwrite == 1) {
615        if (_Py_set_inheritable_async_safe(c2pwrite, 1, NULL) < 0)
616            goto error;
617    }
618    else if (c2pwrite != -1)
619        POSIX_CALL(dup2(c2pwrite, 1));  /* stdout */
620
621    if (errwrite == 2) {
622        if (_Py_set_inheritable_async_safe(errwrite, 1, NULL) < 0)
623            goto error;
624    }
625    else if (errwrite != -1)
626        POSIX_CALL(dup2(errwrite, 2));  /* stderr */
627
628    /* We no longer manually close p2cread, c2pwrite, and errwrite here as
629     * _close_open_fds takes care when it is not already non-inheritable. */
630
631    if (cwd)
632        POSIX_CALL(chdir(cwd));
633
634    if (child_umask >= 0)
635        umask(child_umask);  /* umask() always succeeds. */
636
637    if (restore_signals)
638        _Py_RestoreSignals();
639
640#ifdef VFORK_USABLE
641    if (child_sigmask) {
642        reset_signal_handlers(child_sigmask);
643        if ((errno = pthread_sigmask(SIG_SETMASK, child_sigmask, NULL))) {
644            goto error;
645        }
646    }
647#endif
648
649#ifdef HAVE_SETSID
650    if (call_setsid)
651        POSIX_CALL(setsid());
652#endif
653
654#ifdef HAVE_SETPGID
655    if (pgid_to_set >= 0)
656        POSIX_CALL(setpgid(0, pgid_to_set));
657#endif
658
659#ifdef HAVE_SETGROUPS
660    if (call_setgroups)
661        POSIX_CALL(setgroups(groups_size, groups));
662#endif /* HAVE_SETGROUPS */
663
664#ifdef HAVE_SETREGID
665    if (call_setgid)
666        POSIX_CALL(setregid(gid, gid));
667#endif /* HAVE_SETREGID */
668
669#ifdef HAVE_SETREUID
670    if (call_setuid)
671        POSIX_CALL(setreuid(uid, uid));
672#endif /* HAVE_SETREUID */
673
674
675    reached_preexec = 1;
676    if (preexec_fn != Py_None && preexec_fn_args_tuple) {
677        /* This is where the user has asked us to deadlock their program. */
678        result = PyObject_Call(preexec_fn, preexec_fn_args_tuple, NULL);
679        if (result == NULL) {
680            /* Stringifying the exception or traceback would involve
681             * memory allocation and thus potential for deadlock.
682             * We've already faced potential deadlock by calling back
683             * into Python in the first place, so it probably doesn't
684             * matter but we avoid it to minimize the possibility. */
685            err_msg = "Exception occurred in preexec_fn.";
686            errno = 0;  /* We don't want to report an OSError. */
687            goto error;
688        }
689        /* Py_DECREF(result); - We're about to exec so why bother? */
690    }
691
692    /* close FDs after executing preexec_fn, which might open FDs */
693    if (close_fds) {
694        /* TODO HP-UX could use pstat_getproc() if anyone cares about it. */
695        _close_open_fds(3, fds_to_keep, fds_to_keep_len);
696    }
697
698    /* This loop matches the Lib/os.py _execvpe()'s PATH search when */
699    /* given the executable_list generated by Lib/subprocess.py.     */
700    saved_errno = 0;
701    for (i = 0; exec_array[i] != NULL; ++i) {
702        const char *executable = exec_array[i];
703        if (envp) {
704            execve(executable, argv, envp);
705        } else {
706            execv(executable, argv);
707        }
708        if (errno != ENOENT && errno != ENOTDIR && saved_errno == 0) {
709            saved_errno = errno;
710        }
711    }
712    /* Report the first exec error, not the last. */
713    if (saved_errno)
714        errno = saved_errno;
715
716error:
717    saved_errno = errno;
718    /* Report the posix error to our parent process. */
719    /* We ignore all write() return values as the total size of our writes is
720       less than PIPEBUF and we cannot do anything about an error anyways.
721       Use _Py_write_noraise() to retry write() if it is interrupted by a
722       signal (fails with EINTR). */
723    if (saved_errno) {
724        char *cur;
725        _Py_write_noraise(errpipe_write, "OSError:", 8);
726        cur = hex_errno + sizeof(hex_errno);
727        while (saved_errno != 0 && cur != hex_errno) {
728            *--cur = Py_hexdigits[saved_errno % 16];
729            saved_errno /= 16;
730        }
731        _Py_write_noraise(errpipe_write, cur, hex_errno + sizeof(hex_errno) - cur);
732        _Py_write_noraise(errpipe_write, ":", 1);
733        if (!reached_preexec) {
734            /* Indicate to the parent that the error happened before exec(). */
735            _Py_write_noraise(errpipe_write, "noexec", 6);
736        }
737        /* We can't call strerror(saved_errno).  It is not async signal safe.
738         * The parent process will look the error message up. */
739    } else {
740        _Py_write_noraise(errpipe_write, "SubprocessError:0:", 18);
741        _Py_write_noraise(errpipe_write, err_msg, strlen(err_msg));
742    }
743}
744
745
746/* The main purpose of this wrapper function is to isolate vfork() from both
747 * subprocess_fork_exec() and child_exec(). A child process created via
748 * vfork() executes on the same stack as the parent process while the latter is
749 * suspended, so this function should not be inlined to avoid compiler bugs
750 * that might clobber data needed by the parent later. Additionally,
751 * child_exec() should not be inlined to avoid spurious -Wclobber warnings from
752 * GCC (see bpo-35823).
753 */
754Py_NO_INLINE static pid_t
755do_fork_exec(char *const exec_array[],
756             char *const argv[],
757             char *const envp[],
758             const char *cwd,
759             int p2cread, int p2cwrite,
760             int c2pread, int c2pwrite,
761             int errread, int errwrite,
762             int errpipe_read, int errpipe_write,
763             int close_fds, int restore_signals,
764             int call_setsid, pid_t pgid_to_set,
765             int call_setgid, gid_t gid,
766             int call_setgroups, size_t groups_size, const gid_t *groups,
767             int call_setuid, uid_t uid, int child_umask,
768             const void *child_sigmask,
769             int *fds_to_keep, Py_ssize_t fds_to_keep_len,
770             PyObject *preexec_fn,
771             PyObject *preexec_fn_args_tuple)
772{
773
774    pid_t pid;
775
776#ifdef VFORK_USABLE
777    if (child_sigmask) {
778        /* These are checked by our caller; verify them in debug builds. */
779        assert(!call_setuid);
780        assert(!call_setgid);
781        assert(!call_setgroups);
782        assert(preexec_fn == Py_None);
783
784        pid = vfork();
785        if (pid == -1) {
786            /* If vfork() fails, fall back to using fork(). When it isn't
787             * allowed in a process by the kernel, vfork can return -1
788             * with errno EINVAL. https://bugs.python.org/issue47151. */
789            pid = fork();
790        }
791    } else
792#endif
793    {
794        pid = fork();
795    }
796
797    if (pid != 0) {
798        return pid;
799    }
800
801    /* Child process.
802     * See the comment above child_exec() for restrictions imposed on
803     * the code below.
804     */
805
806    if (preexec_fn != Py_None) {
807        /* We'll be calling back into Python later so we need to do this.
808         * This call may not be async-signal-safe but neither is calling
809         * back into Python.  The user asked us to use hope as a strategy
810         * to avoid deadlock... */
811        PyOS_AfterFork_Child();
812    }
813
814    child_exec(exec_array, argv, envp, cwd,
815               p2cread, p2cwrite, c2pread, c2pwrite,
816               errread, errwrite, errpipe_read, errpipe_write,
817               close_fds, restore_signals, call_setsid, pgid_to_set,
818               call_setgid, gid, call_setgroups, groups_size, groups,
819               call_setuid, uid, child_umask, child_sigmask,
820               fds_to_keep, fds_to_keep_len,
821               preexec_fn, preexec_fn_args_tuple);
822    _exit(255);
823    return 0;  /* Dead code to avoid a potential compiler warning. */
824}
825
826
827static PyObject *
828subprocess_fork_exec(PyObject *module, PyObject *args)
829{
830    PyObject *gc_module = NULL;
831    PyObject *executable_list, *py_fds_to_keep;
832    PyObject *env_list, *preexec_fn;
833    PyObject *process_args, *converted_args = NULL, *fast_args = NULL;
834    PyObject *preexec_fn_args_tuple = NULL;
835    PyObject *groups_list;
836    PyObject *uid_object, *gid_object;
837    int p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite;
838    int errpipe_read, errpipe_write, close_fds, restore_signals;
839    int call_setsid;
840    pid_t pgid_to_set = -1;
841    int call_setgid = 0, call_setgroups = 0, call_setuid = 0;
842    uid_t uid;
843    gid_t gid, *groups = NULL;
844    int child_umask;
845    PyObject *cwd_obj, *cwd_obj2 = NULL;
846    const char *cwd;
847    pid_t pid = -1;
848    int need_to_reenable_gc = 0;
849    char *const *exec_array, *const *argv = NULL, *const *envp = NULL;
850    Py_ssize_t arg_num, num_groups = 0;
851    int need_after_fork = 0;
852    int saved_errno = 0;
853    int allow_vfork;
854    int *c_fds_to_keep = NULL;
855
856    if (!PyArg_ParseTuple(
857            args, "OOpO!OOiiiiiiiiii" _Py_PARSE_PID "OOOiOp:fork_exec",
858            &process_args, &executable_list,
859            &close_fds, &PyTuple_Type, &py_fds_to_keep,
860            &cwd_obj, &env_list,
861            &p2cread, &p2cwrite, &c2pread, &c2pwrite,
862            &errread, &errwrite, &errpipe_read, &errpipe_write,
863            &restore_signals, &call_setsid, &pgid_to_set,
864            &gid_object, &groups_list, &uid_object, &child_umask,
865            &preexec_fn, &allow_vfork))
866        return NULL;
867
868    if ((preexec_fn != Py_None) &&
869            (PyInterpreterState_Get() != PyInterpreterState_Main())) {
870        PyErr_SetString(PyExc_RuntimeError,
871                        "preexec_fn not supported within subinterpreters");
872        return NULL;
873    }
874
875    if (close_fds && errpipe_write < 3) {  /* precondition */
876        PyErr_SetString(PyExc_ValueError, "errpipe_write must be >= 3");
877        return NULL;
878    }
879    if (_sanity_check_python_fd_sequence(py_fds_to_keep)) {
880        PyErr_SetString(PyExc_ValueError, "bad value(s) in fds_to_keep");
881        return NULL;
882    }
883
884    PyInterpreterState *interp = PyInterpreterState_Get();
885    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
886    if (config->_isolated_interpreter) {
887        PyErr_SetString(PyExc_RuntimeError,
888                        "subprocess not supported for isolated subinterpreters");
889        return NULL;
890    }
891
892    /* We need to call gc.disable() when we'll be calling preexec_fn */
893    if (preexec_fn != Py_None) {
894        need_to_reenable_gc = PyGC_Disable();
895    }
896
897    exec_array = _PySequence_BytesToCharpArray(executable_list);
898    if (!exec_array)
899        goto cleanup;
900
901    /* Convert args and env into appropriate arguments for exec() */
902    /* These conversions are done in the parent process to avoid allocating
903       or freeing memory in the child process. */
904    if (process_args != Py_None) {
905        Py_ssize_t num_args;
906        /* Equivalent to:  */
907        /*  tuple(PyUnicode_FSConverter(arg) for arg in process_args)  */
908        fast_args = PySequence_Fast(process_args, "argv must be a tuple");
909        if (fast_args == NULL)
910            goto cleanup;
911        num_args = PySequence_Fast_GET_SIZE(fast_args);
912        converted_args = PyTuple_New(num_args);
913        if (converted_args == NULL)
914            goto cleanup;
915        for (arg_num = 0; arg_num < num_args; ++arg_num) {
916            PyObject *borrowed_arg, *converted_arg;
917            if (PySequence_Fast_GET_SIZE(fast_args) != num_args) {
918                PyErr_SetString(PyExc_RuntimeError, "args changed during iteration");
919                goto cleanup;
920            }
921            borrowed_arg = PySequence_Fast_GET_ITEM(fast_args, arg_num);
922            if (PyUnicode_FSConverter(borrowed_arg, &converted_arg) == 0)
923                goto cleanup;
924            PyTuple_SET_ITEM(converted_args, arg_num, converted_arg);
925        }
926
927        argv = _PySequence_BytesToCharpArray(converted_args);
928        Py_CLEAR(converted_args);
929        Py_CLEAR(fast_args);
930        if (!argv)
931            goto cleanup;
932    }
933
934    if (env_list != Py_None) {
935        envp = _PySequence_BytesToCharpArray(env_list);
936        if (!envp)
937            goto cleanup;
938    }
939
940    if (cwd_obj != Py_None) {
941        if (PyUnicode_FSConverter(cwd_obj, &cwd_obj2) == 0)
942            goto cleanup;
943        cwd = PyBytes_AsString(cwd_obj2);
944    } else {
945        cwd = NULL;
946    }
947
948    if (groups_list != Py_None) {
949#ifdef HAVE_SETGROUPS
950        Py_ssize_t i;
951        gid_t gid;
952
953        if (!PyList_Check(groups_list)) {
954            PyErr_SetString(PyExc_TypeError,
955                    "setgroups argument must be a list");
956            goto cleanup;
957        }
958        num_groups = PySequence_Size(groups_list);
959
960        if (num_groups < 0)
961            goto cleanup;
962
963        if (num_groups > MAX_GROUPS) {
964            PyErr_SetString(PyExc_ValueError, "too many groups");
965            goto cleanup;
966        }
967
968        if ((groups = PyMem_RawMalloc(num_groups * sizeof(gid_t))) == NULL) {
969            PyErr_SetString(PyExc_MemoryError,
970                    "failed to allocate memory for group list");
971            goto cleanup;
972        }
973
974        for (i = 0; i < num_groups; i++) {
975            PyObject *elem;
976            elem = PySequence_GetItem(groups_list, i);
977            if (!elem)
978                goto cleanup;
979            if (!PyLong_Check(elem)) {
980                PyErr_SetString(PyExc_TypeError,
981                                "groups must be integers");
982                Py_DECREF(elem);
983                goto cleanup;
984            } else {
985                if (!_Py_Gid_Converter(elem, &gid)) {
986                    Py_DECREF(elem);
987                    PyErr_SetString(PyExc_ValueError, "invalid group id");
988                    goto cleanup;
989                }
990                groups[i] = gid;
991            }
992            Py_DECREF(elem);
993        }
994        call_setgroups = 1;
995
996#else /* HAVE_SETGROUPS */
997        PyErr_BadInternalCall();
998        goto cleanup;
999#endif /* HAVE_SETGROUPS */
1000    }
1001
1002    if (gid_object != Py_None) {
1003#ifdef HAVE_SETREGID
1004        if (!_Py_Gid_Converter(gid_object, &gid))
1005            goto cleanup;
1006
1007        call_setgid = 1;
1008
1009#else /* HAVE_SETREGID */
1010        PyErr_BadInternalCall();
1011        goto cleanup;
1012#endif /* HAVE_SETREUID */
1013    }
1014
1015    if (uid_object != Py_None) {
1016#ifdef HAVE_SETREUID
1017        if (!_Py_Uid_Converter(uid_object, &uid))
1018            goto cleanup;
1019
1020        call_setuid = 1;
1021
1022#else /* HAVE_SETREUID */
1023        PyErr_BadInternalCall();
1024        goto cleanup;
1025#endif /* HAVE_SETREUID */
1026    }
1027
1028    Py_ssize_t fds_to_keep_len = PyTuple_GET_SIZE(py_fds_to_keep);
1029    c_fds_to_keep = PyMem_Malloc(fds_to_keep_len * sizeof(int));
1030    if (c_fds_to_keep == NULL) {
1031        PyErr_SetString(PyExc_MemoryError, "failed to malloc c_fds_to_keep");
1032        goto cleanup;
1033    }
1034    if (convert_fds_to_keep_to_c(py_fds_to_keep, c_fds_to_keep) < 0) {
1035        goto cleanup;
1036    }
1037
1038    /* This must be the last thing done before fork() because we do not
1039     * want to call PyOS_BeforeFork() if there is any chance of another
1040     * error leading to the cleanup: code without calling fork(). */
1041    if (preexec_fn != Py_None) {
1042        preexec_fn_args_tuple = PyTuple_New(0);
1043        if (!preexec_fn_args_tuple)
1044            goto cleanup;
1045        PyOS_BeforeFork();
1046        need_after_fork = 1;
1047    }
1048
1049    /* NOTE: When old_sigmask is non-NULL, do_fork_exec() may use vfork(). */
1050    const void *old_sigmask = NULL;
1051#ifdef VFORK_USABLE
1052    /* Use vfork() only if it's safe. See the comment above child_exec(). */
1053    sigset_t old_sigs;
1054    if (preexec_fn == Py_None && allow_vfork &&
1055        !call_setuid && !call_setgid && !call_setgroups) {
1056        /* Block all signals to ensure that no signal handlers are run in the
1057         * child process while it shares memory with us. Note that signals
1058         * used internally by C libraries won't be blocked by
1059         * pthread_sigmask(), but signal handlers installed by C libraries
1060         * normally service only signals originating from *within the process*,
1061         * so it should be sufficient to consider any library function that
1062         * might send such a signal to be vfork-unsafe and do not call it in
1063         * the child.
1064         */
1065        sigset_t all_sigs;
1066        sigfillset(&all_sigs);
1067        if ((saved_errno = pthread_sigmask(SIG_BLOCK, &all_sigs, &old_sigs))) {
1068            goto cleanup;
1069        }
1070        old_sigmask = &old_sigs;
1071    }
1072#endif
1073
1074    pid = do_fork_exec(exec_array, argv, envp, cwd,
1075                       p2cread, p2cwrite, c2pread, c2pwrite,
1076                       errread, errwrite, errpipe_read, errpipe_write,
1077                       close_fds, restore_signals, call_setsid, pgid_to_set,
1078                       call_setgid, gid, call_setgroups, num_groups, groups,
1079                       call_setuid, uid, child_umask, old_sigmask,
1080                       c_fds_to_keep, fds_to_keep_len,
1081                       preexec_fn, preexec_fn_args_tuple);
1082
1083    /* Parent (original) process */
1084    if (pid == -1) {
1085        /* Capture errno for the exception. */
1086        saved_errno = errno;
1087    }
1088
1089#ifdef VFORK_USABLE
1090    if (old_sigmask) {
1091        /* vfork() semantics guarantees that the parent is blocked
1092         * until the child performs _exit() or execve(), so it is safe
1093         * to unblock signals once we're here.
1094         * Note that in environments where vfork() is implemented as fork(),
1095         * such as QEMU user-mode emulation, the parent won't be blocked,
1096         * but it won't share the address space with the child,
1097         * so it's still safe to unblock the signals.
1098         *
1099         * We don't handle errors here because this call can't fail
1100         * if valid arguments are given, and because there is no good
1101         * way for the caller to deal with a failure to restore
1102         * the thread signal mask. */
1103        (void) pthread_sigmask(SIG_SETMASK, old_sigmask, NULL);
1104    }
1105#endif
1106
1107    if (need_after_fork)
1108        PyOS_AfterFork_Parent();
1109
1110cleanup:
1111    if (c_fds_to_keep != NULL) {
1112        PyMem_Free(c_fds_to_keep);
1113    }
1114
1115    if (saved_errno != 0) {
1116        errno = saved_errno;
1117        /* We can't call this above as PyOS_AfterFork_Parent() calls back
1118         * into Python code which would see the unreturned error. */
1119        PyErr_SetFromErrno(PyExc_OSError);
1120    }
1121
1122    Py_XDECREF(preexec_fn_args_tuple);
1123    PyMem_RawFree(groups);
1124    Py_XDECREF(cwd_obj2);
1125    if (envp)
1126        _Py_FreeCharPArray(envp);
1127    Py_XDECREF(converted_args);
1128    Py_XDECREF(fast_args);
1129    if (argv)
1130        _Py_FreeCharPArray(argv);
1131    if (exec_array)
1132        _Py_FreeCharPArray(exec_array);
1133
1134    if (need_to_reenable_gc) {
1135        PyGC_Enable();
1136    }
1137    Py_XDECREF(gc_module);
1138
1139    return pid == -1 ? NULL : PyLong_FromPid(pid);
1140}
1141
1142
1143PyDoc_STRVAR(subprocess_fork_exec_doc,
1144"fork_exec(args, executable_list, close_fds, pass_fds, cwd, env,\n\
1145          p2cread, p2cwrite, c2pread, c2pwrite,\n\
1146          errread, errwrite, errpipe_read, errpipe_write,\n\
1147          restore_signals, call_setsid, pgid_to_set,\n\
1148          gid, groups_list, uid,\n\
1149          preexec_fn)\n\
1150\n\
1151Forks a child process, closes parent file descriptors as appropriate in the\n\
1152child and dups the few that are needed before calling exec() in the child\n\
1153process.\n\
1154\n\
1155If close_fds is true, close file descriptors 3 and higher, except those listed\n\
1156in the sorted tuple pass_fds.\n\
1157\n\
1158The preexec_fn, if supplied, will be called immediately before closing file\n\
1159descriptors and exec.\n\
1160WARNING: preexec_fn is NOT SAFE if your application uses threads.\n\
1161         It may trigger infrequent, difficult to debug deadlocks.\n\
1162\n\
1163If an error occurs in the child process before the exec, it is\n\
1164serialized and written to the errpipe_write fd per subprocess.py.\n\
1165\n\
1166Returns: the child process's PID.\n\
1167\n\
1168Raises: Only on an error in the parent process.\n\
1169");
1170
1171/* module level code ********************************************************/
1172
1173PyDoc_STRVAR(module_doc,
1174"A POSIX helper for the subprocess module.");
1175
1176static PyMethodDef module_methods[] = {
1177    {"fork_exec", subprocess_fork_exec, METH_VARARGS, subprocess_fork_exec_doc},
1178    {NULL, NULL}  /* sentinel */
1179};
1180
1181static PyModuleDef_Slot _posixsubprocess_slots[] = {
1182    {0, NULL}
1183};
1184
1185static struct PyModuleDef _posixsubprocessmodule = {
1186        PyModuleDef_HEAD_INIT,
1187        .m_name = "_posixsubprocess",
1188        .m_doc = module_doc,
1189        .m_size = 0,
1190        .m_methods = module_methods,
1191        .m_slots = _posixsubprocess_slots,
1192};
1193
1194PyMODINIT_FUNC
1195PyInit__posixsubprocess(void)
1196{
1197    return PyModuleDef_Init(&_posixsubprocessmodule);
1198}
1199