cosmopolitan/libc/proc/posix_spawn.c
Justine Tunney 791f79fcb3
Make improvements
- We now serialize the file descriptor table when spawning / executing
  processes on Windows. This means you can now inherit more stuff than
  just standard i/o. It's needed by bash, which duplicates the console
  to file descriptor #255. We also now do a better job serializing the
  environment variables, so you're less likely to encounter E2BIG when
  using your bash shell. We also no longer coerce environ to uppercase

- execve() on Windows now remotely controls its parent process to make
  them spawn a replacement for itself. Then it'll be able to terminate
  immediately once the spawn succeeds, without having to linger around
  for the lifetime as a shell process for proxying the exit code. When
  process worker thread running in the parent sees the child die, it's
  given a handle to the new child, to replace it in the process table.

- execve() and posix_spawn() on Windows will now provide CreateProcess
  an explicit handle list. This allows us to remove handle locks which
  enables better fork/spawn concurrency, with seriously correct thread
  safety. Other codebases like Go use the same technique. On the other
  hand fork() still favors the conventional WIN32 inheritence approach
  which can be a little bit messy, but is *controlled* by guaranteeing
  perfectly clean slates at both the spawning and execution boundaries

- sigset_t is now 64 bits. Having it be 128 bits was a mistake because
  there's no reason to use that and it's only supported by FreeBSD. By
  using the system word size, signal mask manipulation on Windows goes
  very fast. Furthermore @asyncsignalsafe funcs have been rewritten on
  Windows to take advantage of signal masking, now that it's much more
  pleasant to use.

- All the overlapped i/o code on Windows has been rewritten for pretty
  good signal and cancelation safety. We're now able to ensure overlap
  data structures are cleaned up so long as you don't longjmp() out of
  out of a signal handler that interrupted an i/o operation. Latencies
  are also improved thanks to the removal of lots of "busy wait" code.
  Waits should be optimal for everything except poll(), which shall be
  the last and final demon we slay in the win32 i/o horror show.

- getrusage() on Windows is now able to report RUSAGE_CHILDREN as well
  as RUSAGE_SELF, thanks to aggregation in the process manager thread.
2023-10-08 08:59:53 -07:00

558 lines
18 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2021 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/proc/posix_spawn.h"
#include "libc/assert.h"
#include "libc/atomic.h"
#include "libc/calls/calls.h"
#include "libc/calls/internal.h"
#include "libc/calls/state.internal.h"
#include "libc/calls/struct/fd.internal.h"
#include "libc/calls/struct/rlimit.h"
#include "libc/calls/struct/rlimit.internal.h"
#include "libc/calls/struct/rusage.internal.h"
#include "libc/calls/struct/sigaction.h"
#include "libc/calls/struct/sigset.h"
#include "libc/calls/struct/sigset.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/calls/syscall_support-nt.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/fmt/itoa.h"
#include "libc/fmt/magnumstrs.internal.h"
#include "libc/intrin/asan.internal.h"
#include "libc/intrin/atomic.h"
#include "libc/intrin/describeflags.internal.h"
#include "libc/intrin/dll.h"
#include "libc/intrin/strace.internal.h"
#include "libc/intrin/weaken.h"
#include "libc/mem/alloca.h"
#include "libc/mem/mem.h"
#include "libc/nt/createfile.h"
#include "libc/nt/enum/processcreationflags.h"
#include "libc/nt/enum/startf.h"
#include "libc/nt/files.h"
#include "libc/nt/runtime.h"
#include "libc/nt/struct/processinformation.h"
#include "libc/nt/struct/startupinfo.h"
#include "libc/proc/describefds.internal.h"
#include "libc/proc/ntspawn.h"
#include "libc/proc/posix_spawn.h"
#include "libc/proc/posix_spawn.internal.h"
#include "libc/proc/proc.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/sock/sock.h"
#include "libc/stdio/stdio.h"
#include "libc/stdio/sysparam.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/at.h"
#include "libc/sysv/consts/f.h"
#include "libc/sysv/consts/fd.h"
#include "libc/sysv/consts/limits.h"
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/ok.h"
#include "libc/sysv/consts/sig.h"
#include "libc/sysv/errfuns.h"
#include "libc/thread/thread.h"
#include "libc/thread/tls.h"
#ifndef SYSDEBUG
#define read sys_read
#define write sys_write
#define close sys_close
#define pipe2 sys_pipe2
#define getgid sys_getgid
#define setgid sys_setgid
#define getuid sys_getuid
#define setuid sys_setuid
#define setsid sys_setsid
#define setpgid sys_setpgid
#define fcntl __sys_fcntl
#define wait4 __sys_wait4
#define openat __sys_openat
#define setrlimit sys_setrlimit
#define sigprocmask sys_sigprocmask
#endif
#define CLOSER_CONTAINER(e) DLL_CONTAINER(struct Closer, elem, e)
struct Closer {
int64_t handle;
struct Dll elem;
};
struct SpawnFds {
int n;
struct Fd *p;
struct Dll *closers;
};
static atomic_bool has_vfork; // i.e. not qemu/wsl/xnu/openbsd
static textwindows int64_t spawnfds_handle(struct SpawnFds *fds, int fd) {
if (__is_cloexec(fds->p + fd)) return -1;
return fds->p[fd].handle;
}
static textwindows errno_t spawnfds_ensure(struct SpawnFds *fds, int fd) {
int n2;
struct Fd *p2;
if (fd < 0) return EBADF;
if (fd < fds->n) return 0;
n2 = fd + 1;
if (!(p2 = realloc(fds->p, n2 * sizeof(*fds->p)))) return ENOMEM;
bzero(p2 + fds->n, (n2 - fds->n) * sizeof(*fds->p));
fds->p = p2;
fds->n = n2;
return 0;
}
static textwindows void spawnfds_destroy(struct SpawnFds *fds) {
struct Dll *e;
while ((e = dll_first(fds->closers))) {
struct Closer *closer = CLOSER_CONTAINER(e);
dll_remove(&fds->closers, e);
CloseHandle(closer->handle);
free(closer);
}
free(fds->p);
}
static textwindows int spawnfds_closelater(struct SpawnFds *fds,
int64_t handle) {
struct Closer *closer;
if (!(closer = malloc(sizeof(struct Closer)))) return ENOMEM;
closer->handle = handle;
dll_init(&closer->elem);
dll_make_last(&fds->closers, &closer->elem);
return 0;
}
static textwindows bool spawnfds_exists(struct SpawnFds *fds, int fildes) {
return fildes + 0u < fds->n && fds->p[fildes].kind;
}
static textwindows void spawnfds_close(struct SpawnFds *fds, int fildes) {
if (spawnfds_exists(fds, fildes)) {
fds->p[fildes] = (struct Fd){0};
}
}
static textwindows errno_t spawnfds_dup2(struct SpawnFds *fds, int fildes,
int newfildes) {
errno_t err;
struct Fd *old;
if (spawnfds_exists(fds, fildes)) {
old = fds->p + fildes;
} else if (__isfdopen(fildes)) {
old = g_fds.p + fildes;
} else {
return EBADF;
}
if ((err = spawnfds_ensure(fds, newfildes))) return err;
struct Fd *neu = fds->p + newfildes;
memcpy(neu, old, sizeof(struct Fd));
neu->flags &= ~O_CLOEXEC;
if (!DuplicateHandle(GetCurrentProcess(), neu->handle, GetCurrentProcess(),
&neu->handle, 0, true, kNtDuplicateSameAccess)) {
return EMFILE;
}
spawnfds_closelater(fds, neu->handle);
return 0;
}
static textwindows errno_t spawnfds_open(struct SpawnFds *fds, int fildes,
const char *path, int oflag,
int mode) {
int64_t h;
errno_t err;
char16_t path16[PATH_MAX];
uint32_t perm, share, disp, attr;
if ((err = spawnfds_ensure(fds, fildes))) return err;
if (__mkntpathat(AT_FDCWD, path, 0, path16) != -1 &&
GetNtOpenFlags(oflag, mode, &perm, &share, &disp, &attr) != -1 &&
(h = CreateFile(path16, perm, share, &kNtIsInheritable, disp, attr, 0))) {
spawnfds_closelater(fds, h);
fds->p[fildes].kind = kFdFile;
fds->p[fildes].flags = oflag;
fds->p[fildes].mode = mode;
fds->p[fildes].handle = h;
return 0;
} else {
return errno;
}
}
static textwindows errno_t posix_spawn_nt_impl(
int *pid, const char *path, const posix_spawn_file_actions_t *file_actions,
const posix_spawnattr_t *attrp, char *const argv[], char *const envp[]) {
// signals, locks, and resources
char *fdspec = 0;
errno_t e = errno;
struct Proc *proc = 0;
struct SpawnFds fds = {0};
int64_t *lpExplicitHandles = 0;
uint32_t dwExplicitHandleCount = 0;
int64_t hCreatorProcess = GetCurrentProcess();
sigset_t m = __sig_block();
// reserve process tracking object
__proc_lock();
proc = __proc_new();
__proc_unlock();
// setup return path
errno_t err;
if (!proc) {
err = ENOMEM;
ReturnErr:
__undescribe_fds(hCreatorProcess, lpExplicitHandles, dwExplicitHandleCount);
free(fdspec);
if (proc) {
__proc_lock();
__proc_free(proc);
__proc_unlock();
}
spawnfds_destroy(&fds);
__sig_unblock(m);
errno = e;
return err;
}
// fork file descriptor table
for (int fd = g_fds.n; fd--;) {
if (__is_cloexec(g_fds.p + fd)) continue;
if ((err = spawnfds_ensure(&fds, fd))) goto ReturnErr;
fds.p[fd] = g_fds.p[fd];
}
// apply user file actions
if (file_actions) {
for (struct _posix_faction *a = *file_actions; a && !err; a = a->next) {
switch (a->action) {
case _POSIX_SPAWN_CLOSE:
spawnfds_close(&fds, a->fildes);
break;
case _POSIX_SPAWN_DUP2:
err = spawnfds_dup2(&fds, a->fildes, a->newfildes);
if (err) {
STRACE("spawnfds_dup2(%d, %d) failed", a->fildes, a->newfildes);
goto ReturnErr;
}
break;
case _POSIX_SPAWN_OPEN:
err = spawnfds_open(&fds, a->fildes, a->path, a->oflag, a->mode);
if (err) {
STRACE("spawnfds_open(%d, %#s) failed", a->fildes, a->path);
goto ReturnErr;
}
break;
default:
__builtin_unreachable();
}
}
}
// figure out flags
uint32_t dwCreationFlags = 0;
if (attrp && *attrp) {
if ((*attrp)->flags & POSIX_SPAWN_SETSID) {
dwCreationFlags |= kNtDetachedProcess;
}
if ((*attrp)->flags & POSIX_SPAWN_SETPGROUP) {
dwCreationFlags |= kNtCreateNewProcessGroup;
}
}
// create process startinfo
struct NtStartupInfo startinfo = {
.cb = sizeof(struct NtStartupInfo),
.dwFlags = kNtStartfUsestdhandles,
.hStdInput = spawnfds_handle(&fds, 0),
.hStdOutput = spawnfds_handle(&fds, 1),
.hStdError = spawnfds_handle(&fds, 2),
};
// launch process
int rc = -1;
struct NtProcessInformation procinfo;
if (!envp) envp = environ;
if ((fdspec = __describe_fds(fds.p, fds.n, &startinfo, hCreatorProcess,
&lpExplicitHandles, &dwExplicitHandleCount))) {
rc = ntspawn(path, argv, envp, (char *[]){fdspec, 0}, dwCreationFlags, 0, 0,
lpExplicitHandles, dwExplicitHandleCount, &startinfo,
&procinfo);
}
if (rc == -1) {
err = errno;
goto ReturnErr;
}
// return result
CloseHandle(procinfo.hThread);
proc->pid = procinfo.dwProcessId;
proc->handle = procinfo.hProcess;
if (pid) *pid = proc->pid;
__proc_lock();
__proc_add(proc);
__proc_unlock();
proc = 0;
err = 0;
goto ReturnErr;
}
static const char *DescribePid(char buf[12], int err, int *pid) {
if (err) return "n/a";
if (!pid) return "NULL";
FormatInt32(buf, *pid);
return buf;
}
static textwindows dontinline errno_t posix_spawn_nt(
int *pid, const char *path, const posix_spawn_file_actions_t *file_actions,
const posix_spawnattr_t *attrp, char *const argv[], char *const envp[]) {
int err;
if (!path || !argv ||
(IsAsan() && (!__asan_is_valid_str(path) || //
!__asan_is_valid_strlist(argv) || //
(envp && !__asan_is_valid_strlist(envp))))) {
err = EFAULT;
} else {
err = posix_spawn_nt_impl(pid, path, file_actions, attrp, argv, envp);
}
STRACE("posix_spawn([%s], %#s, %s, %s) → %s",
DescribePid(alloca(12), err, pid), path, DescribeStringList(argv),
DescribeStringList(envp), !err ? "0" : _strerrno(err));
return err;
}
/**
* Spawns process, the POSIX way, e.g.
*
* int pid, status;
* posix_spawnattr_t sa;
* posix_spawnattr_init(&sa);
* posix_spawnattr_setflags(&sa, POSIX_SPAWN_SETPGROUP);
* posix_spawn_file_actions_t fa;
* posix_spawn_file_actions_init(&fa);
* posix_spawn_file_actions_addopen(&fa, 0, "/dev/null", O_RDWR, 0644);
* posix_spawn_file_actions_adddup2(&fa, 0, 1);
* posix_spawnp(&pid, "lol", &fa, &sa, (char *[]){"lol", 0}, 0);
* posix_spawnp(&pid, "cat", &fa, &sa, (char *[]){"cat", 0}, 0);
* posix_spawn_file_actions_destroy(&fa);
* posix_spawnattr_destroy(&sa);
* while (wait(&status) != -1);
*
* This provides superior process creation performance across systems
*
* Processes are normally spawned by calling fork() and execve(), but
* that goes slow on Windows if the caller has allocated a nontrivial
* number of memory mappings, all of which need to be copied into the
* forked child, only to be destroyed a moment later. On UNIX systems
* fork() bears a similar cost that's 100x less bad, which is copying
* the page tables. So what this implementation does is on Windows it
* calls CreateProcess() directly and on UNIX it uses vfork() if it's
* possible (XNU and OpenBSD don't have it). On UNIX this API has the
* benefit of avoiding the footguns of using vfork() directly because
* this implementation will ensure signal handlers can't be called in
* the child process since that'd likely corrupt the parent's memory.
* On systems with a real vfork() implementation, the execve() status
* code is returned by this function via shared memory; otherwise, it
* gets passed via a temporary pipe (on systems like QEmu, Blink, and
* XNU/OpenBSD) whose support is auto-detected at runtime.
*
* @param pid if non-null shall be set to child pid on success
* @param path is resolved path of program which is not `$PATH` searched
* @param file_actions specifies close(), dup2(), and open() operations
* @param attrp specifies signal masks, user ids, scheduling, etc.
* @param envp is environment variables, or `environ` if null
* @return 0 on success or error number on failure
* @raise ETXTBSY if another process has `path` open in write mode
* @raise ENOEXEC if file is executable but not a valid format
* @raise ENOMEM if remaining stack memory is insufficient
* @raise EACCES if execute permission was denied
* @see posix_spawnp() for `$PATH` searching
* @returnserrno
* @tlsrequired
*/
errno_t posix_spawn(int *pid, const char *path,
const posix_spawn_file_actions_t *file_actions,
const posix_spawnattr_t *attrp, char *const argv[],
char *const envp[]) {
if (IsWindows()) {
return posix_spawn_nt(pid, path, file_actions, attrp, argv, envp);
}
int pfds[2];
bool use_pipe;
volatile int status = 0;
sigset_t blockall, oldmask;
int child, res, cs, e = errno;
volatile bool can_clobber = false;
sigfillset(&blockall);
sigprocmask(SIG_SETMASK, &blockall, &oldmask);
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
if ((use_pipe = !atomic_load_explicit(&has_vfork, memory_order_acquire))) {
if (pipe2(pfds, O_CLOEXEC)) {
res = errno;
goto ParentFailed;
}
}
if (!(child = vfork())) {
can_clobber = true;
sigset_t *childmask;
bool lost_cloexec = 0;
struct sigaction dfl = {0};
short flags = attrp && *attrp ? (*attrp)->flags : 0;
if (use_pipe) close(pfds[0]);
for (int sig = 1; sig < _NSIG; sig++) {
if (__sighandrvas[sig] != (long)SIG_DFL &&
(__sighandrvas[sig] != (long)SIG_IGN ||
((flags & POSIX_SPAWN_SETSIGDEF) &&
sigismember(&(*attrp)->sigdefault, sig) == 1))) {
sigaction(sig, &dfl, 0);
}
}
if (flags & POSIX_SPAWN_SETSID) {
setsid();
}
if ((flags & POSIX_SPAWN_SETPGROUP) && setpgid(0, (*attrp)->pgroup)) {
goto ChildFailed;
}
if ((flags & POSIX_SPAWN_RESETIDS) && setgid(getgid())) {
goto ChildFailed;
}
if ((flags & POSIX_SPAWN_RESETIDS) && setuid(getuid())) {
goto ChildFailed;
}
if (file_actions) {
struct _posix_faction *a;
for (a = *file_actions; a; a = a->next) {
if (use_pipe && pfds[1] == a->fildes) {
int p2;
if ((p2 = dup(pfds[1])) == -1) {
goto ChildFailed;
}
lost_cloexec = true;
close(pfds[1]);
pfds[1] = p2;
}
switch (a->action) {
case _POSIX_SPAWN_CLOSE:
if (close(a->fildes)) {
goto ChildFailed;
}
break;
case _POSIX_SPAWN_DUP2:
if (dup2(a->fildes, a->newfildes) == -1) {
goto ChildFailed;
}
break;
case _POSIX_SPAWN_OPEN: {
int t;
if ((t = openat(AT_FDCWD, a->path, a->oflag, a->mode)) == -1) {
goto ChildFailed;
}
if (t != a->fildes) {
if (dup2(t, a->fildes) == -1) {
close(t);
goto ChildFailed;
}
if (close(t)) {
goto ChildFailed;
}
}
break;
}
default:
__builtin_unreachable();
}
}
}
if (IsLinux() || IsFreebsd() || IsNetbsd()) {
if (flags & POSIX_SPAWN_SETSCHEDULER) {
if (sched_setscheduler(0, (*attrp)->schedpolicy,
&(*attrp)->schedparam) == -1) {
goto ChildFailed;
}
}
if (flags & POSIX_SPAWN_SETSCHEDPARAM) {
if (sched_setparam(0, &(*attrp)->schedparam)) {
goto ChildFailed;
}
}
}
if (flags & POSIX_SPAWN_SETRLIMIT) {
for (int rez = 0; rez <= ARRAYLEN((*attrp)->rlim); ++rez) {
if ((*attrp)->rlimset & (1u << rez)) {
if (setrlimit(rez, (*attrp)->rlim + rez)) {
goto ChildFailed;
}
}
}
}
if (lost_cloexec) {
fcntl(pfds[1], F_SETFD, FD_CLOEXEC);
}
if (flags & POSIX_SPAWN_SETSIGMASK) {
childmask = &(*attrp)->sigmask;
} else {
childmask = &oldmask;
}
sigprocmask(SIG_SETMASK, childmask, 0);
if (!envp) envp = environ;
execve(path, argv, envp);
ChildFailed:
res = errno;
if (!use_pipe) {
status = res;
} else {
write(pfds[1], &res, sizeof(res));
}
_Exit(127);
}
if (use_pipe) {
close(pfds[1]);
}
if (child != -1) {
if (!use_pipe) {
res = status;
} else {
if (can_clobber) {
atomic_store_explicit(&has_vfork, true, memory_order_release);
}
res = 0;
read(pfds[0], &res, sizeof(res));
}
if (!res) {
if (pid) *pid = child;
} else {
wait4(child, 0, 0, 0);
}
} else {
res = errno;
}
if (use_pipe) {
close(pfds[0]);
}
ParentFailed:
sigprocmask(SIG_SETMASK, &oldmask, 0);
pthread_setcancelstate(cs, 0);
errno = e;
return res;
}