cosmopolitan/libc/runtime/mmap.c
Justine Tunney 791f79fcb3
Make improvements
- We now serialize the file descriptor table when spawning / executing
  processes on Windows. This means you can now inherit more stuff than
  just standard i/o. It's needed by bash, which duplicates the console
  to file descriptor #255. We also now do a better job serializing the
  environment variables, so you're less likely to encounter E2BIG when
  using your bash shell. We also no longer coerce environ to uppercase

- execve() on Windows now remotely controls its parent process to make
  them spawn a replacement for itself. Then it'll be able to terminate
  immediately once the spawn succeeds, without having to linger around
  for the lifetime as a shell process for proxying the exit code. When
  process worker thread running in the parent sees the child die, it's
  given a handle to the new child, to replace it in the process table.

- execve() and posix_spawn() on Windows will now provide CreateProcess
  an explicit handle list. This allows us to remove handle locks which
  enables better fork/spawn concurrency, with seriously correct thread
  safety. Other codebases like Go use the same technique. On the other
  hand fork() still favors the conventional WIN32 inheritence approach
  which can be a little bit messy, but is *controlled* by guaranteeing
  perfectly clean slates at both the spawning and execution boundaries

- sigset_t is now 64 bits. Having it be 128 bits was a mistake because
  there's no reason to use that and it's only supported by FreeBSD. By
  using the system word size, signal mask manipulation on Windows goes
  very fast. Furthermore @asyncsignalsafe funcs have been rewritten on
  Windows to take advantage of signal masking, now that it's much more
  pleasant to use.

- All the overlapped i/o code on Windows has been rewritten for pretty
  good signal and cancelation safety. We're now able to ensure overlap
  data structures are cleaned up so long as you don't longjmp() out of
  out of a signal handler that interrupted an i/o operation. Latencies
  are also improved thanks to the removal of lots of "busy wait" code.
  Waits should be optimal for everything except poll(), which shall be
  the last and final demon we slay in the win32 i/o horror show.

- getrusage() on Windows is now able to report RUSAGE_CHILDREN as well
  as RUSAGE_SELF, thanks to aggregation in the process manager thread.
2023-10-08 08:59:53 -07:00

497 lines
18 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/calls/internal.h"
#include "libc/calls/struct/sigset.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/asan.internal.h"
#include "libc/intrin/asancodes.h"
#include "libc/intrin/bits.h"
#include "libc/intrin/bsr.h"
#include "libc/intrin/describeflags.internal.h"
#include "libc/intrin/directmap.internal.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/likely.h"
#include "libc/intrin/safemacros.internal.h"
#include "libc/intrin/strace.internal.h"
#include "libc/intrin/weaken.h"
#include "libc/limits.h"
#include "libc/log/backtrace.internal.h"
#include "libc/log/libfatal.internal.h"
#include "libc/log/log.h"
#include "libc/macros.internal.h"
#include "libc/nt/enum/memflags.h"
#include "libc/nt/enum/pageflags.h"
#include "libc/nt/memory.h"
#include "libc/nt/process.h"
#include "libc/nt/runtime.h"
#include "libc/nt/struct/processmemorycounters.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/memtrack.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/zipos.internal.h"
#include "libc/stdckdint.h"
#include "libc/stdio/rand.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/auxv.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/prot.h"
#include "libc/sysv/consts/sig.h"
#include "libc/sysv/consts/ss.h"
#include "libc/sysv/errfuns.h"
#include "libc/thread/thread.h"
#define MAP_ANONYMOUS_linux 0x00000020
#define MAP_ANONYMOUS_openbsd 0x00001000
#define MAP_GROWSDOWN_linux 0x00000100
#define MAP_STACK_freebsd 0x00000400
#define MAP_STACK_openbsd 0x00004000
#define IP(X) (intptr_t)(X)
#define VIP(X) (void *)IP(X)
#define ALIGNED(p) (!(IP(p) & (FRAMESIZE - 1)))
#define SHADE(x) (((intptr_t)(x) >> 3) + 0x7fff8000)
#define FRAME(x) ((int)((intptr_t)(x) >> 16))
static inline pureconst unsigned long __rounddown2pow(unsigned long x) {
return x ? 1ul << _bsrl(x) : 0;
}
static wontreturn void __mmap_die(const char *s) {
if (_weaken(__die)) _weaken(__die)();
STRACE("%s %m", s);
_Exit(199);
}
static inline bool __overlaps_existing_mapping(char *p, size_t n) {
int a, b, i;
unassert(n > 0);
a = FRAME(p);
b = FRAME(p + (n - 1));
i = __find_memory(&_mmi, a);
if (i < _mmi.i) {
if (a <= _mmi.p[i].x && _mmi.p[i].x <= b) return true;
if (a <= _mmi.p[i].y && _mmi.p[i].y <= b) return true;
if (_mmi.p[i].x <= a && b <= _mmi.p[i].y) return true;
}
return false;
}
static bool __choose_memory(int x, int n, int align, int *res) {
// TODO: improve performance
int i, start, end;
unassert(align > 0);
if (_mmi.i) {
// find the start of the automap memory region
i = __find_memory(&_mmi, x);
if (i < _mmi.i) {
// check to see if there's space available before the first entry
if (!ckd_add(&start, x, align - 1)) {
start &= -align;
if (!ckd_add(&end, start, n - 1)) {
if (end < _mmi.p[i].x) {
*res = start;
return true;
}
}
}
// check to see if there's space available between two entries
while (++i < _mmi.i) {
if (!ckd_add(&start, _mmi.p[i - 1].y, 1) &&
!ckd_add(&start, start, align - 1)) {
start &= -align;
if (!ckd_add(&end, start, n - 1)) {
if (end < _mmi.p[i].x) {
*res = start;
return true;
}
}
}
}
}
// otherwise append after the last entry if space is available
if (!ckd_add(&start, _mmi.p[i - 1].y, 1) &&
!ckd_add(&start, start, align - 1)) {
start &= -align;
if (!ckd_add(&end, start, n - 1)) {
*res = start;
return true;
}
}
} else {
// if memtrack is empty, then just assign the requested address
// assuming it doesn't overflow
if (!ckd_add(&start, x, align - 1)) {
start &= -align;
if (!ckd_add(&end, start, n - 1)) {
*res = start;
return true;
}
}
}
return false;
}
static bool __auto_map(int count, int align, int *res) {
return __choose_memory(FRAME(kAutomapStart), count, align, res) &&
*res + count <= FRAME(kAutomapStart + (kAutomapSize - 1));
}
static void *__finish_memory(void *addr, size_t size, int prot, int flags,
int fd, int64_t off, int f, int x, int n,
struct DirectMap dm) {
if (!IsWindows() && (flags & MAP_FIXED)) {
if (__untrack_memories(addr, size)) {
__mmap_die("FIXED UNTRACK FAILED");
}
}
if (__track_memory(&_mmi, x, x + (n - 1), dm.maphandle, prot, flags, false,
false, off, size)) {
if (sys_munmap(addr, n) == -1) {
__mmap_die("TRACK MUNMAP FAILED");
}
return MAP_FAILED;
}
if (_weaken(__asan_map_shadow) && !OverlapsShadowSpace(addr, size)) {
_weaken(__asan_map_shadow)((intptr_t)addr, size);
}
return addr;
}
static void *__map_memory(void *addr, size_t size, int prot, int flags, int fd,
int64_t off, int f, int x, int n) {
struct DirectMap dm;
dm = sys_mmap(addr, size, prot, f, fd, off);
if (VERY_UNLIKELY(dm.addr == MAP_FAILED)) {
if (IsWindows() && (flags & MAP_FIXED)) {
__mmap_die("can't recover from MAP_FIXED errors on Windows");
}
return MAP_FAILED;
}
if (VERY_UNLIKELY(dm.addr != addr)) {
__mmap_die("KERNEL DIDN'T RESPECT MAP_FIXED");
}
return __finish_memory(addr, size, prot, flags, fd, off, f, x, n, dm);
}
/**
* Maps memory from system, one frame at a time.
*
* This is useful on Windows since it allows us to partially unmap or
* punch holes into existing mappings.
*/
static textwindows dontinline void *__map_memories(char *addr, size_t size,
int prot, int flags, int fd,
int64_t off, int f, int x,
int n) {
size_t i, m;
int64_t oi, sz;
struct DirectMap dm;
bool iscow, readonlyfile;
m = (size_t)(n - 1) << 16;
unassert(m < size);
unassert(m + FRAMESIZE >= size);
oi = fd == -1 ? 0 : off + m;
sz = size - m;
dm = sys_mmap(addr + m, sz, prot, f, fd, oi);
if (dm.addr == MAP_FAILED) return MAP_FAILED;
iscow = (flags & MAP_TYPE) != MAP_SHARED && fd != -1;
readonlyfile = (flags & MAP_TYPE) == MAP_SHARED && fd != -1 &&
(g_fds.p[fd].flags & O_ACCMODE) == O_RDONLY;
if (__track_memory(&_mmi, x + (n - 1), x + (n - 1), dm.maphandle, prot, flags,
readonlyfile, iscow, oi, sz) == -1) {
__mmap_die("__map_memories unrecoverable #1");
}
for (i = 0; i < m; i += FRAMESIZE) {
oi = fd == -1 ? 0 : off + i;
sz = FRAMESIZE;
dm = sys_mmap(addr + i, sz, prot, f, fd, oi);
if (dm.addr == MAP_FAILED ||
__track_memory(&_mmi, x + i / FRAMESIZE, x + i / FRAMESIZE,
dm.maphandle, prot, flags, readonlyfile, iscow, oi,
sz) == -1) {
__mmap_die("__map_memories unrecoverable #2");
}
}
if (_weaken(__asan_map_shadow) && !OverlapsShadowSpace(addr, size)) {
_weaken(__asan_map_shadow)((intptr_t)addr, size);
}
return addr;
}
inline void *__mmap_unlocked(void *addr, size_t size, int prot, int flags,
int fd, int64_t off) {
char *p = addr;
struct DirectMap dm;
size_t requested_size;
bool needguard, clashes;
int a, f, n, x, pagesize;
size_t virtualused, virtualneed;
if (VERY_UNLIKELY(!size)) {
STRACE("can't mmap zero bytes");
return VIP(einval());
}
if (VERY_UNLIKELY(!ALIGNED(p))) {
STRACE("cosmo mmap is 64kb aligned");
return VIP(einval());
}
if (VERY_UNLIKELY(!IsLegalSize(size))) {
STRACE("mmap size isn't legal");
return VIP(einval());
}
if (VERY_UNLIKELY(!IsLegalPointer(p))) {
STRACE("mmap addr isn't 48-bit");
return VIP(einval());
}
if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) {
flags = MAP_SHARED; // cf. MAP_SHARED_VALIDATE
}
requested_size = size;
pagesize = getauxval(AT_PAGESZ);
if (flags & MAP_ANONYMOUS) {
fd = -1;
off = 0;
size = ROUNDUP(size, FRAMESIZE);
if ((flags & MAP_TYPE) == MAP_FILE) {
STRACE("need MAP_PRIVATE or MAP_SHARED");
return VIP(einval());
}
} else if (__isfdkind(fd, kFdZip)) {
STRACE("mmap fd is zipos handle");
return VIP(einval());
} else if (VERY_UNLIKELY(off < 0)) {
STRACE("mmap negative offset");
return VIP(einval());
} else if (off & ((IsWindows() ? FRAMESIZE : pagesize) - 1)) {
STRACE("mmap offset isn't properly aligned");
return VIP(einval());
} else if (VERY_UNLIKELY(INT64_MAX - size < off)) {
STRACE("mmap too large");
return VIP(einval());
}
if (__virtualmax < LONG_MAX &&
(ckd_add(&virtualneed, (virtualused = __get_memtrack_size(&_mmi)),
size) ||
virtualneed > __virtualmax)) {
STRACE("mmap %'zu size + %'zu inuse exceeds virtual memory limit %'zu",
size, virtualused, __virtualmax);
return VIP(enomem());
}
clashes = OverlapsImageSpace(p, size) || __overlaps_existing_mapping(p, size);
if ((flags & MAP_FIXED_NOREPLACE) == MAP_FIXED_NOREPLACE && clashes) {
STRACE("mmap noreplace overlaps existing");
return VIP(eexist());
}
if (ckd_add(&n, (int)(size >> 16), (int)!!(size & (FRAMESIZE - 1)))) {
STRACE("mmap range overflows");
return VIP(einval());
}
a = MAX(1, __rounddown2pow(size) >> 16);
f = (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED;
if (flags & MAP_FIXED) {
x = FRAME(p);
if (IsWindows()) {
if (__untrack_memories(p, size)) {
__mmap_die("FIXED UNTRACK FAILED");
}
}
} else if (p && !clashes && !OverlapsShadowSpace(p, size)) {
x = FRAME(p);
} else if (!__auto_map(n, a, &x)) {
STRACE("automap has no room for %d frames with %d alignment", n, a);
return VIP(enomem());
}
needguard = false;
p = (char *)ADDR_32_TO_48(x);
if ((f & MAP_TYPE) == MAP_STACK) {
if (~f & MAP_ANONYMOUS) {
STRACE("MAP_STACK must be anonymous");
return VIP(einval());
}
f &= ~MAP_TYPE;
f |= MAP_PRIVATE;
if (IsOpenbsd()) { // openbsd:dubstack
// on openbsd this is less about scalability of threads, and more
// about defining the legal intervals for the RSP register. sadly
// openbsd doesn't let us create a new fixed stack mapping. but..
// openbsd does allow us to overwrite existing fixed mappings, to
// authorize its usage as a stack.
if (sys_mmap(p, size, prot, f, fd, off).addr == MAP_FAILED) {
return MAP_FAILED;
}
f |= MAP_STACK_openbsd;
needguard = true;
} else if (IsLinux()) {
// make sure there's no existing stuff existing between our stack
// starting page and the bottom guard page, since that would stop
// our stack page from growing down.
npassert(!sys_munmap(p, size));
int guardsize = pagesize, e = errno;
if ((dm = sys_mmap(p + size - guardsize, guardsize, prot,
f | MAP_GROWSDOWN_linux, fd, off))
.addr != MAP_FAILED) {
npassert(sys_mmap(p, pagesize, PROT_NONE,
MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
.addr == p);
dm.addr = p;
p = __finish_memory(p, size, prot, flags, fd, off, f, x, n, dm);
if (IsAsan() && p != MAP_FAILED) {
__asan_poison(p, pagesize, kAsanStackOverflow);
}
return p;
} else if (errno == ENOTSUP) {
// WSL and Blink don't support MAP_GROWSDOWN
needguard = true;
errno = e;
}
} else {
if (IsFreebsd()) {
f |= MAP_STACK_freebsd;
}
needguard = true;
}
}
if (!IsWindows()) {
p = __map_memory(p, size, prot, flags, fd, off, f, x, n);
} else {
p = __map_memories(p, size, prot, flags, fd, off, f, x, n);
}
if (p != MAP_FAILED) {
if (IsAsan()) {
__asan_poison(p + requested_size, size - requested_size,
kAsanMmapSizeOverrun);
}
if (needguard) {
if (!IsWindows()) {
unassert(!mprotect(p, pagesize, PROT_NONE));
if (IsAsan()) {
__asan_poison(p, pagesize, kAsanStackOverflow);
}
} else {
uint32_t oldattr;
unassert(VirtualProtect(p, pagesize, kNtPageReadwrite | kNtPageGuard,
&oldattr));
}
}
}
return p;
}
/**
* Creates virtual memory, e.g.
*
* char *m;
* m = mmap(NULL, FRAMESIZE, PROT_READ | PROT_WRITE,
* MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
* munmap(m, FRAMESIZE);
*
* @param addr should be 0 to let your memory manager choose address;
* unless MAP_FIXED or MAP_FIXED_NOREPLACE are specified in flags
* in which case this function will do precicely as you ask, even
* if p=0 (in which you need -fno-delete-null-pointer-checks); it
* needs to be 64kb aligned because it's a wise choice that sadly
* needs to be made mandatory because of Windows although you can
* use __sys_mmap() to circumvent it on System Five in which case
* runtime support services, e.g. asan memory safety, could break
* @param size must be >0 otherwise EINVAL is raised
* @param prot can have PROT_READ/PROT_WRITE/PROT_EXEC/PROT_NONE/etc.
* @param flags should have one of the following masked by `MAP_TYPE`
* - `MAP_FILE` in which case `MAP_ANONYMOUS` shouldn't be used
* - `MAP_PRIVATE` for copy-on-write behavior of writeable pages
* - `MAP_SHARED` to create shared memory between processes
* - `MAP_STACK` to create a grows-down alloc, where a guard page
* is automatically protected at the bottom, sized as AT_PAGESZ
* Your `flags` may optionally bitwise or any of the following:
* - `MAP_ANONYMOUS` in which case `fd` and `off` are ignored
* - `MAP_FIXED` in which case `addr` becomes more than a hint
* - `MAP_FIXED_NOREPLACE` to protect existing mappings; this is
* always polyfilled by mmap() which tracks its own memory and
* removed before passing to the kernel, in order to support
* old versions; if you believe mappings exist which only the
* kernel knows, then this flag may be passed to sys_mmap() on
* Linux 4.17+ and FreeBSD (where it has multiple bits)
* - `MAP_CONCEAL` is FreeBSD/NetBSD/OpenBSD-only
* - `MAP_NORESERVE` is Linux/XNU/NetBSD-only
* - `MAP_POPULATE` is Linux/FreeBSD-only
* - `MAP_NONBLOCK` is Linux-only
* - `MAP_NOSYNC` is FreeBSD-only
* - `MAP_INHERIT` is NetBSD-only
* - `MAP_LOCKED` is Linux-only
* @param fd is an open()'d file descriptor, whose contents shall be
* made available w/ automatic reading at the chosen address
* @param off specifies absolute byte index of fd's file for mapping,
* should be zero if MAP_ANONYMOUS is specified, which SHOULD be
* aligned to FRAMESIZE
* @return virtual base address of new mapping, or MAP_FAILED w/ errno
*/
void *mmap(void *addr, size_t size, int prot, int flags, int fd, int64_t off) {
void *res;
#ifdef SYSDEBUG
size_t toto = 0;
#if _KERNTRACE || _NTTRACE
if (IsWindows()) {
STRACE("mmap(%p, %'zu, %s, %s, %d, %'ld) → ...", addr, size,
DescribeProtFlags(prot), DescribeMapFlags(flags), fd, off);
}
#endif
#endif
__mmi_lock();
if (!__isfdkind(fd, kFdZip)) {
res = __mmap_unlocked(addr, size, prot, flags, fd, off);
} else {
res = _weaken(__zipos_mmap)(
addr, size, prot, flags,
(struct ZiposHandle *)(intptr_t)g_fds.p[fd].handle, off);
}
#if SYSDEBUG
toto = __strace > 0 ? __get_memtrack_size(&_mmi) : 0;
#endif
__mmi_unlock();
STRACE("mmap(%p, %'zu, %s, %s, %d, %'ld) → %p% m (%'zu bytes total)", addr,
size, DescribeProtFlags(prot), DescribeMapFlags(flags), fd, off, res,
toto);
return res;
}
__strong_reference(mmap, mmap64);