Make mmap() scalable

It's now possible to create thousands of thousands of sparse independent
memory mappings, without any slowdown. The memory manager is better with
tracking memory protection now, particularly on Windows in a precise way
that can be restored during fork(). You now have the highest quality mem
manager possible. It's even better than some OSes like XNU, where mmap()
is implemented as an O(n) operation which means sadly things aren't much
improved over there. With this change the llamafile HTTP server endpoint
at /tokenize with a prompt of 50 tokens is now able to handle 2.6m r/sec
This commit is contained in:
Justine Tunney 2024-07-05 23:13:20 -07:00
parent 3756870635
commit 8c645fa1ee
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
59 changed files with 1238 additions and 1067 deletions

View file

@ -30,6 +30,7 @@
#include "libc/intrin/kprintf.h"
#include "libc/intrin/maps.h"
#include "libc/intrin/strace.internal.h"
#include "libc/intrin/tree.h"
#include "libc/intrin/weaken.h"
#include "libc/macros.internal.h"
#include "libc/nt/createfile.h"
@ -95,7 +96,7 @@ static inline textwindows ssize_t ForkIo(int64_t h, char *p, size_t n,
size_t i;
uint32_t x;
for (i = 0; i < n; i += x) {
if (!f(h, p + i, n - i, &x, NULL))
if (!f(h, p + i, n - i, &x, 0))
return __winerr();
if (!x)
break;
@ -109,10 +110,11 @@ static dontinline textwindows ssize_t ForkIo2(
const char *sf, bool ischild) {
ssize_t rc = ForkIo(h, buf, n, fn);
if (ischild) {
__tls_enabled_set(false); // prevent tls crash in kprintf
// prevent crashes
__tls_enabled_set(false);
__pid = __imp_GetCurrentProcessId();
__klog_handle = 0;
__maps.used = 0;
__maps.maps = 0;
}
NTTRACE("%s(%ld, %p, %'zu) → %'zd% m", sf, h, buf, n, rc);
return rc;
@ -121,9 +123,11 @@ static dontinline textwindows ssize_t ForkIo2(
static dontinline textwindows bool WriteAll(int64_t h, void *buf, size_t n) {
bool ok;
ok = ForkIo2(h, buf, n, (void *)WriteFile, "WriteFile", false) != -1;
if (!ok)
AbortFork("WriteAll");
// Sleep(10);
if (!ok) {
STRACE("fork() failed in parent due to WriteAll(%ld, %p, %'zu) → %u", h,
buf, n, GetLastError());
__print_maps(0);
}
return ok;
}
@ -185,30 +189,6 @@ static textwindows void *Malloc(size_t size) {
return HeapAlloc(GetProcessHeap(), 0, size);
}
static textwindows void Free(void *addr) {
HeapFree(GetProcessHeap(), 0, addr);
}
static int CountMaps(struct Dll *maps) {
int count = 0;
for (struct Dll *e = dll_first(maps); e; e = dll_next(maps, e))
++count;
return count;
}
static struct Map **SortMaps(struct Dll *maps, int count) {
int j, i = 0;
struct Map **sorted = Malloc(count * sizeof(struct Map *));
for (struct Dll *e = dll_first(maps); e; e = dll_next(maps, e)) {
struct Map *map = MAP_CONTAINER(e);
for (j = i; j > 0 && sorted[j - 1]->addr > map->addr; --j)
sorted[j] = sorted[j - 1];
sorted[j] = map;
++i;
}
return sorted;
}
textwindows void WinMainForked(void) {
jmp_buf jb;
int64_t reader;
@ -233,35 +213,30 @@ textwindows void WinMainForked(void) {
ReadOrDie(reader, jb, sizeof(jb));
// read memory mappings from parent process
int n = 0;
struct Dll *maps = 0;
struct Tree *maps = 0;
for (;;) {
struct Map *map = Malloc(sizeof(struct Map));
ReadOrDie(reader, map, sizeof(struct Map));
if (map->addr == MAP_FAILED) {
Free(map);
if (map->addr == MAP_FAILED)
break;
}
dll_init(&map->elem);
dll_make_first(&maps, &map->elem);
++n;
tree_insert(&maps, &map->tree, __maps_compare);
}
// created sorted array of maps
struct Map **sorted = SortMaps(maps, n);
// map memory into process
int granularity = __granularity();
for (int i = 0; i < n; ++i) {
struct Map *map = sorted[i];
for (struct Tree *e = tree_first(maps); e; e = tree_next(e)) {
struct Map *map = MAP_TREE_CONTAINER(e);
if ((uintptr_t)map->addr & (granularity - 1))
continue;
size_t size = map->size;
// get true length in case mprotect() chopped up actual win32 map
for (int j = i + 1;
j < n && sorted[j]->hand == -1 && map->addr + size == sorted[j]->addr;
++j) {
size += sorted[j]->size;
size_t size = map->size;
for (struct Tree *e2 = tree_next(e); e2; e2 = tree_next(e2)) {
struct Map *map2 = MAP_TREE_CONTAINER(e2);
if (map2->hand == -1 && map->addr + size == map2->addr) {
size += map2->size;
} else {
break;
}
}
// obtain the most permissive access possible
unsigned prot, access;
@ -295,11 +270,11 @@ textwindows void WinMainForked(void) {
// fixup memory manager
__maps.free = 0;
__maps.used = 0;
__maps.maps = 0;
__maps.count = 0;
__maps.pages = 0;
for (int i = 0; i < n; ++i) {
struct Map *map = sorted[i];
for (struct Tree *e = tree_first(maps); e; e = tree_next(e)) {
struct Map *map = MAP_TREE_CONTAINER(e);
__maps.count += 1;
__maps.pages += (map->size + getpagesize() - 1) / getpagesize();
unsigned old_protect;
@ -307,8 +282,7 @@ textwindows void WinMainForked(void) {
&old_protect))
AbortFork("VirtualProtect");
}
Free(sorted);
__maps.used = maps;
__maps.maps = maps;
__maps_init();
// mitosis complete
@ -393,19 +367,13 @@ textwindows int sys_fork_nt(uint32_t dwCreationFlags) {
if (spawnrc != -1) {
CloseHandle(procinfo.hThread);
ok = WriteAll(writer, jb, sizeof(jb));
int count = 0;
// this list will be populated with the maps we're transferring
struct Dll *e2, *maps = 0;
for (struct Dll *e = dll_first(__maps.used); ok && e; e = e2) {
e2 = dll_next(__maps.used, e);
struct Map *map = MAP_CONTAINER(e);
for (struct Map *map = __maps_first(); ok && map;
map = __maps_next(map)) {
if (MAX((char *)__executable_start, map->addr) <
MIN((char *)_end, map->addr + map->size))
continue; // executable image is loaded by windows
dll_remove(&__maps.used, e);
dll_make_last(&maps, e);
ok = WriteAll(writer, map, sizeof(*map));
++count;
}
// send a terminating Map struct to child
if (ok) {
@ -415,40 +383,44 @@ textwindows int sys_fork_nt(uint32_t dwCreationFlags) {
}
// now write content of each map to child
int granularity = __granularity();
struct Map **sorted = SortMaps(maps, count);
uint32_t *old_protect = Malloc(count * 4);
for (int i = 0; ok && i < count; ++i) {
struct Map *map = sorted[i];
for (struct Map *map = __maps_first(); ok && map;
map = __maps_next(map)) {
// we only need to worry about the base mapping
if ((uintptr_t)map->addr & (granularity - 1))
continue;
if (MAX((char *)__executable_start, map->addr) <
MIN((char *)_end, map->addr + map->size))
continue; // executable image is loaded by windows
// shared mappings don't need to be copied
if ((map->flags & MAP_TYPE) == MAP_SHARED)
continue;
// get true length in case mprotect() chopped up actual win32 map
int j;
size_t size = map->size;
for (j = i + 1; j < count && sorted[j]->hand == -1 &&
map->addr + size == sorted[j]->addr;
++j) {
size += sorted[j]->size;
for (struct Map *map2 = __maps_next(map); map2;
map2 = __maps_next(map2)) {
if (map2->hand == -1 && map->addr + size == map2->addr) {
size += map2->size;
} else {
break;
}
}
for (struct Map *map2 = map; ok && map2; map2 = __maps_next(map2)) {
if (!(map2->prot & PROT_READ))
if (map->addr >= map2->addr && map->addr < map->addr + size)
ok = VirtualProtect(
map2->addr, map2->size,
__prot2nt(map2->prot | PROT_READ, map2->iscow),
&map2->oldprot);
}
for (int k = i; ok && k < j; ++k)
if (!(sorted[k]->prot & PROT_READ))
ok = VirtualProtect(
sorted[k]->addr, sorted[k]->size,
__prot2nt(sorted[k]->prot | PROT_READ, map->iscow),
&old_protect[k]);
if (ok)
ok = WriteAll(writer, map->addr, size);
for (int k = i; ok && k < j; ++k)
if (!(sorted[k]->prot & PROT_READ))
ok = VirtualProtect(sorted[k]->addr, sorted[k]->size,
old_protect[k], &old_protect[k]);
for (struct Map *map2 = map; ok && map2; map2 = __maps_next(map2)) {
if (!(map2->prot & PROT_READ))
if (map->addr >= map2->addr && map->addr < map->addr + size)
ok = VirtualProtect(map2->addr, map2->size, map2->oldprot,
&map2->oldprot);
}
}
Free(old_protect);
Free(sorted);
dll_make_first(&__maps.used, maps);
if (ok)
ok = WriteAll(writer, __data_start, __data_end - __data_start);
if (ok)
@ -466,6 +438,7 @@ textwindows int sys_fork_nt(uint32_t dwCreationFlags) {
} else {
TerminateProcess(procinfo.hProcess, SIGKILL);
CloseHandle(procinfo.hProcess);
rc = -1;
}
}
}
@ -473,9 +446,8 @@ textwindows int sys_fork_nt(uint32_t dwCreationFlags) {
CloseHandle(reader);
if (writer != -1)
CloseHandle(writer);
if (rc == -1 && errno != ENOMEM) {
if (rc == -1 && errno != ENOMEM)
eagain(); // posix fork() only specifies two errors
}
} else {
rc = 0;
// re-apply code morphing for thread-local storage

View file

@ -22,6 +22,7 @@
#include "libc/calls/state.internal.h"
#include "libc/calls/struct/sigset.h"
#include "libc/calls/struct/sigset.internal.h"
#include "libc/calls/struct/timespec.h"
#include "libc/calls/syscall-nt.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
@ -45,7 +46,6 @@
#include "libc/thread/posixthread.internal.h"
#include "libc/thread/tls.h"
extern pthread_mutex_t nsync_waiters_mu;
extern pthread_mutex_t _pthread_lock_obj;
static void _onfork_prepare(void) {
@ -54,11 +54,10 @@ static void _onfork_prepare(void) {
_pthread_lock();
__maps_lock();
__fds_lock();
pthread_mutex_lock(&nsync_waiters_mu);
LOCKTRACE("READY TO ROCK AND ROLL");
}
static void _onfork_parent(void) {
pthread_mutex_unlock(&nsync_waiters_mu);
__fds_unlock();
__maps_unlock();
_pthread_unlock();
@ -68,7 +67,6 @@ static void _onfork_parent(void) {
static void _onfork_child(void) {
__fds_lock_obj = (pthread_mutex_t)PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
nsync_waiters_mu = (pthread_mutex_t)PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
_pthread_lock_obj = (pthread_mutex_t)PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
atomic_store_explicit(&__maps.lock, 0, memory_order_relaxed);
atomic_store_explicit(&__get_tls()->tib_relock_maps, 0, memory_order_relaxed);
@ -77,7 +75,9 @@ static void _onfork_child(void) {
}
int _fork(uint32_t dwCreationFlags) {
long micros;
struct Dll *e;
struct timespec started;
int ax, dx, tid, parent;
parent = __pid;
BLOCK_SIGNALS;
@ -85,11 +85,13 @@ int _fork(uint32_t dwCreationFlags) {
__proc_lock();
if (__threaded)
_onfork_prepare();
started = timespec_real();
if (!IsWindows()) {
ax = sys_fork();
} else {
ax = sys_fork_nt(dwCreationFlags);
}
micros = timespec_tomicros(timespec_sub(timespec_real(), started));
if (!ax) {
// get new process id
@ -136,15 +138,14 @@ int _fork(uint32_t dwCreationFlags) {
// run user fork callbacks
if (__threaded)
_onfork_child();
STRACE("fork() → 0 (child of %d)", parent);
STRACE("fork() → 0 (child of %d; took %ld us)", parent, micros);
} else {
// this is the parent process
if (__threaded) {
if (__threaded)
_onfork_parent();
}
if (IsWindows())
__proc_unlock();
STRACE("fork() → %d% m", ax);
STRACE("fork() → %d% m (took %ld us)", ax, micros);
}
ALLOW_SIGNALS;
return ax;

View file

@ -482,9 +482,8 @@ errno_t posix_spawn(int *pid, const char *path,
const posix_spawn_file_actions_t *file_actions,
const posix_spawnattr_t *attrp, char *const argv[],
char *const envp[]) {
if (IsWindows()) {
if (IsWindows())
return posix_spawn_nt(pid, path, file_actions, attrp, argv, envp);
}
int pfds[2];
bool use_pipe;
volatile int status = 0;
@ -516,66 +515,55 @@ errno_t posix_spawn(int *pid, const char *path,
sigaction(sig, &dfl, 0);
}
}
if (flags & POSIX_SPAWN_SETSID) {
if (flags & POSIX_SPAWN_SETSID)
setsid();
}
if ((flags & POSIX_SPAWN_SETPGROUP) && setpgid(0, (*attrp)->pgroup)) {
if ((flags & POSIX_SPAWN_SETPGROUP) && setpgid(0, (*attrp)->pgroup))
goto ChildFailed;
}
if ((flags & POSIX_SPAWN_RESETIDS) && setgid(getgid())) {
if ((flags & POSIX_SPAWN_RESETIDS) && setgid(getgid()))
goto ChildFailed;
}
if ((flags & POSIX_SPAWN_RESETIDS) && setuid(getuid())) {
if ((flags & POSIX_SPAWN_RESETIDS) && setuid(getuid()))
goto ChildFailed;
}
if (file_actions) {
struct _posix_faction *a;
for (a = *file_actions; a; a = a->next) {
if (use_pipe && pfds[1] == a->fildes) {
int p2;
if ((p2 = dup(pfds[1])) == -1) {
if ((p2 = dup(pfds[1])) == -1)
goto ChildFailed;
}
lost_cloexec = true;
close(pfds[1]);
pfds[1] = p2;
}
switch (a->action) {
case _POSIX_SPAWN_CLOSE:
if (close(a->fildes)) {
if (close(a->fildes))
goto ChildFailed;
}
break;
case _POSIX_SPAWN_DUP2:
if (dup2(a->fildes, a->newfildes) == -1) {
if (dup2(a->fildes, a->newfildes) == -1)
goto ChildFailed;
}
break;
case _POSIX_SPAWN_OPEN: {
int t;
if ((t = openat(AT_FDCWD, a->path, a->oflag, a->mode)) == -1) {
if ((t = openat(AT_FDCWD, a->path, a->oflag, a->mode)) == -1)
goto ChildFailed;
}
if (t != a->fildes) {
if (dup2(t, a->fildes) == -1) {
close(t);
goto ChildFailed;
}
if (close(t)) {
if (close(t))
goto ChildFailed;
}
}
break;
}
case _POSIX_SPAWN_CHDIR:
if (chdir(a->path) == -1) {
if (chdir(a->path) == -1)
goto ChildFailed;
}
break;
case _POSIX_SPAWN_FCHDIR:
if (fchdir(a->fildes) == -1) {
if (fchdir(a->fildes) == -1)
goto ChildFailed;
}
break;
default:
__builtin_unreachable();
@ -583,17 +571,13 @@ errno_t posix_spawn(int *pid, const char *path,
}
}
if (IsLinux() || IsFreebsd() || IsNetbsd()) {
if (flags & POSIX_SPAWN_SETSCHEDULER) {
if (flags & POSIX_SPAWN_SETSCHEDULER)
if (sched_setscheduler(0, (*attrp)->schedpolicy,
&(*attrp)->schedparam) == -1) {
&(*attrp)->schedparam) == -1)
goto ChildFailed;
}
}
if (flags & POSIX_SPAWN_SETSCHEDPARAM) {
if (sched_setparam(0, &(*attrp)->schedparam)) {
if (flags & POSIX_SPAWN_SETSCHEDPARAM)
if (sched_setparam(0, &(*attrp)->schedparam))
goto ChildFailed;
}
}
}
if (flags & POSIX_SPAWN_SETRLIMIT) {
int rlimset = (*attrp)->rlimset;
@ -608,9 +592,8 @@ errno_t posix_spawn(int *pid, const char *path,
}
}
}
if (lost_cloexec) {
if (lost_cloexec)
fcntl(pfds[1], F_SETFD, FD_CLOEXEC);
}
if (flags & POSIX_SPAWN_SETSIGMASK) {
childmask = (*attrp)->sigmask;
} else {
@ -636,9 +619,8 @@ errno_t posix_spawn(int *pid, const char *path,
if (!use_pipe) {
res = status;
} else {
if (can_clobber) {
if (can_clobber)
atomic_store_explicit(&has_vfork, true, memory_order_release);
}
res = 0;
read(pfds[0], &res, sizeof(res));
}
@ -651,9 +633,8 @@ errno_t posix_spawn(int *pid, const char *path,
} else {
res = errno;
}
if (use_pipe) {
if (use_pipe)
close(pfds[0]);
}
ParentFailed:
sigprocmask(SIG_SETMASK, &oldmask, 0);
pthread_setcancelstate(cs, 0);